Upgrade to Encode 2.11, plus a patch to PerlIO::encoding
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.xs
CommitLineData
85982a32 1/*
56ff7374 2 $Id: Encode.xs,v 2.5 2005/08/05 10:58:25 dankogai Exp dankogai $
85982a32 3 */
4
fcf2db38 5#define PERL_NO_GET_CONTEXT
2c674647 6#include "EXTERN.h"
7#include "perl.h"
8#include "XSUB.h"
2f2b4ff2 9#define U8 U8
10#include "encode.h"
10c5ecbb 11
12# define PERLIO_MODNAME "PerlIO::encoding"
0ab8f81e 13# define PERLIO_FILENAME "PerlIO/encoding.pm"
aae85ceb 14
85982a32 15/* set 1 or more to profile. t/encoding.t dumps core because of
16 Perl_warner and PerlIO don't work well */
c6a7db43 17#define ENCODE_XS_PROFILE 0
39cf9a5e 18
85982a32 19/* set 0 to disable floating point to calculate buffer size for
20 encode_method(). 1 is recommended. 2 restores NI-S original */
c6a7db43 21#define ENCODE_XS_USEFP 1
39cf9a5e 22
fcf2db38 23#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \
2f5768b8 24 Perl_croak(aTHX_ "panic_unimplemented"); \
4a83738a 25 return (y)0; /* fool picky compilers */ \
87714904 26 }
85982a32 27/**/
011b2d2f 28
7e9a885a 29UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
30UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
33af2bc7 31
7f0d54d7 32#define UTF8_ALLOW_STRICT 0
33#define UTF8_ALLOW_NONSTRICT (UTF8_ALLOW_ANY & \
34 ~(UTF8_ALLOW_CONTINUATION | \
35 UTF8_ALLOW_NON_CONTINUATION | \
36 UTF8_ALLOW_LONG))
37
b0b300a3 38void
aa0053b7 39Encode_XSEncoding(pTHX_ encode_t * enc)
2f2b4ff2 40{
aa0053b7 41 dSP;
42 HV *stash = gv_stashpv("Encode::XS", TRUE);
43 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))), stash);
44 int i = 0;
45 PUSHMARK(sp);
46 XPUSHs(sv);
47 while (enc->name[i]) {
48 const char *name = enc->name[i++];
49 XPUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
50 }
51 PUTBACK;
52 call_pv("Encode::define_encoding", G_DISCARD);
53 SvREFCNT_dec(sv);
2f2b4ff2 54}
55
aa0053b7 56void
57call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig)
58{
85982a32 59 /* Exists for breakpointing */
aa0053b7 60}
67e989fb 61
85982a32 62
2fc614e0 63#define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s"
64#define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode"
65
2f2b4ff2 66static SV *
aa0053b7 67encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
220e2d4e 68 int check, STRLEN * offset, SV * term, int * retcode)
2f2b4ff2 69{
aa0053b7 70 STRLEN slen;
71 U8 *s = (U8 *) SvPV(src, slen);
3aececda 72 STRLEN tlen = slen;
73 STRLEN ddone = 0;
74 STRLEN sdone = 0;
39cf9a5e 75
3c49ab08 76 /* We allocate slen+1.
85982a32 77 PerlIO dumps core if this value is smaller than this. */
3c49ab08 78 SV *dst = sv_2mortal(newSV(slen+1));
85982a32 79 U8 *d = (U8 *)SvPVX(dst);
80 STRLEN dlen = SvLEN(dst)-1;
220e2d4e 81 int code = 0;
82 STRLEN trmlen = 0;
cc7dbc11 83 U8 *trm = term ? (U8*) SvPV(term, trmlen) : NULL;
220e2d4e 84
85 if (offset) {
86 s += *offset;
6be7c101 87 if (slen > *offset){ /* safeguard against slen overflow */
88 slen -= *offset;
89 }else{
90 slen = 0;
91 }
220e2d4e 92 tlen = slen;
93 }
85982a32 94
6be7c101 95 if (slen == 0){
85982a32 96 SvCUR_set(dst, 0);
97 SvPOK_only(dst);
98 goto ENCODE_END;
99 }
100
220e2d4e 101 while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check,
102 trm, trmlen)) )
85982a32 103 {
104 SvCUR_set(dst, dlen+ddone);
105 SvPOK_only(dst);
0b3236bb 106
220e2d4e 107 if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL ||
108 code == ENCODE_FOUND_TERM) {
85982a32 109 break;
110 }
111 switch (code) {
112 case ENCODE_NOSPACE:
113 {
114 STRLEN more = 0; /* make sure you initialize! */
115 STRLEN sleft;
116 sdone += slen;
117 ddone += dlen;
118 sleft = tlen - sdone;
fcb875d4 119#if ENCODE_XS_PROFILE >= 2
85982a32 120 Perl_warn(aTHX_
121 "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
122 more, sdone, sleft, SvLEN(dst));
fcb875d4 123#endif
85982a32 124 if (sdone != 0) { /* has src ever been processed ? */
39cf9a5e 125#if ENCODE_XS_USEFP == 2
85982a32 126 more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone
127 - SvLEN(dst);
39cf9a5e 128#elif ENCODE_XS_USEFP
6e21dc91 129 more = (STRLEN)((1.0*SvLEN(dst)+1)/sdone * sleft);
39cf9a5e 130#else
85982a32 131 /* safe until SvLEN(dst) == MAX_INT/16 */
132 more = (16*SvLEN(dst)+1)/sdone/16 * sleft;
39cf9a5e 133#endif
39cf9a5e 134 }
85982a32 135 more += UTF8_MAXLEN; /* insurance policy */
136 d = (U8 *) SvGROW(dst, SvLEN(dst) + more);
137 /* dst need to grow need MORE bytes! */
138 if (ddone >= SvLEN(dst)) {
139 Perl_croak(aTHX_ "Destination couldn't be grown.");
140 }
141 dlen = SvLEN(dst)-ddone-1;
142 d += ddone;
143 s += slen;
144 slen = tlen-sdone;
145 continue;
146 }
147 case ENCODE_NOREP:
148 /* encoding */
c6a7db43 149 if (dir == enc->f_utf8) {
85982a32 150 STRLEN clen;
151 UV ch =
3e952a88 152 utf8n_to_uvuni(s+slen, (SvCUR(src)-slen),
b0b300a3 153 &clen, UTF8_ALLOW_ANY|UTF8_CHECK_ONLY);
56ff7374 154 /* if non-representable multibyte prefix at end of current buffer - break*/
155 if (clen > tlen - sdone) break;
85982a32 156 if (check & ENCODE_DIE_ON_ERR) {
2fc614e0 157 Perl_croak(aTHX_ ERR_ENCODE_NOMAP,
158 (UV)ch, enc->name[0]);
4089adc4 159 return &PL_sv_undef; /* never reaches but be safe */
160 }
161 if (check & ENCODE_WARN_ON_ERR){
162 Perl_warner(aTHX_ packWARN(WARN_UTF8),
2fc614e0 163 ERR_ENCODE_NOMAP, (UV)ch, enc->name[0]);
4089adc4 164 }
165 if (check & ENCODE_RETURN_ON_ERR){
166 goto ENCODE_SET_SRC;
167 }
f9d05ba3 168 if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){
169 SV* subchar =
170 newSVpvf(check & ENCODE_PERLQQ ? "\\x{%04"UVxf"}" :
171 check & ENCODE_HTMLCREF ? "&#%" UVuf ";" :
172 "&#x%" UVxf ";", (UV)ch);
4089adc4 173 sdone += slen + clen;
f9d05ba3 174 ddone += dlen + SvCUR(subchar);
175 sv_catsv(dst, subchar);
176 SvREFCNT_dec(subchar);
4089adc4 177 } else {
178 /* fallback char */
179 sdone += slen + clen;
180 ddone += dlen + enc->replen;
181 sv_catpvn(dst, (char*)enc->rep, enc->replen);
c6a7db43 182 }
b2704119 183 }
85982a32 184 /* decoding */
c6a7db43 185 else {
85982a32 186 if (check & ENCODE_DIE_ON_ERR){
2fc614e0 187 Perl_croak(aTHX_ ERR_DECODE_NOMAP,
436c6dd3 188 enc->name[0], (UV)s[slen]);
4089adc4 189 return &PL_sv_undef; /* never reaches but be safe */
190 }
191 if (check & ENCODE_WARN_ON_ERR){
192 Perl_warner(
193 aTHX_ packWARN(WARN_UTF8),
2fc614e0 194 ERR_DECODE_NOMAP,
436c6dd3 195 enc->name[0], (UV)s[slen]);
4089adc4 196 }
197 if (check & ENCODE_RETURN_ON_ERR){
198 goto ENCODE_SET_SRC;
199 }
200 if (check &
201 (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){
f9d05ba3 202 SV* subchar = newSVpvf("\\x%02" UVXf, (UV)s[slen]);
4089adc4 203 sdone += slen + 1;
f9d05ba3 204 ddone += dlen + SvCUR(subchar);
205 sv_catsv(dst, subchar);
206 SvREFCNT_dec(subchar);
4089adc4 207 } else {
208 sdone += slen + 1;
209 ddone += dlen + strlen(FBCHAR_UTF8);
210 sv_catpv(dst, FBCHAR_UTF8);
aa0053b7 211 }
b2704119 212 }
85982a32 213 /* settle variables when fallback */
b0b300a3 214 d = (U8 *)SvEND(dst);
215 dlen = SvLEN(dst) - ddone - 1;
3e952a88 216 s = (U8*)SvPVX(src) + sdone;
b2704119 217 slen = tlen - sdone;
218 break;
2f2b4ff2 219
85982a32 220 default:
221 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
222 code, (dir == enc->f_utf8) ? "to" : "from",
223 enc->name[0]);
224 return &PL_sv_undef;
aa0053b7 225 }
85982a32 226 }
227 ENCODE_SET_SRC:
ca777f1c 228 if (check && !(check & ENCODE_LEAVE_SRC)){
229 sdone = SvCUR(src) - (slen+sdone);
85982a32 230 if (sdone) {
231 sv_setpvn(src, (char*)s+slen, sdone);
aa0053b7 232 }
85982a32 233 SvCUR_set(src, sdone);
2f2b4ff2 234 }
85982a32 235 /* warn("check = 0x%X, code = 0x%d\n", check, code); */
c6a7db43 236
85982a32 237 SvCUR_set(dst, dlen+ddone);
238 SvPOK_only(dst);
c6a7db43 239
39cf9a5e 240#if ENCODE_XS_PROFILE
241 if (SvCUR(dst) > SvCUR(src)){
85982a32 242 Perl_warn(aTHX_
243 "SvLEN(dst)=%d, SvCUR(dst)=%d. %d bytes unused(%f %%)\n",
244 SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst),
245 (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0);
39cf9a5e 246 }
3c49ab08 247#endif
c6a7db43 248
220e2d4e 249 if (offset)
250 *offset += sdone + slen;
251
85982a32 252 ENCODE_END:
0b3236bb 253 *SvEND(dst) = '\0';
220e2d4e 254 if (retcode) *retcode = code;
aa0053b7 255 return dst;
2f2b4ff2 256}
257
7f0d54d7 258static bool
259strict_utf8(pTHX_ SV* sv)
260{
261 HV* hv;
262 SV** svp;
263 sv = SvRV(sv);
264 if (!sv || SvTYPE(sv) != SVt_PVHV)
265 return 0;
266 hv = (HV*)sv;
267 svp = hv_fetch(hv, "strict_utf8", 11, 0);
268 if (!svp)
269 return 0;
270 return SvTRUE(*svp);
271}
272
273static U8*
274process_utf8(pTHX_ SV* dst, U8* s, U8* e, int check,
275 bool encode, bool strict, bool stop_at_partial)
276{
277 UV uv;
278 STRLEN ulen;
279
280 SvPOK_only(dst);
281 SvCUR_set(dst,0);
282
283 while (s < e) {
284 if (UTF8_IS_INVARIANT(*s)) {
285 sv_catpvn(dst, (char *)s, 1);
286 s++;
287 continue;
288 }
289
290 if (UTF8_IS_START(*s)) {
291 U8 skip = UTF8SKIP(s);
292 if ((s + skip) > e) {
293 /* Partial character */
294 /* XXX could check that rest of bytes are UTF8_IS_CONTINUATION(ch) */
56ff7374 295 if (stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL))
7f0d54d7 296 break;
297
298 goto malformed_byte;
299 }
300
301 uv = utf8n_to_uvuni(s, e - s, &ulen,
302 UTF8_CHECK_ONLY | (strict ? UTF8_ALLOW_STRICT :
303 UTF8_ALLOW_NONSTRICT)
304 );
305#if 1 /* perl-5.8.6 and older do not check UTF8_ALLOW_LONG */
306 if (strict && uv > PERL_UNICODE_MAX)
307 ulen = -1;
308#endif
309 if (ulen == -1) {
310 if (strict) {
311 uv = utf8n_to_uvuni(s, e - s, &ulen,
312 UTF8_CHECK_ONLY | UTF8_ALLOW_NONSTRICT);
313 if (ulen == -1)
314 goto malformed_byte;
315 goto malformed;
316 }
317 goto malformed_byte;
318 }
319
320
321 /* Whole char is good */
322 sv_catpvn(dst,(char *)s,skip);
323 s += skip;
324 continue;
325 }
326
327 /* If we get here there is something wrong with alleged UTF-8 */
328 malformed_byte:
329 uv = (UV)*s;
330 ulen = 1;
331
332 malformed:
333 if (check & ENCODE_DIE_ON_ERR){
334 if (encode)
335 Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, "utf8");
336 else
337 Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", uv);
338 }
339 if (check & ENCODE_WARN_ON_ERR){
340 if (encode)
341 Perl_warner(aTHX_ packWARN(WARN_UTF8),
342 ERR_ENCODE_NOMAP, uv, "utf8");
343 else
344 Perl_warner(aTHX_ packWARN(WARN_UTF8),
345 ERR_DECODE_NOMAP, "utf8", uv);
346 }
347 if (check & ENCODE_RETURN_ON_ERR) {
348 break;
349 }
350 if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){
351 SV* subchar = newSVpvf(check & ENCODE_PERLQQ ? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}"):
352 check & ENCODE_HTMLCREF ? "&#%" UVuf ";" :
353 "&#x%" UVxf ";", uv);
354 sv_catsv(dst, subchar);
355 SvREFCNT_dec(subchar);
356 } else {
357 sv_catpv(dst, FBCHAR_UTF8);
358 }
359 s += ulen;
360 }
361 *SvEND(dst) = '\0';
362
363 return s;
364}
365
366
ab3374e4 367MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_
368
a0d8a30e 369PROTOTYPES: DISABLE
370
371void
b536bf57 372Method_decode_xs(obj,src,check = 0)
ab3374e4 373SV * obj
374SV * src
375int check
376CODE:
377{
378 STRLEN slen;
379 U8 *s = (U8 *) SvPV(src, slen);
380 U8 *e = (U8 *) SvEND(src);
b536bf57 381 SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */
cc836e95 382
383 /*
7f0d54d7 384 * PerlIO check -- we assume the object is of PerlIO if renewed
cc836e95 385 */
386 int renewed = 0;
387 dSP; ENTER; SAVETMPS;
388 PUSHMARK(sp);
389 XPUSHs(obj);
390 PUTBACK;
391 if (call_method("renewed",G_SCALAR) == 1) {
392 SPAGAIN;
393 renewed = POPi;
394 PUTBACK;
395#if 0
396 fprintf(stderr, "renewed == %d\n", renewed);
397#endif
cc836e95 398 }
399 FREETMPS; LEAVE;
400 /* end PerlIO check */
401
ab3374e4 402 if (SvUTF8(src)) {
403 s = utf8_to_bytes(s,&slen);
404 if (s) {
405 SvCUR_set(src,slen);
406 SvUTF8_off(src);
407 e = s+slen;
408 }
409 else {
410 croak("Cannot decode string with wide characters");
411 }
412 }
7f0d54d7 413
414 s = process_utf8(aTHX_ dst, s, e, check, 0, strict_utf8(aTHX_ obj), renewed);
ab3374e4 415
416 /* Clear out translated part of source unless asked not to */
417 if (check && !(check & ENCODE_LEAVE_SRC)){
418 slen = e-s;
419 if (slen) {
420 sv_setpvn(src, (char*)s, slen);
421 }
422 SvCUR_set(src, slen);
423 }
424 SvUTF8_on(dst);
425 ST(0) = sv_2mortal(dst);
426 XSRETURN(1);
427}
428
429void
b536bf57 430Method_encode_xs(obj,src,check = 0)
ab3374e4 431SV * obj
432SV * src
433int check
434CODE:
435{
436 STRLEN slen;
437 U8 *s = (U8 *) SvPV(src, slen);
438 U8 *e = (U8 *) SvEND(src);
b536bf57 439 SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */
ab3374e4 440 if (SvUTF8(src)) {
7f0d54d7 441 /* Already encoded */
442 if (strict_utf8(aTHX_ obj)) {
443 s = process_utf8(aTHX_ dst, s, e, check, 1, 1, 0);
444 }
445 else {
446 /* trust it and just copy the octets */
447 sv_setpvn(dst,(char *)s,(e-s));
448 s = e;
449 }
ab3374e4 450 }
451 else {
452 /* Native bytes - can always encode */
b536bf57 453 U8 *d = (U8 *) SvGROW(dst, 2*slen+1); /* +1 or assertion will botch */
ab3374e4 454 while (s < e) {
455 UV uv = NATIVE_TO_UNI((UV) *s++);
456 if (UNI_IS_INVARIANT(uv))
457 *d++ = (U8)UTF_TO_NATIVE(uv);
458 else {
459 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
460 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
461 }
462 }
463 SvCUR_set(dst, d- (U8 *)SvPVX(dst));
464 *SvEND(dst) = '\0';
465 }
466
467 /* Clear out translated part of source unless asked not to */
468 if (check && !(check & ENCODE_LEAVE_SRC)){
469 slen = e-s;
470 if (slen) {
471 sv_setpvn(src, (char*)s, slen);
472 }
473 SvCUR_set(src, slen);
474 }
475 SvPOK_only(dst);
476 SvUTF8_off(dst);
477 ST(0) = sv_2mortal(dst);
478 XSRETURN(1);
479}
480
50d26985 481MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_
2f2b4ff2 482
483PROTOTYPES: ENABLE
484
485void
a0d8a30e 486Method_renew(obj)
487SV * obj
488CODE:
489{
490 XSRETURN(1);
491}
492
cc836e95 493int
494Method_renewed(obj)
495SV * obj
496CODE:
497 RETVAL = 0;
498OUTPUT:
499 RETVAL
500
a0d8a30e 501void
0a95303c 502Method_name(obj)
503SV * obj
504CODE:
85982a32 505{
506 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
507 ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0])));
508 XSRETURN(1);
509}
0a95303c 510
511void
220e2d4e 512Method_cat_decode(obj, dst, src, off, term, check = 0)
513SV * obj
514SV * dst
515SV * src
516SV * off
517SV * term
518int check
519CODE:
520{
521 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
522 STRLEN offset = (STRLEN)SvIV(off);
523 int code = 0;
524 if (SvUTF8(src)) {
525 sv_utf8_downgrade(src, FALSE);
526 }
527 sv_catsv(dst, encode_method(aTHX_ enc, enc->t_utf8, src, check,
528 &offset, term, &code));
b162af07 529 SvIV_set(off, (IV)offset);
220e2d4e 530 if (code == ENCODE_FOUND_TERM) {
531 ST(0) = &PL_sv_yes;
532 }else{
533 ST(0) = &PL_sv_no;
534 }
535 XSRETURN(1);
536}
537
538void
b2704119 539Method_decode(obj,src,check = 0)
2f2b4ff2 540SV * obj
541SV * src
b2704119 542int check
2f2b4ff2 543CODE:
aae85ceb 544{
85982a32 545 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
ab3374e4 546 if (SvUTF8(src)) {
547 sv_utf8_downgrade(src, FALSE);
548 }
220e2d4e 549 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check,
550 NULL, Nullsv, NULL);
85982a32 551 SvUTF8_on(ST(0));
aae85ceb 552 XSRETURN(1);
553}
554
555void
85982a32 556Method_encode(obj,src,check = 0)
aae85ceb 557SV * obj
85982a32 558SV * src
559int check
aae85ceb 560CODE:
561{
85982a32 562 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
563 sv_utf8_upgrade(src);
220e2d4e 564 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check,
565 NULL, Nullsv, NULL);
aae85ceb 566 XSRETURN(1);
567}
568
0ab8f81e 569void
570Method_needs_lines(obj)
571SV * obj
572CODE:
573{
b32afa7c 574 /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */
0ab8f81e 575 ST(0) = &PL_sv_no;
576 XSRETURN(1);
577}
578
579void
580Method_perlio_ok(obj)
581SV * obj
582CODE:
583{
b32afa7c 584 /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */
10c5ecbb 585 /* require_pv(PERLIO_FILENAME); */
586
587 eval_pv("require PerlIO::encoding", 0);
588
589 if (SvTRUE(get_sv("@", 0))) {
0ab8f81e 590 ST(0) = &PL_sv_no;
10c5ecbb 591 }else{
592 ST(0) = &PL_sv_yes;
0ab8f81e 593 }
594 XSRETURN(1);
595}
596
67e989fb 597MODULE = Encode PACKAGE = Encode
2c674647 598
599PROTOTYPES: ENABLE
600
67e989fb 601I32
2c674647 602_bytes_to_utf8(sv, ...)
85982a32 603SV * sv
604CODE:
605{
606 SV * encoding = items == 2 ? ST(1) : Nullsv;
c6a7db43 607
85982a32 608 if (encoding)
609 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
610 else {
611 STRLEN len;
612 U8* s = (U8*)SvPV(sv, len);
613 U8* converted;
614
615 converted = bytes_to_utf8(s, &len); /* This allocs */
616 sv_setpvn(sv, (char *)converted, len);
617 SvUTF8_on(sv); /* XXX Should we? */
618 Safefree(converted); /* ... so free it */
619 RETVAL = len;
620 }
621}
622OUTPUT:
623 RETVAL
2c674647 624
67e989fb 625I32
2c674647 626_utf8_to_bytes(sv, ...)
85982a32 627SV * sv
628CODE:
629{
630 SV * to = items > 1 ? ST(1) : Nullsv;
631 SV * check = items > 2 ? ST(2) : Nullsv;
632
633 if (to) {
634 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
635 } else {
636 STRLEN len;
637 U8 *s = (U8*)SvPV(sv, len);
638
639 RETVAL = 0;
640 if (SvTRUE(check)) {
641 /* Must do things the slow way */
642 U8 *dest;
643 /* We need a copy to pass to check() */
c6a7db43 644 U8 *src = (U8*)savepv((char *)s);
85982a32 645 U8 *send = s + len;
646
647 New(83, dest, len, U8); /* I think */
648
649 while (s < send) {
650 if (*s < 0x80){
651 *dest++ = *s++;
652 } else {
653 STRLEN ulen;
654 UV uv = *s++;
655
656 /* Have to do it all ourselves because of error routine,
657 aargh. */
658 if (!(uv & 0x40)){ goto failure; }
659 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
660 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
661 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
662 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
663 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
664 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
665 else { ulen = 13; uv = 0; }
87714904 666
85982a32 667 /* Note change to utf8.c variable naming, for variety */
668 while (ulen--) {
c6a7db43 669 if ((*s & 0xc0) != 0x80){
670 goto failure;
85982a32 671 } else {
672 uv = (uv << 6) | (*s++ & 0x3f);
673 }
87714904 674 }
67e989fb 675 if (uv > 256) {
676 failure:
85982a32 677 call_failure(check, s, dest, src);
678 /* Now what happens? */
67e989fb 679 }
680 *dest++ = (U8)uv;
85982a32 681 }
682 }
683 } else {
684 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
2c674647 685 }
85982a32 686 }
687}
688OUTPUT:
689 RETVAL
2c674647 690
2c674647 691bool
b2704119 692is_utf8(sv, check = 0)
4411f3b6 693SV * sv
b2704119 694int check
85982a32 695CODE:
696{
697 if (SvGMAGICAL(sv)) /* it could be $1, for example */
698 sv = newSVsv(sv); /* GMAGIG will be done */
699 if (SvPOK(sv)) {
700 RETVAL = SvUTF8(sv) ? TRUE : FALSE;
701 if (RETVAL &&
702 check &&
703 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
2c674647 704 RETVAL = FALSE;
85982a32 705 } else {
706 RETVAL = FALSE;
707 }
708 if (sv != ST(0))
709 SvREFCNT_dec(sv); /* it was a temp copy */
710}
711OUTPUT:
712 RETVAL
2c674647 713
714SV *
4411f3b6 715_utf8_on(sv)
85982a32 716SV * sv
717CODE:
718{
719 if (SvPOK(sv)) {
720 SV *rsv = newSViv(SvUTF8(sv));
721 RETVAL = rsv;
722 SvUTF8_on(sv);
723 } else {
724 RETVAL = &PL_sv_undef;
725 }
726}
727OUTPUT:
728 RETVAL
2c674647 729
730SV *
4411f3b6 731_utf8_off(sv)
85982a32 732SV * sv
733CODE:
734{
735 if (SvPOK(sv)) {
736 SV *rsv = newSViv(SvUTF8(sv));
737 RETVAL = rsv;
738 SvUTF8_off(sv);
739 } else {
740 RETVAL = &PL_sv_undef;
741 }
742}
743OUTPUT:
744 RETVAL
745
85982a32 746int
747DIE_ON_ERR()
748CODE:
749 RETVAL = ENCODE_DIE_ON_ERR;
750OUTPUT:
751 RETVAL
752
c6a7db43 753int
85982a32 754WARN_ON_ERR()
755CODE:
756 RETVAL = ENCODE_WARN_ON_ERR;
757OUTPUT:
758 RETVAL
759
760int
761LEAVE_SRC()
762CODE:
763 RETVAL = ENCODE_LEAVE_SRC;
764OUTPUT:
765 RETVAL
766
767int
768RETURN_ON_ERR()
769CODE:
770 RETVAL = ENCODE_RETURN_ON_ERR;
771OUTPUT:
772 RETVAL
773
774int
775PERLQQ()
776CODE:
777 RETVAL = ENCODE_PERLQQ;
778OUTPUT:
779 RETVAL
780
781int
af1f55d9 782HTMLCREF()
783CODE:
784 RETVAL = ENCODE_HTMLCREF;
785OUTPUT:
786 RETVAL
787
788int
789XMLCREF()
790CODE:
791 RETVAL = ENCODE_XMLCREF;
792OUTPUT:
793 RETVAL
794
795int
56ff7374 796STOP_AT_PARTIAL()
797CODE:
798 RETVAL = ENCODE_STOP_AT_PARTIAL;
799OUTPUT:
800 RETVAL
801
802int
85982a32 803FB_DEFAULT()
804CODE:
805 RETVAL = ENCODE_FB_DEFAULT;
806OUTPUT:
807 RETVAL
808
809int
810FB_CROAK()
811CODE:
812 RETVAL = ENCODE_FB_CROAK;
813OUTPUT:
814 RETVAL
815
816int
817FB_QUIET()
818CODE:
819 RETVAL = ENCODE_FB_QUIET;
820OUTPUT:
821 RETVAL
822
823int
824FB_WARN()
825CODE:
826 RETVAL = ENCODE_FB_WARN;
827OUTPUT:
828 RETVAL
829
830int
831FB_PERLQQ()
832CODE:
833 RETVAL = ENCODE_FB_PERLQQ;
834OUTPUT:
835 RETVAL
2c674647 836
af1f55d9 837int
838FB_HTMLCREF()
839CODE:
840 RETVAL = ENCODE_FB_HTMLCREF;
841OUTPUT:
842 RETVAL
843
844int
845FB_XMLCREF()
846CODE:
847 RETVAL = ENCODE_FB_XMLCREF;
848OUTPUT:
849 RETVAL
850
33af2bc7 851BOOT:
852{
85982a32 853#include "def_t.h"
e7cbefb8 854#include "def_t.exh"
33af2bc7 855}