Upgrade to Encode 2.10
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.xs
CommitLineData
85982a32 1/*
dc4a2e29 2 $Id: Encode.xs,v 2.4 2005/05/16 18:46:36 dankogai Exp dankogai $
85982a32 3 */
4
fcf2db38 5#define PERL_NO_GET_CONTEXT
2c674647 6#include "EXTERN.h"
7#include "perl.h"
8#include "XSUB.h"
2f2b4ff2 9#define U8 U8
10#include "encode.h"
10c5ecbb 11
12# define PERLIO_MODNAME "PerlIO::encoding"
0ab8f81e 13# define PERLIO_FILENAME "PerlIO/encoding.pm"
aae85ceb 14
85982a32 15/* set 1 or more to profile. t/encoding.t dumps core because of
16 Perl_warner and PerlIO don't work well */
c6a7db43 17#define ENCODE_XS_PROFILE 0
39cf9a5e 18
85982a32 19/* set 0 to disable floating point to calculate buffer size for
20 encode_method(). 1 is recommended. 2 restores NI-S original */
c6a7db43 21#define ENCODE_XS_USEFP 1
39cf9a5e 22
fcf2db38 23#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \
2f5768b8 24 Perl_croak(aTHX_ "panic_unimplemented"); \
4a83738a 25 return (y)0; /* fool picky compilers */ \
87714904 26 }
85982a32 27/**/
011b2d2f 28
7e9a885a 29UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
30UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
33af2bc7 31
7f0d54d7 32#define UTF8_ALLOW_STRICT 0
33#define UTF8_ALLOW_NONSTRICT (UTF8_ALLOW_ANY & \
34 ~(UTF8_ALLOW_CONTINUATION | \
35 UTF8_ALLOW_NON_CONTINUATION | \
36 UTF8_ALLOW_LONG))
37
b0b300a3 38void
aa0053b7 39Encode_XSEncoding(pTHX_ encode_t * enc)
2f2b4ff2 40{
aa0053b7 41 dSP;
42 HV *stash = gv_stashpv("Encode::XS", TRUE);
43 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))), stash);
44 int i = 0;
45 PUSHMARK(sp);
46 XPUSHs(sv);
47 while (enc->name[i]) {
48 const char *name = enc->name[i++];
49 XPUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
50 }
51 PUTBACK;
52 call_pv("Encode::define_encoding", G_DISCARD);
53 SvREFCNT_dec(sv);
2f2b4ff2 54}
55
aa0053b7 56void
57call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig)
58{
85982a32 59 /* Exists for breakpointing */
aa0053b7 60}
67e989fb 61
85982a32 62
2fc614e0 63#define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s"
64#define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode"
65
2f2b4ff2 66static SV *
aa0053b7 67encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
220e2d4e 68 int check, STRLEN * offset, SV * term, int * retcode)
2f2b4ff2 69{
aa0053b7 70 STRLEN slen;
71 U8 *s = (U8 *) SvPV(src, slen);
3aececda 72 STRLEN tlen = slen;
73 STRLEN ddone = 0;
74 STRLEN sdone = 0;
39cf9a5e 75
3c49ab08 76 /* We allocate slen+1.
85982a32 77 PerlIO dumps core if this value is smaller than this. */
3c49ab08 78 SV *dst = sv_2mortal(newSV(slen+1));
85982a32 79 U8 *d = (U8 *)SvPVX(dst);
80 STRLEN dlen = SvLEN(dst)-1;
220e2d4e 81 int code = 0;
82 STRLEN trmlen = 0;
cc7dbc11 83 U8 *trm = term ? (U8*) SvPV(term, trmlen) : NULL;
220e2d4e 84
85 if (offset) {
86 s += *offset;
6be7c101 87 if (slen > *offset){ /* safeguard against slen overflow */
88 slen -= *offset;
89 }else{
90 slen = 0;
91 }
220e2d4e 92 tlen = slen;
93 }
85982a32 94
6be7c101 95 if (slen == 0){
85982a32 96 SvCUR_set(dst, 0);
97 SvPOK_only(dst);
98 goto ENCODE_END;
99 }
100
220e2d4e 101 while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check,
102 trm, trmlen)) )
85982a32 103 {
104 SvCUR_set(dst, dlen+ddone);
105 SvPOK_only(dst);
0b3236bb 106
220e2d4e 107 if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL ||
108 code == ENCODE_FOUND_TERM) {
85982a32 109 break;
110 }
111 switch (code) {
112 case ENCODE_NOSPACE:
113 {
114 STRLEN more = 0; /* make sure you initialize! */
115 STRLEN sleft;
116 sdone += slen;
117 ddone += dlen;
118 sleft = tlen - sdone;
fcb875d4 119#if ENCODE_XS_PROFILE >= 2
85982a32 120 Perl_warn(aTHX_
121 "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
122 more, sdone, sleft, SvLEN(dst));
fcb875d4 123#endif
85982a32 124 if (sdone != 0) { /* has src ever been processed ? */
39cf9a5e 125#if ENCODE_XS_USEFP == 2
85982a32 126 more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone
127 - SvLEN(dst);
39cf9a5e 128#elif ENCODE_XS_USEFP
6e21dc91 129 more = (STRLEN)((1.0*SvLEN(dst)+1)/sdone * sleft);
39cf9a5e 130#else
85982a32 131 /* safe until SvLEN(dst) == MAX_INT/16 */
132 more = (16*SvLEN(dst)+1)/sdone/16 * sleft;
39cf9a5e 133#endif
39cf9a5e 134 }
85982a32 135 more += UTF8_MAXLEN; /* insurance policy */
136 d = (U8 *) SvGROW(dst, SvLEN(dst) + more);
137 /* dst need to grow need MORE bytes! */
138 if (ddone >= SvLEN(dst)) {
139 Perl_croak(aTHX_ "Destination couldn't be grown.");
140 }
141 dlen = SvLEN(dst)-ddone-1;
142 d += ddone;
143 s += slen;
144 slen = tlen-sdone;
145 continue;
146 }
147 case ENCODE_NOREP:
148 /* encoding */
c6a7db43 149 if (dir == enc->f_utf8) {
85982a32 150 STRLEN clen;
151 UV ch =
3e952a88 152 utf8n_to_uvuni(s+slen, (SvCUR(src)-slen),
b0b300a3 153 &clen, UTF8_ALLOW_ANY|UTF8_CHECK_ONLY);
85982a32 154 if (check & ENCODE_DIE_ON_ERR) {
2fc614e0 155 Perl_croak(aTHX_ ERR_ENCODE_NOMAP,
156 (UV)ch, enc->name[0]);
4089adc4 157 return &PL_sv_undef; /* never reaches but be safe */
158 }
159 if (check & ENCODE_WARN_ON_ERR){
160 Perl_warner(aTHX_ packWARN(WARN_UTF8),
2fc614e0 161 ERR_ENCODE_NOMAP, (UV)ch, enc->name[0]);
4089adc4 162 }
163 if (check & ENCODE_RETURN_ON_ERR){
164 goto ENCODE_SET_SRC;
165 }
f9d05ba3 166 if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){
167 SV* subchar =
168 newSVpvf(check & ENCODE_PERLQQ ? "\\x{%04"UVxf"}" :
169 check & ENCODE_HTMLCREF ? "&#%" UVuf ";" :
170 "&#x%" UVxf ";", (UV)ch);
4089adc4 171 sdone += slen + clen;
f9d05ba3 172 ddone += dlen + SvCUR(subchar);
173 sv_catsv(dst, subchar);
174 SvREFCNT_dec(subchar);
4089adc4 175 } else {
176 /* fallback char */
177 sdone += slen + clen;
178 ddone += dlen + enc->replen;
179 sv_catpvn(dst, (char*)enc->rep, enc->replen);
c6a7db43 180 }
b2704119 181 }
85982a32 182 /* decoding */
c6a7db43 183 else {
85982a32 184 if (check & ENCODE_DIE_ON_ERR){
2fc614e0 185 Perl_croak(aTHX_ ERR_DECODE_NOMAP,
436c6dd3 186 enc->name[0], (UV)s[slen]);
4089adc4 187 return &PL_sv_undef; /* never reaches but be safe */
188 }
189 if (check & ENCODE_WARN_ON_ERR){
190 Perl_warner(
191 aTHX_ packWARN(WARN_UTF8),
2fc614e0 192 ERR_DECODE_NOMAP,
436c6dd3 193 enc->name[0], (UV)s[slen]);
4089adc4 194 }
195 if (check & ENCODE_RETURN_ON_ERR){
196 goto ENCODE_SET_SRC;
197 }
198 if (check &
199 (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){
f9d05ba3 200 SV* subchar = newSVpvf("\\x%02" UVXf, (UV)s[slen]);
4089adc4 201 sdone += slen + 1;
f9d05ba3 202 ddone += dlen + SvCUR(subchar);
203 sv_catsv(dst, subchar);
204 SvREFCNT_dec(subchar);
4089adc4 205 } else {
206 sdone += slen + 1;
207 ddone += dlen + strlen(FBCHAR_UTF8);
208 sv_catpv(dst, FBCHAR_UTF8);
aa0053b7 209 }
b2704119 210 }
85982a32 211 /* settle variables when fallback */
b0b300a3 212 d = (U8 *)SvEND(dst);
213 dlen = SvLEN(dst) - ddone - 1;
3e952a88 214 s = (U8*)SvPVX(src) + sdone;
b2704119 215 slen = tlen - sdone;
216 break;
2f2b4ff2 217
85982a32 218 default:
219 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
220 code, (dir == enc->f_utf8) ? "to" : "from",
221 enc->name[0]);
222 return &PL_sv_undef;
aa0053b7 223 }
85982a32 224 }
225 ENCODE_SET_SRC:
ca777f1c 226 if (check && !(check & ENCODE_LEAVE_SRC)){
227 sdone = SvCUR(src) - (slen+sdone);
85982a32 228 if (sdone) {
229 sv_setpvn(src, (char*)s+slen, sdone);
aa0053b7 230 }
85982a32 231 SvCUR_set(src, sdone);
2f2b4ff2 232 }
85982a32 233 /* warn("check = 0x%X, code = 0x%d\n", check, code); */
c6a7db43 234
85982a32 235 SvCUR_set(dst, dlen+ddone);
236 SvPOK_only(dst);
c6a7db43 237
39cf9a5e 238#if ENCODE_XS_PROFILE
239 if (SvCUR(dst) > SvCUR(src)){
85982a32 240 Perl_warn(aTHX_
241 "SvLEN(dst)=%d, SvCUR(dst)=%d. %d bytes unused(%f %%)\n",
242 SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst),
243 (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0);
39cf9a5e 244 }
3c49ab08 245#endif
c6a7db43 246
220e2d4e 247 if (offset)
248 *offset += sdone + slen;
249
85982a32 250 ENCODE_END:
0b3236bb 251 *SvEND(dst) = '\0';
220e2d4e 252 if (retcode) *retcode = code;
aa0053b7 253 return dst;
2f2b4ff2 254}
255
7f0d54d7 256static bool
257strict_utf8(pTHX_ SV* sv)
258{
259 HV* hv;
260 SV** svp;
261 sv = SvRV(sv);
262 if (!sv || SvTYPE(sv) != SVt_PVHV)
263 return 0;
264 hv = (HV*)sv;
265 svp = hv_fetch(hv, "strict_utf8", 11, 0);
266 if (!svp)
267 return 0;
268 return SvTRUE(*svp);
269}
270
271static U8*
272process_utf8(pTHX_ SV* dst, U8* s, U8* e, int check,
273 bool encode, bool strict, bool stop_at_partial)
274{
275 UV uv;
276 STRLEN ulen;
277
278 SvPOK_only(dst);
279 SvCUR_set(dst,0);
280
281 while (s < e) {
282 if (UTF8_IS_INVARIANT(*s)) {
283 sv_catpvn(dst, (char *)s, 1);
284 s++;
285 continue;
286 }
287
288 if (UTF8_IS_START(*s)) {
289 U8 skip = UTF8SKIP(s);
290 if ((s + skip) > e) {
291 /* Partial character */
292 /* XXX could check that rest of bytes are UTF8_IS_CONTINUATION(ch) */
293 if (stop_at_partial)
294 break;
295
296 goto malformed_byte;
297 }
298
299 uv = utf8n_to_uvuni(s, e - s, &ulen,
300 UTF8_CHECK_ONLY | (strict ? UTF8_ALLOW_STRICT :
301 UTF8_ALLOW_NONSTRICT)
302 );
303#if 1 /* perl-5.8.6 and older do not check UTF8_ALLOW_LONG */
304 if (strict && uv > PERL_UNICODE_MAX)
305 ulen = -1;
306#endif
307 if (ulen == -1) {
308 if (strict) {
309 uv = utf8n_to_uvuni(s, e - s, &ulen,
310 UTF8_CHECK_ONLY | UTF8_ALLOW_NONSTRICT);
311 if (ulen == -1)
312 goto malformed_byte;
313 goto malformed;
314 }
315 goto malformed_byte;
316 }
317
318
319 /* Whole char is good */
320 sv_catpvn(dst,(char *)s,skip);
321 s += skip;
322 continue;
323 }
324
325 /* If we get here there is something wrong with alleged UTF-8 */
326 malformed_byte:
327 uv = (UV)*s;
328 ulen = 1;
329
330 malformed:
331 if (check & ENCODE_DIE_ON_ERR){
332 if (encode)
333 Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, "utf8");
334 else
335 Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", uv);
336 }
337 if (check & ENCODE_WARN_ON_ERR){
338 if (encode)
339 Perl_warner(aTHX_ packWARN(WARN_UTF8),
340 ERR_ENCODE_NOMAP, uv, "utf8");
341 else
342 Perl_warner(aTHX_ packWARN(WARN_UTF8),
343 ERR_DECODE_NOMAP, "utf8", uv);
344 }
345 if (check & ENCODE_RETURN_ON_ERR) {
346 break;
347 }
348 if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){
349 SV* subchar = newSVpvf(check & ENCODE_PERLQQ ? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}"):
350 check & ENCODE_HTMLCREF ? "&#%" UVuf ";" :
351 "&#x%" UVxf ";", uv);
352 sv_catsv(dst, subchar);
353 SvREFCNT_dec(subchar);
354 } else {
355 sv_catpv(dst, FBCHAR_UTF8);
356 }
357 s += ulen;
358 }
359 *SvEND(dst) = '\0';
360
361 return s;
362}
363
364
ab3374e4 365MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_
366
a0d8a30e 367PROTOTYPES: DISABLE
368
369void
b536bf57 370Method_decode_xs(obj,src,check = 0)
ab3374e4 371SV * obj
372SV * src
373int check
374CODE:
375{
376 STRLEN slen;
377 U8 *s = (U8 *) SvPV(src, slen);
378 U8 *e = (U8 *) SvEND(src);
b536bf57 379 SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */
cc836e95 380
381 /*
7f0d54d7 382 * PerlIO check -- we assume the object is of PerlIO if renewed
cc836e95 383 */
384 int renewed = 0;
385 dSP; ENTER; SAVETMPS;
386 PUSHMARK(sp);
387 XPUSHs(obj);
388 PUTBACK;
389 if (call_method("renewed",G_SCALAR) == 1) {
390 SPAGAIN;
391 renewed = POPi;
392 PUTBACK;
393#if 0
394 fprintf(stderr, "renewed == %d\n", renewed);
395#endif
cc836e95 396 }
397 FREETMPS; LEAVE;
398 /* end PerlIO check */
399
ab3374e4 400 if (SvUTF8(src)) {
401 s = utf8_to_bytes(s,&slen);
402 if (s) {
403 SvCUR_set(src,slen);
404 SvUTF8_off(src);
405 e = s+slen;
406 }
407 else {
408 croak("Cannot decode string with wide characters");
409 }
410 }
7f0d54d7 411
412 s = process_utf8(aTHX_ dst, s, e, check, 0, strict_utf8(aTHX_ obj), renewed);
ab3374e4 413
414 /* Clear out translated part of source unless asked not to */
415 if (check && !(check & ENCODE_LEAVE_SRC)){
416 slen = e-s;
417 if (slen) {
418 sv_setpvn(src, (char*)s, slen);
419 }
420 SvCUR_set(src, slen);
421 }
422 SvUTF8_on(dst);
423 ST(0) = sv_2mortal(dst);
424 XSRETURN(1);
425}
426
427void
b536bf57 428Method_encode_xs(obj,src,check = 0)
ab3374e4 429SV * obj
430SV * src
431int check
432CODE:
433{
434 STRLEN slen;
435 U8 *s = (U8 *) SvPV(src, slen);
436 U8 *e = (U8 *) SvEND(src);
b536bf57 437 SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */
ab3374e4 438 if (SvUTF8(src)) {
7f0d54d7 439 /* Already encoded */
440 if (strict_utf8(aTHX_ obj)) {
441 s = process_utf8(aTHX_ dst, s, e, check, 1, 1, 0);
442 }
443 else {
444 /* trust it and just copy the octets */
445 sv_setpvn(dst,(char *)s,(e-s));
446 s = e;
447 }
ab3374e4 448 }
449 else {
450 /* Native bytes - can always encode */
b536bf57 451 U8 *d = (U8 *) SvGROW(dst, 2*slen+1); /* +1 or assertion will botch */
ab3374e4 452 while (s < e) {
453 UV uv = NATIVE_TO_UNI((UV) *s++);
454 if (UNI_IS_INVARIANT(uv))
455 *d++ = (U8)UTF_TO_NATIVE(uv);
456 else {
457 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
458 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
459 }
460 }
461 SvCUR_set(dst, d- (U8 *)SvPVX(dst));
462 *SvEND(dst) = '\0';
463 }
464
465 /* Clear out translated part of source unless asked not to */
466 if (check && !(check & ENCODE_LEAVE_SRC)){
467 slen = e-s;
468 if (slen) {
469 sv_setpvn(src, (char*)s, slen);
470 }
471 SvCUR_set(src, slen);
472 }
473 SvPOK_only(dst);
474 SvUTF8_off(dst);
475 ST(0) = sv_2mortal(dst);
476 XSRETURN(1);
477}
478
50d26985 479MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_
2f2b4ff2 480
481PROTOTYPES: ENABLE
482
483void
a0d8a30e 484Method_renew(obj)
485SV * obj
486CODE:
487{
488 XSRETURN(1);
489}
490
cc836e95 491int
492Method_renewed(obj)
493SV * obj
494CODE:
495 RETVAL = 0;
496OUTPUT:
497 RETVAL
498
a0d8a30e 499void
0a95303c 500Method_name(obj)
501SV * obj
502CODE:
85982a32 503{
504 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
505 ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0])));
506 XSRETURN(1);
507}
0a95303c 508
509void
220e2d4e 510Method_cat_decode(obj, dst, src, off, term, check = 0)
511SV * obj
512SV * dst
513SV * src
514SV * off
515SV * term
516int check
517CODE:
518{
519 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
520 STRLEN offset = (STRLEN)SvIV(off);
521 int code = 0;
522 if (SvUTF8(src)) {
523 sv_utf8_downgrade(src, FALSE);
524 }
525 sv_catsv(dst, encode_method(aTHX_ enc, enc->t_utf8, src, check,
526 &offset, term, &code));
b162af07 527 SvIV_set(off, (IV)offset);
220e2d4e 528 if (code == ENCODE_FOUND_TERM) {
529 ST(0) = &PL_sv_yes;
530 }else{
531 ST(0) = &PL_sv_no;
532 }
533 XSRETURN(1);
534}
535
536void
b2704119 537Method_decode(obj,src,check = 0)
2f2b4ff2 538SV * obj
539SV * src
b2704119 540int check
2f2b4ff2 541CODE:
aae85ceb 542{
85982a32 543 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
ab3374e4 544 if (SvUTF8(src)) {
545 sv_utf8_downgrade(src, FALSE);
546 }
220e2d4e 547 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check,
548 NULL, Nullsv, NULL);
85982a32 549 SvUTF8_on(ST(0));
aae85ceb 550 XSRETURN(1);
551}
552
553void
85982a32 554Method_encode(obj,src,check = 0)
aae85ceb 555SV * obj
85982a32 556SV * src
557int check
aae85ceb 558CODE:
559{
85982a32 560 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
561 sv_utf8_upgrade(src);
220e2d4e 562 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check,
563 NULL, Nullsv, NULL);
aae85ceb 564 XSRETURN(1);
565}
566
0ab8f81e 567void
568Method_needs_lines(obj)
569SV * obj
570CODE:
571{
b32afa7c 572 /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */
0ab8f81e 573 ST(0) = &PL_sv_no;
574 XSRETURN(1);
575}
576
577void
578Method_perlio_ok(obj)
579SV * obj
580CODE:
581{
b32afa7c 582 /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */
10c5ecbb 583 /* require_pv(PERLIO_FILENAME); */
584
585 eval_pv("require PerlIO::encoding", 0);
586
587 if (SvTRUE(get_sv("@", 0))) {
0ab8f81e 588 ST(0) = &PL_sv_no;
10c5ecbb 589 }else{
590 ST(0) = &PL_sv_yes;
0ab8f81e 591 }
592 XSRETURN(1);
593}
594
67e989fb 595MODULE = Encode PACKAGE = Encode
2c674647 596
597PROTOTYPES: ENABLE
598
67e989fb 599I32
2c674647 600_bytes_to_utf8(sv, ...)
85982a32 601SV * sv
602CODE:
603{
604 SV * encoding = items == 2 ? ST(1) : Nullsv;
c6a7db43 605
85982a32 606 if (encoding)
607 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
608 else {
609 STRLEN len;
610 U8* s = (U8*)SvPV(sv, len);
611 U8* converted;
612
613 converted = bytes_to_utf8(s, &len); /* This allocs */
614 sv_setpvn(sv, (char *)converted, len);
615 SvUTF8_on(sv); /* XXX Should we? */
616 Safefree(converted); /* ... so free it */
617 RETVAL = len;
618 }
619}
620OUTPUT:
621 RETVAL
2c674647 622
67e989fb 623I32
2c674647 624_utf8_to_bytes(sv, ...)
85982a32 625SV * sv
626CODE:
627{
628 SV * to = items > 1 ? ST(1) : Nullsv;
629 SV * check = items > 2 ? ST(2) : Nullsv;
630
631 if (to) {
632 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
633 } else {
634 STRLEN len;
635 U8 *s = (U8*)SvPV(sv, len);
636
637 RETVAL = 0;
638 if (SvTRUE(check)) {
639 /* Must do things the slow way */
640 U8 *dest;
641 /* We need a copy to pass to check() */
c6a7db43 642 U8 *src = (U8*)savepv((char *)s);
85982a32 643 U8 *send = s + len;
644
645 New(83, dest, len, U8); /* I think */
646
647 while (s < send) {
648 if (*s < 0x80){
649 *dest++ = *s++;
650 } else {
651 STRLEN ulen;
652 UV uv = *s++;
653
654 /* Have to do it all ourselves because of error routine,
655 aargh. */
656 if (!(uv & 0x40)){ goto failure; }
657 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
658 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
659 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
660 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
661 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
662 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
663 else { ulen = 13; uv = 0; }
87714904 664
85982a32 665 /* Note change to utf8.c variable naming, for variety */
666 while (ulen--) {
c6a7db43 667 if ((*s & 0xc0) != 0x80){
668 goto failure;
85982a32 669 } else {
670 uv = (uv << 6) | (*s++ & 0x3f);
671 }
87714904 672 }
67e989fb 673 if (uv > 256) {
674 failure:
85982a32 675 call_failure(check, s, dest, src);
676 /* Now what happens? */
67e989fb 677 }
678 *dest++ = (U8)uv;
85982a32 679 }
680 }
681 } else {
682 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
2c674647 683 }
85982a32 684 }
685}
686OUTPUT:
687 RETVAL
2c674647 688
2c674647 689bool
b2704119 690is_utf8(sv, check = 0)
4411f3b6 691SV * sv
b2704119 692int check
85982a32 693CODE:
694{
695 if (SvGMAGICAL(sv)) /* it could be $1, for example */
696 sv = newSVsv(sv); /* GMAGIG will be done */
697 if (SvPOK(sv)) {
698 RETVAL = SvUTF8(sv) ? TRUE : FALSE;
699 if (RETVAL &&
700 check &&
701 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
2c674647 702 RETVAL = FALSE;
85982a32 703 } else {
704 RETVAL = FALSE;
705 }
706 if (sv != ST(0))
707 SvREFCNT_dec(sv); /* it was a temp copy */
708}
709OUTPUT:
710 RETVAL
2c674647 711
712SV *
4411f3b6 713_utf8_on(sv)
85982a32 714SV * sv
715CODE:
716{
717 if (SvPOK(sv)) {
718 SV *rsv = newSViv(SvUTF8(sv));
719 RETVAL = rsv;
720 SvUTF8_on(sv);
721 } else {
722 RETVAL = &PL_sv_undef;
723 }
724}
725OUTPUT:
726 RETVAL
2c674647 727
728SV *
4411f3b6 729_utf8_off(sv)
85982a32 730SV * sv
731CODE:
732{
733 if (SvPOK(sv)) {
734 SV *rsv = newSViv(SvUTF8(sv));
735 RETVAL = rsv;
736 SvUTF8_off(sv);
737 } else {
738 RETVAL = &PL_sv_undef;
739 }
740}
741OUTPUT:
742 RETVAL
743
85982a32 744int
745DIE_ON_ERR()
746CODE:
747 RETVAL = ENCODE_DIE_ON_ERR;
748OUTPUT:
749 RETVAL
750
c6a7db43 751int
85982a32 752WARN_ON_ERR()
753CODE:
754 RETVAL = ENCODE_WARN_ON_ERR;
755OUTPUT:
756 RETVAL
757
758int
759LEAVE_SRC()
760CODE:
761 RETVAL = ENCODE_LEAVE_SRC;
762OUTPUT:
763 RETVAL
764
765int
766RETURN_ON_ERR()
767CODE:
768 RETVAL = ENCODE_RETURN_ON_ERR;
769OUTPUT:
770 RETVAL
771
772int
773PERLQQ()
774CODE:
775 RETVAL = ENCODE_PERLQQ;
776OUTPUT:
777 RETVAL
778
779int
af1f55d9 780HTMLCREF()
781CODE:
782 RETVAL = ENCODE_HTMLCREF;
783OUTPUT:
784 RETVAL
785
786int
787XMLCREF()
788CODE:
789 RETVAL = ENCODE_XMLCREF;
790OUTPUT:
791 RETVAL
792
793int
85982a32 794FB_DEFAULT()
795CODE:
796 RETVAL = ENCODE_FB_DEFAULT;
797OUTPUT:
798 RETVAL
799
800int
801FB_CROAK()
802CODE:
803 RETVAL = ENCODE_FB_CROAK;
804OUTPUT:
805 RETVAL
806
807int
808FB_QUIET()
809CODE:
810 RETVAL = ENCODE_FB_QUIET;
811OUTPUT:
812 RETVAL
813
814int
815FB_WARN()
816CODE:
817 RETVAL = ENCODE_FB_WARN;
818OUTPUT:
819 RETVAL
820
821int
822FB_PERLQQ()
823CODE:
824 RETVAL = ENCODE_FB_PERLQQ;
825OUTPUT:
826 RETVAL
2c674647 827
af1f55d9 828int
829FB_HTMLCREF()
830CODE:
831 RETVAL = ENCODE_FB_HTMLCREF;
832OUTPUT:
833 RETVAL
834
835int
836FB_XMLCREF()
837CODE:
838 RETVAL = ENCODE_FB_XMLCREF;
839OUTPUT:
840 RETVAL
841
33af2bc7 842BOOT:
843{
85982a32 844#include "def_t.h"
e7cbefb8 845#include "def_t.exh"
33af2bc7 846}