1 #define PERL_NO_GET_CONTEXT
10 #define FBCHAR_UTF8 "\xEF\xBF\xBD"
12 #define BOM16LE 0xFFFe
13 #define BOM32LE 0xFFFe0000
14 #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
15 #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
16 #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
17 #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
20 enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian)
25 croak("Partial character %c",(char) endian);
45 croak("Unknown endian %c",(char) endian);
53 enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value)
55 U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size);
60 SvCUR_set(result,SvCUR(result)+size);
68 SvCUR_set(result,SvCUR(result)+size);
76 croak("Unknown endian %c",(char) endian);
81 #define ENCODE_XS_PROFILE 0 /* set 1 or more to profile.
82 t/encoding.t dumps core because of
83 Perl_warner and PerlIO don't work well */
85 #define ENCODE_XS_USEFP 1 /* set 0 to disable floating point to calculate
86 buffer size for encode_method().
87 1 is recommended. 2 restores NI-S original */
89 #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \
90 Perl_croak(aTHX_ "panic_unimplemented"); \
91 return (y)0; /* fool picky compilers */ \
93 UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
94 UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
97 Encode_XSEncoding(pTHX_ encode_t * enc)
100 HV *stash = gv_stashpv("Encode::XS", TRUE);
101 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))), stash);
105 while (enc->name[i]) {
106 const char *name = enc->name[i++];
107 XPUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
110 call_pv("Encode::define_encoding", G_DISCARD);
115 call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig)
117 /* Exists for breakpointing */
121 encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
125 U8 *s = (U8 *) SvPV(src, slen);
130 /* We allocate slen+1.
131 PerlIO dumps core if this value is smaller than this. */
132 SV *dst = sv_2mortal(newSV(slen+1));
134 U8 *d = (U8 *) SvPVX(dst);
135 STRLEN dlen = SvLEN(dst)-1;
137 while ((code = do_encode(dir, s, &slen, d, dlen, &dlen, !check))) {
138 SvCUR_set(dst, dlen+ddone);
141 #if ENCODE_XS_PROFILE >= 3
142 Perl_warn(aTHX_ "code=%d @ s=%d/%d/%d d=%d/%d/%d\n",code,slen,sdone,tlen,dlen,ddone,SvLEN(dst)-1);
145 if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL)
151 STRLEN more = 0; /* make sure you initialize! */
155 sleft = tlen - sdone;
156 #if ENCODE_XS_PROFILE >= 2
158 "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
159 more, sdone, sleft, SvLEN(dst));
161 if (sdone != 0) { /* has src ever been processed ? */
162 #if ENCODE_XS_USEFP == 2
163 more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone
165 #elif ENCODE_XS_USEFP
166 more = (1.0*SvLEN(dst)+1)/sdone * sleft;
168 /* safe until SvLEN(dst) == MAX_INT/16 */
169 more = (16*SvLEN(dst)+1)/sdone/16 * sleft;
172 more += UTF8_MAXLEN; /* insurance policy */
173 #if ENCODE_XS_PROFILE >= 2
175 "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
176 more, sdone, sleft, SvLEN(dst));
178 d = (U8 *) SvGROW(dst, SvLEN(dst) + more);
179 /* dst need to grow need MORE bytes! */
180 if (ddone >= SvLEN(dst)) {
181 Perl_croak(aTHX_ "Destination couldn't be grown.");
183 dlen = SvLEN(dst)-ddone-1;
191 if (dir == enc->f_utf8) {
194 utf8n_to_uvuni(s + slen, (SvCUR(src) - slen),
196 if (!check) { /* fallback char */
197 sdone += slen + clen;
198 ddone += dlen + enc->replen;
199 sv_catpvn(dst, (char*)enc->rep, enc->replen);
201 else if (check == -1){ /* perlqq */
203 sv_2mortal(newSVpvf("\\x{%x}", ch));
204 sdone += slen + clen;
205 ddone += dlen + SvLEN(perlqq);
206 sv_catsv(dst, perlqq);
211 "}\" does not map to %s", ch,
216 if (!check){ /* fallback char */
218 ddone += dlen + strlen(FBCHAR_UTF8);
219 sv_catpv(dst, FBCHAR_UTF8);
221 else if (check == -1){ /* perlqq */
223 sv_2mortal(newSVpvf("\\x%02X", s[slen]));
225 ddone += dlen + SvLEN(perlqq);
226 sv_catsv(dst, perlqq);
229 /* UTF-8 is supposed to be "Universal" so should not
230 happen for real characters, but some encodings
231 have non-assigned codes which may occur. */
232 Perl_croak(aTHX_ "%s \"\\x%02X\" "
233 "does not map to Unicode (%d)",
234 enc->name[0], (U8) s[slen], code);
238 d = (U8*)SvPVX(dst) + dlen;
239 s = (U8*)SvPVX(src) + sdone;
244 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
245 code, (dir == enc->f_utf8) ? "to" : "from",
250 SvCUR_set(dst, dlen+ddone);
253 sdone = SvCUR(src) - (slen+sdone);
256 /* FIXME: A Move() is dangerous - PV could be mmap'ed readonly
257 SvOOK would be ideal - but sv_backoff does not understand SvLEN == 0
258 type SVs and sv_clear() calls it ...
260 sv_setpvn(src, (char*)s+slen, sdone);
262 Move(s + slen, SvPVX(src), sdone , U8);
265 SvCUR_set(src, sdone);
272 #if ENCODE_XS_PROFILE
273 if (SvCUR(dst) > SvCUR(src)){
275 "SvLEN(dst)=%d, SvCUR(dst)=%d. "
276 "%d bytes unused(%f %%)\n",
277 SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst),
278 (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0);
286 MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_
295 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
296 ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0])));
301 Method_decode(obj,src,check = 0)
307 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
308 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
314 Method_encode(obj,src,check = 0)
320 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
321 sv_utf8_upgrade(src);
322 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
326 MODULE = Encode PACKAGE = Encode::Unicode
329 decode_xs(obj, str, chk = &PL_sv_undef)
335 int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
336 U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
337 int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
338 SV *result = newSVpvn("",0);
340 U8 *s = (U8 *)SvPVbyte(str,ulen);
341 U8 *e = (U8 *)SvEND(str);
342 ST(0) = sv_2mortal(result);
345 if (!endian && s+size <= e) {
347 endian = (size == 4) ? 'N' : 'n';
348 bom = enc_unpack(aTHX_ &s,e,size,endian);
350 if (bom == BOM16LE) {
353 else if (bom == BOM32LE) {
357 croak("%s:Unregognised BOM %"UVxf,
358 SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),bom);
362 /* Update endian for this sequence */
363 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
366 while (s < e && s+size <= e) {
367 UV ord = enc_unpack(aTHX_ &s,e,size,endian);
369 if (size != 4 && invalid_ucs2(ord)) {
372 croak("%s:no surrogates allowed %"UVxf,
373 SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
376 enc_unpack(aTHX_ &s,e,size,endian); /* skip the next one as well */
382 if (!isHiSurrogate(ord)) {
383 croak("%s:Malformed HI surrogate %"UVxf,
384 SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
387 /* Partial character */
388 s -= size; /* back up to 1st half */
389 break; /* And exit loop */
391 lo = enc_unpack(aTHX_ &s,e,size,endian);
392 if (!isLoSurrogate(lo)){
393 croak("%s:Malformed LO surrogate %"UVxf,
394 SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
396 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
399 d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1);
400 d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0);
401 SvCUR_set(result,d - (U8 *)SvPVX(result));
405 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character",
406 SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)));
407 Move(s,SvPVX(str),e-s,U8);
408 SvCUR_set(str,(e-s));
419 encode_xs(obj, utf8, chk = &PL_sv_undef)
425 int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
426 U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
427 int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
428 SV *result = newSVpvn("",0);
430 U8 *s = (U8 *)SvPVutf8(utf8,ulen);
431 U8 *e = (U8 *)SvEND(utf8);
432 ST(0) = sv_2mortal(result);
434 endian = (size == 4) ? 'N' : 'n';
435 enc_pack(aTHX_ result,size,endian,BOM_BE);
437 /* Update endian for this sequence */
438 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
441 while (s < e && s+UTF8SKIP(s) <= e) {
443 UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
445 if (size != 4 && invalid_ucs2(ord)) {
446 if (!issurrogate(ord)){
449 croak("%s:code point \"\\x{"UVxf"}\" too high",
450 SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
452 enc_pack(aTHX_ result,size,endian,FBCHAR);
454 UV hi = ((ord - 0x10000) >> 10) + 0xD800;
455 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
456 enc_pack(aTHX_ result,size,endian,hi);
457 enc_pack(aTHX_ result,size,endian,lo);
461 /* not supposed to happen */
462 enc_pack(aTHX_ result,size,endian,FBCHAR);
466 enc_pack(aTHX_ result,size,endian,ord);
471 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character",
472 SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)));
473 Move(s,SvPVX(utf8),e-s,U8);
474 SvCUR_set(utf8,(e-s));
484 MODULE = Encode PACKAGE = Encode
489 _bytes_to_utf8(sv, ...)
493 SV * encoding = items == 2 ? ST(1) : Nullsv;
496 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
499 U8* s = (U8*)SvPV(sv, len);
502 converted = bytes_to_utf8(s, &len); /* This allocs */
503 sv_setpvn(sv, (char *)converted, len);
504 SvUTF8_on(sv); /* XXX Should we? */
505 Safefree(converted); /* ... so free it */
513 _utf8_to_bytes(sv, ...)
517 SV * to = items > 1 ? ST(1) : Nullsv;
518 SV * check = items > 2 ? ST(2) : Nullsv;
521 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
524 U8 *s = (U8*)SvPV(sv, len);
528 /* Must do things the slow way */
530 U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
533 New(83, dest, len, U8); /* I think */
542 /* Have to do it all ourselves because of error routine,
546 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
547 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
548 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
549 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
550 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
551 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
552 else { ulen = 13; uv = 0; }
554 /* Note change to utf8.c variable naming, for variety */
556 if ((*s & 0xc0) != 0x80)
560 uv = (uv << 6) | (*s++ & 0x3f);
564 call_failure(check, s, dest, src);
565 /* Now what happens? */
571 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
578 is_utf8(sv, check = 0)
583 if (SvGMAGICAL(sv)) /* it could be $1, for example */
584 sv = newSVsv(sv); /* GMAGIG will be done */
586 RETVAL = SvUTF8(sv) ? TRUE : FALSE;
589 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
595 SvREFCNT_dec(sv); /* it was a temp copy */
606 SV *rsv = newSViv(SvUTF8(sv));
610 RETVAL = &PL_sv_undef;
622 SV *rsv = newSViv(SvUTF8(sv));
626 RETVAL = &PL_sv_undef;
634 #if defined(USE_PERLIO) && !defined(USE_SFIO)
635 /* PerlIO_define_layer(aTHX_ &PerlIO_encode); */