1 #define PERL_NO_GET_CONTEXT
9 #define ENCODE_XS_PROFILE 0 /* set 1 or more to profile.
10 t/encoding.t dumps core because of
11 Perl_warner and PerlIO don't work well */
13 #define ENCODE_XS_USEFP 1 /* set 0 to disable floating point to calculate
14 buffer size for encode_method().
15 1 is recommended. 2 restores NI-S original */
17 #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \
18 Perl_croak(aTHX_ "panic_unimplemented"); \
19 return (y)0; /* fool picky compilers */ \
21 UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
22 UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
23 #if defined(USE_PERLIO) && !defined(USE_SFIO)
24 /* Define an encoding "layer" in the perliol.h sense.
25 The layer defined here "inherits" in an object-oriented sense from the
26 "perlio" layer with its PerlIOBuf_* "methods".
27 The implementation is particularly efficient as until Encode settles down
28 there is no point in tryint to tune it.
30 The layer works by overloading the "fill" and "flush" methods.
32 "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API
33 to convert the encoded data to UTF-8 form, then copies it back to the
34 buffer. The "base class's" read methods then see the UTF-8 data.
36 "flush" transforms the UTF-8 data deposited by the "base class's write
37 method in the buffer back into the encoded form using the encode OO perl API,
38 then copies data back into the buffer and calls "SUPER::flush.
40 Note that "flush" is _also_ called for read mode - we still do the (back)-translate
41 so that the the base class's "flush" sees the correct number of encoded chars
42 for positioning the seek pointer. (This double translation is the worst performance
43 issue - particularly with all-perl encode engine.)
48 PerlIOBuf base; /* PerlIOBuf stuff */
49 SV *bufsv; /* buffer seen by layers above */
50 SV *dataSV; /* data we have read from layer below */
51 SV *enc; /* the encoding object */
55 PerlIOEncode_getarg(pTHX_ PerlIO * f, CLONE_PARAMS * param, int flags)
57 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
58 SV *sv = &PL_sv_undef;
66 if (perl_call_method("name", G_SCALAR) == 1) {
76 PerlIOEncode_pushed(pTHX_ PerlIO * f, const char *mode, SV * arg)
78 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
81 code = PerlIOBuf_pushed(aTHX_ f, mode, Nullsv);
87 if (perl_call_pv("Encode::find_encoding", G_SCALAR) != 1) {
88 /* should never happen */
89 Perl_die(aTHX_ "Encode::find_encoding did not return a value");
98 Perl_warner(aTHX_ packWARN(WARN_IO), "Cannot find encoding \"%" SVf "\"",
103 SvREFCNT_inc(e->enc);
104 PerlIOBase(f)->flags |= PERLIO_F_UTF8;
112 PerlIOEncode_popped(pTHX_ PerlIO * f)
114 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
116 SvREFCNT_dec(e->enc);
120 SvREFCNT_dec(e->bufsv);
124 SvREFCNT_dec(e->dataSV);
131 PerlIOEncode_get_base(pTHX_ PerlIO * f)
133 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
135 e->base.bufsiz = 1024;
137 e->bufsv = newSV(e->base.bufsiz);
138 sv_setpvn(e->bufsv, "", 0);
140 e->base.buf = (STDCHAR *) SvPVX(e->bufsv);
142 e->base.ptr = e->base.buf;
144 e->base.end = e->base.buf;
145 if (e->base.ptr < e->base.buf
146 || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) {
147 Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr,
148 e->base.buf + SvLEN(e->bufsv));
151 if (SvLEN(e->bufsv) < e->base.bufsiz) {
152 SSize_t poff = e->base.ptr - e->base.buf;
153 SSize_t eoff = e->base.end - e->base.buf;
154 e->base.buf = (STDCHAR *) SvGROW(e->bufsv, e->base.bufsiz);
155 e->base.ptr = e->base.buf + poff;
156 e->base.end = e->base.buf + eoff;
158 if (e->base.ptr < e->base.buf
159 || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) {
160 Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr,
161 e->base.buf + SvLEN(e->bufsv));
168 PerlIOEncode_fill(pTHX_ PerlIO * f)
170 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
175 if (PerlIO_flush(f) != 0)
178 if (!PerlIO_fast_gets(n)) {
179 /* Things get too messy if we don't have a buffer layer
180 push a :perlio to do the job */
182 n = PerlIO_push(aTHX_ n, &PerlIO_perlio, PerlIO_modestr(f,mode), Nullsv);
184 Perl_die(aTHX_ "panic: cannot push :perlio for %p",f);
190 avail = PerlIO_get_cnt(n);
192 avail = PerlIO_fill(n);
194 avail = PerlIO_get_cnt(n);
197 if (!PerlIO_error(n) && PerlIO_eof(n))
202 STDCHAR *ptr = PerlIO_get_ptr(n);
207 e->base.ptr = e->base.end = (STDCHAR *) Nullch;
208 (void) PerlIOEncode_get_base(aTHX_ f);
210 e->dataSV = newSV(0);
211 if (SvTYPE(e->dataSV) < SVt_PV) {
212 sv_upgrade(e->dataSV,SVt_PV);
214 if (SvCUR(e->dataSV)) {
215 /* something left over from last time - create a normal
216 SV with new data appended
218 if (use + SvCUR(e->dataSV) > e->base.bufsiz) {
219 use = e->base.bufsiz - SvCUR(e->dataSV);
221 sv_catpvn(e->dataSV,(char*)ptr,use);
224 /* Create a "dummy" SV to represent the available data from layer below */
225 if (SvLEN(e->dataSV) && SvPVX(e->dataSV)) {
226 Safefree(SvPVX(e->dataSV));
228 if (use > e->base.bufsiz) {
229 use = e->base.bufsiz;
231 SvPVX(e->dataSV) = (char *) ptr;
232 SvLEN(e->dataSV) = 0; /* Hands off sv.c - it isn't yours */
233 SvCUR_set(e->dataSV,use);
234 SvPOK_only(e->dataSV);
236 SvUTF8_off(e->dataSV);
242 if (perl_call_method("decode", G_SCALAR) != 1) {
243 Perl_die(aTHX_ "panic: decode did not return a value");
248 /* Now get translated string (forced to UTF-8) and use as buffer */
250 s = SvPVutf8(uni, len);
251 #ifdef PARANOID_ENCODE_CHECKS
252 if (len && !is_utf8_string((U8*)s,len)) {
253 Perl_warn(aTHX_ "panic: decode did not return UTF-8 '%.*s'",(int) len,s);
259 /* if decode gave us back dataSV then data may vanish when
260 we do ptrcnt adjust - so take our copy now.
261 (The copy is a pain - need a put-it-here option for decode.)
263 sv_setpvn(e->bufsv,s,len);
264 e->base.ptr = e->base.buf = (STDCHAR*)SvPVX(e->bufsv);
265 e->base.end = e->base.ptr + SvCUR(e->bufsv);
266 PerlIOBase(f)->flags |= PERLIO_F_RDBUF;
269 /* Adjust ptr/cnt not taking anything which
270 did not translate - not clear this is a win */
271 /* compute amount we took */
272 use -= SvCUR(e->dataSV);
273 PerlIO_set_ptrcnt(n, ptr+use, (avail-use));
274 /* and as we did not take it it isn't pending */
275 SvCUR_set(e->dataSV,0);
277 /* Got nothing - assume partial character so we need some more */
278 /* Make sure e->dataSV is a normal SV before re-filling as
279 buffer alias will change under us
281 s = SvPV(e->dataSV,len);
282 sv_setpvn(e->dataSV,s,len);
283 PerlIO_set_ptrcnt(n, ptr+use, (avail-use));
292 PerlIOBase(f)->flags |= PERLIO_F_EOF;
294 PerlIOBase(f)->flags |= PERLIO_F_ERROR;
300 PerlIOEncode_flush(pTHX_ PerlIO * f)
302 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
304 if (e->bufsv && (e->base.ptr > e->base.buf)) {
310 if (PerlIOBase(f)->flags & PERLIO_F_WRBUF) {
311 /* Write case encode the buffer and write() to layer below */
316 SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
321 if (perl_call_method("encode", G_SCALAR) != 1) {
322 Perl_die(aTHX_ "panic: encode did not return a value");
328 count = PerlIO_write(PerlIONext(f),s,len);
334 if (PerlIO_flush(PerlIONext(f)) != 0) {
337 if (SvCUR(e->bufsv)) {
338 /* Did not all translate */
339 e->base.ptr = e->base.buf+SvCUR(e->bufsv);
343 else if (PerlIOBase(f)->flags & PERLIO_F_RDBUF) {
345 /* if we have any untranslated stuff then unread that first */
346 if (e->dataSV && SvCUR(e->dataSV)) {
347 s = SvPV(e->dataSV, len);
348 count = PerlIO_unread(PerlIONext(f),s,len);
353 /* See if there is anything left in the buffer */
354 if (e->base.ptr < e->base.end) {
355 /* Bother - have unread data.
356 re-encode and unread() to layer below
360 str = sv_newmortal();
361 sv_upgrade(str, SVt_PV);
362 SvPVX(str) = (char*)e->base.ptr;
364 SvCUR_set(str, e->base.end - e->base.ptr);
372 if (perl_call_method("encode", G_SCALAR) != 1) {
373 Perl_die(aTHX_ "panic: encode did not return a value");
379 count = PerlIO_unread(PerlIONext(f),s,len);
387 e->base.ptr = e->base.end = e->base.buf;
388 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF);
394 PerlIOEncode_close(pTHX_ PerlIO * f)
396 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
397 IV code = PerlIOBase_close(aTHX_ f);
399 if (e->base.buf && e->base.ptr > e->base.buf) {
400 Perl_croak(aTHX_ "Close with partial character");
402 SvREFCNT_dec(e->bufsv);
408 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF);
413 PerlIOEncode_tell(pTHX_ PerlIO * f)
415 PerlIOBuf *b = PerlIOSelf(f, PerlIOBuf);
416 /* Unfortunately the only way to get a postion is to (re-)translate,
417 the UTF8 we have in bufefr and then ask layer below
420 if (b->buf && b->ptr > b->buf) {
421 Perl_croak(aTHX_ "Cannot tell at partial character");
423 return PerlIO_tell(PerlIONext(f));
427 PerlIOEncode_dup(pTHX_ PerlIO * f, PerlIO * o,
428 CLONE_PARAMS * params, int flags)
430 if ((f = PerlIOBase_dup(aTHX_ f, o, params, flags))) {
431 PerlIOEncode *fe = PerlIOSelf(f, PerlIOEncode);
432 PerlIOEncode *oe = PerlIOSelf(o, PerlIOEncode);
434 fe->enc = PerlIO_sv_dup(aTHX_ oe->enc, params);
440 PerlIO_funcs PerlIO_encode = {
442 sizeof(PerlIOEncode),
443 PERLIO_K_BUFFERED|PERLIO_K_DESTRUCT,
461 PerlIOBase_setlinebuf,
462 PerlIOEncode_get_base,
466 PerlIOBuf_set_ptrcnt,
468 #endif /* encode layer */
471 Encode_XSEncoding(pTHX_ encode_t * enc)
474 HV *stash = gv_stashpv("Encode::XS", TRUE);
475 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))), stash);
479 while (enc->name[i]) {
480 const char *name = enc->name[i++];
481 XPUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
484 call_pv("Encode::define_encoding", G_DISCARD);
489 call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig)
491 /* Exists for breakpointing */
495 encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
499 U8 *s = (U8 *) SvPV(src, slen);
504 /* We allocate slen+1.
505 PerlIO dumps core if this value is smaller than this. */
506 SV *dst = sv_2mortal(newSV(slen+1));
508 U8 *d = (U8 *) SvPVX(dst);
509 STRLEN dlen = SvLEN(dst)-1;
511 while ((code = do_encode(dir, s, &slen, d, dlen, &dlen, !check))) {
512 SvCUR_set(dst, dlen+ddone);
515 #if ENCODE_XS_PROFILE >= 3
516 Perl_warn(aTHX_ "code=%d @ s=%d/%d/%d d=%d/%d/%d\n",code,slen,sdone,tlen,dlen,ddone,SvLEN(dst)-1);
519 if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL)
525 STRLEN more = 0; /* make sure you initialize! */
529 sleft = tlen - sdone;
530 #if ENCODE_XS_PROFILE >= 2
532 "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
533 more, sdone, sleft, SvLEN(dst));
535 if (sdone != 0) { /* has src ever been processed ? */
536 #if ENCODE_XS_USEFP == 2
537 more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone
539 #elif ENCODE_XS_USEFP
540 more = (1.0*SvLEN(dst)+1)/sdone * sleft;
542 /* safe until SvLEN(dst) == MAX_INT/16 */
543 more = (16*SvLEN(dst)+1)/sdone/16 * sleft;
546 more += UTF8_MAXLEN; /* insurance policy */
547 #if ENCODE_XS_PROFILE >= 2
549 "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
550 more, sdone, sleft, SvLEN(dst));
552 d = (U8 *) SvGROW(dst, SvLEN(dst) + more);
553 /* dst need to grow need MORE bytes! */
554 if (ddone >= SvLEN(dst)) {
555 Perl_croak(aTHX_ "Destination couldn't be grown.");
557 dlen = SvLEN(dst)-ddone-1;
565 if (dir == enc->f_utf8) {
566 if (!check && ckWARN_d(WARN_UTF8)) {
569 utf8n_to_uvuni(s + slen, (SvCUR(src) - slen),
571 Perl_warner(aTHX_ packWARN(WARN_UTF8),
573 "}\" does not map to %s", ch,
575 /* FIXME: Skip over the character, copy in replacement and continue
576 * but that is messy so for now just fail.
585 /* UTF-8 is supposed to be "Universal" so should not happen
586 for real characters, but some encodings have non-assigned
587 codes which may occur.
589 Perl_croak(aTHX_ "%s \"\\x%02X\" does not map to Unicode (%d)",
590 enc->name[0], (U8) s[slen], code);
595 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
596 code, (dir == enc->f_utf8) ? "to" : "from",
601 SvCUR_set(dst, dlen+ddone);
604 sdone = SvCUR(src) - (slen+sdone);
607 /* FIXME: A Move() is dangerous - PV could be mmap'ed readonly
608 SvOOK would be ideal - but sv_backoff does not understand SvLEN == 0
609 type SVs and sv_clear() calls it ...
611 sv_setpvn(src, (char*)s+slen, sdone);
613 Move(s + slen, SvPVX(src), sdone , U8);
616 SvCUR_set(src, sdone);
623 #if ENCODE_XS_PROFILE
624 if (SvCUR(dst) > SvCUR(src)){
626 "SvLEN(dst)=%d, SvCUR(dst)=%d. "
627 "%d bytes unused(%f %%)\n",
628 SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst),
629 (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0);
637 MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_
646 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
647 ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0])));
652 Method_decode(obj,src,check = FALSE)
658 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
659 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
665 Method_encode(obj,src,check = FALSE)
671 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
672 sv_utf8_upgrade(src);
673 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
677 MODULE = Encode PACKAGE = Encode
682 _bytes_to_utf8(sv, ...)
686 SV * encoding = items == 2 ? ST(1) : Nullsv;
689 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
692 U8* s = (U8*)SvPV(sv, len);
695 converted = bytes_to_utf8(s, &len); /* This allocs */
696 sv_setpvn(sv, (char *)converted, len);
697 SvUTF8_on(sv); /* XXX Should we? */
698 Safefree(converted); /* ... so free it */
706 _utf8_to_bytes(sv, ...)
710 SV * to = items > 1 ? ST(1) : Nullsv;
711 SV * check = items > 2 ? ST(2) : Nullsv;
714 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
717 U8 *s = (U8*)SvPV(sv, len);
721 /* Must do things the slow way */
723 U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
726 New(83, dest, len, U8); /* I think */
735 /* Have to do it all ourselves because of error routine,
739 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
740 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
741 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
742 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
743 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
744 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
745 else { ulen = 13; uv = 0; }
747 /* Note change to utf8.c variable naming, for variety */
749 if ((*s & 0xc0) != 0x80)
753 uv = (uv << 6) | (*s++ & 0x3f);
757 call_failure(check, s, dest, src);
758 /* Now what happens? */
764 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
771 is_utf8(sv, check = FALSE)
776 if (SvGMAGICAL(sv)) /* it could be $1, for example */
777 sv = newSVsv(sv); /* GMAGIG will be done */
779 RETVAL = SvUTF8(sv) ? TRUE : FALSE;
782 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
788 SvREFCNT_dec(sv); /* it was a temp copy */
799 SV *rsv = newSViv(SvUTF8(sv));
803 RETVAL = &PL_sv_undef;
815 SV *rsv = newSViv(SvUTF8(sv));
819 RETVAL = &PL_sv_undef;
827 #if defined(USE_PERLIO) && !defined(USE_SFIO)
828 PerlIO_define_layer(aTHX_ &PerlIO_encode);