1 #define PERL_NO_GET_CONTEXT
9 #define ENCODE_XS_PROFILE 0 /* set 1 to profile.
10 t/encoding.t dumps core because of
11 Perl_warner and PerlIO don't work well */
13 #define ENCODE_XS_USEFP 1 /* set 0 to disable floating point to calculate
14 buffer size for encode_method().
15 1 is recommended. 2 restores NI-S original */
17 #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \
18 Perl_croak(aTHX_ "panic_unimplemented"); \
19 return (y)0; /* fool picky compilers */ \
21 UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
22 UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
23 #if defined(USE_PERLIO) && !defined(USE_SFIO)
24 /* Define an encoding "layer" in the perliol.h sense.
25 The layer defined here "inherits" in an object-oriented sense from the
26 "perlio" layer with its PerlIOBuf_* "methods".
27 The implementation is particularly efficient as until Encode settles down
28 there is no point in tryint to tune it.
30 The layer works by overloading the "fill" and "flush" methods.
32 "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API
33 to convert the encoded data to UTF-8 form, then copies it back to the
34 buffer. The "base class's" read methods then see the UTF-8 data.
36 "flush" transforms the UTF-8 data deposited by the "base class's write
37 method in the buffer back into the encoded form using the encode OO perl API,
38 then copies data back into the buffer and calls "SUPER::flush.
40 Note that "flush" is _also_ called for read mode - we still do the (back)-translate
41 so that the the base class's "flush" sees the correct number of encoded chars
42 for positioning the seek pointer. (This double translation is the worst performance
43 issue - particularly with all-perl encode engine.)
48 PerlIOBuf base; /* PerlIOBuf stuff */
49 SV *bufsv; /* buffer seen by layers above */
50 SV *dataSV; /* data we have read from layer below */
51 SV *enc; /* the encoding object */
55 PerlIOEncode_getarg(pTHX_ PerlIO * f, CLONE_PARAMS * param, int flags)
57 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
58 SV *sv = &PL_sv_undef;
66 if (perl_call_method("name", G_SCALAR) == 1) {
76 PerlIOEncode_pushed(pTHX_ PerlIO * f, const char *mode, SV * arg)
78 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
81 code = PerlIOBuf_pushed(aTHX_ f, mode, Nullsv);
87 if (perl_call_pv("Encode::find_encoding", G_SCALAR) != 1) {
88 /* should never happen */
89 Perl_die(aTHX_ "Encode::find_encoding did not return a value");
98 Perl_warner(aTHX_ packWARN(WARN_IO), "Cannot find encoding \"%" SVf "\"",
103 SvREFCNT_inc(e->enc);
104 PerlIOBase(f)->flags |= PERLIO_F_UTF8;
112 PerlIOEncode_popped(pTHX_ PerlIO * f)
114 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
116 SvREFCNT_dec(e->enc);
120 SvREFCNT_dec(e->bufsv);
124 SvREFCNT_dec(e->dataSV);
131 PerlIOEncode_get_base(pTHX_ PerlIO * f)
133 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
135 e->base.bufsiz = 1024;
137 e->bufsv = newSV(e->base.bufsiz);
138 sv_setpvn(e->bufsv, "", 0);
140 e->base.buf = (STDCHAR *) SvPVX(e->bufsv);
142 e->base.ptr = e->base.buf;
144 e->base.end = e->base.buf;
145 if (e->base.ptr < e->base.buf
146 || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) {
147 Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr,
148 e->base.buf + SvLEN(e->bufsv));
151 if (SvLEN(e->bufsv) < e->base.bufsiz) {
152 SSize_t poff = e->base.ptr - e->base.buf;
153 SSize_t eoff = e->base.end - e->base.buf;
154 e->base.buf = (STDCHAR *) SvGROW(e->bufsv, e->base.bufsiz);
155 e->base.ptr = e->base.buf + poff;
156 e->base.end = e->base.buf + eoff;
158 if (e->base.ptr < e->base.buf
159 || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) {
160 Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr,
161 e->base.buf + SvLEN(e->bufsv));
168 PerlIOEncode_fill(pTHX_ PerlIO * f)
170 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
175 if (PerlIO_flush(f) != 0)
178 if (!PerlIO_fast_gets(n)) {
179 /* Things get too messy if we don't have a buffer layer
180 push a :perlio to do the job */
182 n = PerlIO_push(aTHX_ n, &PerlIO_perlio, PerlIO_modestr(f,mode), Nullsv);
184 Perl_die(aTHX_ "panic: cannot push :perlio for %p",f);
190 avail = PerlIO_get_cnt(n);
192 avail = PerlIO_fill(n);
194 avail = PerlIO_get_cnt(n);
197 if (!PerlIO_error(n) && PerlIO_eof(n))
202 STDCHAR *ptr = PerlIO_get_ptr(n);
207 e->base.ptr = e->base.end = (STDCHAR *) Nullch;
208 (void) PerlIOEncode_get_base(aTHX_ f);
210 e->dataSV = newSV(0);
211 if (SvTYPE(e->dataSV) < SVt_PV) {
212 sv_upgrade(e->dataSV,SVt_PV);
214 if (SvCUR(e->dataSV)) {
215 /* something left over from last time - create a normal
216 SV with new data appended
218 if (use + SvCUR(e->dataSV) > e->base.bufsiz) {
219 use = e->base.bufsiz - SvCUR(e->dataSV);
221 sv_catpvn(e->dataSV,(char*)ptr,use);
224 /* Create a "dummy" SV to represent the available data from layer below */
225 if (SvLEN(e->dataSV) && SvPVX(e->dataSV)) {
226 Safefree(SvPVX(e->dataSV));
228 if (use > e->base.bufsiz) {
229 use = e->base.bufsiz;
231 SvPVX(e->dataSV) = (char *) ptr;
232 SvLEN(e->dataSV) = 0; /* Hands off sv.c - it isn't yours */
233 SvCUR_set(e->dataSV,use);
234 SvPOK_only(e->dataSV);
236 SvUTF8_off(e->dataSV);
242 if (perl_call_method("decode", G_SCALAR) != 1) {
243 Perl_die(aTHX_ "panic: decode did not return a value");
248 /* Now get translated string (forced to UTF-8) and use as buffer */
250 s = SvPVutf8(uni, len);
251 if (len && !is_utf8_string((U8*)s,len)) {
252 Perl_warn(aTHX_ "panic: decode did not return UTF-8 '%.*s'",(int) len,s);
257 /* if decode gave us back dataSV then data may vanish when
258 we do ptrcnt adjust - so take our copy now.
259 (The copy is a pain - need a put-it-here option for decode.)
261 sv_setpvn(e->bufsv,s,len);
262 e->base.ptr = e->base.buf = (STDCHAR*)SvPVX(e->bufsv);
263 e->base.end = e->base.ptr + SvCUR(e->bufsv);
264 PerlIOBase(f)->flags |= PERLIO_F_RDBUF;
267 /* Adjust ptr/cnt not taking anything which
268 did not translate - not clear this is a win */
269 /* compute amount we took */
270 use -= SvCUR(e->dataSV);
271 PerlIO_set_ptrcnt(n, ptr+use, (avail-use));
272 /* and as we did not take it it isn't pending */
273 SvCUR_set(e->dataSV,0);
275 /* Got nothing - assume partial character so we need some more */
276 /* Make sure e->dataSV is a normal SV before re-filling as
277 buffer alias will change under us
279 s = SvPV(e->dataSV,len);
280 sv_setpvn(e->dataSV,s,len);
281 PerlIO_set_ptrcnt(n, ptr+use, (avail-use));
290 PerlIOBase(f)->flags |= PERLIO_F_EOF;
292 PerlIOBase(f)->flags |= PERLIO_F_ERROR;
298 PerlIOEncode_flush(pTHX_ PerlIO * f)
300 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
302 if (e->bufsv && (e->base.ptr > e->base.buf)) {
308 if (PerlIOBase(f)->flags & PERLIO_F_WRBUF) {
309 /* Write case encode the buffer and write() to layer below */
314 SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
319 if (perl_call_method("encode", G_SCALAR) != 1) {
320 Perl_die(aTHX_ "panic: encode did not return a value");
326 count = PerlIO_write(PerlIONext(f),s,len);
332 if (PerlIO_flush(PerlIONext(f)) != 0) {
335 if (SvCUR(e->bufsv)) {
336 /* Did not all translate */
337 e->base.ptr = e->base.buf+SvCUR(e->bufsv);
341 else if (PerlIOBase(f)->flags & PERLIO_F_RDBUF) {
343 /* if we have any untranslated stuff then unread that first */
344 if (e->dataSV && SvCUR(e->dataSV)) {
345 s = SvPV(e->dataSV, len);
346 count = PerlIO_unread(PerlIONext(f),s,len);
351 /* See if there is anything left in the buffer */
352 if (e->base.ptr < e->base.end) {
353 /* Bother - have unread data.
354 re-encode and unread() to layer below
358 str = sv_newmortal();
359 sv_upgrade(str, SVt_PV);
360 SvPVX(str) = (char*)e->base.ptr;
362 SvCUR_set(str, e->base.end - e->base.ptr);
370 if (perl_call_method("encode", G_SCALAR) != 1) {
371 Perl_die(aTHX_ "panic: encode did not return a value");
377 count = PerlIO_unread(PerlIONext(f),s,len);
385 e->base.ptr = e->base.end = e->base.buf;
386 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF);
392 PerlIOEncode_close(pTHX_ PerlIO * f)
394 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
395 IV code = PerlIOBase_close(aTHX_ f);
397 if (e->base.buf && e->base.ptr > e->base.buf) {
398 Perl_croak(aTHX_ "Close with partial character");
400 SvREFCNT_dec(e->bufsv);
406 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF);
411 PerlIOEncode_tell(pTHX_ PerlIO * f)
413 PerlIOBuf *b = PerlIOSelf(f, PerlIOBuf);
414 /* Unfortunately the only way to get a postion is to (re-)translate,
415 the UTF8 we have in bufefr and then ask layer below
418 if (b->buf && b->ptr > b->buf) {
419 Perl_croak(aTHX_ "Cannot tell at partial character");
421 return PerlIO_tell(PerlIONext(f));
425 PerlIOEncode_dup(pTHX_ PerlIO * f, PerlIO * o,
426 CLONE_PARAMS * params, int flags)
428 if ((f = PerlIOBase_dup(aTHX_ f, o, params, flags))) {
429 PerlIOEncode *fe = PerlIOSelf(f, PerlIOEncode);
430 PerlIOEncode *oe = PerlIOSelf(o, PerlIOEncode);
432 fe->enc = PerlIO_sv_dup(aTHX_ oe->enc, params);
438 PerlIO_funcs PerlIO_encode = {
440 sizeof(PerlIOEncode),
441 PERLIO_K_BUFFERED|PERLIO_K_DESTRUCT,
459 PerlIOBase_setlinebuf,
460 PerlIOEncode_get_base,
464 PerlIOBuf_set_ptrcnt,
466 #endif /* encode layer */
469 Encode_XSEncoding(pTHX_ encode_t * enc)
472 HV *stash = gv_stashpv("Encode::XS", TRUE);
473 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))), stash);
477 while (enc->name[i]) {
478 const char *name = enc->name[i++];
479 XPUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
482 call_pv("Encode::define_encoding", G_DISCARD);
487 call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig)
489 /* Exists for breakpointing */
493 encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
497 U8 *s = (U8 *) SvPV(src, slen);
502 /* We allocate slen+1.
503 PerlIO dumps core if this value is smaller than this. */
504 SV *dst = sv_2mortal(newSV(slen+1));
506 U8 *d = (U8 *) SvPVX(dst);
507 STRLEN dlen = SvLEN(dst)-1;
509 while ((code = do_encode(dir, s, &slen, d, dlen, &dlen, !check))) {
510 SvCUR_set(dst, dlen+ddone);
513 #if ENCODE_XS_PROFILE >= 3
514 Perl_warn(aTHX_ "code=%d @ s=%d/%d/%d d=%d/%d/%d\n",code,slen,sdone,tlen,dlen,ddone,SvLEN(dst)-1);
517 if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL)
526 sleft = tlen - sdone;
527 if (sdone) { /* has src ever been processed ? */
528 #if ENCODE_XS_USEFP == 2
529 more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone
531 #elif ENCODE_XS_USEFP
532 more = (1.0*SvLEN(dst)+1)/sdone * sleft;
534 /* safe until SvLEN(dst) == MAX_INT/16 */
535 more = (16*SvLEN(dst)+1)/sdone/16 * sleft;
539 more += UTF8_MAXLEN; /* insurance policy */
540 #if ENCODE_XS_PROFILE >= 2
542 "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
543 more, sdone, sleft, SvLEN(dst));
545 d = (U8 *) SvGROW(dst, SvLEN(dst) + more);
546 /* dst need to grow need MORE bytes! */
547 if (ddone >= SvLEN(dst)) {
548 Perl_croak(aTHX_ "Destination couldn't be grown.");
550 dlen = SvLEN(dst)-ddone-1;
558 if (dir == enc->f_utf8) {
559 if (!check && ckWARN_d(WARN_UTF8)) {
562 utf8n_to_uvuni(s + slen, (SvCUR(src) - slen),
564 Perl_warner(aTHX_ packWARN(WARN_UTF8),
566 "}\" does not map to %s", ch,
568 /* FIXME: Skip over the character, copy in replacement and continue
569 * but that is messy so for now just fail.
578 /* UTF-8 is supposed to be "Universal" so should not happen
579 for real characters, but some encodings have non-assigned
580 codes which may occur.
582 Perl_croak(aTHX_ "%s \"\\x%02X\" does not map to Unicode (%d)",
583 enc->name[0], (U8) s[slen], code);
588 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
589 code, (dir == enc->f_utf8) ? "to" : "from",
594 SvCUR_set(dst, dlen+ddone);
597 sdone = SvCUR(src) - (slen+sdone);
600 /* FIXME: A Move() is dangerous - PV could be mmap'ed readonly
601 SvOOK would be ideal - but sv_backoff does not understand SvLEN == 0
602 type SVs and sv_clear() calls it ...
604 sv_setpvn(src, (char*)s+slen, sdone);
606 Move(s + slen, SvPVX(src), sdone , U8);
609 SvCUR_set(src, sdone);
616 #if ENCODE_XS_PROFILE
617 if (SvCUR(dst) > SvCUR(src)){
619 "SvLEN(dst)=%d, SvCUR(dst)=%d. "
620 "%d bytes unused(%f %%)\n",
621 SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst),
622 (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0);
630 MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_
639 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
640 ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0])));
645 Method_decode(obj,src,check = FALSE)
651 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
652 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
658 Method_encode(obj,src,check = FALSE)
664 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
665 sv_utf8_upgrade(src);
666 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
670 MODULE = Encode PACKAGE = Encode
675 _bytes_to_utf8(sv, ...)
679 SV * encoding = items == 2 ? ST(1) : Nullsv;
682 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
685 U8* s = (U8*)SvPV(sv, len);
688 converted = bytes_to_utf8(s, &len); /* This allocs */
689 sv_setpvn(sv, (char *)converted, len);
690 SvUTF8_on(sv); /* XXX Should we? */
691 Safefree(converted); /* ... so free it */
699 _utf8_to_bytes(sv, ...)
703 SV * to = items > 1 ? ST(1) : Nullsv;
704 SV * check = items > 2 ? ST(2) : Nullsv;
707 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
710 U8 *s = (U8*)SvPV(sv, len);
714 /* Must do things the slow way */
716 U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
719 New(83, dest, len, U8); /* I think */
728 /* Have to do it all ourselves because of error routine,
732 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
733 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
734 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
735 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
736 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
737 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
738 else { ulen = 13; uv = 0; }
740 /* Note change to utf8.c variable naming, for variety */
742 if ((*s & 0xc0) != 0x80)
746 uv = (uv << 6) | (*s++ & 0x3f);
750 call_failure(check, s, dest, src);
751 /* Now what happens? */
757 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
764 is_utf8(sv, check = FALSE)
769 if (SvGMAGICAL(sv)) /* it could be $1, for example */
770 sv = newSVsv(sv); /* GMAGIG will be done */
772 RETVAL = SvUTF8(sv) ? TRUE : FALSE;
775 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
781 SvREFCNT_dec(sv); /* it was a temp copy */
792 SV *rsv = newSViv(SvUTF8(sv));
796 RETVAL = &PL_sv_undef;
808 SV *rsv = newSViv(SvUTF8(sv));
812 RETVAL = &PL_sv_undef;
820 #if defined(USE_PERLIO) && !defined(USE_SFIO)
821 PerlIO_define_layer(aTHX_ &PerlIO_encode);