1 #define PERL_NO_GET_CONTEXT
9 #define ENCODE_XS_PROFILE 0 /* set 1 or more to profile.
10 t/encoding.t dumps core because of
11 Perl_warner and PerlIO don't work well */
13 #define ENCODE_XS_USEFP 1 /* set 0 to disable floating point to calculate
14 buffer size for encode_method().
15 1 is recommended. 2 restores NI-S original */
17 #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \
18 Perl_croak(aTHX_ "panic_unimplemented"); \
19 return (y)0; /* fool picky compilers */ \
21 UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
22 UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
23 #if defined(USE_PERLIO) && !defined(USE_SFIO)
24 /* Define an encoding "layer" in the perliol.h sense.
25 The layer defined here "inherits" in an object-oriented sense from the
26 "perlio" layer with its PerlIOBuf_* "methods".
27 The implementation is particularly efficient as until Encode settles down
28 there is no point in tryint to tune it.
30 The layer works by overloading the "fill" and "flush" methods.
32 "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API
33 to convert the encoded data to UTF-8 form, then copies it back to the
34 buffer. The "base class's" read methods then see the UTF-8 data.
36 "flush" transforms the UTF-8 data deposited by the "base class's write
37 method in the buffer back into the encoded form using the encode OO perl API,
38 then copies data back into the buffer and calls "SUPER::flush.
40 Note that "flush" is _also_ called for read mode - we still do the (back)-translate
41 so that the the base class's "flush" sees the correct number of encoded chars
42 for positioning the seek pointer. (This double translation is the worst performance
43 issue - particularly with all-perl encode engine.)
48 PerlIOBuf base; /* PerlIOBuf stuff */
49 SV *bufsv; /* buffer seen by layers above */
50 SV *dataSV; /* data we have read from layer below */
51 SV *enc; /* the encoding object */
55 PerlIOEncode_getarg(pTHX_ PerlIO * f, CLONE_PARAMS * param, int flags)
57 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
58 SV *sv = &PL_sv_undef;
66 if (perl_call_method("name", G_SCALAR) == 1) {
76 PerlIOEncode_pushed(pTHX_ PerlIO * f, const char *mode, SV * arg)
78 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
81 code = PerlIOBuf_pushed(aTHX_ f, mode, Nullsv);
87 if (perl_call_pv("Encode::find_encoding", G_SCALAR) != 1) {
88 /* should never happen */
89 Perl_die(aTHX_ "Encode::find_encoding did not return a value");
98 Perl_warner(aTHX_ packWARN(WARN_IO), "Cannot find encoding \"%" SVf "\"",
103 SvREFCNT_inc(e->enc);
104 PerlIOBase(f)->flags |= PERLIO_F_UTF8;
112 PerlIOEncode_popped(pTHX_ PerlIO * f)
114 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
116 SvREFCNT_dec(e->enc);
120 SvREFCNT_dec(e->bufsv);
124 SvREFCNT_dec(e->dataSV);
131 PerlIOEncode_get_base(pTHX_ PerlIO * f)
133 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
135 e->base.bufsiz = 1024;
137 e->bufsv = newSV(e->base.bufsiz);
138 sv_setpvn(e->bufsv, "", 0);
140 e->base.buf = (STDCHAR *) SvPVX(e->bufsv);
142 e->base.ptr = e->base.buf;
144 e->base.end = e->base.buf;
145 if (e->base.ptr < e->base.buf
146 || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) {
147 Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr,
148 e->base.buf + SvLEN(e->bufsv));
151 if (SvLEN(e->bufsv) < e->base.bufsiz) {
152 SSize_t poff = e->base.ptr - e->base.buf;
153 SSize_t eoff = e->base.end - e->base.buf;
154 e->base.buf = (STDCHAR *) SvGROW(e->bufsv, e->base.bufsiz);
155 e->base.ptr = e->base.buf + poff;
156 e->base.end = e->base.buf + eoff;
158 if (e->base.ptr < e->base.buf
159 || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) {
160 Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr,
161 e->base.buf + SvLEN(e->bufsv));
168 PerlIOEncode_fill(pTHX_ PerlIO * f)
170 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
175 if (PerlIO_flush(f) != 0)
178 if (!PerlIO_fast_gets(n)) {
179 /* Things get too messy if we don't have a buffer layer
180 push a :perlio to do the job */
182 n = PerlIO_push(aTHX_ n, &PerlIO_perlio, PerlIO_modestr(f,mode), Nullsv);
184 Perl_die(aTHX_ "panic: cannot push :perlio for %p",f);
190 avail = PerlIO_get_cnt(n);
192 avail = PerlIO_fill(n);
194 avail = PerlIO_get_cnt(n);
197 if (!PerlIO_error(n) && PerlIO_eof(n))
202 STDCHAR *ptr = PerlIO_get_ptr(n);
207 e->base.ptr = e->base.end = (STDCHAR *) Nullch;
208 (void) PerlIOEncode_get_base(aTHX_ f);
210 e->dataSV = newSV(0);
211 if (SvTYPE(e->dataSV) < SVt_PV) {
212 sv_upgrade(e->dataSV,SVt_PV);
214 if (SvCUR(e->dataSV)) {
215 /* something left over from last time - create a normal
216 SV with new data appended
218 if (use + SvCUR(e->dataSV) > e->base.bufsiz) {
219 use = e->base.bufsiz - SvCUR(e->dataSV);
221 sv_catpvn(e->dataSV,(char*)ptr,use);
224 /* Create a "dummy" SV to represent the available data from layer below */
225 if (SvLEN(e->dataSV) && SvPVX(e->dataSV)) {
226 Safefree(SvPVX(e->dataSV));
228 if (use > e->base.bufsiz) {
229 use = e->base.bufsiz;
231 SvPVX(e->dataSV) = (char *) ptr;
232 SvLEN(e->dataSV) = 0; /* Hands off sv.c - it isn't yours */
233 SvCUR_set(e->dataSV,use);
234 SvPOK_only(e->dataSV);
236 SvUTF8_off(e->dataSV);
242 if (perl_call_method("decode", G_SCALAR) != 1) {
243 Perl_die(aTHX_ "panic: decode did not return a value");
248 /* Now get translated string (forced to UTF-8) and use as buffer */
250 s = SvPVutf8(uni, len);
251 if (len && !is_utf8_string((U8*)s,len)) {
252 Perl_warn(aTHX_ "panic: decode did not return UTF-8 '%.*s'",(int) len,s);
257 /* if decode gave us back dataSV then data may vanish when
258 we do ptrcnt adjust - so take our copy now.
259 (The copy is a pain - need a put-it-here option for decode.)
261 sv_setpvn(e->bufsv,s,len);
262 e->base.ptr = e->base.buf = (STDCHAR*)SvPVX(e->bufsv);
263 e->base.end = e->base.ptr + SvCUR(e->bufsv);
264 PerlIOBase(f)->flags |= PERLIO_F_RDBUF;
267 /* Adjust ptr/cnt not taking anything which
268 did not translate - not clear this is a win */
269 /* compute amount we took */
270 use -= SvCUR(e->dataSV);
271 PerlIO_set_ptrcnt(n, ptr+use, (avail-use));
272 /* and as we did not take it it isn't pending */
273 SvCUR_set(e->dataSV,0);
275 /* Got nothing - assume partial character so we need some more */
276 /* Make sure e->dataSV is a normal SV before re-filling as
277 buffer alias will change under us
279 s = SvPV(e->dataSV,len);
280 sv_setpvn(e->dataSV,s,len);
281 PerlIO_set_ptrcnt(n, ptr+use, (avail-use));
290 PerlIOBase(f)->flags |= PERLIO_F_EOF;
292 PerlIOBase(f)->flags |= PERLIO_F_ERROR;
298 PerlIOEncode_flush(pTHX_ PerlIO * f)
300 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
302 if (e->bufsv && (e->base.ptr > e->base.buf)) {
308 if (PerlIOBase(f)->flags & PERLIO_F_WRBUF) {
309 /* Write case encode the buffer and write() to layer below */
314 SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
319 if (perl_call_method("encode", G_SCALAR) != 1) {
320 Perl_die(aTHX_ "panic: encode did not return a value");
326 count = PerlIO_write(PerlIONext(f),s,len);
332 if (PerlIO_flush(PerlIONext(f)) != 0) {
335 if (SvCUR(e->bufsv)) {
336 /* Did not all translate */
337 e->base.ptr = e->base.buf+SvCUR(e->bufsv);
341 else if (PerlIOBase(f)->flags & PERLIO_F_RDBUF) {
343 /* if we have any untranslated stuff then unread that first */
344 if (e->dataSV && SvCUR(e->dataSV)) {
345 s = SvPV(e->dataSV, len);
346 count = PerlIO_unread(PerlIONext(f),s,len);
351 /* See if there is anything left in the buffer */
352 if (e->base.ptr < e->base.end) {
353 /* Bother - have unread data.
354 re-encode and unread() to layer below
358 str = sv_newmortal();
359 sv_upgrade(str, SVt_PV);
360 SvPVX(str) = (char*)e->base.ptr;
362 SvCUR_set(str, e->base.end - e->base.ptr);
370 if (perl_call_method("encode", G_SCALAR) != 1) {
371 Perl_die(aTHX_ "panic: encode did not return a value");
377 count = PerlIO_unread(PerlIONext(f),s,len);
385 e->base.ptr = e->base.end = e->base.buf;
386 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF);
392 PerlIOEncode_close(pTHX_ PerlIO * f)
394 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
395 IV code = PerlIOBase_close(aTHX_ f);
397 if (e->base.buf && e->base.ptr > e->base.buf) {
398 Perl_croak(aTHX_ "Close with partial character");
400 SvREFCNT_dec(e->bufsv);
406 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF);
411 PerlIOEncode_tell(pTHX_ PerlIO * f)
413 PerlIOBuf *b = PerlIOSelf(f, PerlIOBuf);
414 /* Unfortunately the only way to get a postion is to (re-)translate,
415 the UTF8 we have in bufefr and then ask layer below
418 if (b->buf && b->ptr > b->buf) {
419 Perl_croak(aTHX_ "Cannot tell at partial character");
421 return PerlIO_tell(PerlIONext(f));
425 PerlIOEncode_dup(pTHX_ PerlIO * f, PerlIO * o,
426 CLONE_PARAMS * params, int flags)
428 if ((f = PerlIOBase_dup(aTHX_ f, o, params, flags))) {
429 PerlIOEncode *fe = PerlIOSelf(f, PerlIOEncode);
430 PerlIOEncode *oe = PerlIOSelf(o, PerlIOEncode);
432 fe->enc = PerlIO_sv_dup(aTHX_ oe->enc, params);
438 PerlIO_funcs PerlIO_encode = {
440 sizeof(PerlIOEncode),
441 PERLIO_K_BUFFERED|PERLIO_K_DESTRUCT,
459 PerlIOBase_setlinebuf,
460 PerlIOEncode_get_base,
464 PerlIOBuf_set_ptrcnt,
466 #endif /* encode layer */
469 Encode_XSEncoding(pTHX_ encode_t * enc)
472 HV *stash = gv_stashpv("Encode::XS", TRUE);
473 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))), stash);
477 while (enc->name[i]) {
478 const char *name = enc->name[i++];
479 XPUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
482 call_pv("Encode::define_encoding", G_DISCARD);
487 call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig)
489 /* Exists for breakpointing */
493 encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
497 U8 *s = (U8 *) SvPV(src, slen);
502 /* We allocate slen+1.
503 PerlIO dumps core if this value is smaller than this. */
504 SV *dst = sv_2mortal(newSV(slen+1));
506 U8 *d = (U8 *) SvPVX(dst);
507 STRLEN dlen = SvLEN(dst)-1;
509 while ((code = do_encode(dir, s, &slen, d, dlen, &dlen, !check))) {
510 SvCUR_set(dst, dlen+ddone);
513 #if ENCODE_XS_PROFILE >= 3
514 Perl_warn(aTHX_ "code=%d @ s=%d/%d/%d d=%d/%d/%d\n",code,slen,sdone,tlen,dlen,ddone,SvLEN(dst)-1);
517 if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL)
523 STRLEN more = 0; /* make sure you initialize! */
527 sleft = tlen - sdone;
528 #if ENCODE_XS_PROFILE >= 2
530 "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
531 more, sdone, sleft, SvLEN(dst));
533 if (sdone != 0) { /* has src ever been processed ? */
534 #if ENCODE_XS_USEFP == 2
535 more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone
537 #elif ENCODE_XS_USEFP
538 more = (1.0*SvLEN(dst)+1)/sdone * sleft;
540 /* safe until SvLEN(dst) == MAX_INT/16 */
541 more = (16*SvLEN(dst)+1)/sdone/16 * sleft;
544 more += UTF8_MAXLEN; /* insurance policy */
545 #if ENCODE_XS_PROFILE >= 2
547 "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
548 more, sdone, sleft, SvLEN(dst));
550 d = (U8 *) SvGROW(dst, SvLEN(dst) + more);
551 /* dst need to grow need MORE bytes! */
552 if (ddone >= SvLEN(dst)) {
553 Perl_croak(aTHX_ "Destination couldn't be grown.");
555 dlen = SvLEN(dst)-ddone-1;
563 if (dir == enc->f_utf8) {
564 if (!check && ckWARN_d(WARN_UTF8)) {
567 utf8n_to_uvuni(s + slen, (SvCUR(src) - slen),
569 Perl_warner(aTHX_ packWARN(WARN_UTF8),
571 "}\" does not map to %s", ch,
573 /* FIXME: Skip over the character, copy in replacement and continue
574 * but that is messy so for now just fail.
583 /* UTF-8 is supposed to be "Universal" so should not happen
584 for real characters, but some encodings have non-assigned
585 codes which may occur.
587 Perl_croak(aTHX_ "%s \"\\x%02X\" does not map to Unicode (%d)",
588 enc->name[0], (U8) s[slen], code);
593 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
594 code, (dir == enc->f_utf8) ? "to" : "from",
599 SvCUR_set(dst, dlen+ddone);
602 sdone = SvCUR(src) - (slen+sdone);
605 /* FIXME: A Move() is dangerous - PV could be mmap'ed readonly
606 SvOOK would be ideal - but sv_backoff does not understand SvLEN == 0
607 type SVs and sv_clear() calls it ...
609 sv_setpvn(src, (char*)s+slen, sdone);
611 Move(s + slen, SvPVX(src), sdone , U8);
614 SvCUR_set(src, sdone);
621 #if ENCODE_XS_PROFILE
622 if (SvCUR(dst) > SvCUR(src)){
624 "SvLEN(dst)=%d, SvCUR(dst)=%d. "
625 "%d bytes unused(%f %%)\n",
626 SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst),
627 (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0);
635 MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_
644 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
645 ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0])));
650 Method_decode(obj,src,check = FALSE)
656 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
657 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
663 Method_encode(obj,src,check = FALSE)
669 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
670 sv_utf8_upgrade(src);
671 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
675 MODULE = Encode PACKAGE = Encode
680 _bytes_to_utf8(sv, ...)
684 SV * encoding = items == 2 ? ST(1) : Nullsv;
687 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
690 U8* s = (U8*)SvPV(sv, len);
693 converted = bytes_to_utf8(s, &len); /* This allocs */
694 sv_setpvn(sv, (char *)converted, len);
695 SvUTF8_on(sv); /* XXX Should we? */
696 Safefree(converted); /* ... so free it */
704 _utf8_to_bytes(sv, ...)
708 SV * to = items > 1 ? ST(1) : Nullsv;
709 SV * check = items > 2 ? ST(2) : Nullsv;
712 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
715 U8 *s = (U8*)SvPV(sv, len);
719 /* Must do things the slow way */
721 U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
724 New(83, dest, len, U8); /* I think */
733 /* Have to do it all ourselves because of error routine,
737 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
738 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
739 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
740 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
741 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
742 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
743 else { ulen = 13; uv = 0; }
745 /* Note change to utf8.c variable naming, for variety */
747 if ((*s & 0xc0) != 0x80)
751 uv = (uv << 6) | (*s++ & 0x3f);
755 call_failure(check, s, dest, src);
756 /* Now what happens? */
762 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
769 is_utf8(sv, check = FALSE)
774 if (SvGMAGICAL(sv)) /* it could be $1, for example */
775 sv = newSVsv(sv); /* GMAGIG will be done */
777 RETVAL = SvUTF8(sv) ? TRUE : FALSE;
780 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
786 SvREFCNT_dec(sv); /* it was a temp copy */
797 SV *rsv = newSViv(SvUTF8(sv));
801 RETVAL = &PL_sv_undef;
813 SV *rsv = newSViv(SvUTF8(sv));
817 RETVAL = &PL_sv_undef;
825 #if defined(USE_PERLIO) && !defined(USE_SFIO)
826 PerlIO_define_layer(aTHX_ &PerlIO_encode);