1 #define PERL_NO_GET_CONTEXT
13 #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \
14 Perl_croak(aTHX_ "panic_unimplemented"); \
15 return (y)0; /* fool picky compilers */ \
17 UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
18 UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
19 #if defined(USE_PERLIO) && !defined(USE_SFIO)
20 /* Define an encoding "layer" in the perliol.h sense.
21 The layer defined here "inherits" in an object-oriented sense from the
22 "perlio" layer with its PerlIOBuf_* "methods".
23 The implementation is particularly efficient as until Encode settles down
24 there is no point in tryint to tune it.
26 The layer works by overloading the "fill" and "flush" methods.
28 "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API
29 to convert the encoded data to UTF-8 form, then copies it back to the
30 buffer. The "base class's" read methods then see the UTF-8 data.
32 "flush" transforms the UTF-8 data deposited by the "base class's write
33 method in the buffer back into the encoded form using the encode OO perl API,
34 then copies data back into the buffer and calls "SUPER::flush.
36 Note that "flush" is _also_ called for read mode - we still do the (back)-translate
37 so that the the base class's "flush" sees the correct number of encoded chars
38 for positioning the seek pointer. (This double translation is the worst performance
39 issue - particularly with all-perl encode engine.)
44 PerlIOBuf base; /* PerlIOBuf stuff */
45 SV *bufsv; /* buffer seen by layers above */
46 SV *dataSV; /* data we have read from layer below */
47 SV *enc; /* the encoding object */
51 PerlIOEncode_getarg(pTHX_ PerlIO * f, CLONE_PARAMS * param, int flags)
53 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
54 SV *sv = &PL_sv_undef;
62 if (perl_call_method("name", G_SCALAR) == 1) {
72 PerlIOEncode_pushed(pTHX_ PerlIO * f, const char *mode, SV * arg)
74 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
77 code = PerlIOBuf_pushed(aTHX_ f, mode, Nullsv);
83 if (perl_call_pv("Encode::find_encoding", G_SCALAR) != 1) {
84 /* should never happen */
85 Perl_die(aTHX_ "Encode::find_encoding did not return a value");
94 Perl_warner(aTHX_ WARN_IO, "Cannot find encoding \"%" SVf "\"",
100 PerlIOBase(f)->flags |= PERLIO_F_UTF8;
108 PerlIOEncode_popped(pTHX_ PerlIO * f)
110 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
112 SvREFCNT_dec(e->enc);
116 SvREFCNT_dec(e->bufsv);
120 SvREFCNT_dec(e->dataSV);
127 PerlIOEncode_get_base(pTHX_ PerlIO * f)
129 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
131 e->base.bufsiz = 1024;
133 e->bufsv = newSV(e->base.bufsiz);
134 sv_setpvn(e->bufsv, "", 0);
136 e->base.buf = (STDCHAR *) SvPVX(e->bufsv);
138 e->base.ptr = e->base.buf;
140 e->base.end = e->base.buf;
141 if (e->base.ptr < e->base.buf
142 || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) {
143 Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr,
144 e->base.buf + SvLEN(e->bufsv));
147 if (SvLEN(e->bufsv) < e->base.bufsiz) {
148 SSize_t poff = e->base.ptr - e->base.buf;
149 SSize_t eoff = e->base.end - e->base.buf;
150 e->base.buf = (STDCHAR *) SvGROW(e->bufsv, e->base.bufsiz);
151 e->base.ptr = e->base.buf + poff;
152 e->base.end = e->base.buf + eoff;
154 if (e->base.ptr < e->base.buf
155 || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) {
156 Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr,
157 e->base.buf + SvLEN(e->bufsv));
164 PerlIOEncode_fill(pTHX_ PerlIO * f)
166 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
171 if (PerlIO_flush(f) != 0)
174 if (!PerlIO_fast_gets(n)) {
175 /* Things get too messy if we don't have a buffer layer
176 push a :perlio to do the job */
178 n = PerlIO_push(aTHX_ n, &PerlIO_perlio, PerlIO_modestr(f,mode), Nullsv);
180 Perl_die(aTHX_ "panic: cannot push :perlio for %p",f);
186 avail = PerlIO_get_cnt(n);
188 avail = PerlIO_fill(n);
190 avail = PerlIO_get_cnt(n);
193 if (!PerlIO_error(n) && PerlIO_eof(n))
198 STDCHAR *ptr = PerlIO_get_ptr(n);
203 e->base.ptr = e->base.end = (STDCHAR *) Nullch;
204 (void) PerlIOEncode_get_base(aTHX_ f);
206 e->dataSV = newSV(0);
207 if (SvTYPE(e->dataSV) < SVt_PV) {
208 sv_upgrade(e->dataSV,SVt_PV);
210 if (SvCUR(e->dataSV)) {
211 /* something left over from last time - create a normal
212 SV with new data appended
214 if (use + SvCUR(e->dataSV) > e->base.bufsiz) {
215 use = e->base.bufsiz - SvCUR(e->dataSV);
217 sv_catpvn(e->dataSV,(char*)ptr,use);
220 /* Create a "dummy" SV to represent the available data from layer below */
221 if (SvLEN(e->dataSV) && SvPVX(e->dataSV)) {
222 Safefree(SvPVX(e->dataSV));
224 if (use > e->base.bufsiz) {
225 use = e->base.bufsiz;
227 SvPVX(e->dataSV) = (char *) ptr;
228 SvLEN(e->dataSV) = 0; /* Hands off sv.c - it isn't yours */
229 SvCUR_set(e->dataSV,use);
230 SvPOK_only(e->dataSV);
232 SvUTF8_off(e->dataSV);
238 if (perl_call_method("decode", G_SCALAR) != 1) {
239 Perl_die(aTHX_ "panic: decode did not return a value");
244 /* Now get translated string (forced to UTF-8) and use as buffer */
246 s = SvPVutf8(uni, len);
247 if (len && !is_utf8_string((U8*)s,len)) {
248 Perl_warn(aTHX_ "panic: decode did not return UTF-8 '%.*s'",(int) len,s);
253 /* if decode gave us back dataSV then data may vanish when
254 we do ptrcnt adjust - so take our copy now.
255 (The copy is a pain - need a put-it-here option for decode.)
257 sv_setpvn(e->bufsv,s,len);
258 e->base.ptr = e->base.buf = (STDCHAR*)SvPVX(e->bufsv);
259 e->base.end = e->base.ptr + SvCUR(e->bufsv);
260 PerlIOBase(f)->flags |= PERLIO_F_RDBUF;
263 /* Adjust ptr/cnt not taking anything which
264 did not translate - not clear this is a win */
265 /* compute amount we took */
266 use -= SvCUR(e->dataSV);
267 PerlIO_set_ptrcnt(n, ptr+use, (avail-use));
268 /* and as we did not take it it isn't pending */
269 SvCUR_set(e->dataSV,0);
271 /* Got nothing - assume partial character so we need some more */
272 /* Make sure e->dataSV is a normal SV before re-filling as
273 buffer alias will change under us
275 s = SvPV(e->dataSV,len);
276 sv_setpvn(e->dataSV,s,len);
277 PerlIO_set_ptrcnt(n, ptr+use, (avail-use));
286 PerlIOBase(f)->flags |= PERLIO_F_EOF;
288 PerlIOBase(f)->flags |= PERLIO_F_ERROR;
294 PerlIOEncode_flush(pTHX_ PerlIO * f)
296 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
298 if (e->bufsv && (e->base.ptr > e->base.buf)) {
304 if (PerlIOBase(f)->flags & PERLIO_F_WRBUF) {
305 /* Write case encode the buffer and write() to layer below */
310 SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
315 if (perl_call_method("encode", G_SCALAR) != 1) {
316 Perl_die(aTHX_ "panic: encode did not return a value");
322 count = PerlIO_write(PerlIONext(f),s,len);
328 if (PerlIO_flush(PerlIONext(f)) != 0) {
331 if (SvCUR(e->bufsv)) {
332 /* Did not all translate */
333 e->base.ptr = e->base.buf+SvCUR(e->bufsv);
337 else if (PerlIOBase(f)->flags & PERLIO_F_RDBUF) {
339 /* if we have any untranslated stuff then unread that first */
340 if (e->dataSV && SvCUR(e->dataSV)) {
341 s = SvPV(e->dataSV, len);
342 count = PerlIO_unread(PerlIONext(f),s,len);
347 /* See if there is anything left in the buffer */
348 if (e->base.ptr < e->base.end) {
349 /* Bother - have unread data.
350 re-encode and unread() to layer below
354 str = sv_newmortal();
355 sv_upgrade(str, SVt_PV);
356 SvPVX(str) = (char*)e->base.ptr;
358 SvCUR_set(str, e->base.end - e->base.ptr);
366 if (perl_call_method("encode", G_SCALAR) != 1) {
367 Perl_die(aTHX_ "panic: encode did not return a value");
373 count = PerlIO_unread(PerlIONext(f),s,len);
381 e->base.ptr = e->base.end = e->base.buf;
382 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF);
388 PerlIOEncode_close(pTHX_ PerlIO * f)
390 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
391 IV code = PerlIOBase_close(aTHX_ f);
393 if (e->base.buf && e->base.ptr > e->base.buf) {
394 Perl_croak(aTHX_ "Close with partial character");
396 SvREFCNT_dec(e->bufsv);
402 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF);
407 PerlIOEncode_tell(pTHX_ PerlIO * f)
409 PerlIOBuf *b = PerlIOSelf(f, PerlIOBuf);
410 /* Unfortunately the only way to get a postion is to (re-)translate,
411 the UTF8 we have in bufefr and then ask layer below
414 if (b->buf && b->ptr > b->buf) {
415 Perl_croak(aTHX_ "Cannot tell at partial character");
417 return PerlIO_tell(PerlIONext(f));
421 PerlIOEncode_dup(pTHX_ PerlIO * f, PerlIO * o,
422 CLONE_PARAMS * params, int flags)
424 if ((f = PerlIOBase_dup(aTHX_ f, o, params, flags))) {
425 PerlIOEncode *fe = PerlIOSelf(f, PerlIOEncode);
426 PerlIOEncode *oe = PerlIOSelf(o, PerlIOEncode);
428 fe->enc = PerlIO_sv_dup(aTHX_ oe->enc, params);
434 PerlIO_funcs PerlIO_encode = {
436 sizeof(PerlIOEncode),
437 PERLIO_K_BUFFERED|PERLIO_K_DESTRUCT,
455 PerlIOBase_setlinebuf,
456 PerlIOEncode_get_base,
460 PerlIOBuf_set_ptrcnt,
462 #endif /* encode layer */
465 Encode_XSEncoding(pTHX_ encode_t * enc)
468 HV *stash = gv_stashpv("Encode::XS", TRUE);
469 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))), stash);
473 while (enc->name[i]) {
474 const char *name = enc->name[i++];
475 XPUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
478 call_pv("Encode::define_encoding", G_DISCARD);
483 call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig)
485 /* Exists for breakpointing */
489 encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
493 U8 *s = (U8 *) SvPV(src, slen);
497 SV *dst = sv_2mortal(newSV(slen+1));
499 U8 *d = (U8 *) SvPVX(dst);
500 STRLEN dlen = SvLEN(dst)-1;
502 while ((code = do_encode(dir, s, &slen, d, dlen, &dlen, !check))) {
503 SvCUR_set(dst, dlen+ddone);
507 Perl_warn(aTHX_ "code=%d @ s=%d/%d/%d d=%d/%d/%d",code,slen,sdone,tlen,dlen,ddone,SvLEN(dst)-1);
510 if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL)
520 need = (tlen*SvLEN(dst)+sdone-1)/sdone+UTF8_MAXLEN;
523 need = SvLEN(dst) + UTF8_MAXLEN;
526 d = (U8 *) SvGROW(dst, need);
527 if (ddone >= SvLEN(dst)) {
528 Perl_croak(aTHX_ "Destination couldn't be grown.");
530 dlen = SvLEN(dst)-ddone-1;
538 if (dir == enc->f_utf8) {
539 if (!check && ckWARN_d(WARN_UTF8)) {
542 utf8n_to_uvuni(s + slen, (SvCUR(src) - slen),
544 Perl_warner(aTHX_ WARN_UTF8,
546 "}\" does not map to %s", ch,
548 /* FIXME: Skip over the character, copy in replacement and continue
549 * but that is messy so for now just fail.
558 /* UTF-8 is supposed to be "Universal" so should not happen
559 for real characters, but some encodings have non-assigned
560 codes which may occur.
562 Perl_croak(aTHX_ "%s \"\\x%02X\" does not map to Unicode",
563 enc->name[0], (U8) s[slen]);
568 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
569 code, (dir == enc->f_utf8) ? "to" : "from",
574 SvCUR_set(dst, dlen+ddone);
577 sdone = SvCUR(src) - (slen+sdone);
579 Move(s + slen, SvPVX(src), sdone , U8);
581 SvCUR_set(src, sdone);
592 MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_
601 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
602 ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0])));
607 Method_decode(obj,src,check = FALSE)
613 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
614 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
620 Method_encode(obj,src,check = FALSE)
626 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
627 sv_utf8_upgrade(src);
628 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
632 MODULE = Encode PACKAGE = Encode
637 _bytes_to_utf8(sv, ...)
641 SV * encoding = items == 2 ? ST(1) : Nullsv;
644 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
647 U8* s = (U8*)SvPV(sv, len);
650 converted = bytes_to_utf8(s, &len); /* This allocs */
651 sv_setpvn(sv, (char *)converted, len);
652 SvUTF8_on(sv); /* XXX Should we? */
653 Safefree(converted); /* ... so free it */
661 _utf8_to_bytes(sv, ...)
665 SV * to = items > 1 ? ST(1) : Nullsv;
666 SV * check = items > 2 ? ST(2) : Nullsv;
669 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
672 U8 *s = (U8*)SvPV(sv, len);
676 /* Must do things the slow way */
678 U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
681 New(83, dest, len, U8); /* I think */
690 /* Have to do it all ourselves because of error routine,
694 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
695 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
696 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
697 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
698 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
699 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
700 else { ulen = 13; uv = 0; }
702 /* Note change to utf8.c variable naming, for variety */
704 if ((*s & 0xc0) != 0x80)
708 uv = (uv << 6) | (*s++ & 0x3f);
712 call_failure(check, s, dest, src);
713 /* Now what happens? */
719 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
726 is_utf8(sv, check = FALSE)
731 if (SvGMAGICAL(sv)) /* it could be $1, for example */
732 sv = newSVsv(sv); /* GMAGIG will be done */
734 RETVAL = SvUTF8(sv) ? TRUE : FALSE;
737 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
743 SvREFCNT_dec(sv); /* it was a temp copy */
754 SV *rsv = newSViv(SvUTF8(sv));
758 RETVAL = &PL_sv_undef;
770 SV *rsv = newSViv(SvUTF8(sv));
774 RETVAL = &PL_sv_undef;
782 #if defined(USE_PERLIO) && !defined(USE_SFIO)
783 PerlIO_define_layer(aTHX_ &PerlIO_encode);
785 #include "8859_def.h"
786 #include "EBCDIC_def.h"
787 #include "Symbols_def.h"