10 #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) { \
11 Perl_croak(aTHX_ "panic_unimplemented"); \
12 return (y)0; /* fool picky compilers */ \
14 UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
15 UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
18 /* Define an encoding "layer" in the perliol.h sense.
19 The layer defined here "inherits" in an object-oriented sense from the
20 "perlio" layer with its PerlIOBuf_* "methods".
21 The implementation is particularly efficient as until Encode settles down
22 there is no point in tryint to tune it.
24 The layer works by overloading the "fill" and "flush" methods.
26 "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API
27 to convert the encoded data to UTF-8 form, then copies it back to the
28 buffer. The "base class's" read methods then see the UTF-8 data.
30 "flush" transforms the UTF-8 data deposited by the "base class's write
31 method in the buffer back into the encoded form using the encode OO perl API,
32 then copies data back into the buffer and calls "SUPER::flush.
34 Note that "flush" is _also_ called for read mode - we still do the (back)-translate
35 so that the the base class's "flush" sees the correct number of encoded chars
36 for positioning the seek pointer. (This double translation is the worst performance
37 issue - particularly with all-perl encode engine.)
46 PerlIOBuf base; /* PerlIOBuf stuff */
53 PerlIOEncode_pushed(PerlIO *f, const char *mode,const char *arg,STRLEN len)
55 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
59 code = PerlIOBuf_pushed(f,mode,Nullch,0);
63 XPUSHs(sv_2mortal(newSVpv("Encode",0)));
64 XPUSHs(sv_2mortal(newSVpvn(arg,len)));
66 if (perl_call_method("getEncoding",G_SCALAR) != 1)
68 /* should never happen */
69 Perl_die(aTHX_ "Encode::getEncoding did not return a value");
79 Perl_warner(aTHX_ WARN_IO, "Cannot find encoding \"%.*s\"", (int) len, arg);
85 PerlIOBase(f)->flags |= PERLIO_F_UTF8;
90 PerlIOEncode_popped(PerlIO *f)
92 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
101 SvREFCNT_dec(e->bufsv);
108 PerlIOEncode_get_base(PerlIO *f)
110 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
113 e->base.bufsiz = 1024;
116 e->bufsv = newSV(e->base.bufsiz);
117 sv_setpvn(e->bufsv,"",0);
119 e->base.buf = (STDCHAR *)SvPVX(e->bufsv);
121 e->base.ptr = e->base.buf;
123 e->base.end = e->base.buf;
124 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
126 Perl_warn(aTHX_ " ptr %p(%p)%p",
127 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
130 if (SvLEN(e->bufsv) < e->base.bufsiz)
132 SSize_t poff = e->base.ptr - e->base.buf;
133 SSize_t eoff = e->base.end - e->base.buf;
134 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,e->base.bufsiz);
135 e->base.ptr = e->base.buf + poff;
136 e->base.end = e->base.buf + eoff;
138 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
140 Perl_warn(aTHX_ " ptr %p(%p)%p",
141 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
148 PerlIOEncode_fill(PerlIO *f)
150 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
154 code = PerlIOBuf_fill(f);
160 /* Set SV that is the buffer to be buf..ptr */
161 SvCUR_set(e->bufsv, e->base.end - e->base.buf);
162 SvUTF8_off(e->bufsv);
170 if (perl_call_method("toUnicode",G_SCALAR) != 1)
175 /* Now get translated string (forced to UTF-8) and copy back to buffer
176 don't use sv_setsv as that may "steal" PV from returned temp
177 and so free() our known-large-enough buffer.
178 sv_setpvn() should do but let us do it long hand.
180 s = SvPVutf8(uni,len);
181 if (s != SvPVX(e->bufsv))
183 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
184 Move(s,e->base.buf,len,char);
185 SvCUR_set(e->bufsv,len);
188 e->base.end = e->base.buf+len;
189 e->base.ptr = e->base.buf;
197 PerlIOEncode_flush(PerlIO *f)
199 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
202 if (e->bufsv && (PerlIOBase(f)->flags & (PERLIO_F_RDBUF|PERLIO_F_WRBUF)))
209 if (PerlIOBase(f)->flags & PERLIO_F_RDBUF)
211 /* This is really just a flag to see if we took all the data, if
212 we did PerlIOBase_flush avoids a seek to lower layer.
213 Need to revisit if we start getting clever with unreads or seeks-in-buffer
215 left = e->base.end - e->base.ptr;
221 SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
226 if (perl_call_method("fromUnicode",G_SCALAR) != 1)
232 if (s != SvPVX(e->bufsv))
234 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
235 Move(s,e->base.buf,len,char);
236 SvCUR_set(e->bufsv,len);
238 SvUTF8_off(e->bufsv);
239 e->base.ptr = e->base.buf+len;
240 /* restore end != ptr as inequality is used by PerlIOBuf_flush in read case */
241 e->base.end = e->base.ptr + left;
244 if (PerlIOBuf_flush(f) != 0)
251 PerlIOEncode_close(PerlIO *f)
253 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
254 IV code = PerlIOBase_close(f);
258 SvREFCNT_dec(e->bufsv);
264 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF|PERLIO_F_WRBUF);
269 PerlIOEncode_tell(PerlIO *f)
271 PerlIOBuf *b = PerlIOSelf(f,PerlIOBuf);
272 /* Unfortunately the only way to get a postion is to back-translate,
273 the UTF8-bytes we have buf..ptr and adjust accordingly.
274 But we will try and save any unread data in case stream
277 if ((PerlIOBase(f)->flags & PERLIO_F_RDBUF) && b->ptr < b->end)
279 Size_t count = b->end - b->ptr;
280 PerlIO_push(f,&PerlIO_pending,"r",Nullch,0);
281 /* Save what we have left to read */
282 PerlIOSelf(f,PerlIOBuf)->bufsiz = count;
283 PerlIO_unread(f,b->ptr,count);
284 /* There isn't any unread data - we just saved it - so avoid the lower seek */
286 /* Flush ourselves - now one layer down,
287 this does the back translate and adjusts position
289 PerlIO_flush(PerlIONext(f));
290 /* Set position of the saved data */
291 PerlIOSelf(f,PerlIOBuf)->posn = b->posn;
300 PerlIO_funcs PerlIO_encode = {
302 sizeof(PerlIOEncode),
321 PerlIOBuf_setlinebuf,
322 PerlIOEncode_get_base,
326 PerlIOBuf_set_ptrcnt,
331 Encode_Define(pTHX_ encode_t *enc)
333 HV *hash = get_hv("Encode::encoding",GV_ADD|GV_ADDMULTI);
334 HV *stash = gv_stashpv("Encode::XS", TRUE);
335 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
339 const char *name = enc->name[i++];
340 hv_store(hash,name,strlen(name),SvREFCNT_inc(sv),0);
345 void call_failure (SV *routine, U8* done, U8* dest, U8* orig) {}
348 encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check)
351 U8 *s = (U8 *) SvPV(src,slen);
352 SV *dst = sv_2mortal(newSV(2*slen+1));
355 U8 *d = (U8 *) SvGROW(dst, 2*slen+1);
356 STRLEN dlen = SvLEN(dst);
358 while ((code = do_encode(dir,s,&slen,d,dlen,&dlen,!check)))
363 if (code == ENCODE_FALLBACK)
370 STRLEN need = (slen) ? (SvLEN(dst)*SvCUR(src)/slen) : (dlen + UTF8_MAXLEN);
371 if (need <= SvLEN(dst))
373 d = (U8 *) SvGROW(dst, need);
380 if (dir == enc->f_utf8)
382 if (!check && ckWARN_d(WARN_UTF8))
385 UV ch = utf8_to_uv(s+slen,(SvCUR(src)-slen),&clen,0);
386 Perl_warner(aTHX_ WARN_UTF8, "\"\\x{%"UVxf"}\" does not map to %s", ch, enc->name[0]);
387 /* FIXME: Skip over the character, copy in replacement and continue
388 * but that is messy so for now just fail.
399 /* UTF-8 is supposed to be "Universal" so should not happen */
400 Perl_croak(aTHX_ "%s '%.*s' does not map to UTF-8",
401 enc->name[0], (int)(SvCUR(src)-slen),s+slen);
406 if (!check && ckWARN_d(WARN_UTF8))
408 Perl_warner(aTHX_ WARN_UTF8, "Partial %s character",
409 (dir == enc->f_utf8) ? "UTF-8" : enc->name[0]);
414 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
415 code, (dir == enc->f_utf8) ? "to" : "from",enc->name[0]);
423 if (slen < SvCUR(src))
425 Move(s+slen,s,SvCUR(src)-slen,U8);
427 SvCUR_set(src,SvCUR(src)-slen);
433 MODULE = Encode PACKAGE = Encode PREFIX = sv_
441 char *s = SvPV(sv,len);
442 if (!SvUTF8(sv) || is_utf8_string((U8*)s,len))
461 sv_utf8_downgrade(sv,failok=0)
465 MODULE = Encode PACKAGE = Encode::XS PREFIX = Encode_
470 Encode_toUnicode(obj,src,check = 0)
476 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
477 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
483 Encode_fromUnicode(obj,src,check = 0)
489 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
490 sv_utf8_upgrade(src);
491 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
495 MODULE = Encode PACKAGE = Encode
500 _bytes_to_utf8(sv, ...)
504 SV * encoding = items == 2 ? ST(1) : Nullsv;
507 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
510 U8* s = (U8*)SvPV(sv, len);
513 converted = bytes_to_utf8(s, &len); /* This allocs */
514 sv_setpvn(sv, (char *)converted, len);
515 SvUTF8_on(sv); /* XXX Should we? */
516 Safefree(converted); /* ... so free it */
524 _utf8_to_bytes(sv, ...)
528 SV * to = items > 1 ? ST(1) : Nullsv;
529 SV * check = items > 2 ? ST(2) : Nullsv;
532 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
535 U8 *s = (U8*)SvPV(sv, len);
538 /* Must do things the slow way */
540 U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
543 New(83, dest, len, U8); /* I think */
552 /* Have to do it all ourselves because of error routine,
556 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
557 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
558 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
559 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
560 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
561 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
562 else { ulen = 13; uv = 0; }
564 /* Note change to utf8.c variable naming, for variety */
566 if ((*s & 0xc0) != 0x80)
570 uv = (uv << 6) | (*s++ & 0x3f);
574 call_failure(check, s, dest, src);
575 /* Now what happens? */
581 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
588 _chars_to_utf8(sv, from, ...)
593 SV * check = items == 3 ? ST(2) : Nullsv;
594 RETVAL = &PL_sv_undef;
600 _utf8_to_chars(sv, to, ...)
605 SV * check = items == 3 ? ST(2) : Nullsv;
606 RETVAL = &PL_sv_undef;
612 _utf8_to_chars_check(sv, ...)
616 SV * check = items == 2 ? ST(1) : Nullsv;
617 RETVAL = &PL_sv_undef;
623 _bytes_to_chars(sv, from, ...)
628 SV * check = items == 3 ? ST(2) : Nullsv;
629 RETVAL = &PL_sv_undef;
635 _chars_to_bytes(sv, to, ...)
640 SV * check = items == 3 ? ST(2) : Nullsv;
641 RETVAL = &PL_sv_undef;
647 _from_to(sv, from, to, ...)
653 SV * check = items == 4 ? ST(3) : Nullsv;
654 RETVAL = &PL_sv_undef;
664 SV * check = items == 2 ? ST(1) : Nullsv;
666 RETVAL = SvUTF8(sv) ? 1 : 0;
669 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
684 SV *rsv = newSViv(SvUTF8(sv));
688 RETVAL = &PL_sv_undef;
700 SV *rsv = newSViv(SvUTF8(sv));
704 RETVAL = &PL_sv_undef;
711 _utf_to_utf(sv, from, to, ...)
717 SV * check = items == 4 ? ST(3) : Nullsv;
718 RETVAL = &PL_sv_undef;
726 PerlIO_define_layer(&PerlIO_encode);
728 #include "iso8859.def"
729 #include "EBCDIC.def"
730 #include "Symbols.def"