10 #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) { \
11 Perl_croak(aTHX_ "panic_unimplemented"); \
12 return (y)0; /* fool picky compilers */ \
14 UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
15 UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
18 /* Define an encoding "layer" in the perliol.h sense.
19 The layer defined here "inherits" in an object-oriented sense from the
20 "perlio" layer with its PerlIOBuf_* "methods".
21 The implementation is particularly efficient as until Encode settles down
22 there is no point in tryint to tune it.
24 The layer works by overloading the "fill" and "flush" methods.
26 "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API
27 to convert the encoded data to UTF-8 form, then copies it back to the
28 buffer. The "base class's" read methods then see the UTF-8 data.
30 "flush" transforms the UTF-8 data deposited by the "base class's write
31 method in the buffer back into the encoded form using the encode OO perl API,
32 then copies data back into the buffer and calls "SUPER::flush.
34 Note that "flush" is _also_ called for read mode - we still do the (back)-translate
35 so that the the base class's "flush" sees the correct number of encoded chars
36 for positioning the seek pointer. (This double translation is the worst performance
37 issue - particularly with all-perl encode engine.)
46 PerlIOBuf base; /* PerlIOBuf stuff */
53 PerlIOEncode_pushed(PerlIO *f, const char *mode,const char *arg,STRLEN len)
55 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
59 code = PerlIOBuf_pushed(f,mode,Nullch,0);
63 XPUSHs(sv_2mortal(newSVpv("Encode",0)));
64 XPUSHs(sv_2mortal(newSVpvn(arg,len)));
66 if (perl_call_method("getEncoding",G_SCALAR) != 1)
76 PerlIOBase(f)->flags |= PERLIO_F_UTF8;
81 PerlIOEncode_popped(PerlIO *f)
83 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
92 SvREFCNT_dec(e->bufsv);
99 PerlIOEncode_get_base(PerlIO *f)
101 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
104 e->base.bufsiz = 1024;
107 e->bufsv = newSV(e->base.bufsiz);
108 sv_setpvn(e->bufsv,"",0);
110 e->base.buf = (STDCHAR *)SvPVX(e->bufsv);
112 e->base.ptr = e->base.buf;
114 e->base.end = e->base.buf;
115 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
117 Perl_warn(aTHX_ " ptr %p(%p)%p",
118 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
121 if (SvLEN(e->bufsv) < e->base.bufsiz)
123 SSize_t poff = e->base.ptr - e->base.buf;
124 SSize_t eoff = e->base.end - e->base.buf;
125 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,e->base.bufsiz);
126 e->base.ptr = e->base.buf + poff;
127 e->base.end = e->base.buf + eoff;
129 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
131 Perl_warn(aTHX_ " ptr %p(%p)%p",
132 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
139 PerlIOEncode_fill(PerlIO *f)
141 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
145 code = PerlIOBuf_fill(f);
151 /* Set SV that is the buffer to be buf..ptr */
152 SvCUR_set(e->bufsv, e->base.end - e->base.buf);
153 SvUTF8_off(e->bufsv);
161 if (perl_call_method("toUnicode",G_SCALAR) != 1)
166 /* Now get translated string (forced to UTF-8) and copy back to buffer
167 don't use sv_setsv as that may "steal" PV from returned temp
168 and so free() our known-large-enough buffer.
169 sv_setpvn() should do but let us do it long hand.
171 s = SvPVutf8(uni,len);
172 if (s != SvPVX(e->bufsv))
174 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
175 Move(s,e->base.buf,len,char);
176 SvCUR_set(e->bufsv,len);
179 e->base.end = e->base.buf+len;
180 e->base.ptr = e->base.buf;
188 PerlIOEncode_flush(PerlIO *f)
190 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
193 if (e->bufsv && (PerlIOBase(f)->flags & (PERLIO_F_RDBUF|PERLIO_F_WRBUF)))
200 if (PerlIOBase(f)->flags & PERLIO_F_RDBUF)
202 /* This is really just a flag to see if we took all the data, if
203 we did PerlIOBase_flush avoids a seek to lower layer.
204 Need to revisit if we start getting clever with unreads or seeks-in-buffer
206 left = e->base.end - e->base.ptr;
212 SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
217 if (perl_call_method("fromUnicode",G_SCALAR) != 1)
223 if (s != SvPVX(e->bufsv))
225 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
226 Move(s,e->base.buf,len,char);
227 SvCUR_set(e->bufsv,len);
229 SvUTF8_off(e->bufsv);
230 e->base.ptr = e->base.buf+len;
231 /* restore end != ptr as inequality is used by PerlIOBuf_flush in read case */
232 e->base.end = e->base.ptr + left;
235 if (PerlIOBuf_flush(f) != 0)
242 PerlIOEncode_close(PerlIO *f)
244 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
245 IV code = PerlIOBase_close(f);
249 SvREFCNT_dec(e->bufsv);
255 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF|PERLIO_F_WRBUF);
260 PerlIOEncode_tell(PerlIO *f)
262 PerlIOBuf *b = PerlIOSelf(f,PerlIOBuf);
263 /* Unfortunately the only way to get a postion is to back-translate,
264 the UTF8-bytes we have buf..ptr and adjust accordingly.
265 But we will try and save any unread data in case stream
268 if ((PerlIOBase(f)->flags & PERLIO_F_RDBUF) && b->ptr < b->end)
270 Size_t count = b->end - b->ptr;
271 PerlIO_push(f,&PerlIO_pending,"r",Nullch,0);
272 /* Save what we have left to read */
273 PerlIOSelf(f,PerlIOBuf)->bufsiz = count;
274 PerlIO_unread(f,b->ptr,count);
275 /* There isn't any unread data - we just saved it - so avoid the lower seek */
277 /* Flush ourselves - now one layer down,
278 this does the back translate and adjusts position
280 PerlIO_flush(PerlIONext(f));
281 /* Set position of the saved data */
282 PerlIOSelf(f,PerlIOBuf)->posn = b->posn;
291 PerlIO_funcs PerlIO_encode = {
293 sizeof(PerlIOEncode),
312 PerlIOBuf_setlinebuf,
313 PerlIOEncode_get_base,
317 PerlIOBuf_set_ptrcnt,
322 Encode_Define(pTHX_ encode_t *enc)
324 HV *hash = get_hv("Encode::encoding",GV_ADD|GV_ADDMULTI);
325 HV *stash = gv_stashpv("Encode::XS", TRUE);
326 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
327 hv_store(hash,enc->name,strlen(enc->name),sv,0);
330 void call_failure (SV *routine, U8* done, U8* dest, U8* orig) {}
333 encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check)
336 U8 *s = (U8 *) SvPV(src,slen);
337 SV *dst = sv_2mortal(newSV(2*slen+1));
340 U8 *d = (U8 *) SvGROW(dst, 2*slen+1);
341 STRLEN dlen = SvLEN(dst);
343 while ((code = do_encode(dir,s,&slen,d,dlen,&dlen,!check)))
348 if (code == ENCODE_FALLBACK)
355 STRLEN need = (slen) ? (SvLEN(dst)*SvCUR(src)/slen) : (dlen + UTF8_MAXLEN);
356 if (need <= SvLEN(dst))
358 d = (U8 *) SvGROW(dst, need);
365 if (dir == enc->f_utf8)
367 if (!check && ckWARN_d(WARN_UTF8))
370 UV ch = utf8_to_uv(s+slen,(SvCUR(src)-slen),&clen,0);
371 Perl_warner(aTHX_ WARN_UTF8, "\"\\x{%x}\" does not map to %s", ch, enc->name);
372 /* FIXME: Skip over the character, copy in replacement and continue
373 * but that is messy so for now just fail.
384 /* UTF-8 is supposed to be "Universal" so should not happen */
385 Perl_croak(aTHX_ "%s '%.*s' does not map to UTF-8",
386 enc->name, (SvCUR(src)-slen),s+slen);
391 if (!check && ckWARN_d(WARN_UTF8))
393 Perl_warner(aTHX_ WARN_UTF8, "Partial %s character",
394 (dir == enc->f_utf8) ? "UTF-8" : enc->name);
399 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
400 code, (dir == enc->f_utf8) ? "to" : "from",enc->name);
408 if (slen < SvCUR(src))
410 Move(s+slen,s,SvCUR(src)-slen,U8);
412 SvCUR_set(src,SvCUR(src)-slen);
418 MODULE = Encode PACKAGE = Encode PREFIX = sv_
426 char *s = SvPV(sv,len);
427 if (!SvUTF8(sv) || is_utf8_string(s,len))
446 sv_utf8_downgrade(sv,failok=0)
450 MODULE = Encode PACKAGE = Encode::XS PREFIX = Encode_
455 Encode_toUnicode(obj,src,check = 0)
461 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
462 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
468 Encode_fromUnicode(obj,src,check = 0)
474 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
475 sv_utf8_upgrade(src);
476 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
480 MODULE = Encode PACKAGE = Encode
485 _bytes_to_utf8(sv, ...)
489 SV * encoding = items == 2 ? ST(1) : Nullsv;
492 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
495 U8* s = (U8*)SvPV(sv, len);
498 converted = bytes_to_utf8(s, &len); /* This allocs */
499 sv_setpvn(sv, (char *)converted, len);
500 SvUTF8_on(sv); /* XXX Should we? */
501 Safefree(converted); /* ... so free it */
509 _utf8_to_bytes(sv, ...)
513 SV * to = items > 1 ? ST(1) : Nullsv;
514 SV * check = items > 2 ? ST(2) : Nullsv;
517 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
520 U8 *s = (U8*)SvPV(sv, len);
523 /* Must do things the slow way */
525 U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
528 New(83, dest, len, U8); /* I think */
537 /* Have to do it all ourselves because of error routine,
541 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
542 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
543 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
544 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
545 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
546 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
547 else { ulen = 13; uv = 0; }
549 /* Note change to utf8.c variable naming, for variety */
551 if ((*s & 0xc0) != 0x80)
555 uv = (uv << 6) | (*s++ & 0x3f);
559 call_failure(check, s, dest, src);
560 /* Now what happens? */
566 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
573 _chars_to_utf8(sv, from, ...)
578 SV * check = items == 3 ? ST(2) : Nullsv;
579 RETVAL = &PL_sv_undef;
585 _utf8_to_chars(sv, to, ...)
590 SV * check = items == 3 ? ST(2) : Nullsv;
591 RETVAL = &PL_sv_undef;
597 _utf8_to_chars_check(sv, ...)
601 SV * check = items == 2 ? ST(1) : Nullsv;
602 RETVAL = &PL_sv_undef;
608 _bytes_to_chars(sv, from, ...)
613 SV * check = items == 3 ? ST(2) : Nullsv;
614 RETVAL = &PL_sv_undef;
620 _chars_to_bytes(sv, to, ...)
625 SV * check = items == 3 ? ST(2) : Nullsv;
626 RETVAL = &PL_sv_undef;
632 _from_to(sv, from, to, ...)
638 SV * check = items == 4 ? ST(3) : Nullsv;
639 RETVAL = &PL_sv_undef;
649 SV * check = items == 2 ? ST(1) : Nullsv;
651 RETVAL = SvUTF8(sv) ? 1 : 0;
654 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
669 SV *rsv = newSViv(SvUTF8(sv));
673 RETVAL = &PL_sv_undef;
685 SV *rsv = newSViv(SvUTF8(sv));
689 RETVAL = &PL_sv_undef;
696 _utf_to_utf(sv, from, to, ...)
702 SV * check = items == 4 ? ST(3) : Nullsv;
703 RETVAL = &PL_sv_undef;
711 PerlIO_define_layer(&PerlIO_encode);
713 #include "iso8859.def"
714 #include "EBCDIC.def"
715 #include "Symbols.def"