X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=ext%2FEncode%2FEncode.xs;h=36d5f3dac63d02ac8ebcb76536d4a97155b6e020;hb=21d92c23f49d139d8bddefbab6f984eb17e12d43;hp=e4f7b10cbd633e9356087cceb5a7c74c69b46b4e;hpb=f54fca96add61189e2fde1d41312bc8885ac2d97;p=p5sagit%2Fp5-mst-13.2.git diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index e4f7b10..36d5f3d 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -1,641 +1,488 @@ -#define PERL_NO_GET_CONTEXT +/* + $Id: Encode.xs,v 1.55 2003/02/28 01:40:27 dankogai Exp $ + */ +#define PERL_NO_GET_CONTEXT #include "EXTERN.h" #include "perl.h" #include "XSUB.h" #define U8 U8 #include "encode.h" -/* #include "8859.h" */ -/* #include "EBCDIC.h" */ -/* #include "Symbols.h" */ -#include "def_t.h" + +# define PERLIO_MODNAME "PerlIO::encoding" +# define PERLIO_FILENAME "PerlIO/encoding.pm" + +/* set 1 or more to profile. t/encoding.t dumps core because of + Perl_warner and PerlIO don't work well */ +#define ENCODE_XS_PROFILE 0 + +/* set 0 to disable floating point to calculate buffer size for + encode_method(). 1 is recommended. 2 restores NI-S original */ +#define ENCODE_XS_USEFP 1 #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \ Perl_croak(aTHX_ "panic_unimplemented"); \ return (y)0; /* fool picky compilers */ \ } -UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) - UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) -#if defined(USE_PERLIO) && !defined(USE_SFIO) -/* Define an encoding "layer" in the perliol.h sense. - The layer defined here "inherits" in an object-oriented sense from the - "perlio" layer with its PerlIOBuf_* "methods". - The implementation is particularly efficient as until Encode settles down - there is no point in tryint to tune it. - - The layer works by overloading the "fill" and "flush" methods. - - "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API - to convert the encoded data to UTF-8 form, then copies it back to the - buffer. The "base class's" read methods then see the UTF-8 data. - - "flush" transforms the UTF-8 data deposited by the "base class's write - method in the buffer back into the encoded form using the encode OO perl API, - then copies data back into the buffer and calls "SUPER::flush. - - Note that "flush" is _also_ called for read mode - we still do the (back)-translate - so that the the base class's "flush" sees the correct number of encoded chars - for positioning the seek pointer. (This double translation is the worst performance - issue - particularly with all-perl encode engine.) - -*/ -#include "perliol.h" -typedef struct { - PerlIOBuf base; /* PerlIOBuf stuff */ - SV *bufsv; /* buffer seen by layers above */ - SV *dataSV; /* data we have read from layer below */ - SV *enc; /* the encoding object */ -} PerlIOEncode; +/**/ -SV * -PerlIOEncode_getarg(pTHX_ PerlIO * f, CLONE_PARAMS * param, int flags) -{ - PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); - SV *sv = &PL_sv_undef; - if (e->enc) { - dSP; - ENTER; - SAVETMPS; - PUSHMARK(sp); - XPUSHs(e->enc); - PUTBACK; - if (perl_call_method("name", G_SCALAR) == 1) { - SPAGAIN; - sv = newSVsv(POPs); - PUTBACK; - } - } - return sv; -} +UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) +UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) -IV -PerlIOEncode_pushed(pTHX_ PerlIO * f, const char *mode, SV * arg) +void +Encode_XSEncoding(pTHX_ encode_t * enc) { - PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); dSP; - IV code; - code = PerlIOBuf_pushed(aTHX_ f, mode, Nullsv); - ENTER; - SAVETMPS; + HV *stash = gv_stashpv("Encode::XS", TRUE); + SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))), stash); + int i = 0; PUSHMARK(sp); - XPUSHs(arg); - PUTBACK; - if (perl_call_pv("Encode::find_encoding", G_SCALAR) != 1) { - /* should never happen */ - Perl_die(aTHX_ "Encode::find_encoding did not return a value"); - return -1; + XPUSHs(sv); + while (enc->name[i]) { + const char *name = enc->name[i++]; + XPUSHs(sv_2mortal(newSVpvn(name, strlen(name)))); } - SPAGAIN; - e->enc = POPs; PUTBACK; - if (!SvROK(e->enc)) { - e->enc = Nullsv; - errno = EINVAL; - Perl_warner(aTHX_ packWARN(WARN_IO), "Cannot find encoding \"%" SVf "\"", - arg); - code = -1; - } - else { - SvREFCNT_inc(e->enc); - PerlIOBase(f)->flags |= PERLIO_F_UTF8; - } - FREETMPS; - LEAVE; - return code; + call_pv("Encode::define_encoding", G_DISCARD); + SvREFCNT_dec(sv); } -IV -PerlIOEncode_popped(pTHX_ PerlIO * f) +void +call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig) { - PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); - if (e->enc) { - SvREFCNT_dec(e->enc); - e->enc = Nullsv; - } - if (e->bufsv) { - SvREFCNT_dec(e->bufsv); - e->bufsv = Nullsv; - } - if (e->dataSV) { - SvREFCNT_dec(e->dataSV); - e->dataSV = Nullsv; - } - return 0; + /* Exists for breakpointing */ } -STDCHAR * -PerlIOEncode_get_base(pTHX_ PerlIO * f) -{ - PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); - if (!e->base.bufsiz) - e->base.bufsiz = 1024; - if (!e->bufsv) { - e->bufsv = newSV(e->base.bufsiz); - sv_setpvn(e->bufsv, "", 0); - } - e->base.buf = (STDCHAR *) SvPVX(e->bufsv); - if (!e->base.ptr) - e->base.ptr = e->base.buf; - if (!e->base.end) - e->base.end = e->base.buf; - if (e->base.ptr < e->base.buf - || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) { - Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr, - e->base.buf + SvLEN(e->bufsv)); - abort(); - } - if (SvLEN(e->bufsv) < e->base.bufsiz) { - SSize_t poff = e->base.ptr - e->base.buf; - SSize_t eoff = e->base.end - e->base.buf; - e->base.buf = (STDCHAR *) SvGROW(e->bufsv, e->base.bufsiz); - e->base.ptr = e->base.buf + poff; - e->base.end = e->base.buf + eoff; - } - if (e->base.ptr < e->base.buf - || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) { - Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr, - e->base.buf + SvLEN(e->bufsv)); - abort(); - } - return e->base.buf; -} -IV -PerlIOEncode_fill(pTHX_ PerlIO * f) +#define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s" +#define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode" + +static SV * +encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, + int check, STRLEN * offset, SV * term, int * retcode) { - PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); - dSP; - IV code = 0; - PerlIO *n; - SSize_t avail; - if (PerlIO_flush(f) != 0) - return -1; - n = PerlIONext(f); - if (!PerlIO_fast_gets(n)) { - /* Things get too messy if we don't have a buffer layer - push a :perlio to do the job */ - char mode[8]; - n = PerlIO_push(aTHX_ n, &PerlIO_perlio, PerlIO_modestr(f,mode), Nullsv); - if (!n) { - Perl_die(aTHX_ "panic: cannot push :perlio for %p",f); - } + STRLEN slen; + U8 *s = (U8 *) SvPV(src, slen); + STRLEN tlen = slen; + STRLEN ddone = 0; + STRLEN sdone = 0; + + /* We allocate slen+1. + PerlIO dumps core if this value is smaller than this. */ + SV *dst = sv_2mortal(newSV(slen+1)); + U8 *d = (U8 *)SvPVX(dst); + STRLEN dlen = SvLEN(dst)-1; + int code = 0; + STRLEN trmlen = 0; + U8 *trm = term ? (U8*) SvPV(term, trmlen) : NULL; + + if (offset) { + s += *offset; + if (slen > *offset){ /* safeguard against slen overflow */ + slen -= *offset; + }else{ + slen = 0; + } + tlen = slen; } - ENTER; - SAVETMPS; - retry: - avail = PerlIO_get_cnt(n); - if (avail <= 0) { - avail = PerlIO_fill(n); - if (avail == 0) { - avail = PerlIO_get_cnt(n); - } - else { - if (!PerlIO_error(n) && PerlIO_eof(n)) - avail = 0; - } + + if (slen == 0){ + SvCUR_set(dst, 0); + SvPOK_only(dst); + goto ENCODE_END; } - if (avail > 0) { - STDCHAR *ptr = PerlIO_get_ptr(n); - SSize_t use = avail; - SV *uni; - char *s; - STRLEN len = 0; - e->base.ptr = e->base.end = (STDCHAR *) Nullch; - (void) PerlIOEncode_get_base(aTHX_ f); - if (!e->dataSV) - e->dataSV = newSV(0); - if (SvTYPE(e->dataSV) < SVt_PV) { - sv_upgrade(e->dataSV,SVt_PV); - } - if (SvCUR(e->dataSV)) { - /* something left over from last time - create a normal - SV with new data appended - */ - if (use + SvCUR(e->dataSV) > e->base.bufsiz) { - use = e->base.bufsiz - SvCUR(e->dataSV); - } - sv_catpvn(e->dataSV,(char*)ptr,use); + + while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check, + trm, trmlen)) ) + { + SvCUR_set(dst, dlen+ddone); + SvPOK_only(dst); + + if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL || + code == ENCODE_FOUND_TERM) { + break; } - else { - /* Create a "dummy" SV to represent the available data from layer below */ - if (SvLEN(e->dataSV) && SvPVX(e->dataSV)) { - Safefree(SvPVX(e->dataSV)); + switch (code) { + case ENCODE_NOSPACE: + { + STRLEN more = 0; /* make sure you initialize! */ + STRLEN sleft; + sdone += slen; + ddone += dlen; + sleft = tlen - sdone; +#if ENCODE_XS_PROFILE >= 2 + Perl_warn(aTHX_ + "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n", + more, sdone, sleft, SvLEN(dst)); +#endif + if (sdone != 0) { /* has src ever been processed ? */ +#if ENCODE_XS_USEFP == 2 + more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone + - SvLEN(dst); +#elif ENCODE_XS_USEFP + more = (STRLEN)((1.0*SvLEN(dst)+1)/sdone * sleft); +#else + /* safe until SvLEN(dst) == MAX_INT/16 */ + more = (16*SvLEN(dst)+1)/sdone/16 * sleft; +#endif } - if (use > e->base.bufsiz) { - use = e->base.bufsiz; + more += UTF8_MAXLEN; /* insurance policy */ + d = (U8 *) SvGROW(dst, SvLEN(dst) + more); + /* dst need to grow need MORE bytes! */ + if (ddone >= SvLEN(dst)) { + Perl_croak(aTHX_ "Destination couldn't be grown."); } - SvPVX(e->dataSV) = (char *) ptr; - SvLEN(e->dataSV) = 0; /* Hands off sv.c - it isn't yours */ - SvCUR_set(e->dataSV,use); - SvPOK_only(e->dataSV); - } - SvUTF8_off(e->dataSV); - PUSHMARK(sp); - XPUSHs(e->enc); - XPUSHs(e->dataSV); - XPUSHs(&PL_sv_yes); - PUTBACK; - if (perl_call_method("decode", G_SCALAR) != 1) { - Perl_die(aTHX_ "panic: decode did not return a value"); + dlen = SvLEN(dst)-ddone-1; + d += ddone; + s += slen; + slen = tlen-sdone; + continue; } - SPAGAIN; - uni = POPs; - PUTBACK; - /* Now get translated string (forced to UTF-8) and use as buffer */ - if (SvPOK(uni)) { - s = SvPVutf8(uni, len); - if (len && !is_utf8_string((U8*)s,len)) { - Perl_warn(aTHX_ "panic: decode did not return UTF-8 '%.*s'",(int) len,s); + case ENCODE_NOREP: + /* encoding */ + if (dir == enc->f_utf8) { + STRLEN clen; + UV ch = + utf8n_to_uvuni(s+slen, (SvCUR(src)-slen), + &clen, UTF8_ALLOW_ANY|UTF8_CHECK_ONLY); + if (check & ENCODE_DIE_ON_ERR) { + Perl_croak(aTHX_ ERR_ENCODE_NOMAP, + (UV)ch, enc->name[0]); + return &PL_sv_undef; /* never reaches but be safe */ + } + if (check & ENCODE_WARN_ON_ERR){ + Perl_warner(aTHX_ packWARN(WARN_UTF8), + ERR_ENCODE_NOMAP, (UV)ch, enc->name[0]); + } + if (check & ENCODE_RETURN_ON_ERR){ + goto ENCODE_SET_SRC; + } + if (check & ENCODE_PERLQQ){ + SV* perlqq = + sv_2mortal(newSVpvf("\\x{%04"UVxf"}", (UV)ch)); + sdone += slen + clen; + ddone += dlen + SvCUR(perlqq); + sv_catsv(dst, perlqq); + }else if (check & ENCODE_HTMLCREF){ + SV* htmlcref = + sv_2mortal(newSVpvf("&#%" UVuf ";", (UV)ch)); + sdone += slen + clen; + ddone += dlen + SvCUR(htmlcref); + sv_catsv(dst, htmlcref); + }else if (check & ENCODE_XMLCREF){ + SV* xmlcref = + sv_2mortal(newSVpvf("&#x%" UVxf ";", (UV)ch)); + sdone += slen + clen; + ddone += dlen + SvCUR(xmlcref); + sv_catsv(dst, xmlcref); + } else { + /* fallback char */ + sdone += slen + clen; + ddone += dlen + enc->replen; + sv_catpvn(dst, (char*)enc->rep, enc->replen); + } + } + /* decoding */ + else { + if (check & ENCODE_DIE_ON_ERR){ + Perl_croak(aTHX_ ERR_DECODE_NOMAP, + enc->name[0], (UV)s[slen]); + return &PL_sv_undef; /* never reaches but be safe */ + } + if (check & ENCODE_WARN_ON_ERR){ + Perl_warner( + aTHX_ packWARN(WARN_UTF8), + ERR_DECODE_NOMAP, + enc->name[0], (UV)s[slen]); + } + if (check & ENCODE_RETURN_ON_ERR){ + goto ENCODE_SET_SRC; + } + if (check & + (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ + SV* perlqq = + sv_2mortal(newSVpvf("\\x%02" UVXf, (UV)s[slen])); + sdone += slen + 1; + ddone += dlen + SvCUR(perlqq); + sv_catsv(dst, perlqq); + } else { + sdone += slen + 1; + ddone += dlen + strlen(FBCHAR_UTF8); + sv_catpv(dst, FBCHAR_UTF8); + } } + /* settle variables when fallback */ + d = (U8 *)SvEND(dst); + dlen = SvLEN(dst) - ddone - 1; + s = (U8*)SvPVX(src) + sdone; + slen = tlen - sdone; + break; + + default: + Perl_croak(aTHX_ "Unexpected code %d converting %s %s", + code, (dir == enc->f_utf8) ? "to" : "from", + enc->name[0]); + return &PL_sv_undef; } - if (len > 0) { - /* Got _something */ - /* if decode gave us back dataSV then data may vanish when - we do ptrcnt adjust - so take our copy now. - (The copy is a pain - need a put-it-here option for decode.) - */ - sv_setpvn(e->bufsv,s,len); - e->base.ptr = e->base.buf = (STDCHAR*)SvPVX(e->bufsv); - e->base.end = e->base.ptr + SvCUR(e->bufsv); - PerlIOBase(f)->flags |= PERLIO_F_RDBUF; - SvUTF8_on(e->bufsv); - - /* Adjust ptr/cnt not taking anything which - did not translate - not clear this is a win */ - /* compute amount we took */ - use -= SvCUR(e->dataSV); - PerlIO_set_ptrcnt(n, ptr+use, (avail-use)); - /* and as we did not take it it isn't pending */ - SvCUR_set(e->dataSV,0); - } else { - /* Got nothing - assume partial character so we need some more */ - /* Make sure e->dataSV is a normal SV before re-filling as - buffer alias will change under us - */ - s = SvPV(e->dataSV,len); - sv_setpvn(e->dataSV,s,len); - PerlIO_set_ptrcnt(n, ptr+use, (avail-use)); - goto retry; + } + ENCODE_SET_SRC: + if (check && !(check & ENCODE_LEAVE_SRC)){ + sdone = SvCUR(src) - (slen+sdone); + if (sdone) { + sv_setpvn(src, (char*)s+slen, sdone); } - FREETMPS; - LEAVE; - return code; + SvCUR_set(src, sdone); } - else { - if (avail == 0) - PerlIOBase(f)->flags |= PERLIO_F_EOF; - else - PerlIOBase(f)->flags |= PERLIO_F_ERROR; - return -1; + /* warn("check = 0x%X, code = 0x%d\n", check, code); */ + + SvCUR_set(dst, dlen+ddone); + SvPOK_only(dst); + +#if ENCODE_XS_PROFILE + if (SvCUR(dst) > SvCUR(src)){ + Perl_warn(aTHX_ + "SvLEN(dst)=%d, SvCUR(dst)=%d. %d bytes unused(%f %%)\n", + SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst), + (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0); } +#endif + + if (offset) + *offset += sdone + slen; + + ENCODE_END: + *SvEND(dst) = '\0'; + if (retcode) *retcode = code; + return dst; } -IV -PerlIOEncode_flush(pTHX_ PerlIO * f) +MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_ + +void +Method_decode_xs(obj,src,check = 0) +SV * obj +SV * src +int check +CODE: { - PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); - IV code = 0; - if (e->bufsv && (e->base.ptr > e->base.buf)) { - dSP; - SV *str; - char *s; - STRLEN len; - SSize_t count = 0; - if (PerlIOBase(f)->flags & PERLIO_F_WRBUF) { - /* Write case encode the buffer and write() to layer below */ - ENTER; - SAVETMPS; - PUSHMARK(sp); - XPUSHs(e->enc); - SvCUR_set(e->bufsv, e->base.ptr - e->base.buf); - SvUTF8_on(e->bufsv); - XPUSHs(e->bufsv); - XPUSHs(&PL_sv_yes); - PUTBACK; - if (perl_call_method("encode", G_SCALAR) != 1) { - Perl_die(aTHX_ "panic: encode did not return a value"); - } - SPAGAIN; - str = POPs; - PUTBACK; - s = SvPV(str, len); - count = PerlIO_write(PerlIONext(f),s,len); - if (count != len) { - code = -1; + STRLEN slen; + U8 *s = (U8 *) SvPV(src, slen); + U8 *e = (U8 *) SvEND(src); + SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */ + SvPOK_only(dst); + SvCUR_set(dst,0); + if (SvUTF8(src)) { + s = utf8_to_bytes(s,&slen); + if (s) { + SvCUR_set(src,slen); + SvUTF8_off(src); + e = s+slen; + } + else { + croak("Cannot decode string with wide characters"); + } + } + while (s < e) { + if (UTF8_IS_INVARIANT(*s) || UTF8_IS_START(*s)) { + U8 skip = UTF8SKIP(s); + if ((s + skip) > e) { + /* Partial character - done */ + break; } - FREETMPS; - LEAVE; - if (PerlIO_flush(PerlIONext(f)) != 0) { - code = -1; + else if (is_utf8_char(s)) { + /* Whole char is good */ + sv_catpvn(dst,(char *)s,skip); + s += skip; + continue; } - if (SvCUR(e->bufsv)) { - /* Did not all translate */ - e->base.ptr = e->base.buf+SvCUR(e->bufsv); - return code; + else { + /* starts ok but isn't "good" */ } } - else if (PerlIOBase(f)->flags & PERLIO_F_RDBUF) { - /* read case */ - /* if we have any untranslated stuff then unread that first */ - if (e->dataSV && SvCUR(e->dataSV)) { - s = SvPV(e->dataSV, len); - count = PerlIO_unread(PerlIONext(f),s,len); - if (count != len) { - code = -1; - } - } - /* See if there is anything left in the buffer */ - if (e->base.ptr < e->base.end) { - /* Bother - have unread data. - re-encode and unread() to layer below - */ - ENTER; - SAVETMPS; - str = sv_newmortal(); - sv_upgrade(str, SVt_PV); - SvPVX(str) = (char*)e->base.ptr; - SvLEN(str) = 0; - SvCUR_set(str, e->base.end - e->base.ptr); - SvPOK_only(str); - SvUTF8_on(str); - PUSHMARK(sp); - XPUSHs(e->enc); - XPUSHs(str); - XPUSHs(&PL_sv_yes); - PUTBACK; - if (perl_call_method("encode", G_SCALAR) != 1) { - Perl_die(aTHX_ "panic: encode did not return a value"); - } - SPAGAIN; - str = POPs; - PUTBACK; - s = SvPV(str, len); - count = PerlIO_unread(PerlIONext(f),s,len); - if (count != len) { - code = -1; - } - FREETMPS; - LEAVE; - } + else { + /* Invalid start byte */ + } + /* If we get here there is something wrong with alleged UTF-8 */ + if (check & ENCODE_DIE_ON_ERR){ + Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", (UV)*s); + XSRETURN(0); + } + if (check & ENCODE_WARN_ON_ERR){ + Perl_warner(aTHX_ packWARN(WARN_UTF8), + ERR_DECODE_NOMAP, "utf8", (UV)*s); + } + if (check & ENCODE_RETURN_ON_ERR) { + break; + } + if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ + SV* perlqq = newSVpvf("\\x%02" UVXf, (UV)*s); + sv_catsv(dst, perlqq); + SvREFCNT_dec(perlqq); + } else { + sv_catpv(dst, FBCHAR_UTF8); } - e->base.ptr = e->base.end = e->base.buf; - PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF); + s++; } - return code; -} + *SvEND(dst) = '\0'; -IV -PerlIOEncode_close(pTHX_ PerlIO * f) -{ - PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); - IV code = PerlIOBase_close(aTHX_ f); - if (e->bufsv) { - if (e->base.buf && e->base.ptr > e->base.buf) { - Perl_croak(aTHX_ "Close with partial character"); + /* Clear out translated part of source unless asked not to */ + if (check && !(check & ENCODE_LEAVE_SRC)){ + slen = e-s; + if (slen) { + sv_setpvn(src, (char*)s, slen); } - SvREFCNT_dec(e->bufsv); - e->bufsv = Nullsv; + SvCUR_set(src, slen); } - e->base.buf = NULL; - e->base.ptr = NULL; - e->base.end = NULL; - PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF); - return code; + SvUTF8_on(dst); + ST(0) = sv_2mortal(dst); + XSRETURN(1); } -Off_t -PerlIOEncode_tell(pTHX_ PerlIO * f) +void +Method_encode_xs(obj,src,check = 0) +SV * obj +SV * src +int check +CODE: { - PerlIOBuf *b = PerlIOSelf(f, PerlIOBuf); - /* Unfortunately the only way to get a postion is to (re-)translate, - the UTF8 we have in bufefr and then ask layer below - */ - PerlIO_flush(f); - if (b->buf && b->ptr > b->buf) { - Perl_croak(aTHX_ "Cannot tell at partial character"); + STRLEN slen; + U8 *s = (U8 *) SvPV(src, slen); + U8 *e = (U8 *) SvEND(src); + SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */ + if (SvUTF8(src)) { + /* Already encoded - trust it and just copy the octets */ + sv_setpvn(dst,(char *)s,(e-s)); + s = e; + } + else { + /* Native bytes - can always encode */ + U8 *d = (U8 *) SvGROW(dst, 2*slen+1); /* +1 or assertion will botch */ + while (s < e) { + UV uv = NATIVE_TO_UNI((UV) *s++); + if (UNI_IS_INVARIANT(uv)) + *d++ = (U8)UTF_TO_NATIVE(uv); + else { + *d++ = (U8)UTF8_EIGHT_BIT_HI(uv); + *d++ = (U8)UTF8_EIGHT_BIT_LO(uv); + } + } + SvCUR_set(dst, d- (U8 *)SvPVX(dst)); + *SvEND(dst) = '\0'; } - return PerlIO_tell(PerlIONext(f)); -} -PerlIO * -PerlIOEncode_dup(pTHX_ PerlIO * f, PerlIO * o, - CLONE_PARAMS * params, int flags) -{ - if ((f = PerlIOBase_dup(aTHX_ f, o, params, flags))) { - PerlIOEncode *fe = PerlIOSelf(f, PerlIOEncode); - PerlIOEncode *oe = PerlIOSelf(o, PerlIOEncode); - if (oe->enc) { - fe->enc = PerlIO_sv_dup(aTHX_ oe->enc, params); + /* Clear out translated part of source unless asked not to */ + if (check && !(check & ENCODE_LEAVE_SRC)){ + slen = e-s; + if (slen) { + sv_setpvn(src, (char*)s, slen); } + SvCUR_set(src, slen); } - return f; + SvPOK_only(dst); + SvUTF8_off(dst); + ST(0) = sv_2mortal(dst); + XSRETURN(1); } -PerlIO_funcs PerlIO_encode = { - "encoding", - sizeof(PerlIOEncode), - PERLIO_K_BUFFERED|PERLIO_K_DESTRUCT, - PerlIOEncode_pushed, - PerlIOEncode_popped, - PerlIOBuf_open, - PerlIOEncode_getarg, - PerlIOBase_fileno, - PerlIOEncode_dup, - PerlIOBuf_read, - PerlIOBuf_unread, - PerlIOBuf_write, - PerlIOBuf_seek, - PerlIOEncode_tell, - PerlIOEncode_close, - PerlIOEncode_flush, - PerlIOEncode_fill, - PerlIOBase_eof, - PerlIOBase_error, - PerlIOBase_clearerr, - PerlIOBase_setlinebuf, - PerlIOEncode_get_base, - PerlIOBuf_bufsiz, - PerlIOBuf_get_ptr, - PerlIOBuf_get_cnt, - PerlIOBuf_set_ptrcnt, -}; -#endif /* encode layer */ +MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_ + +PROTOTYPES: ENABLE void -Encode_XSEncoding(pTHX_ encode_t * enc) +Method_name(obj) +SV * obj +CODE: { - dSP; - HV *stash = gv_stashpv("Encode::XS", TRUE); - SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))), stash); - int i = 0; - PUSHMARK(sp); - XPUSHs(sv); - while (enc->name[i]) { - const char *name = enc->name[i++]; - XPUSHs(sv_2mortal(newSVpvn(name, strlen(name)))); - } - PUTBACK; - call_pv("Encode::define_encoding", G_DISCARD); - SvREFCNT_dec(sv); + encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0]))); + XSRETURN(1); } void -call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig) +Method_cat_decode(obj, dst, src, off, term, check = 0) +SV * obj +SV * dst +SV * src +SV * off +SV * term +int check +CODE: { - /* Exists for breakpointing */ + encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + STRLEN offset = (STRLEN)SvIV(off); + int code = 0; + if (SvUTF8(src)) { + sv_utf8_downgrade(src, FALSE); + } + sv_catsv(dst, encode_method(aTHX_ enc, enc->t_utf8, src, check, + &offset, term, &code)); + SvIVX(off) = (IV)offset; + if (code == ENCODE_FOUND_TERM) { + ST(0) = &PL_sv_yes; + }else{ + ST(0) = &PL_sv_no; + } + XSRETURN(1); } -static SV * -encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, - int check) +void +Method_decode(obj,src,check = 0) +SV * obj +SV * src +int check +CODE: { - STRLEN slen; - U8 *s = (U8 *) SvPV(src, slen); - STRLEN tlen = slen; - STRLEN ddone = 0; - STRLEN sdone = 0; - SV *dst = sv_2mortal(newSV(slen+1)); - if (slen) { - U8 *d = (U8 *) SvPVX(dst); - STRLEN dlen = SvLEN(dst)-1; - int code; - while ((code = do_encode(dir, s, &slen, d, dlen, &dlen, !check))) { - SvCUR_set(dst, dlen+ddone); - SvPOK_only(dst); - -#if 0 - Perl_warn(aTHX_ "code=%d @ s=%d/%d/%d d=%d/%d/%d",code,slen,sdone,tlen,dlen,ddone,SvLEN(dst)-1); -#endif - - if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL) - break; - - switch (code) { - case ENCODE_NOSPACE: - { - STRLEN need ; - sdone += slen; - ddone += dlen; - if (sdone) { - need = (tlen*SvLEN(dst)+sdone-1)/sdone+UTF8_MAXLEN; - } - else { - need = SvLEN(dst) + UTF8_MAXLEN; - } - - d = (U8 *) SvGROW(dst, need); - if (ddone >= SvLEN(dst)) { - Perl_croak(aTHX_ "Destination couldn't be grown."); - } - dlen = SvLEN(dst)-ddone-1; - d += ddone; - s += slen; - slen = tlen-sdone; - continue; - } - - case ENCODE_NOREP: - if (dir == enc->f_utf8) { - if (!check && ckWARN_d(WARN_UTF8)) { - STRLEN clen; - UV ch = - utf8n_to_uvuni(s + slen, (SvCUR(src) - slen), - &clen, 0); - Perl_warner(aTHX_ packWARN(WARN_UTF8), - "\"\\N{U+%" UVxf - "}\" does not map to %s", ch, - enc->name[0]); - /* FIXME: Skip over the character, copy in replacement and continue - * but that is messy so for now just fail. - */ - return &PL_sv_undef; - } - else { - return &PL_sv_undef; - } - } - else { - /* UTF-8 is supposed to be "Universal" so should not happen - for real characters, but some encodings have non-assigned - codes which may occur. - */ - Perl_croak(aTHX_ "%s \"\\x%02X\" does not map to Unicode (%d)", - enc->name[0], (U8) s[slen], code); - } - break; - - default: - Perl_croak(aTHX_ "Unexpected code %d converting %s %s", - code, (dir == enc->f_utf8) ? "to" : "from", - enc->name[0]); - return &PL_sv_undef; - } - } - SvCUR_set(dst, dlen+ddone); - SvPOK_only(dst); - if (check) { - sdone = SvCUR(src) - (slen+sdone); - if (sdone) { -#if 1 - /* FIXME: A Move() is dangerous - PV could be mmap'ed readonly - SvOOK would be ideal - but sv_backoff does not understand SvLEN == 0 - type SVs and sv_clear() calls it ... - */ - sv_setpvn(src,s+slen,sdone); -#else - Move(s + slen, SvPVX(src), sdone , U8); -#endif - } - SvCUR_set(src, sdone); - } - } - else { - SvCUR_set(dst, 0); - SvPOK_only(dst); + encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + if (SvUTF8(src)) { + sv_utf8_downgrade(src, FALSE); } - *SvEND(dst) = '\0'; - return dst; + ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check, + NULL, Nullsv, NULL); + SvUTF8_on(ST(0)); + XSRETURN(1); } -MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_ - -PROTOTYPES: ENABLE - void -Method_name(obj) +Method_encode(obj,src,check = 0) SV * obj +SV * src +int check CODE: - { - encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); - ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0]))); - XSRETURN(1); - } +{ + encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + sv_utf8_upgrade(src); + ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check, + NULL, Nullsv, NULL); + XSRETURN(1); +} void -Method_decode(obj,src,check = FALSE) +Method_needs_lines(obj) SV * obj -SV * src -bool check CODE: - { - encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); - ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check); - SvUTF8_on(ST(0)); - XSRETURN(1); - } +{ + /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */ + ST(0) = &PL_sv_no; + XSRETURN(1); +} void -Method_encode(obj,src,check = FALSE) +Method_perlio_ok(obj) SV * obj -SV * src -bool check CODE: - { - encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); - sv_utf8_upgrade(src); - ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check); - XSRETURN(1); - } +{ + /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */ + /* require_pv(PERLIO_FILENAME); */ + + eval_pv("require PerlIO::encoding", 0); + + if (SvTRUE(get_sv("@", 0))) { + ST(0) = &PL_sv_no; + }else{ + ST(0) = &PL_sv_yes; + } + XSRETURN(1); +} MODULE = Encode PACKAGE = Encode @@ -643,155 +490,249 @@ PROTOTYPES: ENABLE I32 _bytes_to_utf8(sv, ...) - SV * sv - CODE: - { - SV * encoding = items == 2 ? ST(1) : Nullsv; - - if (encoding) - RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding)); - else { - STRLEN len; - U8* s = (U8*)SvPV(sv, len); - U8* converted; - - converted = bytes_to_utf8(s, &len); /* This allocs */ - sv_setpvn(sv, (char *)converted, len); - SvUTF8_on(sv); /* XXX Should we? */ - Safefree(converted); /* ... so free it */ - RETVAL = len; - } - } - OUTPUT: - RETVAL +SV * sv +CODE: +{ + SV * encoding = items == 2 ? ST(1) : Nullsv; + + if (encoding) + RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding)); + else { + STRLEN len; + U8* s = (U8*)SvPV(sv, len); + U8* converted; + + converted = bytes_to_utf8(s, &len); /* This allocs */ + sv_setpvn(sv, (char *)converted, len); + SvUTF8_on(sv); /* XXX Should we? */ + Safefree(converted); /* ... so free it */ + RETVAL = len; + } +} +OUTPUT: + RETVAL I32 _utf8_to_bytes(sv, ...) - SV * sv - CODE: - { - SV * to = items > 1 ? ST(1) : Nullsv; - SV * check = items > 2 ? ST(2) : Nullsv; - - if (to) - RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to)); - else { - STRLEN len; - U8 *s = (U8*)SvPV(sv, len); - - RETVAL = 0; - if (SvTRUE(check)) { - /* Must do things the slow way */ - U8 *dest; - U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */ - U8 *send = s + len; - - New(83, dest, len, U8); /* I think */ - - while (s < send) { - if (*s < 0x80) - *dest++ = *s++; - else { - STRLEN ulen; - UV uv = *s++; - - /* Have to do it all ourselves because of error routine, - aargh. */ - if (!(uv & 0x40)) - goto failure; - if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; } - else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; } - else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; } - else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; } - else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; } - else if (!(uv & 0x01)) { ulen = 7; uv = 0; } - else { ulen = 13; uv = 0; } - - /* Note change to utf8.c variable naming, for variety */ - while (ulen--) { - if ((*s & 0xc0) != 0x80) - goto failure; +SV * sv +CODE: +{ + SV * to = items > 1 ? ST(1) : Nullsv; + SV * check = items > 2 ? ST(2) : Nullsv; + + if (to) { + RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to)); + } else { + STRLEN len; + U8 *s = (U8*)SvPV(sv, len); + + RETVAL = 0; + if (SvTRUE(check)) { + /* Must do things the slow way */ + U8 *dest; + /* We need a copy to pass to check() */ + U8 *src = (U8*)savepv((char *)s); + U8 *send = s + len; + + New(83, dest, len, U8); /* I think */ + + while (s < send) { + if (*s < 0x80){ + *dest++ = *s++; + } else { + STRLEN ulen; + UV uv = *s++; + + /* Have to do it all ourselves because of error routine, + aargh. */ + if (!(uv & 0x40)){ goto failure; } + if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; } + else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; } + else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; } + else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; } + else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; } + else if (!(uv & 0x01)) { ulen = 7; uv = 0; } + else { ulen = 13; uv = 0; } - else - uv = (uv << 6) | (*s++ & 0x3f); + /* Note change to utf8.c variable naming, for variety */ + while (ulen--) { + if ((*s & 0xc0) != 0x80){ + goto failure; + } else { + uv = (uv << 6) | (*s++ & 0x3f); + } } if (uv > 256) { failure: - call_failure(check, s, dest, src); - /* Now what happens? */ + call_failure(check, s, dest, src); + /* Now what happens? */ } *dest++ = (U8)uv; - } - } - } else - RETVAL = (utf8_to_bytes(s, &len) ? len : 0); - } + } + } + } else { + RETVAL = (utf8_to_bytes(s, &len) ? len : 0); } - OUTPUT: - RETVAL + } +} +OUTPUT: + RETVAL bool -is_utf8(sv, check = FALSE) +is_utf8(sv, check = 0) SV * sv -bool check - CODE: - { - if (SvGMAGICAL(sv)) /* it could be $1, for example */ - sv = newSVsv(sv); /* GMAGIG will be done */ - if (SvPOK(sv)) { - RETVAL = SvUTF8(sv) ? TRUE : FALSE; - if (RETVAL && - check && - !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) - RETVAL = FALSE; - } else { +int check +CODE: +{ + if (SvGMAGICAL(sv)) /* it could be $1, for example */ + sv = newSVsv(sv); /* GMAGIG will be done */ + if (SvPOK(sv)) { + RETVAL = SvUTF8(sv) ? TRUE : FALSE; + if (RETVAL && + check && + !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) RETVAL = FALSE; - } - if (sv != ST(0)) - SvREFCNT_dec(sv); /* it was a temp copy */ - } - OUTPUT: - RETVAL + } else { + RETVAL = FALSE; + } + if (sv != ST(0)) + SvREFCNT_dec(sv); /* it was a temp copy */ +} +OUTPUT: + RETVAL SV * _utf8_on(sv) - SV * sv - CODE: - { - if (SvPOK(sv)) { - SV *rsv = newSViv(SvUTF8(sv)); - RETVAL = rsv; - SvUTF8_on(sv); - } else { - RETVAL = &PL_sv_undef; - } - } - OUTPUT: - RETVAL +SV * sv +CODE: +{ + if (SvPOK(sv)) { + SV *rsv = newSViv(SvUTF8(sv)); + RETVAL = rsv; + SvUTF8_on(sv); + } else { + RETVAL = &PL_sv_undef; + } +} +OUTPUT: + RETVAL SV * _utf8_off(sv) - SV * sv - CODE: - { - if (SvPOK(sv)) { - SV *rsv = newSViv(SvUTF8(sv)); - RETVAL = rsv; - SvUTF8_off(sv); - } else { - RETVAL = &PL_sv_undef; - } - } - OUTPUT: - RETVAL +SV * sv +CODE: +{ + if (SvPOK(sv)) { + SV *rsv = newSViv(SvUTF8(sv)); + RETVAL = rsv; + SvUTF8_off(sv); + } else { + RETVAL = &PL_sv_undef; + } +} +OUTPUT: + RETVAL + +int +DIE_ON_ERR() +CODE: + RETVAL = ENCODE_DIE_ON_ERR; +OUTPUT: + RETVAL + +int +WARN_ON_ERR() +CODE: + RETVAL = ENCODE_WARN_ON_ERR; +OUTPUT: + RETVAL + +int +LEAVE_SRC() +CODE: + RETVAL = ENCODE_LEAVE_SRC; +OUTPUT: + RETVAL + +int +RETURN_ON_ERR() +CODE: + RETVAL = ENCODE_RETURN_ON_ERR; +OUTPUT: + RETVAL + +int +PERLQQ() +CODE: + RETVAL = ENCODE_PERLQQ; +OUTPUT: + RETVAL + +int +HTMLCREF() +CODE: + RETVAL = ENCODE_HTMLCREF; +OUTPUT: + RETVAL + +int +XMLCREF() +CODE: + RETVAL = ENCODE_XMLCREF; +OUTPUT: + RETVAL + +int +FB_DEFAULT() +CODE: + RETVAL = ENCODE_FB_DEFAULT; +OUTPUT: + RETVAL + +int +FB_CROAK() +CODE: + RETVAL = ENCODE_FB_CROAK; +OUTPUT: + RETVAL + +int +FB_QUIET() +CODE: + RETVAL = ENCODE_FB_QUIET; +OUTPUT: + RETVAL + +int +FB_WARN() +CODE: + RETVAL = ENCODE_FB_WARN; +OUTPUT: + RETVAL + +int +FB_PERLQQ() +CODE: + RETVAL = ENCODE_FB_PERLQQ; +OUTPUT: + RETVAL + +int +FB_HTMLCREF() +CODE: + RETVAL = ENCODE_FB_HTMLCREF; +OUTPUT: + RETVAL + +int +FB_XMLCREF() +CODE: + RETVAL = ENCODE_FB_XMLCREF; +OUTPUT: + RETVAL BOOT: { -#if defined(USE_PERLIO) && !defined(USE_SFIO) - PerlIO_define_layer(aTHX_ &PerlIO_encode); -#endif -/* #include "8859_def.h" */ -/* #include "EBCDIC_def.h" */ -/* #include "Symbols_def.h" */ -#include "def_t_def.h" +#include "def_t.h" +#include "def_t.exh" }