X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=ext%2FEncode%2FEncode.xs;h=1424071f3a72da1a85dfeda47849e7e30dd39d4e;hb=4ac71550d23cca4632a2bcdfcb1d83a6bf705e45;hp=d9e33bf31e40170a0f5c431d0ef0e59cddd1fbe7;hpb=7211588c6aa4629aa6b38db94c2ae732db5ef8c7;p=p5sagit%2Fp5-mst-13.2.git diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index d9e33bf..1424071 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -1,5 +1,5 @@ /* - $Id: Encode.xs,v 1.49 2002/10/21 19:47:47 dankogai Exp $ + $Id: Encode.xs,v 2.14 2007/05/29 18:15:32 dankogai Exp $ */ #define PERL_NO_GET_CONTEXT @@ -21,14 +21,20 @@ #define ENCODE_XS_USEFP 1 #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \ - Perl_croak(aTHX_ "panic_unimplemented"); \ - return (y)0; /* fool picky compilers */ \ - } + Perl_croak(aTHX_ "panic_unimplemented"); \ + return (y)0; /* fool picky compilers */ \ + } /**/ UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) +#define UTF8_ALLOW_STRICT 0 +#define UTF8_ALLOW_NONSTRICT (UTF8_ALLOW_ANY & \ + ~(UTF8_ALLOW_CONTINUATION | \ + UTF8_ALLOW_NON_CONTINUATION | \ + UTF8_ALLOW_LONG)) + void Encode_XSEncoding(pTHX_ encode_t * enc) { @@ -39,8 +45,8 @@ Encode_XSEncoding(pTHX_ encode_t * enc) PUSHMARK(sp); XPUSHs(sv); while (enc->name[i]) { - const char *name = enc->name[i++]; - XPUSHs(sv_2mortal(newSVpvn(name, strlen(name)))); + const char *name = enc->name[i++]; + XPUSHs(sv_2mortal(newSVpvn(name, strlen(name)))); } PUTBACK; call_pv("Encode::define_encoding", G_DISCARD); @@ -58,8 +64,35 @@ call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig) #define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode" static SV * -encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, - int check) +do_fallback_cb(pTHX_ UV ch, SV *fallback_cb) +{ + dSP; + int argc; + SV *temp, *retval; + ENTER; + SAVETMPS; + PUSHMARK(sp); + XPUSHs(sv_2mortal(newSVnv((UV)ch))); + PUTBACK; + argc = call_sv(fallback_cb, G_SCALAR); + SPAGAIN; + if (argc != 1){ + croak("fallback sub must return scalar!"); + } + temp = newSVsv(POPs); + PUTBACK; + FREETMPS; + LEAVE; + retval = newSVpv("",0); + sv_catsv(retval, temp); + SvREFCNT_dec(temp); + return retval; +} + +static SV * +encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, + int check, STRLEN * offset, SV * term, int * retcode, + SV *fallback_cb) { STRLEN slen; U8 *s = (U8 *) SvPV(src, slen); @@ -72,152 +105,164 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, SV *dst = sv_2mortal(newSV(slen+1)); U8 *d = (U8 *)SvPVX(dst); STRLEN dlen = SvLEN(dst)-1; - int code; + int code = 0; + STRLEN trmlen = 0; + U8 *trm = term ? (U8*) SvPV(term, trmlen) : NULL; + + if (offset) { + s += *offset; + if (slen > *offset){ /* safeguard against slen overflow */ + slen -= *offset; + }else{ + slen = 0; + } + tlen = slen; + } - if (!slen){ - SvCUR_set(dst, 0); - SvPOK_only(dst); - goto ENCODE_END; + if (slen == 0){ + SvCUR_set(dst, 0); + SvPOK_only(dst); + goto ENCODE_END; } - while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check)) ) + while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check, + trm, trmlen)) ) { - SvCUR_set(dst, dlen+ddone); - SvPOK_only(dst); - - if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL){ - break; - } - switch (code) { - case ENCODE_NOSPACE: - { - STRLEN more = 0; /* make sure you initialize! */ - STRLEN sleft; - sdone += slen; - ddone += dlen; - sleft = tlen - sdone; + SvCUR_set(dst, dlen+ddone); + SvPOK_only(dst); + + if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL || + code == ENCODE_FOUND_TERM) { + break; + } + switch (code) { + case ENCODE_NOSPACE: + { + STRLEN more = 0; /* make sure you initialize! */ + STRLEN sleft; + sdone += slen; + ddone += dlen; + sleft = tlen - sdone; #if ENCODE_XS_PROFILE >= 2 - Perl_warn(aTHX_ - "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n", - more, sdone, sleft, SvLEN(dst)); + Perl_warn(aTHX_ + "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n", + more, sdone, sleft, SvLEN(dst)); #endif - if (sdone != 0) { /* has src ever been processed ? */ + if (sdone != 0) { /* has src ever been processed ? */ #if ENCODE_XS_USEFP == 2 - more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone - - SvLEN(dst); + more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone + - SvLEN(dst); #elif ENCODE_XS_USEFP - more = (STRLEN)((1.0*SvLEN(dst)+1)/sdone * sleft); + more = (STRLEN)((1.0*SvLEN(dst)+1)/sdone * sleft); #else - /* safe until SvLEN(dst) == MAX_INT/16 */ - more = (16*SvLEN(dst)+1)/sdone/16 * sleft; + /* safe until SvLEN(dst) == MAX_INT/16 */ + more = (16*SvLEN(dst)+1)/sdone/16 * sleft; #endif - } - more += UTF8_MAXLEN; /* insurance policy */ - d = (U8 *) SvGROW(dst, SvLEN(dst) + more); - /* dst need to grow need MORE bytes! */ - if (ddone >= SvLEN(dst)) { - Perl_croak(aTHX_ "Destination couldn't be grown."); - } - dlen = SvLEN(dst)-ddone-1; - d += ddone; - s += slen; - slen = tlen-sdone; - continue; - } - case ENCODE_NOREP: - /* encoding */ - if (dir == enc->f_utf8) { - STRLEN clen; - UV ch = - utf8n_to_uvuni(s+slen, (SvCUR(src)-slen), - &clen, UTF8_ALLOW_ANY|UTF8_CHECK_ONLY); - if (check & ENCODE_DIE_ON_ERR) { - Perl_croak(aTHX_ ERR_ENCODE_NOMAP, - (UV)ch, enc->name[0]); - return &PL_sv_undef; /* never reaches but be safe */ - } - if (check & ENCODE_WARN_ON_ERR){ - Perl_warner(aTHX_ packWARN(WARN_UTF8), - ERR_ENCODE_NOMAP, (UV)ch, enc->name[0]); - } - if (check & ENCODE_RETURN_ON_ERR){ - goto ENCODE_SET_SRC; - } - if (check & ENCODE_PERLQQ){ - SV* perlqq = - sv_2mortal(newSVpvf("\\x{%04"UVxf"}", (UV)ch)); - sdone += slen + clen; - ddone += dlen + SvCUR(perlqq); - sv_catsv(dst, perlqq); - }else if (check & ENCODE_HTMLCREF){ - SV* htmlcref = - sv_2mortal(newSVpvf("&#%" UVuf ";", (UV)ch)); - sdone += slen + clen; - ddone += dlen + SvCUR(htmlcref); - sv_catsv(dst, htmlcref); - }else if (check & ENCODE_XMLCREF){ - SV* xmlcref = - sv_2mortal(newSVpvf("&#x%" UVxf ";", (UV)ch)); - sdone += slen + clen; - ddone += dlen + SvCUR(xmlcref); - sv_catsv(dst, xmlcref); - } else { - /* fallback char */ - sdone += slen + clen; - ddone += dlen + enc->replen; - sv_catpvn(dst, (char*)enc->rep, enc->replen); - } - } - /* decoding */ - else { - if (check & ENCODE_DIE_ON_ERR){ - Perl_croak(aTHX_ ERR_DECODE_NOMAP, - enc->name[0], (UV)s[slen]); - return &PL_sv_undef; /* never reaches but be safe */ - } - if (check & ENCODE_WARN_ON_ERR){ - Perl_warner( - aTHX_ packWARN(WARN_UTF8), - ERR_DECODE_NOMAP, - enc->name[0], (UV)s[slen]); - } - if (check & ENCODE_RETURN_ON_ERR){ - goto ENCODE_SET_SRC; - } - if (check & - (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ - SV* perlqq = - sv_2mortal(newSVpvf("\\x%02" UVXf, (UV)s[slen])); - sdone += slen + 1; - ddone += dlen + SvCUR(perlqq); - sv_catsv(dst, perlqq); - } else { - sdone += slen + 1; - ddone += dlen + strlen(FBCHAR_UTF8); - sv_catpv(dst, FBCHAR_UTF8); - } - } - /* settle variables when fallback */ - d = (U8 *)SvEND(dst); - dlen = SvLEN(dst) - ddone - 1; - s = (U8*)SvPVX(src) + sdone; - slen = tlen - sdone; - break; - - default: - Perl_croak(aTHX_ "Unexpected code %d converting %s %s", - code, (dir == enc->f_utf8) ? "to" : "from", - enc->name[0]); - return &PL_sv_undef; - } + } + more += UTF8_MAXLEN; /* insurance policy */ + d = (U8 *) SvGROW(dst, SvLEN(dst) + more); + /* dst need to grow need MORE bytes! */ + if (ddone >= SvLEN(dst)) { + Perl_croak(aTHX_ "Destination couldn't be grown."); + } + dlen = SvLEN(dst)-ddone-1; + d += ddone; + s += slen; + slen = tlen-sdone; + continue; + } + case ENCODE_NOREP: + /* encoding */ + if (dir == enc->f_utf8) { + STRLEN clen; + UV ch = + utf8n_to_uvuni(s+slen, (SvCUR(src)-slen), + &clen, UTF8_ALLOW_ANY|UTF8_CHECK_ONLY); + /* if non-representable multibyte prefix at end of current buffer - break*/ + if (clen > tlen - sdone) break; + if (check & ENCODE_DIE_ON_ERR) { + Perl_croak(aTHX_ ERR_ENCODE_NOMAP, + (UV)ch, enc->name[0]); + return &PL_sv_undef; /* never reaches but be safe */ + } + if (check & ENCODE_WARN_ON_ERR){ + Perl_warner(aTHX_ packWARN(WARN_UTF8), + ERR_ENCODE_NOMAP, (UV)ch, enc->name[0]); + } + if (check & ENCODE_RETURN_ON_ERR){ + goto ENCODE_SET_SRC; + } + if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ + SV* subchar = + (fallback_cb != &PL_sv_undef) + ? do_fallback_cb(aTHX_ ch, fallback_cb) + : newSVpvf(check & ENCODE_PERLQQ ? "\\x{%04"UVxf"}" : + check & ENCODE_HTMLCREF ? "&#%" UVuf ";" : + "&#x%" UVxf ";", (UV)ch); + sdone += slen + clen; + ddone += dlen + SvCUR(subchar); + sv_catsv(dst, subchar); + SvREFCNT_dec(subchar); + } else { + /* fallback char */ + sdone += slen + clen; + ddone += dlen + enc->replen; + sv_catpvn(dst, (char*)enc->rep, enc->replen); + } + } + /* decoding */ + else { + if (check & ENCODE_DIE_ON_ERR){ + Perl_croak(aTHX_ ERR_DECODE_NOMAP, + enc->name[0], (UV)s[slen]); + return &PL_sv_undef; /* never reaches but be safe */ + } + if (check & ENCODE_WARN_ON_ERR){ + Perl_warner( + aTHX_ packWARN(WARN_UTF8), + ERR_DECODE_NOMAP, + enc->name[0], (UV)s[slen]); + } + if (check & ENCODE_RETURN_ON_ERR){ + goto ENCODE_SET_SRC; + } + if (check & + (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ + SV* subchar = + (fallback_cb != &PL_sv_undef) + ? do_fallback_cb(aTHX_ (UV)s[slen], fallback_cb) + : newSVpvf("\\x%02" UVXf, (UV)s[slen]); + sdone += slen + 1; + ddone += dlen + SvCUR(subchar); + sv_catsv(dst, subchar); + SvREFCNT_dec(subchar); + } else { + sdone += slen + 1; + ddone += dlen + strlen(FBCHAR_UTF8); + sv_catpv(dst, FBCHAR_UTF8); + } + } + /* settle variables when fallback */ + d = (U8 *)SvEND(dst); + dlen = SvLEN(dst) - ddone - 1; + s = (U8*)SvPVX(src) + sdone; + slen = tlen - sdone; + break; + + default: + Perl_croak(aTHX_ "Unexpected code %d converting %s %s", + code, (dir == enc->f_utf8) ? "to" : "from", + enc->name[0]); + return &PL_sv_undef; + } } ENCODE_SET_SRC: if (check && !(check & ENCODE_LEAVE_SRC)){ - sdone = SvCUR(src) - (slen+sdone); - if (sdone) { - sv_setpvn(src, (char*)s+slen, sdone); - } - SvCUR_set(src, sdone); + sdone = SvCUR(src) - (slen+sdone); + if (sdone) { + sv_setpvn(src, (char*)s+slen, sdone); + } + SvCUR_set(src, sdone); } /* warn("check = 0x%X, code = 0x%d\n", check, code); */ @@ -226,94 +271,187 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, #if ENCODE_XS_PROFILE if (SvCUR(dst) > SvCUR(src)){ - Perl_warn(aTHX_ - "SvLEN(dst)=%d, SvCUR(dst)=%d. %d bytes unused(%f %%)\n", - SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst), - (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0); + Perl_warn(aTHX_ + "SvLEN(dst)=%d, SvCUR(dst)=%d. %d bytes unused(%f %%)\n", + SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst), + (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0); } #endif + if (offset) + *offset += sdone + slen; + ENCODE_END: *SvEND(dst) = '\0'; + if (retcode) *retcode = code; return dst; } -MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_ +static bool +strict_utf8(pTHX_ SV* sv) +{ + HV* hv; + SV** svp; + sv = SvRV(sv); + if (!sv || SvTYPE(sv) != SVt_PVHV) + return 0; + hv = (HV*)sv; + svp = hv_fetch(hv, "strict_utf8", 11, 0); + if (!svp) + return 0; + return SvTRUE(*svp); +} + +static U8* +process_utf8(pTHX_ SV* dst, U8* s, U8* e, int check, + bool encode, bool strict, bool stop_at_partial) +{ + UV uv; + STRLEN ulen; + + SvPOK_only(dst); + SvCUR_set(dst,0); + + while (s < e) { + if (UTF8_IS_INVARIANT(*s)) { + sv_catpvn(dst, (char *)s, 1); + s++; + continue; + } + + if (UTF8_IS_START(*s)) { + U8 skip = UTF8SKIP(s); + if ((s + skip) > e) { + /* Partial character */ + /* XXX could check that rest of bytes are UTF8_IS_CONTINUATION(ch) */ + if (stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) + break; + + goto malformed_byte; + } + + uv = utf8n_to_uvuni(s, e - s, &ulen, + UTF8_CHECK_ONLY | (strict ? UTF8_ALLOW_STRICT : + UTF8_ALLOW_NONSTRICT) + ); +#if 1 /* perl-5.8.6 and older do not check UTF8_ALLOW_LONG */ + if (strict && uv > PERL_UNICODE_MAX) + ulen = (STRLEN) -1; +#endif + if (ulen == -1) { + if (strict) { + uv = utf8n_to_uvuni(s, e - s, &ulen, + UTF8_CHECK_ONLY | UTF8_ALLOW_NONSTRICT); + if (ulen == -1) + goto malformed_byte; + goto malformed; + } + goto malformed_byte; + } + + + /* Whole char is good */ + sv_catpvn(dst,(char *)s,skip); + s += skip; + continue; + } + + /* If we get here there is something wrong with alleged UTF-8 */ + malformed_byte: + uv = (UV)*s; + ulen = 1; + + malformed: + if (check & ENCODE_DIE_ON_ERR){ + if (encode) + Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, "utf8"); + else + Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", uv); + } + if (check & ENCODE_WARN_ON_ERR){ + if (encode) + Perl_warner(aTHX_ packWARN(WARN_UTF8), + ERR_ENCODE_NOMAP, uv, "utf8"); + else + Perl_warner(aTHX_ packWARN(WARN_UTF8), + ERR_DECODE_NOMAP, "utf8", uv); + } + if (check & ENCODE_RETURN_ON_ERR) { + break; + } + if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ + SV* subchar = newSVpvf(check & ENCODE_PERLQQ ? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}"): + check & ENCODE_HTMLCREF ? "&#%" UVuf ";" : + "&#x%" UVxf ";", uv); + sv_catsv(dst, subchar); + SvREFCNT_dec(subchar); + } else { + sv_catpv(dst, FBCHAR_UTF8); + } + s += ulen; + } + *SvEND(dst) = '\0'; + + return s; +} + + +MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_ + +PROTOTYPES: DISABLE void -Method_decode(obj,src,check = 0) -SV * obj -SV * src -int check +Method_decode_xs(obj,src,check = 0) +SV * obj +SV * src +int check CODE: { STRLEN slen; U8 *s = (U8 *) SvPV(src, slen); U8 *e = (U8 *) SvEND(src); - SV *dst = newSV(slen); - SvPOK_only(dst); - SvCUR_set(dst,0); + SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */ + + /* + * PerlIO check -- we assume the object is of PerlIO if renewed + */ + bool renewed = 0; + dSP; ENTER; SAVETMPS; + PUSHMARK(sp); + XPUSHs(obj); + PUTBACK; + if (call_method("renewed",G_SCALAR) == 1) { + SPAGAIN; + renewed = (bool)POPi; + PUTBACK; +#if 0 + fprintf(stderr, "renewed == %d\n", renewed); +#endif + } + FREETMPS; LEAVE; + /* end PerlIO check */ + if (SvUTF8(src)) { - s = utf8_to_bytes(s,&slen); - if (s) { - SvCUR_set(src,slen); - SvUTF8_off(src); - e = s+slen; - } - else { - croak("Cannot decode string with wide characters"); - } + s = utf8_to_bytes(s,&slen); + if (s) { + SvCUR_set(src,slen); + SvUTF8_off(src); + e = s+slen; + } + else { + croak("Cannot decode string with wide characters"); } - while (s < e) { - if (UTF8_IS_INVARIANT(*s) || UTF8_IS_START(*s)) { - U8 skip = UTF8SKIP(s); - if ((s + skip) > e) { - /* Partial character - done */ - break; - } - else if (is_utf8_char(s)) { - /* Whole char is good */ - sv_catpvn(dst,(char *)s,skip); - s += skip; - continue; - } - else { - /* starts ok but isn't "good" */ - } - } - else { - /* Invalid start byte */ - } - /* If we get here there is something wrong with alleged UTF-8 */ - if (check & ENCODE_DIE_ON_ERR){ - Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", (UV)*s); - XSRETURN(0); - } - if (check & ENCODE_WARN_ON_ERR){ - Perl_warner(aTHX_ packWARN(WARN_UTF8), - ERR_DECODE_NOMAP, "utf8", (UV)*s); - } - if (check & ENCODE_RETURN_ON_ERR) { - break; - } - if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ - SV* perlqq = newSVpvf("\\x%02" UVXf, (UV)*s); - sv_catsv(dst, perlqq); - SvREFCNT_dec(perlqq); - } else { - sv_catpv(dst, FBCHAR_UTF8); - } - s++; } - *SvEND(dst) = '\0'; + + s = process_utf8(aTHX_ dst, s, e, check, 0, strict_utf8(aTHX_ obj), renewed); /* Clear out translated part of source unless asked not to */ if (check && !(check & ENCODE_LEAVE_SRC)){ - slen = e-s; - if (slen) { - sv_setpvn(src, (char*)s, slen); - } - SvCUR_set(src, slen); + slen = e-s; + if (slen) { + sv_setpvn(src, (char*)s, slen); + } + SvCUR_set(src, slen); } SvUTF8_on(dst); ST(0) = sv_2mortal(dst); @@ -321,44 +459,51 @@ CODE: } void -Method_encode(obj,src,check = 0) -SV * obj -SV * src -int check +Method_encode_xs(obj,src,check = 0) +SV * obj +SV * src +int check CODE: { STRLEN slen; U8 *s = (U8 *) SvPV(src, slen); U8 *e = (U8 *) SvEND(src); - SV *dst = newSV(slen); + SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */ if (SvUTF8(src)) { - /* Already encoded - trust it and just copy the octets */ - sv_setpvn(dst,(char *)s,(e-s)); - s = e; + /* Already encoded */ + if (strict_utf8(aTHX_ obj)) { + s = process_utf8(aTHX_ dst, s, e, check, 1, 1, 0); + } + else { + /* trust it and just copy the octets */ + sv_setpvn(dst,(char *)s,(e-s)); + s = e; + } } else { - /* Native bytes - can always encode */ - U8 *d = (U8 *) SvGROW(dst,2*slen+1); - while (s < e) { - UV uv = NATIVE_TO_UNI((UV) *s++); - if (UNI_IS_INVARIANT(uv)) - *d++ = (U8)UTF_TO_NATIVE(uv); - else { - *d++ = (U8)UTF8_EIGHT_BIT_HI(uv); - *d++ = (U8)UTF8_EIGHT_BIT_LO(uv); - } - } - SvCUR_set(dst, d- (U8 *)SvPVX(dst)); - *SvEND(dst) = '\0'; + /* Native bytes - can always encode */ + U8 *d = (U8 *) SvGROW(dst, 2*slen+1); /* +1 or assertion will botch */ + while (s < e) { + UV uv = NATIVE_TO_UNI((UV) *s); + s++; /* Above expansion of NATIVE_TO_UNI() is safer this way. */ + if (UNI_IS_INVARIANT(uv)) + *d++ = (U8)UTF_TO_NATIVE(uv); + else { + *d++ = (U8)UTF8_EIGHT_BIT_HI(uv); + *d++ = (U8)UTF8_EIGHT_BIT_LO(uv); + } + } + SvCUR_set(dst, d- (U8 *)SvPVX(dst)); + *SvEND(dst) = '\0'; } /* Clear out translated part of source unless asked not to */ if (check && !(check & ENCODE_LEAVE_SRC)){ - slen = e-s; - if (slen) { - sv_setpvn(src, (char*)s, slen); - } - SvCUR_set(src, slen); + slen = e-s; + if (slen) { + sv_setpvn(src, (char*)s, slen); + } + SvCUR_set(src, slen); } SvPOK_only(dst); SvUTF8_off(dst); @@ -366,14 +511,30 @@ CODE: XSRETURN(1); } -MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_ +MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_ PROTOTYPES: ENABLE void -Method_name(obj) +Method_renew(obj) +SV * obj +CODE: +{ + XSRETURN(1); +} + +int +Method_renewed(obj) SV * obj CODE: + RETVAL = 0; +OUTPUT: + RETVAL + +void +Method_name(obj) +SV * obj +CODE: { encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0]))); @@ -381,37 +542,90 @@ CODE: } void -Method_decode(obj,src,check = 0) -SV * obj -SV * src -int check +Method_cat_decode(obj, dst, src, off, term, check_sv = &PL_sv_no) +SV * obj +SV * dst +SV * src +SV * off +SV * term +SV * check_sv CODE: { + int check; + SV *fallback_cb = &PL_sv_undef; encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + STRLEN offset = (STRLEN)SvIV(off); + int code = 0; if (SvUTF8(src)) { sv_utf8_downgrade(src, FALSE); } - ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check); + if (SvROK(check_sv)){ + fallback_cb = check_sv; + check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ + }else{ + check = SvIV(check_sv); + } + sv_catsv(dst, encode_method(aTHX_ enc, enc->t_utf8, src, check, + &offset, term, &code, fallback_cb)); + SvIV_set(off, (IV)offset); + if (code == ENCODE_FOUND_TERM) { + ST(0) = &PL_sv_yes; + }else{ + ST(0) = &PL_sv_no; + } + XSRETURN(1); +} + +void +Method_decode(obj,src,check_sv = &PL_sv_no) +SV * obj +SV * src +SV * check_sv +CODE: +{ + int check; + SV *fallback_cb = &PL_sv_undef; + encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + if (SvUTF8(src)) { + sv_utf8_downgrade(src, FALSE); + } + if (SvROK(check_sv)){ + fallback_cb = check_sv; + check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ + }else{ + check = SvIV(check_sv); + } + ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check, + NULL, Nullsv, NULL, fallback_cb); SvUTF8_on(ST(0)); XSRETURN(1); } void -Method_encode(obj,src,check = 0) -SV * obj -SV * src -int check +Method_encode(obj,src,check_sv = &PL_sv_no) +SV * obj +SV * src +SV * check_sv CODE: { + int check; + SV *fallback_cb = &PL_sv_undef; encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); sv_utf8_upgrade(src); - ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check); + if (SvROK(check_sv)){ + fallback_cb = check_sv; + check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ + }else{ + check = SvIV(check_sv); + } + ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check, + NULL, Nullsv, NULL, fallback_cb); XSRETURN(1); } void Method_needs_lines(obj) -SV * obj +SV * obj CODE: { /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */ @@ -421,7 +635,7 @@ CODE: void Method_perlio_ok(obj) -SV * obj +SV * obj CODE: { /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */ @@ -430,9 +644,38 @@ CODE: eval_pv("require PerlIO::encoding", 0); if (SvTRUE(get_sv("@", 0))) { - ST(0) = &PL_sv_no; + ST(0) = &PL_sv_no; + }else{ + ST(0) = &PL_sv_yes; + } + XSRETURN(1); +} + +void +Method_mime_name(obj) +SV * obj +CODE: +{ + encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + SV *retval; + eval_pv("require Encode::MIME::Name", 0); + + if (SvTRUE(get_sv("@", 0))) { + ST(0) = &PL_sv_undef; }else{ - ST(0) = &PL_sv_yes; + ENTER; + SAVETMPS; + PUSHMARK(sp); + XPUSHs(sv_2mortal(newSVpvn(enc->name[0], strlen(enc->name[0])))); + PUTBACK; + call_pv("Encode::MIME::Name::get_mime_name", G_SCALAR); + SPAGAIN; + retval = newSVsv(POPs); + PUTBACK; + FREETMPS; + LEAVE; + /* enc->name[0] */ + ST(0) = retval; } XSRETURN(1); } @@ -451,15 +694,15 @@ CODE: if (encoding) RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding)); else { - STRLEN len; - U8* s = (U8*)SvPV(sv, len); - U8* converted; - - converted = bytes_to_utf8(s, &len); /* This allocs */ - sv_setpvn(sv, (char *)converted, len); - SvUTF8_on(sv); /* XXX Should we? */ - Safefree(converted); /* ... so free it */ - RETVAL = len; + STRLEN len; + U8* s = (U8*)SvPV(sv, len); + U8* converted; + + converted = bytes_to_utf8(s, &len); /* This allocs */ + sv_setpvn(sv, (char *)converted, len); + SvUTF8_on(sv); /* XXX Should we? */ + Safefree(converted); /* ... so free it */ + RETVAL = len; } } OUTPUT: @@ -474,58 +717,63 @@ CODE: SV * check = items > 2 ? ST(2) : Nullsv; if (to) { - RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to)); + RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to)); + } else { + STRLEN len; + U8 *s = (U8*)SvPV(sv, len); + + RETVAL = 0; + if (SvTRUE(check)) { + /* Must do things the slow way */ + U8 *dest; + /* We need a copy to pass to check() */ + U8 *src = s; + U8 *send = s + len; + U8 *d0; + + New(83, dest, len, U8); /* I think */ + d0 = dest; + + while (s < send) { + if (*s < 0x80){ + *dest++ = *s++; + } else { + STRLEN ulen; + UV uv = *s++; + + /* Have to do it all ourselves because of error routine, + aargh. */ + if (!(uv & 0x40)){ goto failure; } + if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; } + else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; } + else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; } + else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; } + else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; } + else if (!(uv & 0x01)) { ulen = 7; uv = 0; } + else { ulen = 13; uv = 0; } + + /* Note change to utf8.c variable naming, for variety */ + while (ulen--) { + if ((*s & 0xc0) != 0x80){ + goto failure; + } else { + uv = (uv << 6) | (*s++ & 0x3f); + } + } + if (uv > 256) { + failure: + call_failure(check, s, dest, src); + /* Now what happens? */ + } + *dest++ = (U8)uv; + } + } + RETVAL = dest - d0; + sv_usepvn(sv, (char *)dest, RETVAL); + SvUTF8_off(sv); } else { - STRLEN len; - U8 *s = (U8*)SvPV(sv, len); - - RETVAL = 0; - if (SvTRUE(check)) { - /* Must do things the slow way */ - U8 *dest; - /* We need a copy to pass to check() */ - U8 *src = (U8*)savepv((char *)s); - U8 *send = s + len; - - New(83, dest, len, U8); /* I think */ - - while (s < send) { - if (*s < 0x80){ - *dest++ = *s++; - } else { - STRLEN ulen; - UV uv = *s++; - - /* Have to do it all ourselves because of error routine, - aargh. */ - if (!(uv & 0x40)){ goto failure; } - if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; } - else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; } - else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; } - else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; } - else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; } - else if (!(uv & 0x01)) { ulen = 7; uv = 0; } - else { ulen = 13; uv = 0; } - - /* Note change to utf8.c variable naming, for variety */ - while (ulen--) { - if ((*s & 0xc0) != 0x80){ - goto failure; - } else { - uv = (uv << 6) | (*s++ & 0x3f); - } - } - if (uv > 256) { - failure: - call_failure(check, s, dest, src); - /* Now what happens? */ - } - *dest++ = (U8)uv; - } - } - } else { - RETVAL = (utf8_to_bytes(s, &len) ? len : 0); - } + RETVAL = (utf8_to_bytes(s, &len) ? len : 0); + } } } OUTPUT: @@ -533,38 +781,34 @@ OUTPUT: bool is_utf8(sv, check = 0) -SV * sv -int check +SV * sv +int check CODE: { if (SvGMAGICAL(sv)) /* it could be $1, for example */ - sv = newSVsv(sv); /* GMAGIG will be done */ - if (SvPOK(sv)) { - RETVAL = SvUTF8(sv) ? TRUE : FALSE; - if (RETVAL && - check && - !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) - RETVAL = FALSE; - } else { - RETVAL = FALSE; - } + sv = newSVsv(sv); /* GMAGIG will be done */ + RETVAL = SvUTF8(sv) ? TRUE : FALSE; + if (RETVAL && + check && + !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) + RETVAL = FALSE; if (sv != ST(0)) - SvREFCNT_dec(sv); /* it was a temp copy */ + SvREFCNT_dec(sv); /* it was a temp copy */ } OUTPUT: RETVAL SV * _utf8_on(sv) -SV * sv +SV * sv CODE: { if (SvPOK(sv)) { - SV *rsv = newSViv(SvUTF8(sv)); - RETVAL = rsv; - SvUTF8_on(sv); + SV *rsv = newSViv(SvUTF8(sv)); + RETVAL = rsv; + SvUTF8_on(sv); } else { - RETVAL = &PL_sv_undef; + RETVAL = &PL_sv_undef; } } OUTPUT: @@ -572,15 +816,15 @@ OUTPUT: SV * _utf8_off(sv) -SV * sv +SV * sv CODE: { if (SvPOK(sv)) { - SV *rsv = newSViv(SvUTF8(sv)); - RETVAL = rsv; - SvUTF8_off(sv); + SV *rsv = newSViv(SvUTF8(sv)); + RETVAL = rsv; + SvUTF8_off(sv); } else { - RETVAL = &PL_sv_undef; + RETVAL = &PL_sv_undef; } } OUTPUT: @@ -636,6 +880,13 @@ OUTPUT: RETVAL int +STOP_AT_PARTIAL() +CODE: + RETVAL = ENCODE_STOP_AT_PARTIAL; +OUTPUT: + RETVAL + +int FB_DEFAULT() CODE: RETVAL = ENCODE_FB_DEFAULT;