X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=ext%2FEncode%2FEncode.xs;h=8e225cd68cb34463a2aa63f7f0d12b6472ea6b3c;hb=a19d7498e238ac7c03cb96036dee4a734a2a0356;hp=65cf13758ae8a327a2a604c885ee8a4526c3847c;hpb=10c5ecbb45a6581439752935880506669f2d618c;p=p5sagit%2Fp5-mst-13.2.git diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index 65cf137..8e225cd 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -1,5 +1,5 @@ /* - $Id: Encode.xs,v 1.41 2002/04/27 18:59:50 dankogai Exp $ + $Id: Encode.xs,v 2.6 2005/09/08 14:17:17 dankogai Exp dankogai $ */ #define PERL_NO_GET_CONTEXT @@ -14,11 +14,11 @@ /* set 1 or more to profile. t/encoding.t dumps core because of Perl_warner and PerlIO don't work well */ -#define ENCODE_XS_PROFILE 0 +#define ENCODE_XS_PROFILE 0 /* set 0 to disable floating point to calculate buffer size for encode_method(). 1 is recommended. 2 restores NI-S original */ -#define ENCODE_XS_USEFP 1 +#define ENCODE_XS_USEFP 1 #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \ Perl_croak(aTHX_ "panic_unimplemented"); \ @@ -29,6 +29,14 @@ UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) +#define UTF8_ALLOW_STRICT 0 +#define UTF8_ALLOW_NONSTRICT (UTF8_ALLOW_ANY & \ + ~(UTF8_ALLOW_CONTINUATION | \ + UTF8_ALLOW_NON_CONTINUATION | \ + UTF8_ALLOW_LONG)) + +static SV* fallback_cb = (SV*)NULL ; + void Encode_XSEncoding(pTHX_ encode_t * enc) { @@ -54,9 +62,35 @@ call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig) } +#define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s" +#define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode" + +static SV * +do_fallback_cb(pTHX_ UV ch) +{ + dSP; + int argc; + SV* retval; + ENTER; + SAVETMPS; + PUSHMARK(sp); + XPUSHs(sv_2mortal(newSVnv((UV)ch))); + PUTBACK; + argc = call_sv(fallback_cb, G_SCALAR); + SPAGAIN; + if (argc != 1){ + croak("fallback sub must return scalar!"); + } + retval = newSVsv(POPs); + PUTBACK; + FREETMPS; + LEAVE; + return retval; +} + static SV * encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, - int check) + int check, STRLEN * offset, SV * term, int * retcode) { STRLEN slen; U8 *s = (U8 *) SvPV(src, slen); @@ -69,20 +103,34 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, SV *dst = sv_2mortal(newSV(slen+1)); U8 *d = (U8 *)SvPVX(dst); STRLEN dlen = SvLEN(dst)-1; - int code; + int code = 0; + STRLEN trmlen = 0; + U8 *trm = term ? (U8*) SvPV(term, trmlen) : NULL; + + if (offset) { + s += *offset; + if (slen > *offset){ /* safeguard against slen overflow */ + slen -= *offset; + }else{ + slen = 0; + } + tlen = slen; + } - if (!slen){ + if (slen == 0){ SvCUR_set(dst, 0); SvPOK_only(dst); goto ENCODE_END; } - while (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check)) + while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check, + trm, trmlen)) ) { SvCUR_set(dst, dlen+ddone); SvPOK_only(dst); - if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL){ + if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL || + code == ENCODE_FOUND_TERM) { break; } switch (code) { @@ -103,7 +151,7 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone - SvLEN(dst); #elif ENCODE_XS_USEFP - more = (1.0*SvLEN(dst)+1)/sdone * sleft; + more = (STRLEN)((1.0*SvLEN(dst)+1)/sdone * sleft); #else /* safe until SvLEN(dst) == MAX_INT/16 */ more = (16*SvLEN(dst)+1)/sdone/16 * sleft; @@ -123,85 +171,78 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, } case ENCODE_NOREP: /* encoding */ - if (dir == enc->f_utf8) { + if (dir == enc->f_utf8) { STRLEN clen; UV ch = - utf8n_to_uvuni(s+slen, (SvCUR(src)-slen), + utf8n_to_uvuni(s+slen, (SvCUR(src)-slen), &clen, UTF8_ALLOW_ANY|UTF8_CHECK_ONLY); + /* if non-representable multibyte prefix at end of current buffer - break*/ + if (clen > tlen - sdone) break; if (check & ENCODE_DIE_ON_ERR) { - Perl_croak( - aTHX_ "\"\\N{U+%" UVxf "}\" does not map to %s, %d", - ch, enc->name[0], __LINE__); - }else{ - if (check & ENCODE_RETURN_ON_ERR){ - if (check & ENCODE_WARN_ON_ERR){ - Perl_warner( - aTHX_ packWARN(WARN_UTF8), - "\"\\N{U+%" UVxf "}\" does not map to %s", - ch,enc->name[0]); - } - goto ENCODE_SET_SRC; - }else if (check & ENCODE_PERLQQ){ - SV* perlqq = - sv_2mortal(newSVpvf("\\x{%04"UVxf"}", ch)); - sdone += slen + clen; - ddone += dlen + SvCUR(perlqq); - sv_catsv(dst, perlqq); - }else if (check & ENCODE_HTMLCREF){ - SV* htmlcref = - sv_2mortal(newSVpvf("&#%" UVuf ";", ch)); - sdone += slen + clen; - ddone += dlen + SvCUR(htmlcref); - sv_catsv(dst, htmlcref); - }else if (check & ENCODE_XMLCREF){ - SV* xmlcref = - sv_2mortal(newSVpvf("&#x%" UVxf ";", ch)); - sdone += slen + clen; - ddone += dlen + SvCUR(xmlcref); - sv_catsv(dst, xmlcref); - } else { - /* fallback char */ - sdone += slen + clen; - ddone += dlen + enc->replen; - sv_catpvn(dst, (char*)enc->rep, enc->replen); - } - } + Perl_croak(aTHX_ ERR_ENCODE_NOMAP, + (UV)ch, enc->name[0]); + return &PL_sv_undef; /* never reaches but be safe */ + } + if (check & ENCODE_WARN_ON_ERR){ + Perl_warner(aTHX_ packWARN(WARN_UTF8), + ERR_ENCODE_NOMAP, (UV)ch, enc->name[0]); + } + if (check & ENCODE_RETURN_ON_ERR){ + goto ENCODE_SET_SRC; + } + if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ + SV* subchar = + (fallback_cb != (SV*)NULL) ? do_fallback_cb(aTHX_ ch) : + newSVpvf(check & ENCODE_PERLQQ ? "\\x{%04"UVxf"}" : + check & ENCODE_HTMLCREF ? "&#%" UVuf ";" : + "&#x%" UVxf ";", (UV)ch); + sdone += slen + clen; + ddone += dlen + SvCUR(subchar); + sv_catsv(dst, subchar); + SvREFCNT_dec(subchar); + } else { + /* fallback char */ + sdone += slen + clen; + ddone += dlen + enc->replen; + sv_catpvn(dst, (char*)enc->rep, enc->replen); + } } /* decoding */ - else { + else { if (check & ENCODE_DIE_ON_ERR){ - Perl_croak( - aTHX_ "%s \"\\x%02" UVXf - "\" does not map to Unicode (%d)", - enc->name[0], (U8) s[slen], code); - }else{ - if (check & ENCODE_RETURN_ON_ERR){ - if (check & ENCODE_WARN_ON_ERR){ - Perl_warner( - aTHX_ packWARN(WARN_UTF8), - "%s \"\\x%02" UVXf - "\" does not map to Unicode (%d)", - enc->name[0], (U8) s[slen], code); - } - goto ENCODE_SET_SRC; - }else if (check & - (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ - SV* perlqq = - sv_2mortal(newSVpvf("\\x%02" UVXf, s[slen])); - sdone += slen + 1; - ddone += dlen + SvCUR(perlqq); - sv_catsv(dst, perlqq); - } else { - sdone += slen + 1; - ddone += dlen + strlen(FBCHAR_UTF8); - sv_catpv(dst, FBCHAR_UTF8); - } + Perl_croak(aTHX_ ERR_DECODE_NOMAP, + enc->name[0], (UV)s[slen]); + return &PL_sv_undef; /* never reaches but be safe */ + } + if (check & ENCODE_WARN_ON_ERR){ + Perl_warner( + aTHX_ packWARN(WARN_UTF8), + ERR_DECODE_NOMAP, + enc->name[0], (UV)s[slen]); + } + if (check & ENCODE_RETURN_ON_ERR){ + goto ENCODE_SET_SRC; + } + if (check & + (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ + SV* subchar = + (fallback_cb != (SV*)NULL) ? + do_fallback_cb(aTHX_ (UV)s[slen]) : + newSVpvf("\\x%02" UVXf, (UV)s[slen]); + sdone += slen + 1; + ddone += dlen + SvCUR(subchar); + sv_catsv(dst, subchar); + SvREFCNT_dec(subchar); + } else { + sdone += slen + 1; + ddone += dlen + strlen(FBCHAR_UTF8); + sv_catpv(dst, FBCHAR_UTF8); } } /* settle variables when fallback */ d = (U8 *)SvEND(dst); dlen = SvLEN(dst) - ddone - 1; - s = (U8*)SvPVX(src) + sdone; + s = (U8*)SvPVX(src) + sdone; slen = tlen - sdone; break; @@ -221,10 +262,10 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, SvCUR_set(src, sdone); } /* warn("check = 0x%X, code = 0x%d\n", check, code); */ - + SvCUR_set(dst, dlen+ddone); SvPOK_only(dst); - + #if ENCODE_XS_PROFILE if (SvCUR(dst) > SvCUR(src)){ Perl_warn(aTHX_ @@ -233,17 +274,260 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0); } #endif - + + if (offset) + *offset += sdone + slen; + ENCODE_END: *SvEND(dst) = '\0'; + if (retcode) *retcode = code; return dst; } +static bool +strict_utf8(pTHX_ SV* sv) +{ + HV* hv; + SV** svp; + sv = SvRV(sv); + if (!sv || SvTYPE(sv) != SVt_PVHV) + return 0; + hv = (HV*)sv; + svp = hv_fetch(hv, "strict_utf8", 11, 0); + if (!svp) + return 0; + return SvTRUE(*svp); +} + +static U8* +process_utf8(pTHX_ SV* dst, U8* s, U8* e, int check, + bool encode, bool strict, bool stop_at_partial) +{ + UV uv; + STRLEN ulen; + + SvPOK_only(dst); + SvCUR_set(dst,0); + + while (s < e) { + if (UTF8_IS_INVARIANT(*s)) { + sv_catpvn(dst, (char *)s, 1); + s++; + continue; + } + + if (UTF8_IS_START(*s)) { + U8 skip = UTF8SKIP(s); + if ((s + skip) > e) { + /* Partial character */ + /* XXX could check that rest of bytes are UTF8_IS_CONTINUATION(ch) */ + if (stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) + break; + + goto malformed_byte; + } + + uv = utf8n_to_uvuni(s, e - s, &ulen, + UTF8_CHECK_ONLY | (strict ? UTF8_ALLOW_STRICT : + UTF8_ALLOW_NONSTRICT) + ); +#if 1 /* perl-5.8.6 and older do not check UTF8_ALLOW_LONG */ + if (strict && uv > PERL_UNICODE_MAX) + ulen = -1; +#endif + if (ulen == -1) { + if (strict) { + uv = utf8n_to_uvuni(s, e - s, &ulen, + UTF8_CHECK_ONLY | UTF8_ALLOW_NONSTRICT); + if (ulen == -1) + goto malformed_byte; + goto malformed; + } + goto malformed_byte; + } + + + /* Whole char is good */ + sv_catpvn(dst,(char *)s,skip); + s += skip; + continue; + } + + /* If we get here there is something wrong with alleged UTF-8 */ + malformed_byte: + uv = (UV)*s; + ulen = 1; + + malformed: + if (check & ENCODE_DIE_ON_ERR){ + if (encode) + Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, "utf8"); + else + Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", uv); + } + if (check & ENCODE_WARN_ON_ERR){ + if (encode) + Perl_warner(aTHX_ packWARN(WARN_UTF8), + ERR_ENCODE_NOMAP, uv, "utf8"); + else + Perl_warner(aTHX_ packWARN(WARN_UTF8), + ERR_DECODE_NOMAP, "utf8", uv); + } + if (check & ENCODE_RETURN_ON_ERR) { + break; + } + if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ + SV* subchar = newSVpvf(check & ENCODE_PERLQQ ? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}"): + check & ENCODE_HTMLCREF ? "&#%" UVuf ";" : + "&#x%" UVxf ";", uv); + sv_catsv(dst, subchar); + SvREFCNT_dec(subchar); + } else { + sv_catpv(dst, FBCHAR_UTF8); + } + s += ulen; + } + *SvEND(dst) = '\0'; + + return s; +} + + +MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_ + +PROTOTYPES: DISABLE + +void +Method_decode_xs(obj,src,check = 0) +SV * obj +SV * src +int check +CODE: +{ + STRLEN slen; + U8 *s = (U8 *) SvPV(src, slen); + U8 *e = (U8 *) SvEND(src); + SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */ + + /* + * PerlIO check -- we assume the object is of PerlIO if renewed + */ + int renewed = 0; + dSP; ENTER; SAVETMPS; + PUSHMARK(sp); + XPUSHs(obj); + PUTBACK; + if (call_method("renewed",G_SCALAR) == 1) { + SPAGAIN; + renewed = POPi; + PUTBACK; +#if 0 + fprintf(stderr, "renewed == %d\n", renewed); +#endif + } + FREETMPS; LEAVE; + /* end PerlIO check */ + + if (SvUTF8(src)) { + s = utf8_to_bytes(s,&slen); + if (s) { + SvCUR_set(src,slen); + SvUTF8_off(src); + e = s+slen; + } + else { + croak("Cannot decode string with wide characters"); + } + } + + s = process_utf8(aTHX_ dst, s, e, check, 0, strict_utf8(aTHX_ obj), renewed); + + /* Clear out translated part of source unless asked not to */ + if (check && !(check & ENCODE_LEAVE_SRC)){ + slen = e-s; + if (slen) { + sv_setpvn(src, (char*)s, slen); + } + SvCUR_set(src, slen); + } + SvUTF8_on(dst); + ST(0) = sv_2mortal(dst); + XSRETURN(1); +} + +void +Method_encode_xs(obj,src,check = 0) +SV * obj +SV * src +int check +CODE: +{ + STRLEN slen; + U8 *s = (U8 *) SvPV(src, slen); + U8 *e = (U8 *) SvEND(src); + SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */ + if (SvUTF8(src)) { + /* Already encoded */ + if (strict_utf8(aTHX_ obj)) { + s = process_utf8(aTHX_ dst, s, e, check, 1, 1, 0); + } + else { + /* trust it and just copy the octets */ + sv_setpvn(dst,(char *)s,(e-s)); + s = e; + } + } + else { + /* Native bytes - can always encode */ + U8 *d = (U8 *) SvGROW(dst, 2*slen+1); /* +1 or assertion will botch */ + while (s < e) { + UV uv = NATIVE_TO_UNI((UV) *s++); + if (UNI_IS_INVARIANT(uv)) + *d++ = (U8)UTF_TO_NATIVE(uv); + else { + *d++ = (U8)UTF8_EIGHT_BIT_HI(uv); + *d++ = (U8)UTF8_EIGHT_BIT_LO(uv); + } + } + SvCUR_set(dst, d- (U8 *)SvPVX(dst)); + *SvEND(dst) = '\0'; + } + + /* Clear out translated part of source unless asked not to */ + if (check && !(check & ENCODE_LEAVE_SRC)){ + slen = e-s; + if (slen) { + sv_setpvn(src, (char*)s, slen); + } + SvCUR_set(src, slen); + } + SvPOK_only(dst); + SvUTF8_off(dst); + ST(0) = sv_2mortal(dst); + XSRETURN(1); +} + MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_ PROTOTYPES: ENABLE void +Method_renew(obj) +SV * obj +CODE: +{ + XSRETURN(1); +} + +int +Method_renewed(obj) +SV * obj +CODE: + RETVAL = 0; +OUTPUT: + RETVAL + +void Method_name(obj) SV * obj CODE: @@ -254,28 +538,86 @@ CODE: } void -Method_decode(obj,src,check = 0) +Method_cat_decode(obj, dst, src, off, term, check = 0) SV * obj +SV * dst SV * src +SV * off +SV * term int check CODE: { encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); - ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check); + STRLEN offset = (STRLEN)SvIV(off); + int code = 0; + if (SvUTF8(src)) { + sv_utf8_downgrade(src, FALSE); + } + sv_catsv(dst, encode_method(aTHX_ enc, enc->t_utf8, src, check, + &offset, term, &code)); + SvIV_set(off, (IV)offset); + if (code == ENCODE_FOUND_TERM) { + ST(0) = &PL_sv_yes; + }else{ + ST(0) = &PL_sv_no; + } + XSRETURN(1); +} + +void +Method_decode(obj,src,check_sv = &PL_sv_no) +SV * obj +SV * src +SV * check_sv +CODE: +{ + int check; + encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + if (SvUTF8(src)) { + sv_utf8_downgrade(src, FALSE); + } + if (SvROK(check_sv)){ + if (fallback_cb == (SV*)NULL){ + fallback_cb = newSVsv(check_sv); /* First time */ + }else{ + SvSetSV(fallback_cb, check_sv); /* Been here before */ + } + check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ + }else{ + fallback_cb = (SV*)NULL; + check = SvIV(check_sv); + } + ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check, + NULL, Nullsv, NULL); SvUTF8_on(ST(0)); XSRETURN(1); } + + void -Method_encode(obj,src,check = 0) +Method_encode(obj,src,check_sv = &PL_sv_no) SV * obj SV * src -int check +SV * check_sv CODE: { + int check; encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); sv_utf8_upgrade(src); - ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check); + if (SvROK(check_sv)){ + if (fallback_cb == (SV*)NULL){ + fallback_cb = newSVsv(check_sv); /* First time */ + }else{ + SvSetSV(fallback_cb, check_sv); /* Been here before */ + } + check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ + }else{ + fallback_cb = (SV*)NULL; + check = SvIV(check_sv); + } + ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check, + NULL, Nullsv, NULL); XSRETURN(1); } @@ -284,7 +626,7 @@ Method_needs_lines(obj) SV * obj CODE: { - encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */ ST(0) = &PL_sv_no; XSRETURN(1); } @@ -294,7 +636,7 @@ Method_perlio_ok(obj) SV * obj CODE: { - encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */ /* require_pv(PERLIO_FILENAME); */ eval_pv("require PerlIO::encoding", 0); @@ -317,7 +659,7 @@ SV * sv CODE: { SV * encoding = items == 2 ? ST(1) : Nullsv; - + if (encoding) RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding)); else { @@ -354,7 +696,7 @@ CODE: /* Must do things the slow way */ U8 *dest; /* We need a copy to pass to check() */ - U8 *src = (U8*)savepv((char *)s); + U8 *src = (U8*)savepv((char *)s); U8 *send = s + len; New(83, dest, len, U8); /* I think */ @@ -379,8 +721,8 @@ CODE: /* Note change to utf8.c variable naming, for variety */ while (ulen--) { - if ((*s & 0xc0) != 0x80){ - goto failure; + if ((*s & 0xc0) != 0x80){ + goto failure; } else { uv = (uv << 6) | (*s++ & 0x3f); } @@ -463,7 +805,7 @@ CODE: OUTPUT: RETVAL -int +int WARN_ON_ERR() CODE: RETVAL = ENCODE_WARN_ON_ERR; @@ -506,6 +848,13 @@ OUTPUT: RETVAL int +STOP_AT_PARTIAL() +CODE: + RETVAL = ENCODE_STOP_AT_PARTIAL; +OUTPUT: + RETVAL + +int FB_DEFAULT() CODE: RETVAL = ENCODE_FB_DEFAULT;