X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=ext%2FEncode%2FEncode.xs;h=4d30914995b13f9154de12b7db4eaea1cedabb42;hb=b536bf570905070470ba64c88c9fb4f28bfa54f3;hp=9806d59621bd2c67a83ed367e3f632369c5e3c1e;hpb=85982a32ef23cb53c2fae6d3861dd7dc62e3ab17;p=p5sagit%2Fp5-mst-13.2.git diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index 9806d59..4d30914 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -1,5 +1,5 @@ /* - $Id: Encode.xs,v 1.29 2002/04/19 05:36:43 dankogai Exp $ + $Id: Encode.xs,v 1.52 2002/11/18 17:28:49 dankogai Exp dankogai $ */ #define PERL_NO_GET_CONTEXT @@ -9,23 +9,27 @@ #define U8 U8 #include "encode.h" +# define PERLIO_MODNAME "PerlIO::encoding" +# define PERLIO_FILENAME "PerlIO/encoding.pm" + /* set 1 or more to profile. t/encoding.t dumps core because of Perl_warner and PerlIO don't work well */ -#define ENCODE_XS_PROFILE 0 +#define ENCODE_XS_PROFILE 0 /* set 0 to disable floating point to calculate buffer size for encode_method(). 1 is recommended. 2 restores NI-S original */ -#define ENCODE_XS_USEFP 1 +#define ENCODE_XS_USEFP 1 #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \ Perl_croak(aTHX_ "panic_unimplemented"); \ return (y)0; /* fool picky compilers */ \ } /**/ + UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) - void +void Encode_XSEncoding(pTHX_ encode_t * enc) { dSP; @@ -50,6 +54,9 @@ call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig) } +#define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s" +#define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode" + static SV * encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, int check) @@ -73,7 +80,7 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, goto ENCODE_END; } - while (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check)) + while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check)) ) { SvCUR_set(dst, dlen+ddone); SvPOK_only(dst); @@ -99,7 +106,7 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone - SvLEN(dst); #elif ENCODE_XS_USEFP - more = (1.0*SvLEN(dst)+1)/sdone * sleft; + more = (STRLEN)((1.0*SvLEN(dst)+1)/sdone * sleft); #else /* safe until SvLEN(dst) == MAX_INT/16 */ more = (16*SvLEN(dst)+1)/sdone/16 * sleft; @@ -119,69 +126,81 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, } case ENCODE_NOREP: /* encoding */ - if (dir == enc->f_utf8) { + if (dir == enc->f_utf8) { STRLEN clen; UV ch = - utf8n_to_uvuni(s+slen, (SvCUR(src)-slen), &clen, 0); + utf8n_to_uvuni(s+slen, (SvCUR(src)-slen), + &clen, UTF8_ALLOW_ANY|UTF8_CHECK_ONLY); if (check & ENCODE_DIE_ON_ERR) { - Perl_croak( - aTHX_ "\"\\N{U+%" UVxf "}\" does not map to %s, %d", - ch, enc->name[0], __LINE__); - }else{ - if (check & ENCODE_RETURN_ON_ERR){ - if (check & ENCODE_WARN_ON_ERR){ - Perl_warner( - aTHX_ packWARN(WARN_UTF8), - "\"\\N{U+%" UVxf "}\" does not map to %s", - ch,enc->name[0]); - } - goto ENCODE_SET_SRC; - }else if (check & ENCODE_PERLQQ){ - SV* perlqq = - sv_2mortal(newSVpvf("\\x{%04x}", ch)); - sdone += slen + clen; - ddone += dlen + SvCUR(perlqq); - sv_catsv(dst, perlqq); - } else { - /* fallback char */ - sdone += slen + clen; - ddone += dlen + enc->replen; - sv_catpvn(dst, (char*)enc->rep, enc->replen); - } - } + Perl_croak(aTHX_ ERR_ENCODE_NOMAP, + (UV)ch, enc->name[0]); + return &PL_sv_undef; /* never reaches but be safe */ + } + if (check & ENCODE_WARN_ON_ERR){ + Perl_warner(aTHX_ packWARN(WARN_UTF8), + ERR_ENCODE_NOMAP, (UV)ch, enc->name[0]); + } + if (check & ENCODE_RETURN_ON_ERR){ + goto ENCODE_SET_SRC; + } + if (check & ENCODE_PERLQQ){ + SV* perlqq = + sv_2mortal(newSVpvf("\\x{%04"UVxf"}", (UV)ch)); + sdone += slen + clen; + ddone += dlen + SvCUR(perlqq); + sv_catsv(dst, perlqq); + }else if (check & ENCODE_HTMLCREF){ + SV* htmlcref = + sv_2mortal(newSVpvf("&#%" UVuf ";", (UV)ch)); + sdone += slen + clen; + ddone += dlen + SvCUR(htmlcref); + sv_catsv(dst, htmlcref); + }else if (check & ENCODE_XMLCREF){ + SV* xmlcref = + sv_2mortal(newSVpvf("&#x%" UVxf ";", (UV)ch)); + sdone += slen + clen; + ddone += dlen + SvCUR(xmlcref); + sv_catsv(dst, xmlcref); + } else { + /* fallback char */ + sdone += slen + clen; + ddone += dlen + enc->replen; + sv_catpvn(dst, (char*)enc->rep, enc->replen); + } } /* decoding */ - else { + else { if (check & ENCODE_DIE_ON_ERR){ - Perl_croak( - aTHX_ "%s \"\\x%02X\" does not map to Unicode (%d)", - enc->name[0], (U8) s[slen], code); - }else{ - if (check & ENCODE_RETURN_ON_ERR){ - if (check & ENCODE_WARN_ON_ERR){ - Perl_warner( - aTHX_ packWARN(WARN_UTF8), - "%s \"\\x%02X\" does not map to Unicode (%d)", - enc->name[0], (U8) s[slen], code); - } - goto ENCODE_SET_SRC; - }else if (check & ENCODE_PERLQQ){ - SV* perlqq = - sv_2mortal(newSVpvf("\\x%02X", s[slen])); - sdone += slen + 1; - ddone += dlen + SvCUR(perlqq); - sv_catsv(dst, perlqq); - } else { - sdone += slen + 1; - ddone += dlen + strlen(FBCHAR_UTF8); - sv_catpv(dst, FBCHAR_UTF8); - } + Perl_croak(aTHX_ ERR_DECODE_NOMAP, + enc->name[0], (UV)s[slen]); + return &PL_sv_undef; /* never reaches but be safe */ + } + if (check & ENCODE_WARN_ON_ERR){ + Perl_warner( + aTHX_ packWARN(WARN_UTF8), + ERR_DECODE_NOMAP, + enc->name[0], (UV)s[slen]); + } + if (check & ENCODE_RETURN_ON_ERR){ + goto ENCODE_SET_SRC; + } + if (check & + (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ + SV* perlqq = + sv_2mortal(newSVpvf("\\x%02" UVXf, (UV)s[slen])); + sdone += slen + 1; + ddone += dlen + SvCUR(perlqq); + sv_catsv(dst, perlqq); + } else { + sdone += slen + 1; + ddone += dlen + strlen(FBCHAR_UTF8); + sv_catpv(dst, FBCHAR_UTF8); } } /* settle variables when fallback */ - dlen = SvCUR(dst); - d = (U8*)SvPVX(dst) + dlen; - s = (U8*)SvPVX(src) + sdone; + d = (U8 *)SvEND(dst); + dlen = SvLEN(dst) - ddone - 1; + s = (U8*)SvPVX(src) + sdone; slen = tlen - sdone; break; @@ -193,21 +212,18 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, } } ENCODE_SET_SRC: - if (check & ~ENCODE_LEAVE_SRC){ - sdone = SvCUR(src) - (slen+sdone); + if (check && !(check & ENCODE_LEAVE_SRC)){ + sdone = SvCUR(src) - (slen+sdone); if (sdone) { sv_setpvn(src, (char*)s+slen, sdone); } SvCUR_set(src, sdone); } /* warn("check = 0x%X, code = 0x%d\n", check, code); */ - if (code && !(check & ENCODE_RETURN_ON_ERR)) { - return &PL_sv_undef; - } - + SvCUR_set(dst, dlen+ddone); SvPOK_only(dst); - + #if ENCODE_XS_PROFILE if (SvCUR(dst) > SvCUR(src)){ Perl_warn(aTHX_ @@ -216,12 +232,140 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0); } #endif - + ENCODE_END: *SvEND(dst) = '\0'; return dst; } +MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_ + +void +Method_decode_xs(obj,src,check = 0) +SV * obj +SV * src +int check +CODE: +{ + STRLEN slen; + U8 *s = (U8 *) SvPV(src, slen); + U8 *e = (U8 *) SvEND(src); + SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */ + SvPOK_only(dst); + SvCUR_set(dst,0); + if (SvUTF8(src)) { + s = utf8_to_bytes(s,&slen); + if (s) { + SvCUR_set(src,slen); + SvUTF8_off(src); + e = s+slen; + } + else { + croak("Cannot decode string with wide characters"); + } + } + while (s < e) { + if (UTF8_IS_INVARIANT(*s) || UTF8_IS_START(*s)) { + U8 skip = UTF8SKIP(s); + if ((s + skip) > e) { + /* Partial character - done */ + break; + } + else if (is_utf8_char(s)) { + /* Whole char is good */ + sv_catpvn(dst,(char *)s,skip); + s += skip; + continue; + } + else { + /* starts ok but isn't "good" */ + } + } + else { + /* Invalid start byte */ + } + /* If we get here there is something wrong with alleged UTF-8 */ + if (check & ENCODE_DIE_ON_ERR){ + Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", (UV)*s); + XSRETURN(0); + } + if (check & ENCODE_WARN_ON_ERR){ + Perl_warner(aTHX_ packWARN(WARN_UTF8), + ERR_DECODE_NOMAP, "utf8", (UV)*s); + } + if (check & ENCODE_RETURN_ON_ERR) { + break; + } + if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ + SV* perlqq = newSVpvf("\\x%02" UVXf, (UV)*s); + sv_catsv(dst, perlqq); + SvREFCNT_dec(perlqq); + } else { + sv_catpv(dst, FBCHAR_UTF8); + } + s++; + } + *SvEND(dst) = '\0'; + + /* Clear out translated part of source unless asked not to */ + if (check && !(check & ENCODE_LEAVE_SRC)){ + slen = e-s; + if (slen) { + sv_setpvn(src, (char*)s, slen); + } + SvCUR_set(src, slen); + } + SvUTF8_on(dst); + ST(0) = sv_2mortal(dst); + XSRETURN(1); +} + +void +Method_encode_xs(obj,src,check = 0) +SV * obj +SV * src +int check +CODE: +{ + STRLEN slen; + U8 *s = (U8 *) SvPV(src, slen); + U8 *e = (U8 *) SvEND(src); + SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */ + if (SvUTF8(src)) { + /* Already encoded - trust it and just copy the octets */ + sv_setpvn(dst,(char *)s,(e-s)); + s = e; + } + else { + /* Native bytes - can always encode */ + U8 *d = (U8 *) SvGROW(dst, 2*slen+1); /* +1 or assertion will botch */ + while (s < e) { + UV uv = NATIVE_TO_UNI((UV) *s++); + if (UNI_IS_INVARIANT(uv)) + *d++ = (U8)UTF_TO_NATIVE(uv); + else { + *d++ = (U8)UTF8_EIGHT_BIT_HI(uv); + *d++ = (U8)UTF8_EIGHT_BIT_LO(uv); + } + } + SvCUR_set(dst, d- (U8 *)SvPVX(dst)); + *SvEND(dst) = '\0'; + } + + /* Clear out translated part of source unless asked not to */ + if (check && !(check & ENCODE_LEAVE_SRC)){ + slen = e-s; + if (slen) { + sv_setpvn(src, (char*)s, slen); + } + SvCUR_set(src, slen); + } + SvPOK_only(dst); + SvUTF8_off(dst); + ST(0) = sv_2mortal(dst); + XSRETURN(1); +} + MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_ PROTOTYPES: ENABLE @@ -244,6 +388,9 @@ int check CODE: { encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + if (SvUTF8(src)) { + sv_utf8_downgrade(src, FALSE); + } ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check); SvUTF8_on(ST(0)); XSRETURN(1); @@ -262,6 +409,34 @@ CODE: XSRETURN(1); } +void +Method_needs_lines(obj) +SV * obj +CODE: +{ + /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */ + ST(0) = &PL_sv_no; + XSRETURN(1); +} + +void +Method_perlio_ok(obj) +SV * obj +CODE: +{ + /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */ + /* require_pv(PERLIO_FILENAME); */ + + eval_pv("require PerlIO::encoding", 0); + + if (SvTRUE(get_sv("@", 0))) { + ST(0) = &PL_sv_no; + }else{ + ST(0) = &PL_sv_yes; + } + XSRETURN(1); +} + MODULE = Encode PACKAGE = Encode PROTOTYPES: ENABLE @@ -272,7 +447,7 @@ SV * sv CODE: { SV * encoding = items == 2 ? ST(1) : Nullsv; - + if (encoding) RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding)); else { @@ -309,7 +484,7 @@ CODE: /* Must do things the slow way */ U8 *dest; /* We need a copy to pass to check() */ - U8 *src = (U8*)savepv((char *)s); + U8 *src = (U8*)savepv((char *)s); U8 *send = s + len; New(83, dest, len, U8); /* I think */ @@ -334,8 +509,8 @@ CODE: /* Note change to utf8.c variable naming, for variety */ while (ulen--) { - if ((*s & 0xc0) != 0x80){ - goto failure; + if ((*s & 0xc0) != 0x80){ + goto failure; } else { uv = (uv << 6) | (*s++ & 0x3f); } @@ -411,9 +586,6 @@ CODE: OUTPUT: RETVAL -PROTOTYPES: DISABLE - - int DIE_ON_ERR() CODE: @@ -421,7 +593,7 @@ CODE: OUTPUT: RETVAL -int +int WARN_ON_ERR() CODE: RETVAL = ENCODE_WARN_ON_ERR; @@ -450,6 +622,20 @@ OUTPUT: RETVAL int +HTMLCREF() +CODE: + RETVAL = ENCODE_HTMLCREF; +OUTPUT: + RETVAL + +int +XMLCREF() +CODE: + RETVAL = ENCODE_XMLCREF; +OUTPUT: + RETVAL + +int FB_DEFAULT() CODE: RETVAL = ENCODE_FB_DEFAULT; @@ -484,6 +670,20 @@ CODE: OUTPUT: RETVAL +int +FB_HTMLCREF() +CODE: + RETVAL = ENCODE_FB_HTMLCREF; +OUTPUT: + RETVAL + +int +FB_XMLCREF() +CODE: + RETVAL = ENCODE_FB_XMLCREF; +OUTPUT: + RETVAL + BOOT: { #include "def_t.h"