X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=doop.c;h=823c88d18e4e9373294f977ef260584522c48eb2;hb=fe9745bfc279b016730696fd0a6abca4d493be60;hp=db5eeaf5a621232bbb2053f88e9425193515a351;hpb=94472101973f2669f5034174c504c45df6a04c85;p=p5sagit%2Fp5-mst-13.2.git diff --git a/doop.c b/doop.c index db5eeaf..823c88d 100644 --- a/doop.c +++ b/doop.c @@ -66,10 +66,10 @@ S_do_trans_simple(pTHX_ SV *sv) UV c; /* Need to check this, otherwise 128..255 won't match */ - c = utf8_to_uv(s, send - s, &ulen, 0); + c = utf8n_to_uvchr(s, send - s, &ulen, 0); if (c < 0x100 && (ch = tbl[c]) >= 0) { matches++; - d = uv_to_utf8(d, ch); + d = uvchr_to_utf8(d, ch); s += ulen; } else { /* No match -> copy */ @@ -117,7 +117,7 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */ while (s < send) { UV c; STRLEN ulen; - c = utf8_to_uv(s, send - s, &ulen, 0); + c = utf8n_to_uvchr(s, send - s, &ulen, 0); if (c < 0x100) { if (tbl[c] >= 0) matches++; @@ -203,7 +203,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ UV pch = 0xfeedface; while (s < send) { STRLEN len; - UV comp = utf8_to_uv_simple(s, &len); + UV comp = utf8_to_uvchr(s, &len); if (comp > 0xff) { if (!complement) { @@ -213,10 +213,11 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ else { matches++; if (!del) { - ch = (comp - 0x100 < rlen) ? + ch = (rlen == 0) ? comp : + (comp - 0x100 < rlen) ? tbl[comp+1] : tbl[0x100+rlen]; if (ch != pch) { - d = uv_to_utf8(d, ch); + d = uvchr_to_utf8(d, ch); pch = ch; } s += len; @@ -227,7 +228,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ else if ((ch = tbl[comp]) >= 0) { matches++; if (ch != pch) { - d = uv_to_utf8(d, ch); + d = uvchr_to_utf8(d, ch); pch = ch; } s += len; @@ -246,7 +247,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ else { while (s < send) { STRLEN len; - UV comp = utf8_to_uv_simple(s, &len); + UV comp = utf8_to_uvchr(s, &len); if (comp > 0xff) { if (!complement) { Copy(s, d, len, U8); @@ -255,15 +256,15 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ else { matches++; if (!del) { - if (comp - 0x100 < rlen) - d = uv_to_utf8(d, tbl[comp+1]); + if (comp - 0x100 < rlen) + d = uvchr_to_utf8(d, tbl[comp+1]); else - d = uv_to_utf8(d, tbl[0x100+rlen]); + d = uvchr_to_utf8(d, tbl[0x100+rlen]); } } } else if ((ch = tbl[comp]) >= 0) { - d = uv_to_utf8(d, ch); + d = uvchr_to_utf8(d, ch); matches++; } else if (ch == -1) { /* -1 is unmapped character */ @@ -315,9 +316,11 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ isutf8 = SvUTF8(sv); if (!isutf8) { U8 *t = s, *e = s + len; - while (t < e) - if ((hibit = UTF8_IS_CONTINUED(*t++))) + while (t < e) { + U8 ch = *t++; + if ((hibit = !NATIVE_IS_INVARIANT(ch))) break; + } if (hibit) s = bytes_to_utf8(s, &len); } @@ -343,7 +346,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ if ((uv = swash_fetch(rv, s)) < none) { s += UTF8SKIP(s); matches++; - d = uv_to_utf8(d, uv); + d = uvchr_to_utf8(d, uv); } else if (uv == none) { int i = UTF8SKIP(s); @@ -355,7 +358,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ int i = UTF8SKIP(s); s += i; matches++; - d = uv_to_utf8(d, final); + d = uvchr_to_utf8(d, final); } else s += UTF8SKIP(s); @@ -382,6 +385,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ } SvSETMAGIC(sv); SvUTF8_on(sv); + /* Downgrading just 'cos it will is suspect - NI-S */ if (!isutf8 && !(PL_hints & HINT_UTF8)) sv_utf8_downgrade(sv, TRUE); @@ -400,22 +404,25 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */ HV* hv = (HV*)SvRV(rv); SV** svp = hv_fetch(hv, "NONE", 4, FALSE); UV none = svp ? SvUV(*svp) : 0x7fffffff; + UV extra = none + 1; UV uv; U8 hibit = 0; s = (U8*)SvPV(sv, len); if (!SvUTF8(sv)) { U8 *t = s, *e = s + len; - while (t < e) - if ((hibit = !UTF8_IS_ASCII(*t++))) + while (t < e) { + U8 ch = *t++; + if ((hibit = !NATIVE_IS_INVARIANT(ch))) break; + } if (hibit) start = s = bytes_to_utf8(s, &len); } send = s + len; while (s < send) { - if ((uv = swash_fetch(rv, s)) < none) + if ((uv = swash_fetch(rv, s)) < none || uv == extra) matches++; s += UTF8SKIP(s); } @@ -441,6 +448,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ UV none = svp ? SvUV(*svp) : 0x7fffffff; UV extra = none + 1; UV final; + bool havefinal = FALSE; UV uv; STRLEN len; U8 *dstart, *dend; @@ -451,9 +459,11 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ isutf8 = SvUTF8(sv); if (!isutf8) { U8 *t = s, *e = s + len; - while (t < e) - if ((hibit = !UTF8_IS_ASCII(*t++))) + while (t < e) { + U8 ch = *t++; + if ((hibit = !NATIVE_IS_INVARIANT(ch))) break; + } if (hibit) s = bytes_to_utf8(s, &len); } @@ -461,8 +471,10 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ start = s; svp = hv_fetch(hv, "FINAL", 5, FALSE); - if (svp) + if (svp) { final = SvUV(*svp); + havefinal = TRUE; + } if (grows) { /* d needs to be bigger than s, in case e.g. upgrading is required */ @@ -479,7 +491,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ UV puv = 0xfeedface; while (s < send) { uv = swash_fetch(rv, s); - + if (d > dend) { STRLEN clen = d - dstart; STRLEN nlen = dend - dstart + len + UTF8_MAXLEN; @@ -493,7 +505,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ matches++; s += UTF8SKIP(s); if (uv != puv) { - d = uv_to_utf8(d, uv); + d = uvchr_to_utf8(d, uv); puv = uv; } continue; @@ -508,10 +520,22 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ } else if (uv == extra && !del) { matches++; - s += UTF8SKIP(s); - if (uv != puv) { - d = uv_to_utf8(d, final); - puv = final; + if (havefinal) { + s += UTF8SKIP(s); + if (puv != final) { + d = uvchr_to_utf8(d, final); + puv = final; + } + } + else { + STRLEN len; + uv = utf8_to_uvchr(s, &len); + if (uv != puv) { + Copy(s, d, len, U8); + d += len; + puv = uv; + } + s += len; } continue; } @@ -534,7 +558,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ if (uv < none) { matches++; s += UTF8SKIP(s); - d = uv_to_utf8(d, uv); + d = uvchr_to_utf8(d, uv); continue; } else if (uv == none) { /* "none" is unmapped character */ @@ -547,7 +571,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ else if (uv == extra && !del) { matches++; s += UTF8SKIP(s); - d = uv_to_utf8(d, final); + d = uvchr_to_utf8(d, final); continue; } matches++; /* "none+1" is delete character */ @@ -600,6 +624,7 @@ Perl_do_trans(pTHX_ SV *sv) return do_trans_simple(sv); case OPpTRANS_IDENTICAL: + case OPpTRANS_IDENTICAL|OPpTRANS_COMPLEMENT: if (hasutf) return do_trans_count_utf8(sv); else @@ -934,7 +959,7 @@ Perl_do_chop(pTHX_ register SV *astr, register SV *sv) s = send - 1; while (s > start && UTF8_IS_CONTINUATION(*s)) s--; - if (utf8_to_uv_simple((U8*)s, 0)) { + if (utf8_to_uvchr((U8*)s, 0)) { sv_setpvn(astr, s, send - s); *s = '\0'; SvCUR_set(sv, s - start); @@ -1100,14 +1125,14 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) switch (optype) { case OP_BIT_AND: while (lulen && rulen) { - luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV); + luc = utf8n_to_uvchr((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV); lc += ulen; lulen -= ulen; - ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV); + ruc = utf8n_to_uvchr((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV); rc += ulen; rulen -= ulen; duc = luc & ruc; - dc = (char*)uv_to_utf8((U8*)dc, duc); + dc = (char*)uvchr_to_utf8((U8*)dc, duc); } if (sv == left || sv == right) (void)sv_usepvn(sv, dcsave, needlen); @@ -1115,26 +1140,26 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) break; case OP_BIT_XOR: while (lulen && rulen) { - luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV); + luc = utf8n_to_uvchr((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV); lc += ulen; lulen -= ulen; - ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV); + ruc = utf8n_to_uvchr((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV); rc += ulen; rulen -= ulen; duc = luc ^ ruc; - dc = (char*)uv_to_utf8((U8*)dc, duc); + dc = (char*)uvchr_to_utf8((U8*)dc, duc); } goto mop_up_utf; case OP_BIT_OR: while (lulen && rulen) { - luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV); + luc = utf8n_to_uvchr((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV); lc += ulen; lulen -= ulen; - ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV); + ruc = utf8n_to_uvchr((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV); rc += ulen; rulen -= ulen; duc = luc | ruc; - dc = (char*)uv_to_utf8((U8*)dc, duc); + dc = (char*)uvchr_to_utf8((U8*)dc, duc); } mop_up_utf: if (sv == left || sv == right)