From: Inaba Hiroto Date: Tue, 9 Jan 2001 01:04:32 +0000 (+0900) Subject: One more patch for UTF8 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=9b877dbba0196ef7d4c6e2b0fcfc6e6f4955d526;p=p5sagit%2Fp5-mst-13.2.git One more patch for UTF8 Message-ID: <3A59E510.52BAB5B9@st.rim.or.jp> UTF-8 fixes for 'x' and tr///. p4raw-id: //depot/perl@8378 --- diff --git a/doop.c b/doop.c index 3b0ddc1..55962a0 100644 --- a/doop.c +++ b/doop.c @@ -29,6 +29,7 @@ S_do_trans_simple(pTHX_ SV *sv) U8 *send; U8 *dstart; I32 matches = 0; + I32 grows = PL_op->op_private & OPpTRANS_GROWS; STRLEN len; short *tbl; I32 ch; @@ -55,27 +56,36 @@ S_do_trans_simple(pTHX_ SV *sv) } /* Allow for expansion: $_="a".chr(400); tr/a/\xFE/, FE needs encoding */ - Newz(0, d, len*2+1, U8); + if (grows) + New(0, d, len*2+1, U8); + else + d = s; dstart = d; while (s < send) { STRLEN ulen; - short c; + UV c; - ulen = 1; /* Need to check this, otherwise 128..255 won't match */ c = utf8_to_uv(s, send - s, &ulen, 0); - if (c < 0x100 && (ch = tbl[(short)c]) >= 0) { + if (c < 0x100 && (ch = tbl[c]) >= 0) { matches++; d = uv_to_utf8(d, ch); s += ulen; } else { /* No match -> copy */ - while (ulen--) - *d++ = *s++; + Copy(s, d, ulen, U8); + d += ulen; + s += ulen; } } - *d = '\0'; - sv_setpvn(sv, (char*)dstart, d - dstart); + if (grows) { + sv_setpvn(sv, (char*)dstart, d - dstart); + Safefree(dstart); + } + else { + *d = '\0'; + SvCUR_set(sv, d - dstart); + } SvUTF8_on(sv); SvSETMAGIC(sv); return matches; @@ -124,6 +134,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ U8 *dstart; I32 isutf8; I32 matches = 0; + I32 grows = PL_op->op_private & OPpTRANS_GROWS; STRLEN len; short *tbl; I32 ch; @@ -170,7 +181,10 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ SvCUR_set(sv, d - dstart); } else { /* isutf8 */ - Newz(0, d, len*2+1, U8); + if (grows) + New(0, d, len*2+1, U8); + else + d = s; dstart = d; if (PL_op->op_private & OPpTRANS_SQUASH) { @@ -180,8 +194,10 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ STRLEN len; UV comp = utf8_to_uv_simple(s, &len); - if (comp > 0xff) - d = uv_to_utf8(d, comp); /* always unmapped */ + if (comp > 0xff) { /* always unmapped */ + Copy(s, d, len, U8); + d += len; + } else if ((ch = tbl[comp]) >= 0) { matches++; if (ch != pch) { @@ -191,8 +207,10 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ s += len; continue; } - else if (ch == -1) /* -1 is unmapped character */ - d = uv_to_utf8(d, comp); + else if (ch == -1) { /* -1 is unmapped character */ + Copy(s, d, len, U8); + d += len; + } else if (ch == -2) /* -2 is delete character */ matches++; s += len; @@ -203,22 +221,31 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ while (s < send) { STRLEN len; UV comp = utf8_to_uv_simple(s, &len); - if (comp > 0xff) - d = uv_to_utf8(d, comp); /* always unmapped */ + if (comp > 0xff) { /* always unmapped */ + Copy(s, d, len, U8); + d += len; + } else if ((ch = tbl[comp]) >= 0) { d = uv_to_utf8(d, ch); matches++; } else if (ch == -1) { /* -1 is unmapped character */ - d = uv_to_utf8(d, comp); + Copy(s, d, len, U8); + d += len; } else if (ch == -2) /* -2 is delete character */ matches++; s += len; } } - *d = '\0'; - sv_setpvn(sv, (char*)dstart, d - dstart); + if (grows) { + sv_setpvn(sv, (char*)dstart, d - dstart); + Safefree(dstart); + } + else { + *d = '\0'; + SvCUR_set(sv, d - dstart); + } SvUTF8_on(sv); } SvSETMAGIC(sv); @@ -234,6 +261,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ U8 *start; U8 *dstart, *dend; I32 matches = 0; + I32 grows = PL_op->op_private & OPpTRANS_GROWS; STRLEN len; SV* rv = (SV*)cSVOP->op_sv; @@ -263,10 +291,16 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ if (svp) final = SvUV(*svp); - /* d needs to be bigger than s, in case e.g. upgrading is required */ - New(0, d, len*3+UTF8_MAXLEN, U8); - dend = d + len * 3; - dstart = d; + if (grows) { + /* d needs to be bigger than s, in case e.g. upgrading is required */ + New(0, d, len*3+UTF8_MAXLEN, U8); + dend = d + len * 3; + dstart = d; + } + else { + dstart = d = s; + dend = d + len; + } while (s < send) { if ((uv = swash_fetch(rv, s)) < none) { @@ -276,8 +310,9 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ } else if (uv == none) { int i = UTF8SKIP(s); - while(i--) - *d++ = *s++; + Copy(s, d, i, U8); + d += i; + s += i; } else if (uv == extra) { int i = UTF8SKIP(s); @@ -288,16 +323,24 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ else s += UTF8SKIP(s); - if (d >= dend) { + if (d > dend) { STRLEN clen = d - dstart; STRLEN nlen = dend - dstart + len + UTF8_MAXLEN; + if (!grows) + Perl_croak(aTHX_ "panic: do_trans_complex_utf8"); Renew(dstart, nlen+UTF8_MAXLEN, U8); d = dstart + clen; dend = dstart + nlen; } } - *d = '\0'; - sv_setpvn(sv, (char*)dstart, d - dstart); + if (grows) { + sv_setpvn(sv, (char*)dstart, d - dstart); + Safefree(dstart); + } + else { + *d = '\0'; + SvCUR_set(sv, d - dstart); + } SvSETMAGIC(sv); SvUTF8_on(sv); if (hibit) @@ -354,6 +397,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ I32 matches = 0; I32 squash = PL_op->op_private & OPpTRANS_SQUASH; I32 del = PL_op->op_private & OPpTRANS_DELETE; + I32 grows = PL_op->op_private & OPpTRANS_GROWS; SV* rv = (SV*)cSVOP->op_sv; HV* hv = (HV*)SvRV(rv); SV** svp = hv_fetch(hv, "NONE", 4, FALSE); @@ -383,17 +427,27 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ if (svp) final = SvUV(*svp); - New(0, d, len*3+UTF8_MAXLEN, U8); - dend = d + len * 3; - dstart = d; + if (grows) { + /* d needs to be bigger than s, in case e.g. upgrading is required */ + New(0, d, len*3+UTF8_MAXLEN, U8); + dend = d + len * 3; + dstart = d; + } + else { + dstart = d = s; + dend = d + len; + } if (squash) { UV puv = 0xfeedface; while (s < send) { uv = swash_fetch(rv, s); - if (d >= dend) { - STRLEN clen = d - dstart, nlen = dend - dstart + len; + if (d > dend) { + STRLEN clen = d - dstart; + STRLEN nlen = dend - dstart + len + UTF8_MAXLEN; + if (!grows) + Perl_croak(aTHX_ "panic: do_trans_complex_utf8"); Renew(dstart, nlen+UTF8_MAXLEN, U8); d = dstart + clen; dend = dstart + nlen; @@ -409,8 +463,9 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ } else if (uv == none) { /* "none" is unmapped character */ int i = UTF8SKIP(s); - while(i--) - *d++ = *s++; + Copy(s, d, i, U8); + d += i; + s += i; puv = 0xfeedface; continue; } @@ -430,8 +485,11 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ else { while (s < send) { uv = swash_fetch(rv, s); - if (d >= dend) { - STRLEN clen = d - dstart, nlen = dend - dstart + len; + if (d > dend) { + STRLEN clen = d - dstart; + STRLEN nlen = dend - dstart + len + UTF8_MAXLEN; + if (!grows) + Perl_croak(aTHX_ "panic: do_trans_complex_utf8"); Renew(dstart, nlen+UTF8_MAXLEN, U8); d = dstart + clen; dend = dstart + nlen; @@ -444,8 +502,9 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ } else if (uv == none) { /* "none" is unmapped character */ int i = UTF8SKIP(s); - while(i--) - *d++ = *s++; + Copy(s, d, i, U8); + d += i; + s += i; continue; } else if (uv == extra && !del) { @@ -458,8 +517,14 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ s += UTF8SKIP(s); } } - *d = '\0'; - sv_setpvn(sv, (char*)dstart, d - dstart); + if (grows) { + sv_setpvn(sv, (char*)dstart, d - dstart); + Safefree(dstart); + } + else { + *d = '\0'; + SvCUR_set(sv, d - dstart); + } SvUTF8_on(sv); if (hibit) Safefree(start); diff --git a/embed.h b/embed.h index 24320e9..76bb2d8 100644 --- a/embed.h +++ b/embed.h @@ -898,6 +898,7 @@ #define scalarboolean S_scalarboolean #define too_few_arguments S_too_few_arguments #define too_many_arguments S_too_many_arguments +#define trlist_upgrade S_trlist_upgrade #define op_clear S_op_clear #define null S_null #define pad_addlex S_pad_addlex @@ -2364,6 +2365,7 @@ #define scalarboolean(a) S_scalarboolean(aTHX_ a) #define too_few_arguments(a,b) S_too_few_arguments(aTHX_ a,b) #define too_many_arguments(a,b) S_too_many_arguments(aTHX_ a,b) +#define trlist_upgrade(a,b) S_trlist_upgrade(aTHX_ a,b) #define op_clear(a) S_op_clear(aTHX_ a) #define null(a) S_null(aTHX_ a) #define pad_addlex(a) S_pad_addlex(aTHX_ a) @@ -4621,6 +4623,8 @@ #define too_few_arguments S_too_few_arguments #define S_too_many_arguments CPerlObj::S_too_many_arguments #define too_many_arguments S_too_many_arguments +#define S_trlist_upgrade CPerlObj::S_trlist_upgrade +#define trlist_upgrade S_trlist_upgrade #define S_op_clear CPerlObj::S_op_clear #define op_clear S_op_clear #define S_null CPerlObj::S_null diff --git a/embed.pl b/embed.pl index d834e4f..3b5b6df 100755 --- a/embed.pl +++ b/embed.pl @@ -2261,6 +2261,7 @@ s |OP* |no_fh_allowed |OP *o s |OP* |scalarboolean |OP *o s |OP* |too_few_arguments|OP *o|char* name s |OP* |too_many_arguments|OP *o|char* name +s |U8* |trlist_upgrade |U8** sp|U8** ep s |void |op_clear |OP* o s |void |null |OP* o s |PADOFFSET|pad_addlex |SV* name diff --git a/op.c b/op.c index 379b0b9..5e2439c 100644 --- a/op.c +++ b/op.c @@ -102,6 +102,30 @@ S_no_bareword_allowed(pTHX_ OP *o) SvPV_nolen(cSVOPo_sv))); } +STATIC U8* +S_trlist_upgrade(pTHX_ U8** sp, U8** ep) +{ + U8 *s = *sp; + U8 *e = *ep; + U8 *d; + + Newz(801, d, (e - s) * 2, U8); + *sp = d; + + while (s < e) { + if (*s < 0x80 || *s == 0xff) + *d++ = *s++; + else { + U8 c = *s++; + *d++ = ((c >> 6) | 0xc0); + *d++ = ((c & 0x3f) | 0x80); + } + } + *ep = d; + return *sp; +} + + /* "register" allocation */ PADOFFSET @@ -2608,13 +2632,14 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) SV *rstr = ((SVOP*)repl)->op_sv; STRLEN tlen; STRLEN rlen; - register U8 *t = (U8*)SvPV(tstr, tlen); - register U8 *r = (U8*)SvPV(rstr, rlen); + U8 *t = (U8*)SvPV(tstr, tlen); + U8 *r = (U8*)SvPV(rstr, rlen); register I32 i; register I32 j; I32 del; I32 complement; I32 squash; + I32 grows = 0; register short *tbl; complement = o->op_private & OPpTRANS_COMPLEMENT; @@ -2643,11 +2668,12 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) I32 none = 0; U32 max = 0; I32 bits; - I32 grows = 0; I32 havefinal = 0; U32 final; I32 from_utf = o->op_private & OPpTRANS_FROM_UTF; I32 to_utf = o->op_private & OPpTRANS_TO_UTF; + U8* tsave = from_utf ? NULL : trlist_upgrade(&t, &tend); + U8* rsave = to_utf ? NULL : trlist_upgrade(&r, &rend); if (complement) { U8 tmpbuf[UTF8_MAXLEN+1]; @@ -2769,20 +2795,8 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) if (rfirst + diff > max) max = rfirst + diff; rfirst += diff + 1; - if (!grows) { - if (rfirst <= 0x80) - ; - else if (rfirst <= 0x800) - grows |= (tfirst < 0x80); - else if (rfirst <= 0x10000) - grows |= (tfirst < 0x800); - else if (rfirst <= 0x200000) - grows |= (tfirst < 0x10000); - else if (rfirst <= 0x4000000) - grows |= (tfirst < 0x200000); - else if (rfirst <= 0x80000000) - grows |= (tfirst < 0x4000000); - } + if (!grows) + grows = (UNISKIP(tfirst) < UNISKIP(rfirst)); } tfirst += diff + 1; } @@ -2807,9 +2821,14 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) (void)hv_store((HV*)SvRV((cSVOPo->op_sv)), "FINAL", 5, newSVuv((UV)final), 0); - if (grows && to_utf) + if (grows) o->op_private |= OPpTRANS_GROWS; + if (tsave) + Safefree(tsave); + if (rsave) + Safefree(rsave); + op_free(expr); op_free(repl); return o; @@ -2830,8 +2849,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) else tbl[i] = i; } - else + else { + if (i < 128 && r[j] >= 128) + grows = 1; tbl[i] = r[j++]; + } } } } @@ -2852,10 +2874,15 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) } --j; } - if (tbl[t[i]] == -1) + if (tbl[t[i]] == -1) { + if (t[i] < 128 && r[j] >= 128) + grows = 1; tbl[t[i]] = r[j]; + } } } + if (grows) + o->op_private |= OPpTRANS_GROWS; op_free(expr); op_free(repl); diff --git a/pp.c b/pp.c index 6bb1618..f970cde 100644 --- a/pp.c +++ b/pp.c @@ -1202,10 +1202,11 @@ PP(pp_repeat) else { /* Note: mark already snarfed by pp_list */ SV *tmpstr = POPs; STRLEN len; - bool isutf = DO_UTF8(tmpstr); + bool isutf; SvSetSV(TARG, tmpstr); SvPV_force(TARG, len); + isutf = DO_UTF8(TARG); if (count != 1) { if (count < 1) SvCUR_set(TARG, 0); diff --git a/proto.h b/proto.h index 55ee5aa..e83d8fd 100644 --- a/proto.h +++ b/proto.h @@ -1008,6 +1008,7 @@ STATIC OP* S_no_fh_allowed(pTHX_ OP *o); STATIC OP* S_scalarboolean(pTHX_ OP *o); STATIC OP* S_too_few_arguments(pTHX_ OP *o, char* name); STATIC OP* S_too_many_arguments(pTHX_ OP *o, char* name); +STATIC U8* S_trlist_upgrade(pTHX_ U8** sp, U8** ep); STATIC void S_op_clear(pTHX_ OP* o); STATIC void S_null(pTHX_ OP* o); STATIC PADOFFSET S_pad_addlex(pTHX_ SV* name); diff --git a/regcomp.c b/regcomp.c index 19d8e8e..c85eb5e 100644 --- a/regcomp.c +++ b/regcomp.c @@ -845,11 +845,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg if (compat) ANYOF_BITMAP_SET(data->start_class, uc); data->start_class->flags &= ~ANYOF_EOS; + if (uc < 0x100) + data->start_class->flags &= ~ANYOF_UNICODE_ALL; } else if (flags & SCF_DO_STCLASS_OR) { /* false positive possible if the class is case-folded */ if (uc < 0x100) - ANYOF_BITMAP_SET(data->start_class, uc); + ANYOF_BITMAP_SET(data->start_class, uc); + else + data->start_class->flags |= ANYOF_UNICODE_ALL; data->start_class->flags &= ~ANYOF_EOS; cl_and(data->start_class, &and_with); } diff --git a/regexec.c b/regexec.c index 0b65d11..5990ea3 100644 --- a/regexec.c +++ b/regexec.c @@ -443,6 +443,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, #endif restart: + other_last = Nullch; + /* Find a possible match in the region s..strend by looking for the "check" substring in the region corrected by start/end_shift. */ if (flags & REXEC_SCREAM) { diff --git a/toke.c b/toke.c index e0d6f07..c6ef8e9 100644 --- a/toke.c +++ b/toke.c @@ -1383,7 +1383,8 @@ S_scan_const(pTHX_ char *start) else { STRLEN len = 1; /* allow underscores */ uv = (UV)scan_hex(s + 1, e - s - 1, &len); - to_be_utf8 = TRUE; + if (PL_hints & HINT_UTF8) + to_be_utf8 = TRUE; } s = e + 1; } @@ -1425,8 +1426,6 @@ S_scan_const(pTHX_ char *start) if (hicount) { char *old_pvx = SvPVX(sv); char *src, *dst; - U8 tmpbuf[UTF8_MAXLEN+1]; - U8 *tmpend; d = SvGROW(sv, SvCUR(sv) + hicount + 1) + @@ -1438,10 +1437,8 @@ S_scan_const(pTHX_ char *start) while (src < dst) { if (UTF8_IS_CONTINUED(*src)) { - tmpend = uv_to_utf8(tmpbuf, (U8)*src--); - dst -= tmpend - tmpbuf; - Copy((char *)tmpbuf, dst+1, - tmpend - tmpbuf, char); + *dst-- = UTF8_EIGHT_BIT_LO(*src); + *dst-- = UTF8_EIGHT_BIT_HI(*src--); } else { *dst-- = *src--; @@ -1450,7 +1447,7 @@ S_scan_const(pTHX_ char *start) } } - if (to_be_utf8 || (has_utf8 && uv > 127) || uv > 255) { + if (to_be_utf8 || has_utf8 || uv > 255) { d = (char*)uv_to_utf8((U8*)d, uv); has_utf8 = TRUE; } diff --git a/utf8.c b/utf8.c index e82725e..65f1096 100644 --- a/utf8.c +++ b/utf8.c @@ -31,20 +31,17 @@ Perl_uv_to_utf8(pTHX_ U8 *d, UV uv) /* the d must be UTF8_MAXLEN+1 deep */ { if (uv < 0x80) { *d++ = uv; - *d = 0; return d; } if (uv < 0x800) { *d++ = (( uv >> 6) | 0xc0); *d++ = (( uv & 0x3f) | 0x80); - *d = 0; return d; } if (uv < 0x10000) { *d++ = (( uv >> 12) | 0xe0); *d++ = (((uv >> 6) & 0x3f) | 0x80); *d++ = (( uv & 0x3f) | 0x80); - *d = 0; return d; } if (uv < 0x200000) { @@ -52,7 +49,6 @@ Perl_uv_to_utf8(pTHX_ U8 *d, UV uv) /* the d must be UTF8_MAXLEN+1 deep */ *d++ = (((uv >> 12) & 0x3f) | 0x80); *d++ = (((uv >> 6) & 0x3f) | 0x80); *d++ = (( uv & 0x3f) | 0x80); - *d = 0; return d; } if (uv < 0x4000000) { @@ -61,7 +57,6 @@ Perl_uv_to_utf8(pTHX_ U8 *d, UV uv) /* the d must be UTF8_MAXLEN+1 deep */ *d++ = (((uv >> 12) & 0x3f) | 0x80); *d++ = (((uv >> 6) & 0x3f) | 0x80); *d++ = (( uv & 0x3f) | 0x80); - *d = 0; return d; } if (uv < 0x80000000) { @@ -71,7 +66,6 @@ Perl_uv_to_utf8(pTHX_ U8 *d, UV uv) /* the d must be UTF8_MAXLEN+1 deep */ *d++ = (((uv >> 12) & 0x3f) | 0x80); *d++ = (((uv >> 6) & 0x3f) | 0x80); *d++ = (( uv & 0x3f) | 0x80); - *d = 0; return d; } #ifdef HAS_QUAD @@ -85,7 +79,6 @@ Perl_uv_to_utf8(pTHX_ U8 *d, UV uv) /* the d must be UTF8_MAXLEN+1 deep */ *d++ = (((uv >> 12) & 0x3f) | 0x80); *d++ = (((uv >> 6) & 0x3f) | 0x80); *d++ = (( uv & 0x3f) | 0x80); - *d = 0; return d; } #ifdef HAS_QUAD @@ -103,7 +96,6 @@ Perl_uv_to_utf8(pTHX_ U8 *d, UV uv) /* the d must be UTF8_MAXLEN+1 deep */ *d++ = (((uv >> 12) & 0x3f) | 0x80); *d++ = (((uv >> 6) & 0x3f) | 0x80); *d++ = (( uv & 0x3f) | 0x80); - *d = 0; return d; } #endif