From: Jarkko Hietaniemi Date: Mon, 15 Jan 2001 05:02:24 +0000 (+0000) Subject: More UTF-8 patches from Inaba Hiroto. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=9aa983d27b0af31badfcbbb76567f6e557076b41;p=p5sagit%2Fp5-mst-13.2.git More UTF-8 patches from Inaba Hiroto. - The substr lval was still not okay. - Now pp_stringify and sv_setsv copies source's UTF8 flag even if IN_BYTE. pp_stringify is called from fold_constants at optimization phase and "\x{100}" was made SvUTF8_off under use bytes (the bytes pragma is for "byte semantics" and not for "do not produce UTF8 data") - New `qu' operator to generate UTF8 string explicitly. Though I agree with the policy "0x00-0xff always produce bytes", sometimes want to such a string to be coded in UTF8. I can use pack"U0a*" but it requires more typing and has runtime overhead. - Fix pp_regcomp bug uncovered by "0x00-0xff always produce bytes" change, the bug appears if a pm has PMdf_UTF8 flag but interpolated string is not UTF8_on and has char 0x80-0xff. TODO: document and test qu. p4raw-id: //depot/perl@8439 --- diff --git a/keywords.pl b/keywords.pl index 46dd53d..06ee8f3 100755 --- a/keywords.pl +++ b/keywords.pl @@ -181,6 +181,7 @@ q qq qr quotemeta +qu qw qx rand diff --git a/mg.c b/mg.c index b5cae86..4f183b0 100644 --- a/mg.c +++ b/mg.c @@ -1404,12 +1404,14 @@ Perl_magic_getsubstr(pTHX_ SV *sv, MAGIC *mg) I32 offs = LvTARGOFF(sv); I32 rem = LvTARGLEN(sv); + if (SvUTF8(lsv)) + sv_pos_u2b(lsv, &offs, &rem); if (offs > len) offs = len; if (rem + offs > len) rem = len - offs; sv_setpvn(sv, tmps + offs, (STRLEN)rem); - if (DO_UTF8(lsv)) + if (SvUTF8(lsv)) SvUTF8_on(sv); return 0; } @@ -1417,25 +1419,26 @@ Perl_magic_getsubstr(pTHX_ SV *sv, MAGIC *mg) int Perl_magic_setsubstr(pTHX_ SV *sv, MAGIC *mg) { - STRLEN littlelen; - char *tmps = SvPV(sv, littlelen); + STRLEN len; + char *tmps = SvPV(sv, len); + SV *lsv = LvTARG(sv); + I32 lvoff = LvTARGOFF(sv); + I32 lvlen = LvTARGLEN(sv); if (DO_UTF8(sv)) { - I32 bigoff = LvTARGOFF(sv); - I32 biglen = LvTARGLEN(sv); - U8 *s, *a, *b; - - sv_utf8_upgrade(LvTARG(sv)); - /* sv_utf8_upgrade() might have moved and/or resized - * the string to be replaced, we must rediscover it. --jhi */ - s = (U8*)SvPVX(LvTARG(sv)); - a = utf8_hop(s, bigoff); - b = utf8_hop(a, biglen); - sv_insert(LvTARG(sv), a - s, b - a, tmps, littlelen); - SvUTF8_on(LvTARG(sv)); + sv_utf8_upgrade(lsv); + sv_pos_u2b(lsv, &lvoff, &lvlen); + sv_insert(lsv, lvoff, lvlen, tmps, len); + SvUTF8_on(lsv); + } + else if (SvUTF8(lsv)) { + sv_pos_u2b(lsv, &lvoff, &lvlen); + tmps = bytes_to_utf8(tmps, &len); + sv_insert(lsv, lvoff, lvlen, tmps, len); + Safefree(tmps); } else - sv_insert(LvTARG(sv), LvTARGOFF(sv), LvTARGLEN(sv), tmps, littlelen); + sv_insert(lsv, lvoff, lvlen, tmps, len); return 0; } diff --git a/pp.c b/pp.c index ba6c17a..87e459e 100644 --- a/pp.c +++ b/pp.c @@ -2792,6 +2792,8 @@ PP(pp_substr) RETPUSHUNDEF; } else { + I32 upos = pos; + I32 urem = rem; if (utfcurlen) sv_pos_u2b(sv, &pos, &rem); tmps += pos; @@ -2826,8 +2828,8 @@ PP(pp_substr) SvREFCNT_dec(LvTARG(TARG)); LvTARG(TARG) = SvREFCNT_inc(sv); } - LvTARGOFF(TARG) = pos; - LvTARGLEN(TARG) = rem; + LvTARGOFF(TARG) = upos; + LvTARGLEN(TARG) = urem; } } SPAGAIN; @@ -2970,11 +2972,9 @@ PP(pp_chr) (void)SvUPGRADE(TARG,SVt_PV); - if ((value > 255 && !IN_BYTE) || - (UTF8_IS_CONTINUED(value) && (PL_hints & HINT_UTF8)) ) { - SvGROW(TARG, UTF8_MAXLEN+1); - tmps = SvPVX(TARG); - tmps = (char*)uv_to_utf8((U8*)tmps, (UV)value); + if (value > 255 && !IN_BYTE) { + SvGROW(TARG, UNISKIP(value)+1); + tmps = (char*)uv_to_utf8((U8*)SvPVX(TARG), value); SvCUR_set(TARG, tmps - SvPVX(TARG)); *tmps = '\0'; (void)SvPOK_only(TARG); @@ -2982,9 +2982,6 @@ PP(pp_chr) XPUSHs(TARG); RETURN; } - else { - SvUTF8_off(TARG); - } SvGROW(TARG,2); SvCUR_set(TARG, 1); diff --git a/pp_ctl.c b/pp_ctl.c index 07545dc..5490221 100644 --- a/pp_ctl.c +++ b/pp_ctl.c @@ -116,9 +116,14 @@ PP(pp_regcomp) pm->op_pmflags = pm->op_pmpermflags; /* reset case sensitivity */ if (DO_UTF8(tmpstr)) pm->op_pmdynflags |= PMdf_DYN_UTF8; - else + else { pm->op_pmdynflags &= ~PMdf_DYN_UTF8; + if (pm->op_pmdynflags & PMdf_UTF8) + t = bytes_to_utf8(t, &len); + } pm->op_pmregexp = CALLREGCOMP(aTHX_ t, t + len, pm); + if (!DO_UTF8(tmpstr) && (pm->op_pmdynflags & PMdf_UTF8)) + Safefree(t); PL_reginterp_cnt = 0; /* XXXX Be extra paranoid - needed inside tie/overload accessors. */ } diff --git a/pp_hot.c b/pp_hot.c index 3a1e08d..0f1fee9 100644 --- a/pp_hot.c +++ b/pp_hot.c @@ -76,7 +76,7 @@ PP(pp_stringify) char *s; s = SvPV(TOPs,len); sv_setpvn(TARG,s,len); - if (SvUTF8(TOPs) && !IN_BYTE) + if (SvUTF8(TOPs)) SvUTF8_on(TARG); else SvUTF8_off(TARG); diff --git a/sv.c b/sv.c index 58c6434..3417924 100644 --- a/sv.c +++ b/sv.c @@ -3440,7 +3440,7 @@ Perl_sv_setsv(pTHX_ SV *dstr, register SV *sstr) *SvEND(dstr) = '\0'; (void)SvPOK_only(dstr); } - if ((sflags & SVf_UTF8) && !IN_BYTE) + if (sflags & SVf_UTF8) SvUTF8_on(dstr); /*SUPPRESS 560*/ if (sflags & SVp_NOK) { diff --git a/t/lib/charnames.t b/t/lib/charnames.t index 1d08ad0..14da2e0 100644 --- a/t/lib/charnames.t +++ b/t/lib/charnames.t @@ -45,7 +45,7 @@ $encoded_bet = "\327\221"; sub to_bytes { use bytes; - my $bytes = shift; + "".shift; } { diff --git a/t/op/length.t b/t/op/length.t index aec6a52..46f0c59 100644 --- a/t/op/length.t +++ b/t/op/length.t @@ -33,8 +33,7 @@ print "ok 3\n"; } { - use utf8; # make "\x{80}" to produce UTF-8 - my $a = "\x{80}"; + my $a = qu"\x{80}"; # make "\x{80}" to produce UTF-8 print "not " unless length($a) == 1; print "ok 6\n"; diff --git a/t/op/substr.t b/t/op/substr.t index d3937fb..12bcd00 100755 --- a/t/op/substr.t +++ b/t/op/substr.t @@ -1,6 +1,6 @@ #!./perl -print "1..149\n"; +print "1..162\n"; #P = start of string Q = start of substr R = end of substr S = end of string @@ -429,3 +429,122 @@ ok 149, length($x) == 5 && substr($x, 3, 1) eq "\x{FF}" && substr($x, 4, 1) eq "\x{F3}"; +# And tests for already-UTF8 one + +$x = "\x{101}\x{F2}\x{F3}"; +substr($x, 0, 1) = "\x{100}"; +ok 150, length($x) == 3 && + $x eq "\x{100}\xF2\xF3" && + substr($x, 0, 1) eq "\x{100}" && + substr($x, 1, 1) eq "\x{F2}" && + substr($x, 2, 1) eq "\x{F3}"; + +$x = "\x{101}\x{F2}\x{F3}"; +substr($x, 0, 1) = "\x{100}\x{FF}"; +ok 151, length($x) == 4 && + $x eq "\x{100}\x{FF}\xF2\xF3" && + substr($x, 0, 1) eq "\x{100}" && + substr($x, 1, 1) eq "\x{FF}" && + substr($x, 2, 1) eq "\x{F2}" && + substr($x, 3, 1) eq "\x{F3}"; + +$x = "\x{101}\x{F2}\x{F3}"; +substr($x, 0, 2) = "\x{100}\xFF"; +ok 152, length($x) == 3 && + $x eq "\x{100}\xFF\xF3" && + substr($x, 0, 1) eq "\x{100}" && + substr($x, 1, 1) eq "\x{FF}" && + substr($x, 2, 1) eq "\x{F3}"; + +$x = "\x{101}\x{F2}\x{F3}"; +substr($x, 1, 1) = "\x{100}\xFF"; +ok 153, length($x) == 4 && + $x eq "\x{101}\x{100}\xFF\xF3" && + substr($x, 0, 1) eq "\x{101}" && + substr($x, 1, 1) eq "\x{100}" && + substr($x, 2, 1) eq "\x{FF}" && + substr($x, 3, 1) eq "\x{F3}"; + +$x = "\x{101}\x{F2}\x{F3}"; +substr($x, 2, 1) = "\x{100}\xFF"; +ok 154, length($x) == 4 && + $x eq "\x{101}\xF2\x{100}\xFF" && + substr($x, 0, 1) eq "\x{101}" && + substr($x, 1, 1) eq "\x{F2}" && + substr($x, 2, 1) eq "\x{100}" && + substr($x, 3, 1) eq "\x{FF}"; + +$x = "\x{101}\x{F2}\x{F3}"; +substr($x, 3, 1) = "\x{100}\xFF"; +ok 155, length($x) == 5 && + $x eq "\x{101}\x{F2}\x{F3}\x{100}\xFF" && + substr($x, 0, 1) eq "\x{101}" && + substr($x, 1, 1) eq "\x{F2}" && + substr($x, 2, 1) eq "\x{F3}" && + substr($x, 3, 1) eq "\x{100}" && + substr($x, 4, 1) eq "\x{FF}"; + +$x = "\x{101}\x{F2}\x{F3}"; +substr($x, -1, 1) = "\x{100}\xFF"; +ok 156, length($x) == 4 && + $x eq "\x{101}\xF2\x{100}\xFF" && + substr($x, 0, 1) eq "\x{101}" && + substr($x, 1, 1) eq "\x{F2}" && + substr($x, 2, 1) eq "\x{100}" && + substr($x, 3, 1) eq "\x{FF}"; + +$x = "\x{101}\x{F2}\x{F3}"; +substr($x, -1, 0) = "\x{100}\xFF"; +ok 157, length($x) == 5 && + $x eq "\x{101}\xF2\x{100}\xFF\xF3" && + substr($x, 0, 1) eq "\x{101}" && + substr($x, 1, 1) eq "\x{F2}" && + substr($x, 2, 1) eq "\x{100}" && + substr($x, 3, 1) eq "\x{FF}" && + substr($x, 4, 1) eq "\x{F3}"; + +$x = "\x{101}\x{F2}\x{F3}"; +substr($x, 0, -1) = "\x{100}\xFF"; +ok 158, length($x) == 3 && + $x eq "\x{100}\xFF\xF3" && + substr($x, 0, 1) eq "\x{100}" && + substr($x, 1, 1) eq "\x{FF}" && + substr($x, 2, 1) eq "\x{F3}"; + +$x = "\x{101}\x{F2}\x{F3}"; +substr($x, 0, -2) = "\x{100}\xFF"; +ok 159, length($x) == 4 && + $x eq "\x{100}\xFF\xF2\xF3" && + substr($x, 0, 1) eq "\x{100}" && + substr($x, 1, 1) eq "\x{FF}" && + substr($x, 2, 1) eq "\x{F2}" && + substr($x, 3, 1) eq "\x{F3}"; + +$x = "\x{101}\x{F2}\x{F3}"; +substr($x, 0, -3) = "\x{100}\xFF"; +ok 160, length($x) == 5 && + $x eq "\x{100}\xFF\x{101}\x{F2}\x{F3}" && + substr($x, 0, 1) eq "\x{100}" && + substr($x, 1, 1) eq "\x{FF}" && + substr($x, 2, 1) eq "\x{101}" && + substr($x, 3, 1) eq "\x{F2}" && + substr($x, 4, 1) eq "\x{F3}"; + +$x = "\x{101}\x{F2}\x{F3}"; +substr($x, 1, -1) = "\x{100}\xFF"; +ok 161, length($x) == 4 && + $x eq "\x{101}\x{100}\xFF\xF3" && + substr($x, 0, 1) eq "\x{101}" && + substr($x, 1, 1) eq "\x{100}" && + substr($x, 2, 1) eq "\x{FF}" && + substr($x, 3, 1) eq "\x{F3}"; + +$x = "\x{101}\x{F2}\x{F3}"; +substr($x, -1, -1) = "\x{100}\xFF"; +ok 162, length($x) == 5 && + $x eq "\x{101}\xF2\x{100}\xFF\xF3" && + substr($x, 0, 1) eq "\x{101}" && + substr($x, 1, 1) eq "\x{F2}" && + substr($x, 2, 1) eq "\x{100}" && + substr($x, 3, 1) eq "\x{FF}" && + substr($x, 4, 1) eq "\x{F3}"; diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t index e369979..546b217 100755 --- a/t/pragma/utf8.t +++ b/t/pragma/utf8.t @@ -283,7 +283,7 @@ sub nok_bytes { { use utf8; - ok_bytes chr(0xe2), pack("C*", 0xc3, 0xa2); + ok_bytes chr(0x1e2), pack("C*", 0xc7, 0xa2); $test++; # 65 } diff --git a/toke.c b/toke.c index ea32115..398253c 100644 --- a/toke.c +++ b/toke.c @@ -1045,8 +1045,11 @@ STATIC I32 S_sublex_done(pTHX) { if (!PL_lex_starts++) { + SV *sv = newSVpvn("",0); + if (SvUTF8(PL_linestr)) + SvUTF8_on(sv); PL_expect = XOPERATOR; - yylval.opval = (OP*)newSVOP(OP_CONST, 0, newSVpvn("",0)); + yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv); return THING; } @@ -1173,7 +1176,8 @@ S_scan_const(pTHX_ char *start) register char *d = SvPVX(sv); /* destination for copies */ bool dorange = FALSE; /* are we in a translit range? */ bool didrange = FALSE; /* did we just finish a range? */ - bool has_utf8 = FALSE; /* embedded \x{} */ + bool has_utf8 = (PL_linestr && SvUTF8(PL_linestr)); + /* the constant is UTF8 */ UV uv; I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) @@ -1313,8 +1317,6 @@ S_scan_const(pTHX_ char *start) /* backslashes */ if (*s == '\\' && s+1 < send) { - bool to_be_utf8 = FALSE; - s++; /* some backslashes we leave behind */ @@ -1383,8 +1385,6 @@ S_scan_const(pTHX_ char *start) else { STRLEN len = 1; /* allow underscores */ uv = (UV)scan_hex(s + 1, e - s - 1, &len); - if (PL_hints & HINT_UTF8) - to_be_utf8 = TRUE; } s = e + 1; } @@ -1408,7 +1408,7 @@ S_scan_const(pTHX_ char *start) * repertoire. --jhi */ if (uv > 127) { - if (!has_utf8 && (to_be_utf8 || uv > 255)) { + if (!has_utf8 && uv > 255) { /* Might need to recode whatever we have * accumulated so far if it contains any * hibit chars. @@ -1447,7 +1447,7 @@ S_scan_const(pTHX_ char *start) } } - if (to_be_utf8 || has_utf8 || uv > 255) { + if (has_utf8 || uv > 255) { d = (char*)uv_to_utf8((U8*)d, uv); has_utf8 = TRUE; } @@ -4711,7 +4711,10 @@ Perl_yylex(pTHX) TOKEN('('); case KEY_qq: + case KEY_qu: s = scan_str(s,FALSE,FALSE); + if (tmp == KEY_qu && is_utf8_string((U8*)s, SvCUR(PL_lex_stuff))) + SvUTF8_on(PL_lex_stuff); if (!s) missingterm((char*)0); yylval.ival = OP_STRINGIFY; @@ -5548,6 +5551,7 @@ Perl_keyword(pTHX_ register char *d, I32 len) if (strEQ(d,"q")) return KEY_q; if (strEQ(d,"qr")) return KEY_qr; if (strEQ(d,"qq")) return KEY_qq; + if (strEQ(d,"qu")) return KEY_qu; if (strEQ(d,"qw")) return KEY_qw; if (strEQ(d,"qx")) return KEY_qx; } @@ -7204,10 +7208,9 @@ vstring: while (isDIGIT(*pos) || *pos == '_') pos++; if (!isALPHA(*pos)) { - UV rev; + UV rev, revmax = 0; U8 tmpbuf[UTF8_MAXLEN+1]; U8 *tmpend; - bool utf8 = FALSE; s++; /* get past 'v' */ sv = NEWSV(92,5); @@ -7234,7 +7237,8 @@ vstring: } } tmpend = uv_to_utf8(tmpbuf, rev); - utf8 = utf8 || rev > 127; + if (rev > revmax) + revmax = rev; sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf); if (*pos == '.' && isDIGIT(pos[1])) s = ++pos; @@ -7248,9 +7252,9 @@ vstring: SvPOK_on(sv); SvREADONLY_on(sv); - if (utf8) { + if (revmax > 127) { SvUTF8_on(sv); - if (!UTF||IN_BYTE) + if (revmax < 256) sv_utf8_downgrade(sv, TRUE); } }