From: Jarkko Hietaniemi Date: Sat, 3 Mar 2001 19:19:42 +0000 (+0000) Subject: UTF8 tr/// fixes from Inaba Hiroto. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=8973db79328a885c91b9dfdcafdb28dbe9e65a88;p=p5sagit%2Fp5-mst-13.2.git UTF8 tr/// fixes from Inaba Hiroto. p4raw-id: //depot/perl@9008 --- diff --git a/doop.c b/doop.c index 9bc6d56..7e2b52f 100644 --- a/doop.c +++ b/doop.c @@ -99,6 +99,7 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */ I32 matches = 0; STRLEN len; short *tbl; + I32 complement = PL_op->op_private & OPpTRANS_COMPLEMENT; tbl = (short*)cPVOP->op_pv; if (!tbl) @@ -117,7 +118,10 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */ UV c; STRLEN ulen; c = utf8_to_uv(s, send - s, &ulen, 0); - if (c < 0x100 && tbl[c] >= 0) + if (c < 0x100) { + if (tbl[c] >= 0) + matches++; + } else if (complement) matches++; s += ulen; } @@ -135,7 +139,9 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ I32 isutf8; I32 matches = 0; I32 grows = PL_op->op_private & OPpTRANS_GROWS; - STRLEN len; + I32 complement = PL_op->op_private & OPpTRANS_COMPLEMENT; + I32 del = PL_op->op_private & OPpTRANS_DELETE; + STRLEN len, rlen; short *tbl; I32 ch; @@ -186,6 +192,8 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ else d = s; dstart = d; + if (complement && !del) + rlen = tbl[0x100]; #ifdef MACOS_TRADITIONAL #define comp CoMP /* "comp" is a keyword in some compilers ... */ @@ -197,9 +205,24 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ STRLEN len; UV comp = utf8_to_uv_simple(s, &len); - if (comp > 0xff) { /* always unmapped */ - Copy(s, d, len, U8); - d += len; + if (comp > 0xff) { + if (!complement) { + Copy(s, d, len, U8); + d += len; + } + else { + matches++; + if (!del) { + ch = (comp - 0x100 < rlen) ? + tbl[comp+1] : tbl[0x100+rlen]; + if (ch != pch) { + d = uv_to_utf8(d, ch); + pch = ch; + } + s += len; + continue; + } + } } else if ((ch = tbl[comp]) >= 0) { matches++; @@ -224,9 +247,20 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ while (s < send) { STRLEN len; UV comp = utf8_to_uv_simple(s, &len); - if (comp > 0xff) { /* always unmapped */ - Copy(s, d, len, U8); - d += len; + if (comp > 0xff) { + if (!complement) { + Copy(s, d, len, U8); + d += len; + } + else { + matches++; + if (!del) { + if (comp - 0x100 < rlen) + d = uv_to_utf8(d, tbl[comp+1]); + else + d = uv_to_utf8(d, tbl[0x100+rlen]); + } + } } else if ((ch = tbl[comp]) >= 0) { d = uv_to_utf8(d, ch); @@ -499,8 +533,8 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ } if (uv < none) { matches++; - d = uv_to_utf8(d, uv); s += UTF8SKIP(s); + d = uv_to_utf8(d, uv); continue; } else if (uv == none) { /* "none" is unmapped character */ diff --git a/op.c b/op.c index eb60121..4c5dd13 100644 --- a/op.c +++ b/op.c @@ -2871,6 +2871,15 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) } } } + if (!del) { + if (j >= rlen) + j = rlen - 1; + else + cPVOPo->op_pv = (char*)Renew(tbl, 0x101+rlen-j, short); + tbl[0x100] = rlen - j; + for (i=0; i < rlen - j; i++) + tbl[0x101+i] = r[j+i]; + } } else { if (!rlen && !del) { diff --git a/t/op/tr.t b/t/op/tr.t index 75887ab..b10f4f2 100755 --- a/t/op/tr.t +++ b/t/op/tr.t @@ -5,7 +5,7 @@ BEGIN { @INC = '../lib'; } -print "1..51\n"; +print "1..55\n"; $_ = "abcdefghijklmnopqrstuvwxyz"; @@ -84,7 +84,7 @@ if (ord("\t") == 9) { # ASCII use utf8; } # 11 - changing UTF8 characters in a UTF8 string, same length. -$l = chr(300); $r = chr(400); +my $l = chr(300); my $r = chr(400); $x = 200.300.400; $x =~ tr/\x{12c}/\x{190}/; printf "not (%vd) ", $x if $x ne 200.400.400 or length $x != 3; @@ -287,7 +287,7 @@ print "ok 48\n"; print "not " unless sprintf("%vd", $a) eq '196.172.200'; print "ok 49\n"; -# UTF8 range +# UTF8 range tests from Inaba Hiroto ($a = v300.196.172.302.197.172) =~ tr/\x{12c}-\x{130}/\xc0-\xc4/; print "not " unless $a eq v192.196.172.194.197.172; @@ -296,3 +296,22 @@ print "ok 50\n"; ($a = v300.196.172.302.197.172) =~ tr/\xc4-\xc8/\x{12c}-\x{130}/; print "not " unless $a eq v300.300.172.302.301.172; print "ok 51\n"; + +# UTF8 range tests from Karsten Sperling (patch #9008 required) + +($a = "\x{0100}") =~ tr/\x00-\x{100}/X/; +print "not " unless $a eq "X"; +print "ok 52\n"; + +($a = "\x{0100}") =~ tr/\x{0000}-\x{00ff}/X/c; +print "not " unless $a eq "X"; +print "ok 53\n"; + +($a = "\x{0100}") =~ tr/\x{0000}-\x{00ff}\x{0101}/X/c; +print "not " unless $a eq "X"; +print "ok 54\n"; + +($a = v256) =~ tr/\x{0000}-\x{00ff}\x{0101}/X/c; +print "not " unless $a eq "X"; +print "ok 55\n"; + diff --git a/toke.c b/toke.c index 33915ed..daa0d52 100644 --- a/toke.c +++ b/toke.c @@ -1240,6 +1240,17 @@ S_scan_const(pTHX_ char *start) I32 min; /* first character in range */ I32 max; /* last character in range */ + if (utf) { + char *c = (char*)utf8_hop((U8*)d, -1); + char *e = d++; + while (e-- > c) + *(e + 1) = *e; + *c = 0xff; + /* mark the range as done, and continue */ + dorange = FALSE; + didrange = TRUE; + continue; + } i = d - SvPVX(sv); /* remember current offset */ SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */ d = SvPVX(sv) + i; /* refresh d after realloc */ @@ -1466,7 +1477,7 @@ S_scan_const(pTHX_ char *start) char *src, *dst; d = SvGROW(sv, - SvCUR(sv) + hicount + 1) + + SvLEN(sv) + hicount + 1) + (d - old_pvx); src = d - 1; @@ -1539,7 +1550,7 @@ S_scan_const(pTHX_ char *start) if (len > e - s + 4) { char *odest = SvPVX(sv); - SvGROW(sv, (SvCUR(sv) + len - (e - s + 4))); + SvGROW(sv, (SvLEN(sv) + len - (e - s + 4))); d = SvPVX(sv) + (d - odest); } Copy(str, d, len, char); @@ -6303,9 +6314,6 @@ S_scan_trans(pTHX_ char *start) Perl_croak(aTHX_ "Transliteration replacement not terminated"); } - New(803,tbl,256,short); - o = newPVOP(OP_TRANS, 0, (char*)tbl); - complement = del = squash = 0; while (strchr("cds", *s)) { if (*s == 'c') @@ -6316,6 +6324,9 @@ S_scan_trans(pTHX_ char *start) squash = OPpTRANS_SQUASH; s++; } + + New(803, tbl, complement&&!del?258:256, short); + o = newPVOP(OP_TRANS, 0, (char*)tbl); o->op_private = del|squash|complement| (DO_UTF8(PL_lex_stuff)? OPpTRANS_FROM_UTF : 0)| (DO_UTF8(PL_lex_repl) ? OPpTRANS_TO_UTF : 0);