UTF8 tr/// fixes from Inaba Hiroto.
Jarkko Hietaniemi [Sat, 3 Mar 2001 19:19:42 +0000 (19:19 +0000)]
p4raw-id: //depot/perl@9008

doop.c
op.c
t/op/tr.t
toke.c

diff --git a/doop.c b/doop.c
index 9bc6d56..7e2b52f 100644 (file)
--- a/doop.c
+++ b/doop.c
@@ -99,6 +99,7 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
     I32 matches = 0;
     STRLEN len;
     short *tbl;
+    I32 complement = PL_op->op_private & OPpTRANS_COMPLEMENT;
 
     tbl = (short*)cPVOP->op_pv;
     if (!tbl)
@@ -117,7 +118,10 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
            UV c;
            STRLEN ulen;
            c = utf8_to_uv(s, send - s, &ulen, 0);
-           if (c < 0x100 && tbl[c] >= 0)
+           if (c < 0x100) {
+               if (tbl[c] >= 0)
+                   matches++;
+           } else if (complement)
                matches++;
            s += ulen;
        }
@@ -135,7 +139,9 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
     I32 isutf8;
     I32 matches = 0;
     I32 grows = PL_op->op_private & OPpTRANS_GROWS;
-    STRLEN len;
+    I32 complement = PL_op->op_private & OPpTRANS_COMPLEMENT;
+    I32 del = PL_op->op_private & OPpTRANS_DELETE;
+    STRLEN len, rlen;
     short *tbl;
     I32 ch;
 
@@ -186,6 +192,8 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
        else
            d = s;
        dstart = d;
+       if (complement && !del)
+           rlen = tbl[0x100];
 
 #ifdef MACOS_TRADITIONAL
 #define comp CoMP   /* "comp" is a keyword in some compilers ... */
@@ -197,9 +205,24 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
                STRLEN len;
                UV comp = utf8_to_uv_simple(s, &len);
 
-               if (comp > 0xff) {      /* always unmapped */   
-                   Copy(s, d, len, U8);
-                   d += len;
+               if (comp > 0xff) {
+                   if (!complement) {
+                       Copy(s, d, len, U8);
+                       d += len;
+                   }
+                   else {
+                       matches++;
+                       if (!del) {
+                           ch = (comp - 0x100 < rlen) ?
+                               tbl[comp+1] : tbl[0x100+rlen];
+                           if (ch != pch) {
+                               d = uv_to_utf8(d, ch);
+                               pch = ch;
+                           }
+                           s += len;
+                           continue;
+                       }
+                   }
                }
                else if ((ch = tbl[comp]) >= 0) {
                    matches++;
@@ -224,9 +247,20 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
            while (s < send) {
                STRLEN len;
                UV comp = utf8_to_uv_simple(s, &len);
-               if (comp > 0xff) {      /* always unmapped */
-                   Copy(s, d, len, U8);
-                   d += len;
+               if (comp > 0xff) {
+                   if (!complement) {
+                       Copy(s, d, len, U8);
+                       d += len;
+                   }
+                   else {
+                       matches++;
+                       if (!del) {
+                           if (comp - 0x100 < rlen) 
+                               d = uv_to_utf8(d, tbl[comp+1]);
+                           else
+                               d = uv_to_utf8(d, tbl[0x100+rlen]);
+                       }
+                   }
                }
                else if ((ch = tbl[comp]) >= 0) {
                    d = uv_to_utf8(d, ch);
@@ -499,8 +533,8 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
            }
            if (uv < none) {
                matches++;
-               d = uv_to_utf8(d, uv);
                s += UTF8SKIP(s);
+               d = uv_to_utf8(d, uv);
                continue;
            }
            else if (uv == none) {      /* "none" is unmapped character */
diff --git a/op.c b/op.c
index eb60121..4c5dd13 100644 (file)
--- a/op.c
+++ b/op.c
@@ -2871,6 +2871,15 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                }
            }
        }
+       if (!del) {
+           if (j >= rlen)
+               j = rlen - 1;
+           else
+               cPVOPo->op_pv = (char*)Renew(tbl, 0x101+rlen-j, short);
+           tbl[0x100] = rlen - j;
+           for (i=0; i < rlen - j; i++)
+               tbl[0x101+i] = r[j+i];
+       }
     }
     else {
        if (!rlen && !del) {
index 75887ab..b10f4f2 100755 (executable)
--- a/t/op/tr.t
+++ b/t/op/tr.t
@@ -5,7 +5,7 @@ BEGIN {
     @INC = '../lib';
 }
 
-print "1..51\n";
+print "1..55\n";
 
 $_ = "abcdefghijklmnopqrstuvwxyz";
 
@@ -84,7 +84,7 @@ if (ord("\t") == 9) { # ASCII
     use utf8;
 }
 # 11 - changing UTF8 characters in a UTF8 string, same length.
-$l = chr(300); $r = chr(400);
+my $l = chr(300); my $r = chr(400);
 $x = 200.300.400;
 $x =~ tr/\x{12c}/\x{190}/;
 printf "not (%vd) ", $x if $x ne 200.400.400 or length $x != 3;
@@ -287,7 +287,7 @@ print "ok 48\n";
 print "not " unless sprintf("%vd", $a) eq '196.172.200';
 print "ok 49\n";
 
-# UTF8 range
+# UTF8 range tests from Inaba Hiroto
 
 ($a = v300.196.172.302.197.172) =~ tr/\x{12c}-\x{130}/\xc0-\xc4/;
 print "not " unless $a eq v192.196.172.194.197.172;
@@ -296,3 +296,22 @@ print "ok 50\n";
 ($a = v300.196.172.302.197.172) =~ tr/\xc4-\xc8/\x{12c}-\x{130}/;
 print "not " unless $a eq v300.300.172.302.301.172;
 print "ok 51\n";
+
+# UTF8 range tests from Karsten Sperling (patch #9008 required)
+
+($a = "\x{0100}") =~ tr/\x00-\x{100}/X/;
+print "not " unless $a eq "X";
+print "ok 52\n";
+
+($a = "\x{0100}") =~ tr/\x{0000}-\x{00ff}/X/c;
+print "not " unless $a eq "X";
+print "ok 53\n";
+
+($a = "\x{0100}") =~ tr/\x{0000}-\x{00ff}\x{0101}/X/c;
+print "not " unless $a eq "X";
+print "ok 54\n";
+($a = v256) =~ tr/\x{0000}-\x{00ff}\x{0101}/X/c;
+print "not " unless $a eq "X";
+print "ok 55\n"; 
+
diff --git a/toke.c b/toke.c
index 33915ed..daa0d52 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -1240,6 +1240,17 @@ S_scan_const(pTHX_ char *start)
                I32 min;                        /* first character in range */
                I32 max;                        /* last character in range */
 
+               if (utf) {
+                   char *c = (char*)utf8_hop((U8*)d, -1);
+                   char *e = d++;
+                   while (e-- > c)
+                       *(e + 1) = *e;
+                   *c = 0xff;
+                   /* mark the range as done, and continue */
+                   dorange = FALSE;
+                   didrange = TRUE;
+                   continue;
+               }
                i = d - SvPVX(sv);              /* remember current offset */
                SvGROW(sv, SvLEN(sv) + 256);    /* never more than 256 chars in a range */
                d = SvPVX(sv) + i;              /* refresh d after realloc */
@@ -1466,7 +1477,7 @@ S_scan_const(pTHX_ char *start)
                            char *src, *dst;
                          
                            d = SvGROW(sv,
-                                      SvCUR(sv) + hicount + 1) +
+                                      SvLEN(sv) + hicount + 1) +
                                         (d - old_pvx);
 
                            src = d - 1;
@@ -1539,7 +1550,7 @@ S_scan_const(pTHX_ char *start)
                    if (len > e - s + 4) {
                        char *odest = SvPVX(sv);
 
-                       SvGROW(sv, (SvCUR(sv) + len - (e - s + 4)));
+                       SvGROW(sv, (SvLEN(sv) + len - (e - s + 4)));
                        d = SvPVX(sv) + (d - odest);
                    }
                    Copy(str, d, len, char);
@@ -6303,9 +6314,6 @@ S_scan_trans(pTHX_ char *start)
        Perl_croak(aTHX_ "Transliteration replacement not terminated");
     }
 
-    New(803,tbl,256,short);
-    o = newPVOP(OP_TRANS, 0, (char*)tbl);
-
     complement = del = squash = 0;
     while (strchr("cds", *s)) {
        if (*s == 'c')
@@ -6316,6 +6324,9 @@ S_scan_trans(pTHX_ char *start)
            squash = OPpTRANS_SQUASH;
        s++;
     }
+
+    New(803, tbl, complement&&!del?258:256, short);
+    o = newPVOP(OP_TRANS, 0, (char*)tbl);
     o->op_private = del|squash|complement|
       (DO_UTF8(PL_lex_stuff)? OPpTRANS_FROM_UTF : 0)|
       (DO_UTF8(PL_lex_repl) ? OPpTRANS_TO_UTF   : 0);