tr/// logic was hosed under utf8
Larry Wall [Sat, 5 Sep 1998 23:48:24 +0000 (23:48 +0000)]
p4raw-id: //depot/perl@1781

doop.c
op.c
op.h
pp.c
proto.h

diff --git a/doop.c b/doop.c
index 8ebbd83..c6270e4 100644 (file)
--- a/doop.c
+++ b/doop.c
 #include <signal.h>
 #endif
 
-I32
-do_trans(SV *sv, OP *arg)
+static I32
+do_trans_CC_simple(SV *sv)
 {
     dTHR;
-    register U8 *s;
-    register U8 *send;
-    register U8 *d;
-    register I32 matches = 0;
-    register I32 squash = PL_op->op_private & OPpTRANS_SQUASH;
+    U8 *s;
+    U8 *send;
+    I32 matches = 0;
     STRLEN len;
+    short *tbl;
+    I32 ch;
 
-    if (SvREADONLY(sv) && !(PL_op->op_private & OPpTRANS_COUNTONLY))
-       croak(no_modify);
+    tbl = (short*)cPVOP->op_pv;
+    if (!tbl)
+       croak("panic: do_trans");
 
-    if (PL_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF)) {
-       SV* rv = (SV*)cSVOP->op_sv;
-       HV* hv = (HV*)SvRV(rv);
-       SV** svp = hv_fetch(hv, "NONE", 4, FALSE);
-       UV none = svp ? SvUV(*svp) : 0x7fffffff;
-       UV extra = none + 1;
-       I32 del = PL_op->op_private & OPpTRANS_DELETE;
-       UV final;
-       register UV uv;
-       UV puv;
-       register I32 from_utf = PL_op->op_private & OPpTRANS_FROM_UTF;
-       register I32 to_utf = PL_op->op_private & OPpTRANS_TO_UTF;
-
-       s = (U8*)SvPV(sv, len);
-       if (!len)
-           return 0;
-       if (!SvPOKp(sv))
-           s = (U8*)SvPV_force(sv, len);
-       (void)SvPOK_only(sv);
-       send = s + len;
-       DEBUG_t( deb("2.TBL\n"));
-       if (PL_op->op_private == (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF)) { /* no other flags */
-           svp = hv_fetch(hv, "FINAL", 5, FALSE);
-           if (svp)
-               final = SvUV(*svp);
-
-           d = s;
-           while (s < send) {
-               if ((uv = swash_fetch(rv, s)) < none) {
-                   s += UTF8SKIP(s);
-                   matches++;
-                   d = uv_to_utf8(d, uv);
-               }
-               else if (uv == none) {
-                   int i;
-                   for (i = UTF8SKIP(s); i; i--)
-                       *d++ = *s++;
-               }
-               else if (uv == extra) {
-                   s += UTF8SKIP(s);
-                   matches++;
-                   d = uv_to_utf8(d, final);
-               }
-               else
-                   s += UTF8SKIP(s);
-           }
-           *d = '\0';
-           SvCUR_set(sv, d - (U8*)SvPVX(sv));
-           SvSETMAGIC(sv);
-       }
-       else if (PL_op->op_private == OPpTRANS_FROM_UTF) {      /* no other flags */
-           svp = hv_fetch(hv, "FINAL", 5, FALSE);
-           if (svp)
-               final = SvUV(*svp);
-
-           d = s;
-           while (s < send) {
-               if ((uv = swash_fetch(rv, s)) < none) {
-                   s += UTF8SKIP(s);
-                   matches++;
-                   *d++ = (U8)uv;
-               }
-               else if (uv == none) {
-                   I32 ulen;
-                   uv = utf8_to_uv(s, &ulen);
-                   s += ulen;
-                   *d++ = (U8)uv;
-               }
-               else if (uv == extra) {
-                   s += UTF8SKIP(s);
-                   matches++;
-                   *d++ = (U8)final;
-               }
-               else
-                   s += UTF8SKIP(s);
-           }
-           *d = '\0';
-           SvCUR_set(sv, d - (U8*)SvPVX(sv));
-           SvSETMAGIC(sv);
-       }
-       else if (PL_op->op_private == OPpTRANS_TO_UTF) {        /* no other flags */
-           svp = hv_fetch(hv, "FINAL", 5, FALSE);
-           if (svp)
-               final = SvUV(*svp);
-
-           d = s;
-           while (s < send) {
-               U8 tmpbuf[10];
-               uv_to_utf8(tmpbuf, *s);         /* XXX suboptimal */
-               if ((uv = swash_fetch(rv, tmpbuf)) < none) {
-                   s += UTF8SKIP(s);
-                   matches++;
-                   d = uv_to_utf8(d, uv);
-               }
-               else if (uv == none) {
-                   I32 ulen;
-                   uv = utf8_to_uv(s, &ulen);
-                   s += ulen;
-                   d = uv_to_utf8(d, uv);
-               }
-               else if (uv == extra) {
-                   s += UTF8SKIP(s);
-                   matches++;
-                   d = uv_to_utf8(d, final);
-               }
+    s = (U8*)SvPV(sv, len);
+    send = s + len;
+
+    while (s < send) {
+       if ((ch = tbl[*s]) >= 0) {
+           matches++;
+           *s = ch;
+       }
+       s++;
+    }
+    SvSETMAGIC(sv);
+
+    return matches;
+}
+
+static I32
+do_trans_CC_count(SV *sv)
+{
+    dTHR;
+    U8 *s;
+    U8 *send;
+    I32 matches = 0;
+    STRLEN len;
+    short *tbl;
+
+    tbl = (short*)cPVOP->op_pv;
+    if (!tbl)
+       croak("panic: do_trans");
+
+    s = (U8*)SvPV(sv, len);
+    send = s + len;
+
+    while (s < send) {
+       if (tbl[*s] >= 0)
+           matches++;
+       s++;
+    }
+
+    return matches;
+}
+
+static I32
+do_trans_CC_complex(SV *sv)
+{
+    dTHR;
+    U8 *s;
+    U8 *send;
+    U8 *d;
+    I32 matches = 0;
+    STRLEN len;
+    short *tbl;
+    I32 ch;
+
+    tbl = (short*)cPVOP->op_pv;
+    if (!tbl)
+       croak("panic: do_trans");
+
+    s = (U8*)SvPV(sv, len);
+    send = s + len;
+
+    d = s;
+    if (PL_op->op_private & OPpTRANS_SQUASH) {
+       U8* p = send;
+
+       while (s < send) {
+           if ((ch = tbl[*s]) >= 0) {
+               *d = ch;
+               matches++;
+               if (p == d - 1 && *p == *d)
+                   matches--;
                else
-                   s += UTF8SKIP(s);
+                   p = d++;
            }
-           *d = '\0';
-           SvCUR_set(sv, d - (U8*)SvPVX(sv));
-           SvSETMAGIC(sv);
+           else if (ch == -1)          /* -1 is unmapped character */
+               *d++ = *s;              /* -2 is delete character */
+           s++;
        }
-       else if (PL_op->op_private & OPpTRANS_COUNTONLY) {
-           if (from_utf) {
-               while (s < send) {
-                   if (swash_fetch(rv, s) < none)
-                       matches++;
-                   s += UTF8SKIP(s);
-               }
-           }
-           else {
-               while (s < send) {
-                   U8 tmpbuf[10];
-                   uv_to_utf8(tmpbuf, *s);     /* XXX suboptimal */
-                   if (swash_fetch(rv, tmpbuf) < none)
-                       matches++;
-                   s += UTF8SKIP(s);
-               }
+    }
+    else {
+       while (s < send) {
+           if ((ch = tbl[*s]) >= 0) {
+               *d = ch;
+               matches++;
+               d++;
            }
+           else if (ch == -1)          /* -1 is unmapped character */
+               *d++ = *s;              /* -2 is delete character */
+           s++;
+       }
+    }
+    matches += send - d;       /* account for disappeared chars */
+    *d = '\0';
+    SvCUR_set(sv, d - (U8*)SvPVX(sv));
+    SvSETMAGIC(sv);
+
+    return matches;
+}
+
+static I32
+do_trans_UU_simple(SV *sv)
+{
+    dTHR;
+    U8 *s;
+    U8 *send;
+    U8 *d;
+    I32 matches = 0;
+    STRLEN len;
+
+    SV* rv = (SV*)cSVOP->op_sv;
+    HV* hv = (HV*)SvRV(rv);
+    SV** svp = hv_fetch(hv, "NONE", 4, FALSE);
+    UV none = svp ? SvUV(*svp) : 0x7fffffff;
+    UV extra = none + 1;
+    UV final;
+    UV uv;
+
+    s = (U8*)SvPV(sv, len);
+    send = s + len;
+
+    svp = hv_fetch(hv, "FINAL", 5, FALSE);
+    if (svp)
+       final = SvUV(*svp);
+
+    d = s;
+    while (s < send) {
+       if ((uv = swash_fetch(rv, s)) < none) {
+           s += UTF8SKIP(s);
+           matches++;
+           d = uv_to_utf8(d, uv);
+       }
+       else if (uv == none) {
+           int i;
+           for (i = UTF8SKIP(s); i; i--)
+               *d++ = *s++;
+       }
+       else if (uv == extra) {
+           s += UTF8SKIP(s);
+           matches++;
+           d = uv_to_utf8(d, final);
+       }
+       else
+           s += UTF8SKIP(s);
+    }
+    *d = '\0';
+    SvCUR_set(sv, d - (U8*)SvPVX(sv));
+    SvSETMAGIC(sv);
+
+    return matches;
+}
+
+static I32
+do_trans_UU_count(SV *sv)
+{
+    dTHR;
+    U8 *s;
+    U8 *send;
+    I32 matches = 0;
+    STRLEN len;
+
+    SV* rv = (SV*)cSVOP->op_sv;
+    HV* hv = (HV*)SvRV(rv);
+    SV** svp = hv_fetch(hv, "NONE", 4, FALSE);
+    UV none = svp ? SvUV(*svp) : 0x7fffffff;
+    UV uv;
+
+    s = (U8*)SvPV(sv, len);
+    send = s + len;
+
+    while (s < send) {
+       if ((uv = swash_fetch(rv, s)) < none) {
+           s += UTF8SKIP(s);
+           matches++;
+       }
+    }
+
+    return matches;
+}
+
+static I32
+do_trans_UC_simple(SV *sv)
+{
+    dTHR;
+    U8 *s;
+    U8 *send;
+    U8 *d;
+    I32 matches = 0;
+    STRLEN len;
+
+    SV* rv = (SV*)cSVOP->op_sv;
+    HV* hv = (HV*)SvRV(rv);
+    SV** svp = hv_fetch(hv, "NONE", 4, FALSE);
+    UV none = svp ? SvUV(*svp) : 0x7fffffff;
+    UV extra = none + 1;
+    UV final;
+    UV uv;
+
+    s = (U8*)SvPV(sv, len);
+    send = s + len;
+
+    svp = hv_fetch(hv, "FINAL", 5, FALSE);
+    if (svp)
+       final = SvUV(*svp);
+
+    d = s;
+    while (s < send) {
+       if ((uv = swash_fetch(rv, s)) < none) {
+           s += UTF8SKIP(s);
+           matches++;
+           *d++ = (U8)uv;
+       }
+       else if (uv == none) {
+           I32 ulen;
+           uv = utf8_to_uv(s, &ulen);
+           s += ulen;
+           *d++ = (U8)uv;
        }
+       else if (uv == extra) {
+           s += UTF8SKIP(s);
+           matches++;
+           *d++ = (U8)final;
+       }
+       else
+           s += UTF8SKIP(s);
+    }
+    *d = '\0';
+    SvCUR_set(sv, d - (U8*)SvPVX(sv));
+    SvSETMAGIC(sv);
+
+    return matches;
+}
+
+static I32
+do_trans_CU_simple(SV *sv)
+{
+    dTHR;
+    U8 *s;
+    U8 *send;
+    U8 *d;
+    U8 *dst;
+    I32 matches = 0;
+    STRLEN len;
+
+    SV* rv = (SV*)cSVOP->op_sv;
+    HV* hv = (HV*)SvRV(rv);
+    SV** svp = hv_fetch(hv, "NONE", 4, FALSE);
+    UV none = svp ? SvUV(*svp) : 0x7fffffff;
+    UV extra = none + 1;
+    UV final;
+    UV uv;
+    U8 tmpbuf[10];
+    I32 bits = 16;
+
+    s = (U8*)SvPV(sv, len);
+    send = s + len;
+
+    svp = hv_fetch(hv, "BITS", 4, FALSE);
+    if (svp)
+       bits = (I32)SvIV(*svp);
+
+    svp = hv_fetch(hv, "FINAL", 5, FALSE);
+    if (svp)
+       final = SvUV(*svp);
+
+    Newz(801, d, len * (bits >> 3) + 1, U8);
+    dst = d;
+
+    while (s < send) {
+       uv = *s++;
+       if (uv < 0x80)
+           tmpbuf[0] = uv;
        else {
-           I32 bits = 16;
-           U8 *dst;
+           tmpbuf[0] = (( uv >>  6)         | 0xc0);
+           tmpbuf[1] = (( uv        & 0x3f) | 0x80);
+       }
 
-           svp = hv_fetch(hv, "BITS", 4, FALSE);
-           if (svp)
-               bits = (I32)SvIV(*svp);
+       if ((uv = swash_fetch(rv, tmpbuf)) < none) {
+           matches++;
+           d = uv_to_utf8(d, uv);
+       }
+       else if (uv == none)
+           d = uv_to_utf8(d, s[-1]);
+       else if (uv == extra) {
+           matches++;
+           d = uv_to_utf8(d, final);
+       }
+    }
+    *d = '\0';
+    sv_usepvn_mg(sv, (char*)dst, d - dst);
 
-           svp = hv_fetch(hv, "FINAL", 5, FALSE);
-           if (svp)
-               final = SvUV(*svp);
+    return matches;
+}
 
-           Newz(801, d, len * (bits >> 3) + 1, U8);
-           dst = d;
+/* utf-8 to latin-1 */
 
-           puv = 0xfeedface;
-           if (squash) {
-               while (s < send) {
-                   if (from_utf)
-                       uv = swash_fetch(rv, s);
-                   else {
-                       U8 tmpbuf[10];
-                       uv_to_utf8(tmpbuf, *s); /* XXX suboptimal */
-                       uv = swash_fetch(rv, tmpbuf);
-                   }
-                   if (uv < none) {
-                       matches++;
-                       if (uv != puv) {
-                           if (to_utf)
-                               d = uv_to_utf8(d, uv);
-                           else
-                               *d++ = (U8)uv;
-                       }
-                       puv = uv;
-                       s += UTF8SKIP(s);
-                       continue;
-                   }
-                   else if (uv == none) {      /* "none" is unmapped character */
-                       int i;
-                       if (to_utf) {
-                           for (i = UTF8SKIP(s); i; --i)
-                               *d++ = *s++;
-                       }
-                       else {
-                           I32 ulen;
-                           *d++ = (U8)utf8_to_uv(s, &ulen);
-                           s += ulen;
-                       }
-                       puv = 0xfeedface;
-                       continue;
-                   }
-                   else if (uv == extra && !del) {
-                       matches++;
-                       if (to_utf)
-                           d = uv_to_utf8(d, final);
-                       else
-                           *d++ = (U8)final;
-                       s += UTF8SKIP(s);
-                       puv = 0xfeedface;
-                       continue;
-                   }
-                   matches++;          /* "none+1" is delete character */
-                   s += UTF8SKIP(s);
-               }
+static I32
+do_trans_UC_trivial(SV *sv)
+{
+    dTHR;
+    U8 *s;
+    U8 *send;
+    U8 *d;
+    STRLEN len;
+
+    s = (U8*)SvPV(sv, len);
+    send = s + len;
+
+    d = s;
+    while (s < send) {
+       if (*s < 0x80)
+           *d++ = *s++;
+       else {
+           I32 ulen;
+           UV uv = utf8_to_uv(s, &ulen);
+           s += ulen;
+           *d++ = (U8)uv;
+       }
+    }
+    *d = '\0';
+    SvCUR_set(sv, d - (U8*)SvPVX(sv));
+    SvSETMAGIC(sv);
+
+    return SvCUR(sv);
+}
+
+/* latin-1 to utf-8 */
+
+static I32
+do_trans_CU_trivial(SV *sv)
+{
+    dTHR;
+    U8 *s;
+    U8 *send;
+    U8 *d;
+    U8 *dst;
+    I32 matches;
+    STRLEN len;
+
+    s = (U8*)SvPV(sv, len);
+    send = s + len;
+
+    Newz(801, d, len * 2 + 1, U8);
+    dst = d;
+
+    matches = send - s;
+
+    while (s < send) {
+       if (*s < 0x80)
+           *d++ = *s++;
+       else {
+           UV uv = *s++;
+           *d++ = (( uv >>  6)         | 0xc0);
+           *d++ = (( uv        & 0x3f) | 0x80);
+       }
+    }
+    *d = '\0';
+    sv_usepvn_mg(sv, (char*)dst, d - dst);
+
+    return matches;
+}
+
+static I32
+do_trans_UU_complex(SV *sv)
+{
+    dTHR;
+    U8 *s;
+    U8 *send;
+    U8 *d;
+    I32 matches = 0;
+    I32 squash   = PL_op->op_private & OPpTRANS_SQUASH;
+    I32 from_utf = PL_op->op_private & OPpTRANS_FROM_UTF;
+    I32 to_utf   = PL_op->op_private & OPpTRANS_TO_UTF;
+    I32 del      = PL_op->op_private & OPpTRANS_DELETE;
+    SV* rv = (SV*)cSVOP->op_sv;
+    HV* hv = (HV*)SvRV(rv);
+    SV** svp = hv_fetch(hv, "NONE", 4, FALSE);
+    UV none = svp ? SvUV(*svp) : 0x7fffffff;
+    UV extra = none + 1;
+    UV final;
+    UV uv;
+    STRLEN len;
+    U8 *dst;
+
+    s = (U8*)SvPV(sv, len);
+    send = s + len;
+
+    svp = hv_fetch(hv, "FINAL", 5, FALSE);
+    if (svp)
+       final = SvUV(*svp);
+
+    if (PL_op->op_private & OPpTRANS_GROWS) {
+       I32 bits = 16;
+
+       svp = hv_fetch(hv, "BITS", 4, FALSE);
+       if (svp)
+           bits = (I32)SvIV(*svp);
+
+       Newz(801, d, len * (bits >> 3) + 1, U8);
+       dst = d;
+    }
+    else {
+       d = s;
+       dst = 0;
+    }
+
+    if (squash) {
+       UV puv = 0xfeedface;
+       while (s < send) {
+           if (from_utf) {
+               uv = swash_fetch(rv, s);
            }
            else {
-               while (s < send) {
-                   if (from_utf)
-                       uv = swash_fetch(rv, s);
-                   else {
-                       U8 tmpbuf[10];
-                       uv_to_utf8(tmpbuf, *s); /* XXX suboptimal */
-                       uv = swash_fetch(rv, tmpbuf);
-                   }
-                   if (uv < none) {
-                       if (to_utf)
-                           d = uv_to_utf8(d, uv);
-                       else
-                           *d++ = (U8)uv;
-                       matches++;
-                       s += UTF8SKIP(s);
-                       continue;
-                   }
-                   else if (uv == none) {      /* "none" is unmapped character */
+               U8 tmpbuf[2];
+               uv = *s++;
+               if (uv < 0x80)
+                   tmpbuf[0] = uv;
+               else {
+                   tmpbuf[0] = (( uv >>  6)         | 0xc0);
+                   tmpbuf[1] = (( uv        & 0x3f) | 0x80);
+               }
+               uv = swash_fetch(rv, tmpbuf);
+           }
+           if (uv < none) {
+               matches++;
+               if (uv != puv) {
+                   if (uv >= 0x80 && to_utf)
+                       d = uv_to_utf8(d, uv);
+                   else
+                       *d++ = (U8)uv;
+                   puv = uv;
+               }
+               if (from_utf)
+                   s += UTF8SKIP(s);
+               continue;
+           }
+           else if (uv == none) {      /* "none" is unmapped character */
+               if (from_utf) {
+                   if (*s < 0x80)
+                       *d++ = *s++;
+                   else if (to_utf) {
                        int i;
-                       if (to_utf) {
-                           for (i = UTF8SKIP(s); i; --i)
-                               *d++ = *s++;
-                       }
-                       else {
-                           I32 ulen;
-                           *d++ = (U8)utf8_to_uv(s, &ulen);
-                           s += ulen;
-                       }
-                       continue;
+                       for (i = UTF8SKIP(s); i; --i)
+                           *d++ = *s++;
                    }
-                   else if (uv == extra && !del) {
-                       matches++;
-                       if (to_utf)
-                           d = uv_to_utf8(d, final);
-                       else
-                           *d++ = (U8)final;
-                       s += UTF8SKIP(s);
-                       continue;
+                   else {
+                       I32 ulen;
+                       *d++ = (U8)utf8_to_uv(s, &ulen);
+                       s += ulen;
                    }
-                   matches++;          /* "none+1" is delete character */
-                   s += UTF8SKIP(s);
                }
+               else {  /* must be to_utf only */
+                   d = uv_to_utf8(d, s[-1]);
+               }
+               puv = 0xfeedface;
+               continue;
            }
-           sv_usepvn_mg(sv, (char*)dst, d - dst);
+           else if (uv == extra && !del) {
+               matches++;
+               if (uv != puv) {
+                   if (final >= 0x80 && to_utf)
+                       d = uv_to_utf8(d, final);
+                   else
+                       *d++ = (U8)final;
+                   puv = final;
+               }
+               if (from_utf)
+                   s += UTF8SKIP(s);
+               continue;
+           }
+           matches++;          /* "none+1" is delete character */
+           if (from_utf)
+               s += UTF8SKIP(s);
        }
-       return matches;
     }
     else {
-       register short *tbl;
-       register I32 ch;
-       register U8 *p;
-
-       tbl = (short*)cPVOP->op_pv;
-       s = (U8*)SvPV(sv, len);
-       if (!len)
-           return 0;
-       if (!SvPOKp(sv))
-           s = (U8*)SvPV_force(sv, len);
-       (void)SvPOK_only(sv);
-       send = s + len;
-       if (!tbl || !s)
-           croak("panic: do_trans");
-       DEBUG_t( deb("2.TBL\n"));
-       if (!PL_op->op_private) {
-           while (s < send) {
-               if ((ch = tbl[*s]) >= 0) {
-                   matches++;
-                   *s = ch;
+       while (s < send) {
+           if (from_utf) {
+               uv = swash_fetch(rv, s);
+           }
+           else {
+               U8 tmpbuf[2];
+               uv = *s++;
+               if (uv < 0x80)
+                   tmpbuf[0] = uv;
+               else {
+                   tmpbuf[0] = (( uv >>  6)         | 0xc0);
+                   tmpbuf[1] = (( uv        & 0x3f) | 0x80);
                }
-               s++;
+               uv = swash_fetch(rv, tmpbuf);
            }
-           SvSETMAGIC(sv);
-       }
-       else if (PL_op->op_private & OPpTRANS_COUNTONLY) {
-           while (s < send) {
-               if (tbl[*s] >= 0)
-                   matches++;
-               s++;
+           if (uv < none) {
+               matches++;
+               if (uv >= 0x80 && to_utf)
+                   d = uv_to_utf8(d, uv);
+               else
+                   *d++ = (U8)uv;
+               if (from_utf)
+                   s += UTF8SKIP(s);
+               continue;
            }
-       }
-       else {
-           d = s;
-           p = send;
-           while (s < send) {
-               if ((ch = tbl[*s]) >= 0) {
-                   *d = ch;
-                   matches++;
-                   if (squash) {
-                       if (p == d - 1 && *p == *d)
-                           matches--;
-                       else
-                           p = d++;
+           else if (uv == none) {      /* "none" is unmapped character */
+               if (from_utf) {
+                   if (*s < 0x80)
+                       *d++ = *s++;
+                   else if (to_utf) {
+                       int i;
+                       for (i = UTF8SKIP(s); i; --i)
+                           *d++ = *s++;
+                   }
+                   else {
+                       I32 ulen;
+                       *d++ = (U8)utf8_to_uv(s, &ulen);
+                       s += ulen;
                    }
-                   else
-                       d++;
                }
-               else if (ch == -1)              /* -1 is unmapped character */
-                   *d++ = *s;          /* -2 is delete character */
-               s++;
+               else {  /* must be to_utf only */
+                   d = uv_to_utf8(d, s[-1]);
+               }
+               continue;
            }
-           matches += send - d;        /* account for disappeared chars */
-           *d = '\0';
-           SvCUR_set(sv, d - (U8*)SvPVX(sv));
-           SvSETMAGIC(sv);
+           else if (uv == extra && !del) {
+               matches++;
+               if (final >= 0x80 && to_utf)
+                   d = uv_to_utf8(d, final);
+               else
+                   *d++ = (U8)final;
+               if (from_utf)
+                   s += UTF8SKIP(s);
+               continue;
+           }
+           matches++;          /* "none+1" is delete character */
+           if (from_utf)
+               s += UTF8SKIP(s);
        }
-       return matches;
+    }
+    if (dst)
+       sv_usepvn(sv, (char*)dst, d - dst);
+    else {
+       *d = '\0';
+       SvCUR_set(sv, d - (U8*)SvPVX(sv));
+    }
+    SvSETMAGIC(sv);
+
+    return matches;
+}
+
+I32
+do_trans(SV *sv)
+{
+    STRLEN len;
+
+    if (SvREADONLY(sv) && !(PL_op->op_private & OPpTRANS_IDENTICAL))
+       croak(no_modify);
+
+    (void)SvPV(sv, len);
+    if (!len)
+       return 0;
+    if (!SvPOKp(sv))
+       (void)SvPV_force(sv, len);
+    (void)SvPOK_only(sv);
+
+    DEBUG_t( deb("2.TBL\n"));
+
+    switch (PL_op->op_private & 63) {
+    case 0:
+       return do_trans_CC_simple(sv);
+
+    case OPpTRANS_FROM_UTF:
+       return do_trans_UC_simple(sv);
+
+    case OPpTRANS_TO_UTF:
+       return do_trans_CU_simple(sv);
+
+    case OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF:
+       return do_trans_UU_simple(sv);
+
+    case OPpTRANS_IDENTICAL:
+       return do_trans_CC_count(sv);
+
+    case OPpTRANS_FROM_UTF|OPpTRANS_IDENTICAL:
+       return do_trans_UC_trivial(sv);
+
+    case OPpTRANS_TO_UTF|OPpTRANS_IDENTICAL:
+       return do_trans_CU_trivial(sv);
+
+    case OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF|OPpTRANS_IDENTICAL:
+       return do_trans_UU_count(sv);
+
+    default:
+       if (PL_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))
+           return do_trans_UU_complex(sv); /* could be UC or CU too */
+       else
+           return do_trans_CC_complex(sv);
     }
 }
 
diff --git a/op.c b/op.c
index 53fb8c1..ca89229 100644 (file)
--- a/op.c
+++ b/op.c
@@ -2156,8 +2156,17 @@ pmtrans(OP *o, OP *expr, OP *repl)
        }
        else if (!rlen && !del) {
            r = t; rlen = tlen; rend = tend;
-           if (!squash && to_utf && from_utf)
-               o->op_private |= OPpTRANS_COUNTONLY;
+       }
+       if (!squash) {
+           if (to_utf && from_utf) {   /* only counting characters */
+               if (t == r || (tlen == rlen && memEQ(t, r, tlen)))
+                   o->op_private |= OPpTRANS_IDENTICAL;
+           }
+           else {      /* straight latin-1 translation */
+               if (tlen == 4 && memEQ(t, "\0\377\303\277", 4) &&
+                   rlen == 4 && memEQ(r, "\0\377\303\277", 4))
+                   o->op_private |= OPpTRANS_IDENTICAL;
+           }
        }
 
        while (t < tend || tfirst <= tlast) {
@@ -2286,7 +2295,7 @@ pmtrans(OP *o, OP *expr, OP *repl)
        if (!rlen && !del) {
            r = t; rlen = tlen;
            if (!squash)
-               o->op_private |= OPpTRANS_COUNTONLY;
+               o->op_private |= OPpTRANS_IDENTICAL;
        }
        for (i = 0; i < 256; i++)
            tbl[i] = -1;
diff --git a/op.h b/op.h
index cbb2ac3..0b186a8 100644 (file)
--- a/op.h
+++ b/op.h
@@ -103,13 +103,15 @@ typedef U32 PADOFFSET;
 #define OPpRUNTIME             64      /* Pattern coming in on the stack */
 
 /* Private for OP_TRANS */
-#define OPpTRANS_GROWS         1
-#define OPpTRANS_FROM_UTF      2
-#define OPpTRANS_TO_UTF                4
-#define OPpTRANS_COUNTONLY     8
-#define OPpTRANS_SQUASH                16
-#define OPpTRANS_DELETE                32
-#define OPpTRANS_COMPLEMENT    64
+#define OPpTRANS_FROM_UTF      1
+#define OPpTRANS_TO_UTF                2
+#define OPpTRANS_IDENTICAL     4
+       /* When CU or UC, means straight latin-1 to utf-8 or vice versa */
+       /* Otherwise, IDENTICAL means the right side is the same as the left */
+#define OPpTRANS_SQUASH                8
+#define OPpTRANS_DELETE                16
+#define OPpTRANS_COMPLEMENT    32
+#define OPpTRANS_GROWS         64
 
 /* Private for OP_REPEAT */
 #define OPpREPEAT_DOLIST       64      /* List replication. */
diff --git a/pp.c b/pp.c
index 9c08e2e..a4f7828 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -669,7 +669,7 @@ PP(pp_trans)
        EXTEND(SP,1);
     }
     TARG = sv_newmortal();
-    PUSHi(do_trans(sv, PL_op));
+    PUSHi(do_trans(sv));
     RETURN;
 }
 
diff --git a/proto.h b/proto.h
index 5b71f63..96bb15c 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -126,7 +126,7 @@ I32 do_shmio _((I32 optype, SV** mark, SV** sp));
 VIRTUAL void   do_sprintf _((SV* sv, I32 len, SV** sarg));
 VIRTUAL long   do_sysseek _((GV* gv, long pos, int whence));
 VIRTUAL long   do_tell _((GV* gv));
-VIRTUAL I32    do_trans _((SV* sv, OP* arg));
+VIRTUAL I32    do_trans _((SV* sv));
 VIRTUAL void   do_vecset _((SV* sv));
 VIRTUAL void   do_vop _((I32 optype, SV* sv, SV* left, SV* right));
 VIRTUAL I32    dowantarray _((void));