Continue 4-arg substr() UTF-8 fixage.
[p5sagit/p5-mst-13.2.git] / pp.c
diff --git a/pp.c b/pp.c
index 6ff39fa..a2ca097 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -745,9 +745,10 @@ PP(pp_schop)
 
 PP(pp_chop)
 {
-    dSP; dMARK; dTARGET;
-    while (SP > MARK)
-       do_chop(TARG, POPs);
+    dSP; dMARK; dTARGET; dORIGMARK;
+    while (MARK < SP)
+       do_chop(TARG, *++MARK);
+    SP = ORIGMARK;
     PUSHTARG;
     RETURN;
 }
@@ -1280,7 +1281,7 @@ PP(pp_subtract)
            UV result;
            register UV buv;
            bool buvok = SvUOK(TOPs);
-           
+       
            if (buvok)
                buv = SvUVX(TOPs);
            else {
@@ -2137,7 +2138,7 @@ PP(pp_complement)
 
          send = tmps + len;
          while (tmps < send) {
-           UV c = utf8_to_uv(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV);
+           UV c = utf8n_to_uvchr(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV);
            tmps += UTF8SKIP(tmps);
            targlen += UNISKIP(~c);
            nchar++;
@@ -2151,9 +2152,9 @@ PP(pp_complement)
          if (nwide) {
              Newz(0, result, targlen + 1, U8);
              while (tmps < send) {
-                 UV c = utf8_to_uv(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV);
+                 UV c = utf8n_to_uvchr(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV);
                  tmps += UTF8SKIP(tmps);
-                 result = uv_to_utf8(result, ~c);
+                 result = uvchr_to_utf8(result, ~c);
              }
              *result = '\0';
              result -= targlen;
@@ -2163,7 +2164,7 @@ PP(pp_complement)
          else {
              Newz(0, result, nchar + 1, U8);
              while (tmps < send) {
-                 U8 c = (U8)utf8_to_uv(tmps, 0, &l, UTF8_ALLOW_ANY);
+                 U8 c = (U8)utf8n_to_uvchr(tmps, 0, &l, UTF8_ALLOW_ANY);
                  tmps += UTF8SKIP(tmps);
                  *result++ = ~c;
              }
@@ -2690,39 +2691,55 @@ PP(pp_substr)
     SV *sv;
     I32 len;
     STRLEN curlen;
-    STRLEN utfcurlen;
+    STRLEN utf8_curlen;
     I32 pos;
     I32 rem;
     I32 fail;
     I32 lvalue = PL_op->op_flags & OPf_MOD || LVRET;
     char *tmps;
     I32 arybase = PL_curcop->cop_arybase;
+    SV *repl_sv = NULL;
+    SV *repl_sv_copy = NULL;
     char *repl = 0;
     STRLEN repl_len;
     int num_args = PL_op->op_private & 7;
+    bool repl_is_utf8 = FALSE;
 
     SvTAINTED_off(TARG);                       /* decontaminate */
     SvUTF8_off(TARG);                          /* decontaminate */
     if (num_args > 2) {
        if (num_args > 3) {
-           sv = POPs;
-           repl = SvPV(sv, repl_len);
+           repl_sv = POPs;
+           repl = SvPV(repl_sv, repl_len);
+           repl_is_utf8 = DO_UTF8(repl_sv) && SvCUR(repl_sv);
        }
        len = POPi;
     }
     pos = POPi;
     sv = POPs;
     PUTBACK;
+    if (repl_sv) {
+       if (repl_is_utf8) {
+           if (!DO_UTF8(sv))
+               sv_utf8_upgrade(sv);
+       }
+       else if (DO_UTF8(sv)) {
+           repl_sv_copy = newSVsv(repl_sv);
+           sv_utf8_upgrade(repl_sv_copy);
+           repl = SvPV(repl_sv_copy, repl_len);
+           repl_is_utf8 = DO_UTF8(repl_sv_copy) && SvCUR(sv);
+       }
+    }
     tmps = SvPV(sv, curlen);
     if (DO_UTF8(sv)) {
-        utfcurlen = sv_len_utf8(sv);
-       if (utfcurlen == curlen)
-           utfcurlen = 0;
+        utf8_curlen = sv_len_utf8(sv);
+       if (utf8_curlen == curlen)
+           utf8_curlen = 0;
        else
-           curlen = utfcurlen;
+           curlen = utf8_curlen;
     }
     else
-       utfcurlen = 0;
+       utf8_curlen = 0;
 
     if (pos >= arybase) {
        pos -= arybase;
@@ -2767,14 +2784,19 @@ PP(pp_substr)
     else {
        I32 upos = pos;
        I32 urem = rem;
-       if (utfcurlen)
+       if (utf8_curlen)
            sv_pos_u2b(sv, &pos, &rem);
        tmps += pos;
        sv_setpvn(TARG, tmps, rem);
-       if (utfcurlen)
+       if (utf8_curlen)
            SvUTF8_on(TARG);
-       if (repl)
+       if (repl) {
            sv_insert(sv, pos, rem, repl, repl_len);
+           if (repl_is_utf8)
+               SvUTF8_on(sv);
+           if (repl_sv_copy)
+               SvREFCNT_dec(repl_sv_copy);
+       }
        else if (lvalue) {              /* it's an lvalue! */
            if (!SvGMAGICAL(sv)) {
                if (SvROK(sv)) {
@@ -2933,7 +2955,7 @@ PP(pp_ord)
     STRLEN len;
     U8 *s = (U8*)SvPVx(argsv, len);
 
-    XPUSHu(DO_UTF8(argsv) ? utf8_to_uv_simple(s, 0) : (*s & 0xff));
+    XPUSHu(DO_UTF8(argsv) ? utf8_to_uvchr(s, 0) : (*s & 0xff));
     RETURN;
 }
 
@@ -2947,7 +2969,7 @@ PP(pp_chr)
 
     if (value > 255 && !IN_BYTE) {
        SvGROW(TARG, UNISKIP(value)+1);
-       tmps = (char*)uv_to_utf8((U8*)SvPVX(TARG), value);
+       tmps = (char*)uvchr_to_utf8((U8*)SvPVX(TARG), value);
        SvCUR_set(TARG, tmps - SvPVX(TARG));
        *tmps = '\0';
        (void)SvPOK_only(TARG);
@@ -2996,17 +3018,17 @@ PP(pp_ucfirst)
        STRLEN ulen;
        U8 tmpbuf[UTF8_MAXLEN+1];
        U8 *tend;
-       UV uv = utf8_to_uv(s, slen, &ulen, 0);
+       UV uv;
 
        if (PL_op->op_private & OPpLOCALE) {
            TAINT;
            SvTAINTED_on(sv);
-           uv = toTITLE_LC_uni(uv);
+           uv = toTITLE_LC_uvchr(utf8n_to_uvchr(s, slen, &ulen, 0));
        }
        else
            uv = toTITLE_utf8(s);
        
-       tend = uv_to_utf8(tmpbuf, uv);
+       tend = uvchr_to_utf8(tmpbuf, uv);
 
        if (!SvPADTMP(sv) || tend - tmpbuf != ulen || SvREADONLY(sv)) {
            dTARGET;
@@ -3055,17 +3077,17 @@ PP(pp_lcfirst)
        STRLEN ulen;
        U8 tmpbuf[UTF8_MAXLEN+1];
        U8 *tend;
-       UV uv = utf8_to_uv(s, slen, &ulen, 0);
+       UV uv;
 
        if (PL_op->op_private & OPpLOCALE) {
            TAINT;
            SvTAINTED_on(sv);
-           uv = toLOWER_LC_uni(uv);
+           uv = toLOWER_LC_uvchr(utf8n_to_uvchr(s, slen, &ulen, 0));
        }
        else
            uv = toLOWER_utf8(s);
        
-       tend = uv_to_utf8(tmpbuf, uv);
+       tend = uvchr_to_utf8(tmpbuf, uv);
 
        if (!SvPADTMP(sv) || tend - tmpbuf != ulen || SvREADONLY(sv)) {
            dTARGET;
@@ -3132,13 +3154,13 @@ PP(pp_uc)
                TAINT;
                SvTAINTED_on(TARG);
                while (s < send) {
-                   d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv(s, len, &ulen, 0)));
+                   d = uvchr_to_utf8(d, toUPPER_LC_uvchr( utf8n_to_uvchr(s, len, &ulen, 0)));
                    s += ulen;
                }
            }
            else {
                while (s < send) {
-                   d = uv_to_utf8(d, toUPPER_utf8( s ));
+                   d = uvchr_to_utf8(d, toUPPER_utf8( s ));
                    s += UTF8SKIP(s);
                }
            }
@@ -3206,13 +3228,13 @@ PP(pp_lc)
                TAINT;
                SvTAINTED_on(TARG);
                while (s < send) {
-                   d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv(s, len, &ulen, 0)));
+                   d = uvchr_to_utf8(d, toLOWER_LC_uvchr( utf8n_to_uvchr(s, len, &ulen, 0)));
                    s += ulen;
                }
            }
            else {
                while (s < send) {
-                   d = uv_to_utf8(d, toLOWER_utf8(s));
+                   d = uvchr_to_utf8(d, toLOWER_utf8(s));
                    s += UTF8SKIP(s);
                }
            }
@@ -3961,12 +3983,12 @@ PP(pp_reverse)
                U8* s = (U8*)SvPVX(TARG);
                U8* send = (U8*)(s + len);
                while (s < send) {
-                   if (UTF8_IS_ASCII(*s)) {
+                   if (UTF8_IS_INVARIANT(*s)) {
                        s++;
                        continue;
                    }
                    else {
-                       if (!utf8_to_uv_simple(s, 0))
+                       if (!utf8_to_uvchr(s, 0))
                            break;
                        up = (char*)s;
                        s += UTF8SKIP(s);
@@ -4035,6 +4057,7 @@ S_mul128(pTHX_ SV *sv, U8 m)
 #define ISUUCHAR(ch)    (memchr(PL_uuemap, (ch), sizeof(PL_uuemap)-1) || (ch) == ' ')
 #endif
 
+
 PP(pp_unpack)
 {
     dSP;
@@ -4045,7 +4068,14 @@ PP(pp_unpack)
     STRLEN llen;
     STRLEN rlen;
     register char *pat = SvPV(left, llen);
+#ifdef PACKED_IS_OCTETS
+    /* Packed side is assumed to be octets - so force downgrade if it
+       has been UTF-8 encoded by accident
+     */
+    register char *s = SvPVbyte(right, rlen);
+#else
     register char *s = SvPV(right, rlen);
+#endif
     char *strend = s + rlen;
     char *strbeg = s;
     register char *patend = pat + llen;
@@ -4064,7 +4094,6 @@ PP(pp_unpack)
     U16 aushort;
     unsigned int auint;
     U32 aulong;
-    UV auv;
 #ifdef HAS_QUAD
     Uquad_t auquad;
 #endif
@@ -4332,46 +4361,20 @@ PP(pp_unpack)
            if (len > strend - s)
                len = strend - s;
            if (checksum) {
-               if (DO_UTF8(right)) {
-                   while (len > 0) {
-                       STRLEN l;
-                       auv = utf8_to_uv((U8*)s, strend - s,
-                                        &l, UTF8_ALLOW_ANYUV);
-                       culong += auv;
-                       s += l;
-                       len -= l;
-                   }
-               }
-               else {
-               uchar_checksum:
-                   while (len-- > 0) {
-                       auint = *s++ & 0xFF;
-                       culong += auint;
-                   }
+             uchar_checksum:
+               while (len-- > 0) {
+                   auint = *s++ & 255;
+                   culong += auint;
                }
            }
            else {
                EXTEND(SP, len);
                EXTEND_MORTAL(len);
-               if (DO_UTF8(right)) {
-                   while (len > 0) {
-                       STRLEN l;
-                       auv = utf8_to_uv((U8*)s, strend - s,
-                                        &l, UTF8_ALLOW_ANYUV);
-                       sv = NEWSV(37, 0);
-                       sv_setuv(sv, auv);
-                       PUSHs(sv_2mortal(sv));
-                       s += l;
-                       len -= l;
-                   }
-               }
-               else {
-                   while (len-- > 0) {
-                       auint = *s++ & 0xFF;
-                       sv = NEWSV(37, 0);
-                       sv_setuv(sv, auint);
-                       PUSHs(sv_2mortal(sv));
-                   }
+               while (len-- > 0) {
+                   auint = *s++ & 255;
+                   sv = NEWSV(37, 0);
+                   sv_setiv(sv, (IV)auint);
+                   PUSHs(sv_2mortal(sv));
                }
            }
            break;
@@ -4381,7 +4384,7 @@ PP(pp_unpack)
            if (checksum) {
                while (len-- > 0 && s < strend) {
                    STRLEN alen;
-                   auint = utf8_to_uv((U8*)s, strend - s, &alen, 0);
+                   auint = utf8n_to_uvchr((U8*)s, strend - s, &alen, 0);
                    along = alen;
                    s += along;
                    if (checksum > 32)
@@ -4395,7 +4398,7 @@ PP(pp_unpack)
                EXTEND_MORTAL(len);
                while (len-- > 0 && s < strend) {
                    STRLEN alen;
-                   auint = utf8_to_uv((U8*)s, strend - s, &alen, 0);
+                   auint = utf8n_to_uvchr((U8*)s, strend - s, &alen, 0);
                    along = alen;
                    s += along;
                    sv = NEWSV(37, 0);
@@ -4807,7 +4810,8 @@ PP(pp_unpack)
                
                while ((len > 0) && (s < strend)) {
                    auv = (auv << 7) | (*s & 0x7f);
-                   if (UTF8_IS_ASCII(*s++)) {
+                   /* UTF8_IS_XXXXX not right here - using constant 0x80 */
+                   if ((U8)(*s++) < 0x80) {
                        bytes = 0;
                        sv = NEWSV(40, 0);
                        sv_setuv(sv, auv);
@@ -5172,7 +5176,6 @@ PP(pp_pack)
     unsigned int auint;
     I32 along;
     U32 aulong;
-    UV auv;
 #ifdef HAS_QUAD
     Quad_t aquad;
     Uquad_t auquad;
@@ -5184,7 +5187,6 @@ PP(pp_pack)
 #ifdef PERL_NATINT_PACK
     int natint;                /* native integer */
 #endif
-    bool has_utf8;
 
     items = SP - MARK;
     MARK++;
@@ -5201,8 +5203,10 @@ PP(pp_pack)
            patcopy++;
            continue;
         }
+#ifndef PACKED_IS_OCTETS
        if (datumtype == 'U' && pat == patcopy+1)
            SvUTF8_on(cat);
+#endif
        if (datumtype == '#') {
            while (pat < patend && *pat != '\n')
                pat++;
@@ -5421,6 +5425,7 @@ PP(pp_pack)
                items = saveitems;
            }
            break;
+       case 'C':
        case 'c':
            while (len-- > 0) {
                fromstr = NEXTFROM;
@@ -5429,41 +5434,12 @@ PP(pp_pack)
                sv_catpvn(cat, &achar, sizeof(char));
            }
            break;
-       case 'C':
-           has_utf8 = SvUTF8(cat);
-           while (len-- > 0) {
-               fromstr = NEXTFROM;
-               auv = SvUV(fromstr);
-               if (!has_utf8 && auv > 0xFF && !IN_BYTE) {
-                   has_utf8 = TRUE;
-                   if (SvCUR(cat))
-                       sv_utf8_upgrade(cat);
-                   else
-                       SvUTF8_on(cat); /* There will be UTF8. */
-               }
-               if (has_utf8) {
-                   SvGROW(cat, SvCUR(cat) + UNISKIP(auv) + 1);
-                   SvCUR_set(cat, (char*)uv_to_utf8((U8*)SvEND(cat),auv)
-                                  - SvPVX(cat));
-               }
-               else {
-                   achar = auv;
-                   sv_catpvn(cat, &achar, sizeof(char));
-               }
-           }
-           *SvEND(cat) = '\0';
-           break;
        case 'U':
-           has_utf8 = SvUTF8(cat);
            while (len-- > 0) {
                fromstr = NEXTFROM;
-               auv = SvUV(fromstr);
-               if (!has_utf8 && auv > 0x80) {
-                   has_utf8 = TRUE;
-                   sv_utf8_upgrade(cat);
-               }
-               SvGROW(cat, SvCUR(cat) + UNISKIP(auv) + 1);
-               SvCUR_set(cat, (char*)uv_to_utf8((U8*)SvEND(cat),auv)
+               auint = SvUV(fromstr);
+               SvGROW(cat, SvCUR(cat) + UTF8_MAXLEN + 1);
+               SvCUR_set(cat, (char*)uvchr_to_utf8((U8*)SvEND(cat),auint)
                               - SvPVX(cat));
            }
            *SvEND(cat) = '\0';