Batch of UTF-8 patches from Simon Cozens.
Jarkko Hietaniemi [Thu, 14 Sep 2000 14:40:40 +0000 (14:40 +0000)]
p4raw-id: //depot/perl@7075

15 files changed:
doop.c
embed.h
embed.pl
ext/Encode/Encode.xs
handy.h
op.c
pod/perlapi.pod
pp.c
pp_ctl.c
proto.h
regcomp.c
regexec.c
sv.c
toke.c
utf8.c

diff --git a/doop.c b/doop.c
index 77c7324..80cc0f6 100644 (file)
--- a/doop.c
+++ b/doop.c
@@ -77,7 +77,7 @@ S_do_trans_simple(pTHX_ SV *sv)
 
         ulen = 1;
         /* Need to check this, otherwise 128..255 won't match */
-       c = utf8_to_uv(s, &ulen);
+       c = utf8_to_uv(s, &ulen, 0);
         if (c < 0x100 && (ch = tbl[(short)c]) >= 0) {
             matches++;
             if (ch < 0x80) 
@@ -125,7 +125,7 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
             I32 ulen;
             ulen = 1;
             if (hasutf)
-                c = utf8_to_uv(s,&ulen);
+                c = utf8_to_uv(s,&ulen, 0);
             else
                 c = *s;
             if (c < 0x100 && tbl[c] >= 0)
@@ -364,7 +364,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
            }
            else if (uv == none) {      /* "none" is unmapped character */
                I32 ulen;
-               *d++ = (U8)utf8_to_uv(s, &ulen);
+               *d++ = (U8)utf8_to_uv(s, &ulen, 0);
                s += ulen;
                puv = 0xfeedface;
                continue;
@@ -405,7 +405,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
            }
            else if (uv == none) {      /* "none" is unmapped character */
                I32 ulen;
-               *d++ = (U8)utf8_to_uv(s, &ulen);
+               *d++ = (U8)utf8_to_uv(s, &ulen, 0);
                s += ulen;
                continue;
            }
@@ -969,10 +969,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
        switch (optype) {
        case OP_BIT_AND:
            while (lulen && rulen) {
-               luc = utf8_to_uv((U8*)lc, &ulen);
+               luc = utf8_to_uv((U8*)lc, &ulen, 0);
                lc += ulen;
                lulen -= ulen;
-               ruc = utf8_to_uv((U8*)rc, &ulen);
+               ruc = utf8_to_uv((U8*)rc, &ulen, 0);
                rc += ulen;
                rulen -= ulen;
                duc = luc & ruc;
@@ -984,10 +984,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
            break;
        case OP_BIT_XOR:
            while (lulen && rulen) {
-               luc = utf8_to_uv((U8*)lc, &ulen);
+               luc = utf8_to_uv((U8*)lc, &ulen, 0);
                lc += ulen;
                lulen -= ulen;
-               ruc = utf8_to_uv((U8*)rc, &ulen);
+               ruc = utf8_to_uv((U8*)rc, &ulen, 0);
                rc += ulen;
                rulen -= ulen;
                duc = luc ^ ruc;
@@ -996,10 +996,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
            goto mop_up_utf;
        case OP_BIT_OR:
            while (lulen && rulen) {
-               luc = utf8_to_uv((U8*)lc, &ulen);
+               luc = utf8_to_uv((U8*)lc, &ulen, 0);
                lc += ulen;
                lulen -= ulen;
-               ruc = utf8_to_uv((U8*)rc, &ulen);
+               ruc = utf8_to_uv((U8*)rc, &ulen, 0);
                rc += ulen;
                rulen -= ulen;
                duc = luc | ruc;
diff --git a/embed.h b/embed.h
index 9bd72ad..7e030a9 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define utf8_hop(a,b)          Perl_utf8_hop(aTHX_ a,b)
 #define utf8_to_bytes(a,b)     Perl_utf8_to_bytes(aTHX_ a,b)
 #define bytes_to_utf8(a,b)     Perl_bytes_to_utf8(aTHX_ a,b)
-#define utf8_to_uv(a,b)                Perl_utf8_to_uv(aTHX_ a,b)
+#define utf8_to_uv(a,b,c)      Perl_utf8_to_uv(aTHX_ a,b,c)
 #define uv_to_utf8(a,b)                Perl_uv_to_utf8(aTHX_ a,b)
 #define vivify_defelem(a)      Perl_vivify_defelem(aTHX_ a)
 #define vivify_ref(a,b)                Perl_vivify_ref(aTHX_ a,b)
index c8e83f8..23214a3 100755 (executable)
--- a/embed.pl
+++ b/embed.pl
@@ -2074,7 +2074,7 @@ Ap        |I32    |utf8_distance  |U8 *a|U8 *b
 Ap     |U8*    |utf8_hop       |U8 *s|I32 off
 ApM    |U8*    |utf8_to_bytes  |U8 *s|STRLEN *len
 ApM    |U8*    |bytes_to_utf8  |U8 *s|STRLEN *len
-Ap     |UV     |utf8_to_uv     |U8 *s|I32* retlen
+Ap     |UV     |utf8_to_uv     |U8 *s|I32* retlen|bool checking
 Ap     |U8*    |uv_to_utf8     |U8 *d|UV uv
 p      |void   |vivify_defelem |SV* sv
 p      |void   |vivify_ref     |SV* sv|U32 to_what
index cc0a86a..5f4a77e 100644 (file)
 #include "perl.h"
 #include "XSUB.h"
 
-MODULE = Encode                PACKAGE = Encode
+#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {   \
+                         Perl_croak("panic_unimplemented"); \
+                         } 
+UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
+UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
+
+void call_failure (SV *routine, U8* done, U8* dest, U8* orig);
+
+MODULE = Encode         PACKAGE = Encode
 
 PROTOTYPES: ENABLE
 
-SV *
+I32
 _bytes_to_utf8(sv, ...)
-       SV *    sv
+        SV *    sv
       CODE:
-       {
-         SV * encoding = 2 ? ST(1) : Nullsv;
-         RETVAL = &PL_sv_undef;
-       }
+        {
+          SV * encoding = items == 2 ? ST(1) : Nullsv;
+
+          if (encoding)
+            RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
+          else {
+            STRLEN len;
+            U8*    s = SvPV(sv, len);
+            U8*    converted;
+
+            converted = bytes_to_utf8(s, &len); /* This allocs */
+            sv_setpvn(sv, converted, len);
+            SvUTF8_on(sv); /* XXX Should we? */
+            Safefree(converted);                /* ... so free it */
+            RETVAL = len;
+          }
+        }
       OUTPUT:
-       RETVAL
+        RETVAL
 
-SV *
+I32
 _utf8_to_bytes(sv, ...)
-       SV *    sv
+        SV *    sv
       CODE:
-       {
-         SV * to    = items > 1 ? ST(1) : Nullsv;
-         SV * check = items > 2 ? ST(2) : Nullsv;
-         RETVAL = &PL_sv_undef;
+        {
+          SV * to    = items > 1 ? ST(1) : Nullsv;
+          SV * check = items > 2 ? ST(2) : Nullsv;
+          
+          if (to)
+            RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
+          else {
+            U8 *s;
+            STRLEN len;
+            s = SvPV(sv, len);
+
+            if (SvTRUE(check)) {
+              /* Must do things the slow way */
+              U8 *dest;
+              U8 *src  = savepv(s); /* We need a copy to pass to check() */ 
+              U8 *send = s + len;
+
+              New(83, dest, len, U8); /* I think */
+
+              while (s < send) {
+                if (*s < 0x80)
+                  *dest++ = *s++;
+                else {
+                  I32 ulen;
+                  I32 byte;
+                 I32 uv = *s++;
+                  
+                  /* Have to do it all ourselves because of error routine,
+                    aargh. */
+                 if (!(uv & 0x40))
+                   goto failure;
+                 if      (!(uv & 0x20)) { ulen = 2;  uv &= 0x1f; }
+                 else if (!(uv & 0x10)) { ulen = 3;  uv &= 0x0f; }
+                 else if (!(uv & 0x08)) { ulen = 4;  uv &= 0x07; }
+                 else if (!(uv & 0x04)) { ulen = 5;  uv &= 0x03; }
+                 else if (!(uv & 0x02)) { ulen = 6;  uv &= 0x01; }
+                 else if (!(uv & 0x01)) { ulen = 7;  uv = 0; }
+                 else                   { ulen = 13; uv = 0; }
+                 
+                 /* Note change to utf8.c variable naming, for variety */
+                 while (ulen--) {
+                   if ((*s & 0xc0) != 0x80)
+                     goto failure;
+                   
+                   else
+                     uv = (uv << 6) | (*s++ & 0x3f);
+                 } 
+                 if (uv > 256) {
+                 failure:
+                   call_failure(check, s, dest, src);
+                   /* Now what happens? */
+                 }
+                 *dest++ = (U8)uv;
+               }
+               }
+           } else
+             RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
+         }
        }
       OUTPUT:
        RETVAL
diff --git a/handy.h b/handy.h
index d82b1c6..c240c42 100644 (file)
--- a/handy.h
+++ b/handy.h
@@ -448,21 +448,21 @@ Converts the specified character to lowercase.
 #define isPSXSPC_utf8(c)       (isSPACE_utf8(c) ||(c) == '\f')
 #define isBLANK_utf8(c)                isBLANK(c) /* could be wrong */
 
-#define isALNUM_LC_utf8(p)     isALNUM_LC_uni(utf8_to_uv(p, 0))
-#define isIDFIRST_LC_utf8(p)   isIDFIRST_LC_uni(utf8_to_uv(p, 0))
-#define isALPHA_LC_utf8(p)     isALPHA_LC_uni(utf8_to_uv(p, 0))
-#define isSPACE_LC_utf8(p)     isSPACE_LC_uni(utf8_to_uv(p, 0))
-#define isDIGIT_LC_utf8(p)     isDIGIT_LC_uni(utf8_to_uv(p, 0))
-#define isUPPER_LC_utf8(p)     isUPPER_LC_uni(utf8_to_uv(p, 0))
-#define isLOWER_LC_utf8(p)     isLOWER_LC_uni(utf8_to_uv(p, 0))
-#define isALNUMC_LC_utf8(p)    isALNUMC_LC_uni(utf8_to_uv(p, 0))
-#define isCNTRL_LC_utf8(p)     isCNTRL_LC_uni(utf8_to_uv(p, 0))
-#define isGRAPH_LC_utf8(p)     isGRAPH_LC_uni(utf8_to_uv(p, 0))
-#define isPRINT_LC_utf8(p)     isPRINT_LC_uni(utf8_to_uv(p, 0))
-#define isPUNCT_LC_utf8(p)     isPUNCT_LC_uni(utf8_to_uv(p, 0))
-#define toUPPER_LC_utf8(p)     toUPPER_LC_uni(utf8_to_uv(p, 0))
-#define toTITLE_LC_utf8(p)     toTITLE_LC_uni(utf8_to_uv(p, 0))
-#define toLOWER_LC_utf8(p)     toLOWER_LC_uni(utf8_to_uv(p, 0))
+#define isALNUM_LC_utf8(p)     isALNUM_LC_uni(utf8_to_uv(p, 0, 0))
+#define isIDFIRST_LC_utf8(p)   isIDFIRST_LC_uni(utf8_to_uv(p, 0, 0))
+#define isALPHA_LC_utf8(p)     isALPHA_LC_uni(utf8_to_uv(p, 0, 0))
+#define isSPACE_LC_utf8(p)     isSPACE_LC_uni(utf8_to_uv(p, 0, 0))
+#define isDIGIT_LC_utf8(p)     isDIGIT_LC_uni(utf8_to_uv(p, 0, 0))
+#define isUPPER_LC_utf8(p)     isUPPER_LC_uni(utf8_to_uv(p, 0, 0))
+#define isLOWER_LC_utf8(p)     isLOWER_LC_uni(utf8_to_uv(p, 0, 0))
+#define isALNUMC_LC_utf8(p)    isALNUMC_LC_uni(utf8_to_uv(p, 0, 0))
+#define isCNTRL_LC_utf8(p)     isCNTRL_LC_uni(utf8_to_uv(p, 0, 0))
+#define isGRAPH_LC_utf8(p)     isGRAPH_LC_uni(utf8_to_uv(p, 0, 0))
+#define isPRINT_LC_utf8(p)     isPRINT_LC_uni(utf8_to_uv(p, 0, 0))
+#define isPUNCT_LC_utf8(p)     isPUNCT_LC_uni(utf8_to_uv(p, 0, 0))
+#define toUPPER_LC_utf8(p)     toUPPER_LC_uni(utf8_to_uv(p, 0, 0))
+#define toTITLE_LC_utf8(p)     toTITLE_LC_uni(utf8_to_uv(p, 0, 0))
+#define toLOWER_LC_utf8(p)     toLOWER_LC_uni(utf8_to_uv(p, 0, 0))
 
 #define isPSXSPC_LC_utf8(c)    (isSPACE_LC_utf8(c) ||(c) == '\f')
 #define isBLANK_LC_utf8(c)     isBLANK(c) /* could be wrong */
diff --git a/op.c b/op.c
index 74d67e3..d24396a 100644 (file)
--- a/op.c
+++ b/op.c
@@ -2656,7 +2656,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
            qsort(cp, i, sizeof(U8*), utf8compare);
            for (j = 0; j < i; j++) {
                U8 *s = cp[j];
-               UV val = utf8_to_uv(s, &ulen);
+               UV val = utf8_to_uv(s, &ulen, 0);
                s += ulen;
                diff = val - nextmin;
                if (diff > 0) {
@@ -2669,7 +2669,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                    }
                }
                if (*s == 0xff)
-                   val = utf8_to_uv(s+1, &ulen);
+                   val = utf8_to_uv(s+1, &ulen, 0);
                if (val >= nextmin)
                    nextmin = val + 1;
            }
@@ -2696,10 +2696,10 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
        while (t < tend || tfirst <= tlast) {
            /* see if we need more "t" chars */
            if (tfirst > tlast) {
-               tfirst = (I32)utf8_to_uv(t, &ulen);
+               tfirst = (I32)utf8_to_uv(t, &ulen, 0);
                t += ulen;
                if (t < tend && *t == 0xff) {   /* illegal utf8 val indicates range */
-                   tlast = (I32)utf8_to_uv(++t, &ulen);
+                   tlast = (I32)utf8_to_uv(++t, &ulen, 0);
                    t += ulen;
                }
                else
@@ -2709,10 +2709,10 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
            /* now see if we need more "r" chars */
            if (rfirst > rlast) {
                if (r < rend) {
-                   rfirst = (I32)utf8_to_uv(r, &ulen);
+                   rfirst = (I32)utf8_to_uv(r, &ulen, 0);
                    r += ulen;
                    if (r < rend && *r == 0xff) {       /* illegal utf8 val indicates range */
-                       rlast = (I32)utf8_to_uv(++r, &ulen);
+                       rlast = (I32)utf8_to_uv(++r, &ulen, 0);
                        r += ulen;
                    }
                    else
index b1feed3..ca2ba7c 100644 (file)
@@ -3182,10 +3182,18 @@ Found in file handy.h
 
 =item U8 *s
 
-Returns true if first C<len> bytes of the given string form valid a UTF8
-string, false otherwise.
+Returns the character value of the first character in the string C<s>
+which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+length, in bytes, of that character, and the pointer C<s> will be
+advanced to the end of the character.
 
-       is_utf8_string  U8 *s(STRLEN len)
+If C<s> does not point to a well-formed UTF8 character, the behaviour
+is dependent on the value of C<checking>: if this is true, it is
+assumed that the caller will raise a warning, and this function will
+set C<retlen> to C<-1> and return. If C<checking> is not true, an optional UTF8
+warning is produced.
+
+       utf8_to_uv      U8 *s(I32 *retlen, I32 checking)
 
 =for hackers
 Found in file utf8.c
@@ -3195,7 +3203,7 @@ Found in file utf8.c
 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 updates len to contain the new length.
-Returns zero on failure leaving the string and len unchanged
+Returns zero on failure, setting C<len> to -1.
 
        U8 *    utf8_to_bytes(U8 *s, STRLEN *len)
 
diff --git a/pp.c b/pp.c
index d5d5dd8..1c5a963 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -2195,7 +2195,7 @@ PP(pp_ord)
     I32 retlen;
 
     if ((*tmps & 0x80) && DO_UTF8(tmpsv))
-       value = utf8_to_uv(tmps, &retlen);
+       value = utf8_to_uv(tmps, &retlen, 0);
     else
        value = (UV)(*tmps & 255);
     XPUSHu(value);
@@ -2262,7 +2262,7 @@ PP(pp_ucfirst)
        I32 ulen;
        U8 tmpbuf[UTF8_MAXLEN];
        U8 *tend;
-       UV uv = utf8_to_uv(s, &ulen);
+       UV uv = utf8_to_uv(s, &ulen, 0);
 
        if (PL_op->op_private & OPpLOCALE) {
            TAINT;
@@ -2321,7 +2321,7 @@ PP(pp_lcfirst)
        I32 ulen;
        U8 tmpbuf[UTF8_MAXLEN];
        U8 *tend;
-       UV uv = utf8_to_uv(s, &ulen);
+       UV uv = utf8_to_uv(s, &ulen, 0);
 
        if (PL_op->op_private & OPpLOCALE) {
            TAINT;
@@ -2398,7 +2398,7 @@ PP(pp_uc)
                TAINT;
                SvTAINTED_on(TARG);
                while (s < send) {
-                   d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv(s, &ulen)));
+                   d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv(s, &ulen, 0)));
                    s += ulen;
                }
            }
@@ -2472,7 +2472,7 @@ PP(pp_lc)
                TAINT;
                SvTAINTED_on(TARG);
                while (s < send) {
-                   d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv(s, &ulen)));
+                   d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv(s, &ulen, 0)));
                    s += ulen;
                }
            }
@@ -3614,7 +3614,7 @@ PP(pp_unpack)
                len = strend - s;
            if (checksum) {
                while (len-- > 0 && s < strend) {
-                   auint = utf8_to_uv((U8*)s, &along);
+                   auint = utf8_to_uv((U8*)s, &along, 0);
                    s += along;
                    if (checksum > 32)
                        cdouble += (NV)auint;
@@ -3626,7 +3626,7 @@ PP(pp_unpack)
                EXTEND(SP, len);
                EXTEND_MORTAL(len);
                while (len-- > 0 && s < strend) {
-                   auint = utf8_to_uv((U8*)s, &along);
+                   auint = utf8_to_uv((U8*)s, &along, 0);
                    s += along;
                    sv = NEWSV(37, 0);
                    sv_setuv(sv, (UV)auint);
index 8981bb8..3cc74e5 100644 (file)
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -2959,13 +2959,13 @@ PP(pp_require)
            U8 *s = (U8*)SvPVX(sv);
            U8 *end = (U8*)SvPVX(sv) + SvCUR(sv);
            if (s < end) {
-               rev = utf8_to_uv(s, &len);
+               rev = utf8_to_uv(s, &len, 0);
                s += len;
                if (s < end) {
-                   ver = utf8_to_uv(s, &len);
+                   ver = utf8_to_uv(s, &len, 0);
                    s += len;
                    if (s < end)
-                       sver = utf8_to_uv(s, &len);
+                       sver = utf8_to_uv(s, &len, 0);
                }
            }
            if (PERL_REVISION < rev
diff --git a/proto.h b/proto.h
index ed08b20..6a0229a 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -817,7 +817,7 @@ PERL_CALLCONV I32   Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
 PERL_CALLCONV U8*      Perl_utf8_hop(pTHX_ U8 *s, I32 off);
 PERL_CALLCONV U8*      Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len);
 PERL_CALLCONV U8*      Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len);
-PERL_CALLCONV UV       Perl_utf8_to_uv(pTHX_ U8 *s, I32* retlen);
+PERL_CALLCONV UV       Perl_utf8_to_uv(pTHX_ U8 *s, I32* retlen, bool checking);
 PERL_CALLCONV U8*      Perl_uv_to_utf8(pTHX_ U8 *d, UV uv);
 PERL_CALLCONV void     Perl_vivify_defelem(pTHX_ SV* sv);
 PERL_CALLCONV void     Perl_vivify_ref(pTHX_ SV* sv, U32 to_what);
index 723cbbe..c60ab84 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -2881,7 +2881,7 @@ tryagain:
                default:
                  normal_default:
                    if ((*p & 0xc0) == 0xc0 && UTF) {
-                       ender = utf8_to_uv((U8*)p, &numlen);
+                       ender = utf8_to_uv((U8*)p, &numlen, 0);
                        p += numlen;
                    }
                    else
@@ -3635,12 +3635,12 @@ S_regclassutf8(pTHX)
        namedclass = OOB_NAMEDCLASS;
        if (!range)
            rangebegin = PL_regcomp_parse;
-       value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen);
+       value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen, 0);
        PL_regcomp_parse += numlen;
        if (value == '[')
            namedclass = regpposixcc(value);
        else if (value == '\\') {
-           value = (U32)utf8_to_uv((U8*)PL_regcomp_parse, &numlen);
+           value = (U32)utf8_to_uv((U8*)PL_regcomp_parse, &numlen, 0);
            PL_regcomp_parse += numlen;
            /* Some compilers cannot handle switching on 64-bit integer
             * values, therefore value cannot be an UV.  Yes, this will
index 6401710..990791b 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -914,7 +914,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
            PL_reg_flags |= RF_tainted;
            /* FALL THROUGH */
        case BOUNDUTF8:
-           tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
+           tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0, 0) : '\n';
            tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
            while (s < strend) {
                if (tmp == !(OP(c) == BOUNDUTF8 ?
@@ -950,7 +950,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
            PL_reg_flags |= RF_tainted;
            /* FALL THROUGH */
        case NBOUNDUTF8:
-           tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
+           tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0, 0) : '\n';
            tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
            while (s < strend) {
                if (tmp == !(OP(c) == NBOUNDUTF8 ?
@@ -1995,7 +1995,7 @@ S_regmatch(pTHX_ regnode *prog)
                while (s < e) {
                    if (l >= PL_regeol)
                        sayNO;
-                   if (utf8_to_uv((U8*)s, 0) != (c1 ?
+                   if (utf8_to_uv((U8*)s, 0, 0) != (c1 ?
                                                  toLOWER_utf8((U8*)l) :
                                                  toLOWER_LC_utf8((U8*)l)))
                    {
@@ -2133,7 +2133,7 @@ S_regmatch(pTHX_ regnode *prog)
        case NBOUNDUTF8:
            /* was last char in word? */
            ln = (locinput != PL_regbol)
-               ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev;
+               ? utf8_to_uv(reghop((U8*)locinput, -1), 0, 0) : PL_regprev;
            if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
                ln = isALNUM_uni(ln);
                n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
diff --git a/sv.c b/sv.c
index 4da49cc..2c45cae 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -2398,6 +2398,7 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv)
 {
     int hicount;
     char *c;
+    char *s;
 
     if (!sv || !SvPOK(sv) || SvUTF8(sv))
        return;
@@ -2406,30 +2407,16 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv)
      * to signal if there are any hibit chars in the string
      */
     hicount = 0;
-    for (c = SvPVX(sv); c < SvEND(sv); c++) {
+    for (c = s = SvPVX(sv); c < SvEND(sv); c++) {
        if (*c & 0x80)
            hicount++;
     }
 
     if (hicount) {
-       char *src, *dst;
-       SvGROW(sv, SvCUR(sv) + hicount + 1);
-
-       src = SvEND(sv) - 1;
-       SvCUR_set(sv, SvCUR(sv) + hicount);
-       dst = SvEND(sv) - 1;
-
-       while (src < dst) {
-           if (*src & 0x80) {
-               dst--;
-               uv_to_utf8((U8*)dst, (U8)*src--);
-               dst--;
-           }
-           else {
-               *dst-- = *src--;
-           }
-       }
-
+       STRLEN len = SvCUR(sv) + 1; /* Plus the \0 */
+       SvPVX(sv) = bytes_to_utf8(s, &len);
+       SvCUR(sv) = len - 1;
+       Safefree(s); /* No longer using what was there before */
        SvUTF8_on(sv);
     }
 }
@@ -2450,46 +2437,14 @@ Perl_sv_utf8_downgrade(pTHX_ register SV* sv, bool fail_ok)
 {
     if (SvPOK(sv) && SvUTF8(sv)) {
         char *c = SvPVX(sv);
-        char *first_hi = 0;
-        /* need to figure out if this is possible at all first */
-        while (c < SvEND(sv)) {
-            if (*c & 0x80) {
-                I32 len;
-                UV uv = utf8_to_uv((U8*)c, &len);
-                if (uv >= 256) {
-                   if (fail_ok)
-                       return FALSE;
-                   else {
-                       /* XXX might want to make a callback here instead */
-                       Perl_croak(aTHX_ "Big byte");
-                   }
-               }
-                if (!first_hi)
-                    first_hi = c;
-                c += len;
-            }
-            else {
-                c++;
-            }
-        }
-
-        if (first_hi) {
-            char *src = first_hi;
-            char *dst = first_hi;
-            while (src < SvEND(sv)) {
-                if (*src & 0x80) {
-                    I32 len;
-                    U8 u = (U8)utf8_to_uv((U8*)src, &len);
-                    *dst++ = u;
-                    src += len;
-                }
-                else {
-                    *dst++ = *src++;
-                }
-            }
-            SvCUR_set(sv, dst - SvPVX(sv));
-        }
-        SvUTF8_off(sv);
+       STRLEN len = SvCUR(sv);
+        if (!utf8_to_bytes(c, &len)) {
+           if (fail_ok)
+               return FALSE;
+           else
+               Perl_croak("big byte");
+       }
+       SvCUR(sv) = len - 1;
     }
     return TRUE;
 }
@@ -2523,24 +2478,15 @@ Perl_sv_utf8_decode(pTHX_ register SV *sv)
          * we want to make sure everything inside is valid utf8 first.
          */
         c = SvPVX(sv);
+       if (!is_utf8_string(c,SvCUR(c)+1))
+           return FALSE;
+
         while (c < SvEND(sv)) {
-            if (*c & 0x80) {
-                I32 len;
-                (void)utf8_to_uv((U8*)c, &len);
-                if (len == 1) {
-                    /* bad utf8 */
-                    return FALSE;
-                }
-                c += len;
-                has_utf = TRUE;
-            }
-            else {
-                c++;
-            }
+            if (*c++ & 0x80) {
+               SvUTF8_on(sv);
+               break;
+           }
         }
-
-        if (has_utf)
-            SvUTF8_on(sv);
     }
     return TRUE;
 }
@@ -6373,7 +6319,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
                    break;
                }
                if (utf)
-                   iv = (IV)utf8_to_uv(vecstr, &ulen);
+                   iv = (IV)utf8_to_uv(vecstr, &ulen, 0);
                else {
                    iv = *vecstr;
                    ulen = 1;
@@ -6455,7 +6401,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
                    break;
                }
                if (utf)
-                   uv = utf8_to_uv(vecstr, &ulen);
+                   uv = utf8_to_uv(vecstr, &ulen, 0);
                else {
                    uv = *vecstr;
                    ulen = 1;
diff --git a/toke.c b/toke.c
index e5f737b..31f5f0a 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -812,7 +812,7 @@ Perl_str_to_version(pTHX_ SV *sv)
        I32 skip;
        UV n;
        if (utf)
-           n = utf8_to_uv((U8*)start, &skip);
+           n = utf8_to_uv((U8*)start, &skip, 0);
        else {
            n = *(U8*)start;
            skip = 1;
@@ -1323,7 +1323,7 @@ S_scan_const(pTHX_ char *start)
        /* (now in tr/// code again) */
 
        if (*s & 0x80 && thisutf) {
-          (void)utf8_to_uv((U8*)s, &len);
+          (void)utf8_to_uv((U8*)s, &len, 0);
           if (len == 1) {
               /* illegal UTF8, make it valid */
               char *old_pvx = SvPVX(sv);
diff --git a/utf8.c b/utf8.c
index a9600e9..d97a8b0 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -158,8 +158,25 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
     return 1;
 }
 
+/*
+=for apidoc Am|utf8_to_uv|U8 *s|I32 *retlen|I32 checking
+
+Returns the character value of the first character in the string C<s>
+which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+length, in bytes, of that character, and the pointer C<s> will be
+advanced to the end of the character.
+
+If C<s> does not point to a well-formed UTF8 character, the behaviour
+is dependent on the value of C<checking>: if this is true, it is
+assumed that the caller will raise a warning, and this function will
+set C<retlen> to C<-1> and return. If C<checking> is not true, an optional UTF8
+warning is produced.
+
+=cut
+*/
+
 UV
-Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen)
+Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen, bool checking)
 {
     UV uv = *s;
     int len;
@@ -170,6 +187,11 @@ Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen)
     }
     if (!(uv & 0x40)) {
         dTHR;
+       if (checking && retlen) {
+           *retlen = -1;
+           return 0;
+       }
+
        if (ckWARN_d(WARN_UTF8))     
            Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
        if (retlen)
@@ -192,6 +214,11 @@ Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen)
     while (len--) {
        if ((*s & 0xc0) != 0x80) {
             dTHR;
+           if (checking && retlen) {
+               *retlen = -1;
+               return 0;
+            }
+
            if (ckWARN_d(WARN_UTF8))     
                Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
            if (retlen)
@@ -253,7 +280,7 @@ Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 updates len to contain the new length.
-Returns zero on failure leaving the string and len unchanged
+Returns zero on failure, setting C<len> to -1.
 
 =cut
 */
@@ -273,8 +300,10 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
     while (s < send) {
        U8 c = *s++;
         if (c >= 0x80 &&
-           ( (s >= send) || ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2)))
+           ( (s >= send) || ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) {
+           *len = -1;
            return 0;    
+       }
     }
     s = save;
     while (s < send) {
@@ -282,7 +311,7 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
             *d++ = *s++;
         else {
             I32 ulen;
-            *d++ = (U8)utf8_to_uv(s, &ulen);
+            *d++ = (U8)utf8_to_uv(s, &ulen, 0);
             s += ulen;
         }
     }
@@ -810,7 +839,7 @@ Perl_to_utf8_upper(pTHX_ U8 *p)
     if (!PL_utf8_toupper)
        PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_toupper, p);
-    return uv ? uv : utf8_to_uv(p,0);
+    return uv ? uv : utf8_to_uv(p,0,0);
 }
 
 UV
@@ -821,7 +850,7 @@ Perl_to_utf8_title(pTHX_ U8 *p)
     if (!PL_utf8_totitle)
        PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_totitle, p);
-    return uv ? uv : utf8_to_uv(p,0);
+    return uv ? uv : utf8_to_uv(p,0,0);
 }
 
 UV
@@ -832,7 +861,7 @@ Perl_to_utf8_lower(pTHX_ U8 *p)
     if (!PL_utf8_tolower)
        PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_tolower, p);
-    return uv ? uv : utf8_to_uv(p,0);
+    return uv ? uv : utf8_to_uv(p,0,0);
 }
 
 /* a "swash" is a swatch hash */
@@ -922,7 +951,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
            PUSHMARK(SP);
            EXTEND(SP,3);
            PUSHs((SV*)sv);
-           PUSHs(sv_2mortal(newSViv(utf8_to_uv(ptr, 0) & ~(needents - 1))));
+           PUSHs(sv_2mortal(newSViv(utf8_to_uv(ptr, 0, 0) & ~(needents - 1))));
            PUSHs(sv_2mortal(newSViv(needents)));
            PUTBACK;
            if (call_method("SWASHGET", G_SCALAR))