Make the UTF-8 decoding stricter and more verbose when
Jarkko Hietaniemi [Tue, 24 Oct 2000 02:55:33 +0000 (02:55 +0000)]
malformation happens.  This involved adding an argument
to utf8_to_uv_chk(), which involved changing its prototype,
and prefer STRLEN over I32 for the UTF-8 length, which as
a domino effect necessitated changing the prototypes of
scan_bin(), scan_oct(), scan_hex(), and reg_uni().
The stricter UTF-8 decoding checking uses Markus Kuhn's
UTF-8 Decode Stress Tester from
http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt

p4raw-id: //depot/perl@7416

22 files changed:
doop.c
embed.h
embed.pl
handy.h
op.c
perl.c
perlapi.c
pod/perlapi.pod
pod/perldiag.pod
pod/perlunicode.pod
pp.c
pp_ctl.c
proto.h
regcomp.c
regexec.c
sv.c
t/pragma/utf8.t
t/pragma/warn/utf8
toke.c
utf8.c
utf8.h
util.c

diff --git a/doop.c b/doop.c
index b75ffaa..3cd8f07 100644 (file)
--- a/doop.c
+++ b/doop.c
@@ -72,12 +72,12 @@ S_do_trans_simple(pTHX_ SV *sv)
     Newz(0, d, len*2+1, U8);
     dstart = d;
     while (s < send) {
-        I32 ulen;
+        STRLEN ulen;
         short c;
 
         ulen = 1;
         /* Need to check this, otherwise 128..255 won't match */
-       c = utf8_to_uv_chk(s, &ulen, 0);
+       c = utf8_to_uv_chk(s, send - s, &ulen, 0);
         if (c < 0x100 && (ch = tbl[(short)c]) >= 0) {
             matches++;
             if (ch < 0x80)
@@ -122,10 +122,10 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
             s += UTF8SKIP(s);
         else {
             UV c;
-            I32 ulen;
+            STRLEN ulen;
             ulen = 1;
             if (hasutf)
-                c = utf8_to_uv_chk(s,&ulen, 0);
+                c = utf8_to_uv_chk(s, send - s, &ulen, 0);
             else
                 c = *s;
             if (c < 0x100 && tbl[c] >= 0)
@@ -363,8 +363,8 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
                continue;
            }
            else if (uv == none) {      /* "none" is unmapped character */
-               I32 ulen;
-               *d++ = (U8)utf8_to_uv_chk(s, &ulen, 0);
+               STRLEN ulen;
+               *d++ = (U8)utf8_to_uv_chk(s, send - s, &ulen, 0);
                s += ulen;
                puv = 0xfeedface;
                continue;
@@ -404,8 +404,8 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
                continue;
            }
            else if (uv == none) {      /* "none" is unmapped character */
-               I32 ulen;
-               *d++ = (U8)utf8_to_uv_chk(s, &ulen, 0);
+               STRLEN ulen;
+               *d++ = (U8)utf8_to_uv_chk(s, send - s, &ulen, 0);
                s += ulen;
                continue;
            }
@@ -964,15 +964,15 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
        char *dcsave = dc;
        STRLEN lulen = leftlen;
        STRLEN rulen = rightlen;
-       I32 ulen;
+       STRLEN ulen;
 
        switch (optype) {
        case OP_BIT_AND:
            while (lulen && rulen) {
-               luc = utf8_to_uv_chk((U8*)lc, &ulen, 0);
+               luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
                lc += ulen;
                lulen -= ulen;
-               ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0);
+               ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
                rc += ulen;
                rulen -= ulen;
                duc = luc & ruc;
@@ -984,10 +984,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
            break;
        case OP_BIT_XOR:
            while (lulen && rulen) {
-               luc = utf8_to_uv_chk((U8*)lc, &ulen, 0);
+               luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
                lc += ulen;
                lulen -= ulen;
-               ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0);
+               ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
                rc += ulen;
                rulen -= ulen;
                duc = luc ^ ruc;
@@ -996,10 +996,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
            goto mop_up_utf;
        case OP_BIT_OR:
            while (lulen && rulen) {
-               luc = utf8_to_uv_chk((U8*)lc, &ulen, 0);
+               luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
                lc += ulen;
                lulen -= ulen;
-               ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0);
+               ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
                rc += ulen;
                rulen -= ulen;
                duc = luc | ruc;
diff --git a/embed.h b/embed.h
index b4c8f6a..eab037f 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define utf8_to_bytes(a,b)     Perl_utf8_to_bytes(aTHX_ a,b)
 #define bytes_to_utf8(a,b)     Perl_bytes_to_utf8(aTHX_ a,b)
 #define utf8_to_uv(a,b)                Perl_utf8_to_uv(aTHX_ a,b)
-#define utf8_to_uv_chk(a,b,c)  Perl_utf8_to_uv_chk(aTHX_ a,b,c)
+#define utf8_to_uv_chk(a,b,c,d)        Perl_utf8_to_uv_chk(aTHX_ a,b,c,d)
 #define uv_to_utf8(a,b)                Perl_uv_to_utf8(aTHX_ a,b)
 #define vivify_defelem(a)      Perl_vivify_defelem(aTHX_ a)
 #define vivify_ref(a,b)                Perl_vivify_ref(aTHX_ a,b)
index f685042..6adb275 100755 (executable)
--- a/embed.pl
+++ b/embed.pl
@@ -1941,10 +1941,10 @@ p       |OP*    |scalar         |OP* o
 p      |OP*    |scalarkids     |OP* o
 p      |OP*    |scalarseq      |OP* o
 p      |OP*    |scalarvoid     |OP* o
-Ap     |NV     |scan_bin       |char* start|I32 len|I32* retlen
-Ap     |NV     |scan_hex       |char* start|I32 len|I32* retlen
+Ap     |NV     |scan_bin       |char* start|STRLEN len|STRLEN* retlen
+Ap     |NV     |scan_hex       |char* start|STRLEN len|STRLEN* retlen
 Ap     |char*  |scan_num       |char* s|YYSTYPE *lvalp
-Ap     |NV     |scan_oct       |char* start|I32 len|I32* retlen
+Ap     |NV     |scan_oct       |char* start|STRLEN len|STRLEN* retlen
 p      |OP*    |scope          |OP* o
 Ap     |char*  |screaminstr    |SV* bigsv|SV* littlesv|I32 start_shift \
                                |I32 end_shift|I32 *state|I32 last
@@ -2074,8 +2074,8 @@ Ap        |I32    |utf8_distance  |U8 *a|U8 *b
 Ap     |U8*    |utf8_hop       |U8 *s|I32 off
 ApM    |U8*    |utf8_to_bytes  |U8 *s|STRLEN *len
 ApM    |U8*    |bytes_to_utf8  |U8 *s|STRLEN *len
-Ap     |UV     |utf8_to_uv     |U8 *s|I32* retlen
-Ap     |UV     |utf8_to_uv_chk |U8 *s|I32* retlen|bool checking
+Ap     |UV     |utf8_to_uv     |U8 *s|STRLEN* retlen
+Ap     |UV     |utf8_to_uv_chk |U8 *s|STRLEN curlen|STRLEN* retlen|bool checking
 Ap     |U8*    |uv_to_utf8     |U8 *d|UV uv
 p      |void   |vivify_defelem |SV* sv
 p      |void   |vivify_ref     |SV* sv|U32 to_what
@@ -2358,7 +2358,7 @@ s |regnode*|reg           |I32|I32 *
 s      |regnode*|reganode      |U8|U32
 s      |regnode*|regatom       |I32 *
 s      |regnode*|regbranch     |I32 *|I32
-s      |void   |reguni         |UV|char *|I32*
+s      |void   |reguni         |UV|char *|STRLEN*
 s      |regnode*|regclass
 s      |regnode*|regclassutf8
 s      |I32    |regcurly       |char *
diff --git a/handy.h b/handy.h
index f0e39af..7341012 100644 (file)
--- a/handy.h
+++ b/handy.h
@@ -448,21 +448,23 @@ Converts the specified character to lowercase.
 #define isPSXSPC_utf8(c)       (isSPACE_utf8(c) ||(c) == '\f')
 #define isBLANK_utf8(c)                isBLANK(c) /* could be wrong */
 
-#define isALNUM_LC_utf8(p)     isALNUM_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isIDFIRST_LC_utf8(p)   isIDFIRST_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isALPHA_LC_utf8(p)     isALPHA_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isSPACE_LC_utf8(p)     isSPACE_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isDIGIT_LC_utf8(p)     isDIGIT_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isUPPER_LC_utf8(p)     isUPPER_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isLOWER_LC_utf8(p)     isLOWER_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isALNUMC_LC_utf8(p)    isALNUMC_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isCNTRL_LC_utf8(p)     isCNTRL_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isGRAPH_LC_utf8(p)     isGRAPH_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isPRINT_LC_utf8(p)     isPRINT_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isPUNCT_LC_utf8(p)     isPUNCT_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define toUPPER_LC_utf8(p)     toUPPER_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define toTITLE_LC_utf8(p)     toTITLE_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define toLOWER_LC_utf8(p)     toLOWER_LC_uni(utf8_to_uv_chk(p, 0, 0))
+#define STRLEN_MAX     ((STRLEN)-1)
+
+#define isALNUM_LC_utf8(p)     isALNUM_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isIDFIRST_LC_utf8(p)   isIDFIRST_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isALPHA_LC_utf8(p)     isALPHA_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isSPACE_LC_utf8(p)     isSPACE_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isDIGIT_LC_utf8(p)     isDIGIT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isUPPER_LC_utf8(p)     isUPPER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isLOWER_LC_utf8(p)     isLOWER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isALNUMC_LC_utf8(p)    isALNUMC_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isCNTRL_LC_utf8(p)     isCNTRL_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isGRAPH_LC_utf8(p)     isGRAPH_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isPRINT_LC_utf8(p)     isPRINT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isPUNCT_LC_utf8(p)     isPUNCT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define toUPPER_LC_utf8(p)     toUPPER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define toTITLE_LC_utf8(p)     toTITLE_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define toLOWER_LC_utf8(p)     toLOWER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
 
 #define isPSXSPC_LC_utf8(c)    (isSPACE_LC_utf8(c) ||(c) == '\f')
 #define isBLANK_LC_utf8(c)     isBLANK(c) /* could be wrong */
diff --git a/op.c b/op.c
index 6ef4bfe..9e256a3 100644 (file)
--- a/op.c
+++ b/op.c
@@ -2621,7 +2621,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
        SV* transv = 0;
        U8* tend = t + tlen;
        U8* rend = r + rlen;
-       I32 ulen;
+       STRLEN ulen;
        U32 tfirst = 1;
        U32 tlast = 0;
        I32 tdiff;
@@ -2641,6 +2641,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
        if (complement) {
            U8 tmpbuf[UTF8_MAXLEN];
            U8** cp;
+           I32* cl;
            UV nextmin = 0;
            New(1109, cp, tlen, U8*);
            i = 0;
@@ -2656,7 +2657,8 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
            qsort(cp, i, sizeof(U8*), utf8compare);
            for (j = 0; j < i; j++) {
                U8 *s = cp[j];
-               UV val = utf8_to_uv_chk(s, &ulen, 0);
+               I32 cur = j < i ? cp[j+1] - s : tend - s;
+               UV  val = utf8_to_uv_chk(s, cur, &ulen, 0);
                s += ulen;
                diff = val - nextmin;
                if (diff > 0) {
@@ -2669,7 +2671,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                    }
                }
                if (*s == 0xff)
-                   val = utf8_to_uv_chk(s+1, &ulen, 0);
+                   val = utf8_to_uv_chk(s+1, cur - 1, &ulen, 0);
                if (val >= nextmin)
                    nextmin = val + 1;
            }
@@ -2696,10 +2698,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
        while (t < tend || tfirst <= tlast) {
            /* see if we need more "t" chars */
            if (tfirst > tlast) {
-               tfirst = (I32)utf8_to_uv_chk(t, &ulen, 0);
+               tfirst = (I32)utf8_to_uv_chk(t, tend - t, &ulen, 0);
                t += ulen;
                if (t < tend && *t == 0xff) {   /* illegal utf8 val indicates range */
-                   tlast = (I32)utf8_to_uv_chk(++t, &ulen, 0);
+                   t++;
+                   tlast = (I32)utf8_to_uv_chk(t, tend - t, &ulen, 0);
                    t += ulen;
                }
                else
@@ -2709,10 +2712,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
            /* now see if we need more "r" chars */
            if (rfirst > rlast) {
                if (r < rend) {
-                   rfirst = (I32)utf8_to_uv_chk(r, &ulen, 0);
+                   rfirst = (I32)utf8_to_uv_chk(r, rend - r, &ulen, 0);
                    r += ulen;
                    if (r < rend && *r == 0xff) {       /* illegal utf8 val indicates range */
-                       rlast = (I32)utf8_to_uv_chk(++r, &ulen, 0);
+                       r++;
+                       rlast = (I32)utf8_to_uv_chk(r, rend - r, &ulen, 0);
                        r += ulen;
                    }
                    else
diff --git a/perl.c b/perl.c
index cb2cb14..3d874ca 100644 (file)
--- a/perl.c
+++ b/perl.c
@@ -2025,7 +2025,7 @@ NULL
 char *
 Perl_moreswitches(pTHX_ char *s)
 {
-    I32 numlen;
+    STRLEN numlen;
     U32 rschar;
 
     switch (*s) {
index 3cfe4e0..1f1343d 100644 (file)
--- a/perlapi.c
+++ b/perlapi.c
@@ -2638,14 +2638,14 @@ Perl_save_threadsv(pTHXo_ PADOFFSET i)
 
 #undef  Perl_scan_bin
 NV
-Perl_scan_bin(pTHXo_ char* start, I32 len, I32* retlen)
+Perl_scan_bin(pTHXo_ char* start, STRLEN len, STRLEN* retlen)
 {
     return ((CPerlObj*)pPerl)->Perl_scan_bin(start, len, retlen);
 }
 
 #undef  Perl_scan_hex
 NV
-Perl_scan_hex(pTHXo_ char* start, I32 len, I32* retlen)
+Perl_scan_hex(pTHXo_ char* start, STRLEN len, STRLEN* retlen)
 {
     return ((CPerlObj*)pPerl)->Perl_scan_hex(start, len, retlen);
 }
@@ -2659,7 +2659,7 @@ Perl_scan_num(pTHXo_ char* s, YYSTYPE *lvalp)
 
 #undef  Perl_scan_oct
 NV
-Perl_scan_oct(pTHXo_ char* start, I32 len, I32* retlen)
+Perl_scan_oct(pTHXo_ char* start, STRLEN len, STRLEN* retlen)
 {
     return ((CPerlObj*)pPerl)->Perl_scan_oct(start, len, retlen);
 }
@@ -3380,16 +3380,16 @@ Perl_bytes_to_utf8(pTHXo_ U8 *s, STRLEN *len)
 
 #undef  Perl_utf8_to_uv
 UV
-Perl_utf8_to_uv(pTHXo_ U8 *s, I32* retlen)
+Perl_utf8_to_uv(pTHXo_ U8 *s, STRLEN* retlen)
 {
     return ((CPerlObj*)pPerl)->Perl_utf8_to_uv(s, retlen);
 }
 
 #undef  Perl_utf8_to_uv_chk
 UV
-Perl_utf8_to_uv_chk(pTHXo_ U8 *s, I32* retlen, bool checking)
+Perl_utf8_to_uv_chk(pTHXo_ U8 *s, STRLEN curlen, STRLEN* retlen, bool checking)
 {
-    return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_chk(s, retlen, checking);
+    return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_chk(s, curlen, retlen, checking);
 }
 
 #undef  Perl_uv_to_utf8
index a5178e8..730d89f 100644 (file)
@@ -3225,7 +3225,7 @@ advanced to the end of the character.
 If C<s> does not point to a well-formed UTF8 character, an optional UTF8
 warning is produced.
 
-       U8* s   utf8_to_uv(I32 *retlen)
+       U8* s   utf8_to_uv(STRLEN *retlen)
 
 =for hackers
 Found in file utf8.c
@@ -3233,9 +3233,9 @@ Found in file utf8.c
 =item utf8_to_uv_chk
 
 Returns the character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
-length, in bytes, of that character, and the pointer C<s> will be
-advanced to the end of the character.
+which is assumed to be in UTF8 encoding and no longer than C<curlen>;
+C<retlen> will be set to the length, in bytes, of that character,
+and the pointer C<s> will be advanced to the end of the character.
 
 If C<s> does not point to a well-formed UTF8 character, the behaviour
 is dependent on the value of C<checking>: if this is true, it is
@@ -3243,7 +3243,7 @@ assumed that the caller will raise a warning, and this function will
 set C<retlen> to C<-1> and return. If C<checking> is not true, an optional UTF8
 warning is produced.
 
-       U8* s   utf8_to_uv_chk(I32 *retlen, I32 checking)
+       U8* s   utf8_to_uv_chk(STRLEN curlen, I32 *retlen, I32 checking)
 
 =for hackers
 Found in file utf8.c
index 480ab84..139bab9 100644 (file)
@@ -1789,6 +1789,10 @@ a builtin library search path, prefix2 is substituted.  The error may
 appear if components are not found, or are too long.  See
 "PERLLIB_PREFIX" in L<perlos2>.
 
+=item Malformed UTF-8 character (%s)
+
+Perl detected something that didn't comply with UTF-8 encoding rules.
+
 =item Malformed UTF-16 surrogate
 
 Perl thought it was reading UTF-16 encoded character data but while
index c9954d8..145c953 100644 (file)
@@ -71,6 +71,11 @@ on Windows.
 Regardless of the above, the C<bytes> pragma can always be used to force
 byte semantics in a particular lexical scope.  See L<bytes>.
 
+One effect of the C<utf8> pragma is that the internal UTF-8 decoding
+becomes stricter so that the character 0xFFFF (UTF-8 bytes 0xEF 0xBF
+0xBF), and the bytes 0xFE and 0xFF, start to cause warnings if they
+appear in the data.
+
 The C<utf8> pragma is primarily a compatibility device that enables
 recognition of UTF-8 in literals encountered by the parser.  It may also
 be used for enabling some of the more experimental Unicode support features.
diff --git a/pp.c b/pp.c
index 98d31cb..35f5956 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -1480,7 +1480,7 @@ PP(pp_complement)
          STRLEN targlen = 0;
          U8 *result;
          U8 *send;
-         I32 l;
+         STRLEN l;
 
          send = tmps + len;
          while (tmps < send) {
@@ -1944,7 +1944,7 @@ PP(pp_hex)
 {
     djSP; dTARGET;
     char *tmps;
-    I32 argtype;
+    STRLEN argtype;
     STRLEN n_a;
 
     tmps = POPpx;
@@ -1957,7 +1957,7 @@ PP(pp_oct)
 {
     djSP; dTARGET;
     NV value;
-    I32 argtype;
+    STRLEN argtype;
     char *tmps;
     STRLEN n_a;
 
@@ -2234,13 +2234,13 @@ PP(pp_ord)
 {
     djSP; dTARGET;
     UV value;
-    STRLEN n_a;
     SV *tmpsv = POPs;
-    U8 *tmps = (U8*)SvPVx(tmpsv,n_a);
-    I32 retlen;
+    STRLEN len;
+    U8 *tmps = (U8*)SvPVx(tmpsv, len);
+    STRLEN retlen;
 
     if ((*tmps & 0x80) && DO_UTF8(tmpsv))
-       value = utf8_to_uv_chk(tmps, &retlen, 0);
+       value = utf8_to_uv_chk(tmps, len, &retlen, 0);
     else
        value = (UV)(*tmps & 255);
     XPUSHu(value);
@@ -2304,10 +2304,10 @@ PP(pp_ucfirst)
     STRLEN slen;
 
     if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) {
-       I32 ulen;
+       STRLEN ulen;
        U8 tmpbuf[UTF8_MAXLEN];
        U8 *tend;
-       UV uv = utf8_to_uv_chk(s, &ulen, 0);
+       UV uv = utf8_to_uv_chk(s, slen, &ulen, 0);
 
        if (PL_op->op_private & OPpLOCALE) {
            TAINT;
@@ -2363,10 +2363,10 @@ PP(pp_lcfirst)
     STRLEN slen;
 
     if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) {
-       I32 ulen;
+       STRLEN ulen;
        U8 tmpbuf[UTF8_MAXLEN];
        U8 *tend;
-       UV uv = utf8_to_uv_chk(s, &ulen, 0);
+       UV uv = utf8_to_uv_chk(s, slen, &ulen, 0);
 
        if (PL_op->op_private & OPpLOCALE) {
            TAINT;
@@ -2423,7 +2423,7 @@ PP(pp_uc)
 
     if (DO_UTF8(sv)) {
        dTARGET;
-       I32 ulen;
+       STRLEN ulen;
        register U8 *d;
        U8 *send;
 
@@ -2443,7 +2443,7 @@ PP(pp_uc)
                TAINT;
                SvTAINTED_on(TARG);
                while (s < send) {
-                   d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv_chk(s, &ulen, 0)));
+                   d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv_chk(s, len, &ulen, 0)));
                    s += ulen;
                }
            }
@@ -2497,7 +2497,7 @@ PP(pp_lc)
 
     if (DO_UTF8(sv)) {
        dTARGET;
-       I32 ulen;
+       STRLEN ulen;
        register U8 *d;
        U8 *send;
 
@@ -2517,7 +2517,7 @@ PP(pp_lc)
                TAINT;
                SvTAINTED_on(TARG);
                while (s < send) {
-                   d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv_chk(s, &ulen, 0)));
+                   d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv_chk(s, len, &ulen, 0)));
                    s += ulen;
                }
            }
@@ -3363,7 +3363,7 @@ PP(pp_unpack)
     /* These must not be in registers: */
     I16 ashort;
     int aint;
-    I32 along;
+    STRLEN along;
 #ifdef HAS_QUAD
     Quad_t aquad;
 #endif
@@ -3659,7 +3659,7 @@ PP(pp_unpack)
                len = strend - s;
            if (checksum) {
                while (len-- > 0 && s < strend) {
-                   auint = utf8_to_uv_chk((U8*)s, &along, 0);
+                   auint = utf8_to_uv_chk((U8*)s, strend - s, &along, 0);
                    s += along;
                    if (checksum > 32)
                        cdouble += (NV)auint;
@@ -3671,7 +3671,7 @@ PP(pp_unpack)
                EXTEND(SP, len);
                EXTEND_MORTAL(len);
                while (len-- > 0 && s < strend) {
-                   auint = utf8_to_uv_chk((U8*)s, &along, 0);
+                   auint = utf8_to_uv_chk((U8*)s, strend - s, &along, 0);
                    s += along;
                    sv = NEWSV(37, 0);
                    sv_setuv(sv, (UV)auint);
index cf2000e..33f91ee 100644 (file)
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -2971,17 +2971,17 @@ PP(pp_require)
     if (SvNIOKp(sv)) {
        if (SvPOK(sv) && SvNOK(sv)) {           /* require v5.6.1 */
            UV rev = 0, ver = 0, sver = 0;
-           I32 len;
+           STRLEN len;
            U8 *s = (U8*)SvPVX(sv);
            U8 *end = (U8*)SvPVX(sv) + SvCUR(sv);
            if (s < end) {
-               rev = utf8_to_uv_chk(s, &len, 0);
+               rev = utf8_to_uv_chk(s, end - s, &len, 0);
                s += len;
                if (s < end) {
-                   ver = utf8_to_uv_chk(s, &len, 0);
+                   ver = utf8_to_uv_chk(s, end - s, &len, 0);
                    s += len;
                    if (s < end)
-                       sver = utf8_to_uv_chk(s, &len, 0);
+                       sver = utf8_to_uv_chk(s, end - s, &len, 0);
                }
            }
            if (PERL_REVISION < rev
diff --git a/proto.h b/proto.h
index 2713916..7624255 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -677,10 +677,10 @@ PERL_CALLCONV OP* Perl_scalar(pTHX_ OP* o);
 PERL_CALLCONV OP*      Perl_scalarkids(pTHX_ OP* o);
 PERL_CALLCONV OP*      Perl_scalarseq(pTHX_ OP* o);
 PERL_CALLCONV OP*      Perl_scalarvoid(pTHX_ OP* o);
-PERL_CALLCONV NV       Perl_scan_bin(pTHX_ char* start, I32 len, I32* retlen);
-PERL_CALLCONV NV       Perl_scan_hex(pTHX_ char* start, I32 len, I32* retlen);
+PERL_CALLCONV NV       Perl_scan_bin(pTHX_ char* start, STRLEN len, STRLEN* retlen);
+PERL_CALLCONV NV       Perl_scan_hex(pTHX_ char* start, STRLEN len, STRLEN* retlen);
 PERL_CALLCONV char*    Perl_scan_num(pTHX_ char* s, YYSTYPE *lvalp);
-PERL_CALLCONV NV       Perl_scan_oct(pTHX_ char* start, I32 len, I32* retlen);
+PERL_CALLCONV NV       Perl_scan_oct(pTHX_ char* start, STRLEN len, STRLEN* retlen);
 PERL_CALLCONV OP*      Perl_scope(pTHX_ OP* o);
 PERL_CALLCONV char*    Perl_screaminstr(pTHX_ SV* bigsv, SV* littlesv, I32 start_shift, I32 end_shift, I32 *state, I32 last);
 #if !defined(VMS)
@@ -809,8 +809,8 @@ PERL_CALLCONV I32   Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
 PERL_CALLCONV U8*      Perl_utf8_hop(pTHX_ U8 *s, I32 off);
 PERL_CALLCONV U8*      Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len);
 PERL_CALLCONV U8*      Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len);
-PERL_CALLCONV UV       Perl_utf8_to_uv(pTHX_ U8 *s, I32* retlen);
-PERL_CALLCONV UV       Perl_utf8_to_uv_chk(pTHX_ U8 *s, I32* retlen, bool checking);
+PERL_CALLCONV UV       Perl_utf8_to_uv(pTHX_ U8 *s, STRLEN* retlen);
+PERL_CALLCONV UV       Perl_utf8_to_uv_chk(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, bool checking);
 PERL_CALLCONV U8*      Perl_uv_to_utf8(pTHX_ U8 *d, UV uv);
 PERL_CALLCONV void     Perl_vivify_defelem(pTHX_ SV* sv);
 PERL_CALLCONV void     Perl_vivify_ref(pTHX_ SV* sv, U32 to_what);
@@ -1103,7 +1103,7 @@ STATIC regnode*   S_reg(pTHX_ I32, I32 *);
 STATIC regnode*        S_reganode(pTHX_ U8, U32);
 STATIC regnode*        S_regatom(pTHX_ I32 *);
 STATIC regnode*        S_regbranch(pTHX_ I32 *, I32);
-STATIC void    S_reguni(pTHX_ UV, char *, I32*);
+STATIC void    S_reguni(pTHX_ UV, char *, STRLEN*);
 STATIC regnode*        S_regclass(pTHX);
 STATIC regnode*        S_regclassutf8(pTHX);
 STATIC I32     S_regcurly(pTHX_ char *);
index e7042ea..3f2b10c 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -2742,11 +2742,11 @@ tryagain:
        /* FALL THROUGH */
 
     default: {
-           register I32 len;
+           register STRLEN len;
            register UV ender;
            register char *p;
            char *oldp, *s;
-           I32 numlen;
+           STRLEN numlen;
 
            PL_regcomp_parse++;
 
@@ -2884,7 +2884,8 @@ tryagain:
                default:
                  normal_default:
                    if ((*p & 0xc0) == 0xc0 && UTF) {
-                       ender = utf8_to_uv_chk((U8*)p, &numlen, 0);
+                       ender = utf8_to_uv_chk((U8*)p, PL_regxend - p,
+                                              &numlen, 0);
                        p += numlen;
                    }
                    else
@@ -3128,7 +3129,7 @@ S_regclass(pTHX)
     register I32 lastvalue = OOB_CHAR8;
     register I32 range = 0;
     register regnode *ret;
-    I32 numlen;
+    STRLEN numlen;
     I32 namedclass;
     char *rangebegin;
     bool need_class = 0;
@@ -3606,7 +3607,7 @@ S_regclassutf8(pTHX)
     register U32 lastvalue = OOB_UTF8;
     register I32 range = 0;
     register regnode *ret;
-    I32 numlen;
+    STRLEN numlen;
     I32 n;
     SV *listsv;
     U8 flags = 0;
@@ -3638,12 +3639,16 @@ S_regclassutf8(pTHX)
        namedclass = OOB_NAMEDCLASS;
        if (!range)
            rangebegin = PL_regcomp_parse;
-       value = utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0);
+       value = utf8_to_uv_chk((U8*)PL_regcomp_parse,
+                              PL_regxend - PL_regcomp_parse,
+                              &numlen, 0);
        PL_regcomp_parse += numlen;
        if (value == '[')
            namedclass = regpposixcc(value);
        else if (value == '\\') {
-           value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0);
+           value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse,
+                                       PL_regxend - PL_regcomp_parse,
+                                       &numlen, 0);
            PL_regcomp_parse += numlen;
            /* Some compilers cannot handle switching on 64-bit integer
             * values, therefore value cannot be an UV.  Yes, this will
@@ -3937,7 +3942,7 @@ S_reganode(pTHX_ U8 op, U32 arg)
 - reguni - emit (if appropriate) a Unicode character
 */
 STATIC void
-S_reguni(pTHX_ UV uv, char* s, I32* lenp)
+S_reguni(pTHX_ UV uv, char* s, STRLEN* lenp)
 {
     dTHR;
     if (SIZE_ONLY) {
index 6e046f3..350f432 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -917,7 +917,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
            PL_reg_flags |= RF_tainted;
            /* FALL THROUGH */
        case BOUNDUTF8:
-           tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), 0, 0) : '\n';
+           tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1),
+                                                       strend - s,
+                                                       0, 0) : '\n';
            tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
            while (s < strend) {
                if (tmp == !(OP(c) == BOUNDUTF8 ?
@@ -953,7 +955,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
            PL_reg_flags |= RF_tainted;
            /* FALL THROUGH */
        case NBOUNDUTF8:
-           tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), 0, 0) : '\n';
+           tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1),
+                                                       strend - s,
+                                                       0, 0) : '\n';
            tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
            while (s < strend) {
                if (tmp == !(OP(c) == NBOUNDUTF8 ?
@@ -1998,7 +2002,7 @@ S_regmatch(pTHX_ regnode *prog)
                while (s < e) {
                    if (l >= PL_regeol)
                        sayNO;
-                   if (utf8_to_uv_chk((U8*)s, 0, 0) != (c1 ?
+                   if (utf8_to_uv_chk((U8*)s, e - s, 0, 0) != (c1 ?
                                                  toLOWER_utf8((U8*)l) :
                                                  toLOWER_LC_utf8((U8*)l)))
                    {
@@ -2136,7 +2140,8 @@ S_regmatch(pTHX_ regnode *prog)
        case NBOUNDUTF8:
            /* was last char in word? */
            ln = (locinput != PL_regbol)
-               ? utf8_to_uv_chk(reghop((U8*)locinput, -1), 0, 0) : PL_regprev;
+               ? utf8_to_uv_chk(reghop((U8*)locinput, -1),
+                                PL_regeol - locinput, 0, 0) : PL_regprev;
            if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
                ln = isALNUM_uni(ln);
                n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
diff --git a/sv.c b/sv.c
index 1fac162..2790cfd 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -6358,13 +6358,13 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
        case 'd':
        case 'i':
            if (vectorize) {
-               I32 ulen;
+               STRLEN ulen;
                if (!veclen) {
                    vectorize = FALSE;
                    break;
                }
                if (utf)
-                   iv = (IV)utf8_to_uv_chk(vecstr, &ulen, 0);
+                   iv = (IV)utf8_to_uv_chk(vecstr, veclen, &ulen, 0);
                else {
                    iv = *vecstr;
                    ulen = 1;
@@ -6440,14 +6440,14 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
 
        uns_integer:
            if (vectorize) {
-               I32 ulen;
+               STRLEN ulen;
        vector:
                if (!veclen) {
                    vectorize = FALSE;
                    break;
                }
                if (utf)
-                   uv = utf8_to_uv_chk(vecstr, &ulen, 0);
+                   uv = utf8_to_uv_chk(vecstr, veclen, &ulen, 0);
                else {
                    uv = *vecstr;
                    ulen = 1;
index 7224a74..e61baad 100755 (executable)
@@ -10,7 +10,7 @@ BEGIN {
     }
 }
 
-print "1..103\n";
+print "1..181\n";
 
 my $test = 1;
 
@@ -559,3 +559,170 @@ sub nok_bytes {
     print "ok $test\n";
     $test++;
 }
+
+# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
+# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
+# version dated 2000-09-02. 
+
+my @MK = split(/\n/, <<__EOMK__);
+1      Correct UTF-8
+1.1.1 y "κόσμε"  -               11      ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5        5
+2      Boundary conditions 
+2.1    First possible sequence of certain length
+2.1.1 y "\0"                    0               1       00      1
+2.1.2 y "\80"                   80              2       c2:80   1
+2.1.3 y "ࠀ"          800             3       e0:a0:80        1
+2.1.4 y "𐀀"         10000           4       f0:90:80:80     1
+2.1.5 y ""        200000          5       f8:88:80:80:80  1
+2.1.6 y ""       4000000         6       fc:84:80:80:80:80       1
+2.2    Last possible sequence of certain length
+2.2.1 y "\7f"                    7f              1       7f      1
+2.2.2 y "߿"                   7ff             2       df:bf   1
+# The ffff is legal unless under use utf8

Software error:

Malformed UTF-8 character (fatal) at /var/www/git.shadowcat.co.uk/docroot/gitweb/gitweb.cgi line 1024, <$fd> line 809.

For help, please send mail to the webmaster (chrisj@shadowcatsystems.co.uk), giving this error message and the time and date of the error.