From: Jarkko Hietaniemi <jhi@iki.fi>
Date: Tue, 24 Oct 2000 02:55:33 +0000 (+0000)
Subject: Make the UTF-8 decoding stricter and more verbose when
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=ba210ebec161cde003bc967e8e460c72f71fb70c;p=p5sagit%2Fp5-mst-13.2.git

Make the UTF-8 decoding stricter and more verbose when
malformation happens.  This involved adding an argument
to utf8_to_uv_chk(), which involved changing its prototype,
and prefer STRLEN over I32 for the UTF-8 length, which as
a domino effect necessitated changing the prototypes of
scan_bin(), scan_oct(), scan_hex(), and reg_uni().
The stricter UTF-8 decoding checking uses Markus Kuhn's
UTF-8 Decode Stress Tester from
http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt

p4raw-id: //depot/perl@7416
---

diff --git a/doop.c b/doop.c
index b75ffaa..3cd8f07 100644
--- a/doop.c
+++ b/doop.c
@@ -72,12 +72,12 @@ S_do_trans_simple(pTHX_ SV *sv)
     Newz(0, d, len*2+1, U8);
     dstart = d;
     while (s < send) {
-        I32 ulen;
+        STRLEN ulen;
         short c;
 
         ulen = 1;
         /* Need to check this, otherwise 128..255 won't match */
-	c = utf8_to_uv_chk(s, &ulen, 0);
+	c = utf8_to_uv_chk(s, send - s, &ulen, 0);
         if (c < 0x100 && (ch = tbl[(short)c]) >= 0) {
             matches++;
             if (ch < 0x80)
@@ -122,10 +122,10 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
             s += UTF8SKIP(s);
         else {
             UV c;
-            I32 ulen;
+            STRLEN ulen;
             ulen = 1;
             if (hasutf)
-                c = utf8_to_uv_chk(s,&ulen, 0);
+                c = utf8_to_uv_chk(s, send - s, &ulen, 0);
             else
                 c = *s;
             if (c < 0x100 && tbl[c] >= 0)
@@ -363,8 +363,8 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
 		continue;
 	    }
 	    else if (uv == none) {	/* "none" is unmapped character */
-		I32 ulen;
-		*d++ = (U8)utf8_to_uv_chk(s, &ulen, 0);
+		STRLEN ulen;
+		*d++ = (U8)utf8_to_uv_chk(s, send - s, &ulen, 0);
 		s += ulen;
 		puv = 0xfeedface;
 		continue;
@@ -404,8 +404,8 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
 		continue;
 	    }
 	    else if (uv == none) {	/* "none" is unmapped character */
-		I32 ulen;
-		*d++ = (U8)utf8_to_uv_chk(s, &ulen, 0);
+		STRLEN ulen;
+		*d++ = (U8)utf8_to_uv_chk(s, send - s, &ulen, 0);
 		s += ulen;
 		continue;
 	    }
@@ -964,15 +964,15 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
 	char *dcsave = dc;
 	STRLEN lulen = leftlen;
 	STRLEN rulen = rightlen;
-	I32 ulen;
+	STRLEN ulen;
 
 	switch (optype) {
 	case OP_BIT_AND:
 	    while (lulen && rulen) {
-		luc = utf8_to_uv_chk((U8*)lc, &ulen, 0);
+		luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
 		lc += ulen;
 		lulen -= ulen;
-		ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0);
+		ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
 		rc += ulen;
 		rulen -= ulen;
 		duc = luc & ruc;
@@ -984,10 +984,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
 	    break;
 	case OP_BIT_XOR:
 	    while (lulen && rulen) {
-		luc = utf8_to_uv_chk((U8*)lc, &ulen, 0);
+		luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
 		lc += ulen;
 		lulen -= ulen;
-		ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0);
+		ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
 		rc += ulen;
 		rulen -= ulen;
 		duc = luc ^ ruc;
@@ -996,10 +996,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
 	    goto mop_up_utf;
 	case OP_BIT_OR:
 	    while (lulen && rulen) {
-		luc = utf8_to_uv_chk((U8*)lc, &ulen, 0);
+		luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
 		lc += ulen;
 		lulen -= ulen;
-		ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0);
+		ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
 		rc += ulen;
 		rulen -= ulen;
 		duc = luc | ruc;
diff --git a/embed.h b/embed.h
index b4c8f6a..eab037f 100644
--- a/embed.h
+++ b/embed.h
@@ -2190,7 +2190,7 @@
 #define utf8_to_bytes(a,b)	Perl_utf8_to_bytes(aTHX_ a,b)
 #define bytes_to_utf8(a,b)	Perl_bytes_to_utf8(aTHX_ a,b)
 #define utf8_to_uv(a,b)		Perl_utf8_to_uv(aTHX_ a,b)
-#define utf8_to_uv_chk(a,b,c)	Perl_utf8_to_uv_chk(aTHX_ a,b,c)
+#define utf8_to_uv_chk(a,b,c,d)	Perl_utf8_to_uv_chk(aTHX_ a,b,c,d)
 #define uv_to_utf8(a,b)		Perl_uv_to_utf8(aTHX_ a,b)
 #define vivify_defelem(a)	Perl_vivify_defelem(aTHX_ a)
 #define vivify_ref(a,b)		Perl_vivify_ref(aTHX_ a,b)
diff --git a/embed.pl b/embed.pl
index f685042..6adb275 100755
--- a/embed.pl
+++ b/embed.pl
@@ -1941,10 +1941,10 @@ p	|OP*	|scalar		|OP* o
 p	|OP*	|scalarkids	|OP* o
 p	|OP*	|scalarseq	|OP* o
 p	|OP*	|scalarvoid	|OP* o
-Ap	|NV	|scan_bin	|char* start|I32 len|I32* retlen
-Ap	|NV	|scan_hex	|char* start|I32 len|I32* retlen
+Ap	|NV	|scan_bin	|char* start|STRLEN len|STRLEN* retlen
+Ap	|NV	|scan_hex	|char* start|STRLEN len|STRLEN* retlen
 Ap	|char*	|scan_num	|char* s|YYSTYPE *lvalp
-Ap	|NV	|scan_oct	|char* start|I32 len|I32* retlen
+Ap	|NV	|scan_oct	|char* start|STRLEN len|STRLEN* retlen
 p	|OP*	|scope		|OP* o
 Ap	|char*	|screaminstr	|SV* bigsv|SV* littlesv|I32 start_shift \
 				|I32 end_shift|I32 *state|I32 last
@@ -2074,8 +2074,8 @@ Ap	|I32	|utf8_distance	|U8 *a|U8 *b
 Ap	|U8*	|utf8_hop	|U8 *s|I32 off
 ApM	|U8*	|utf8_to_bytes	|U8 *s|STRLEN *len
 ApM	|U8*	|bytes_to_utf8	|U8 *s|STRLEN *len
-Ap	|UV	|utf8_to_uv	|U8 *s|I32* retlen
-Ap	|UV	|utf8_to_uv_chk	|U8 *s|I32* retlen|bool checking
+Ap	|UV	|utf8_to_uv	|U8 *s|STRLEN* retlen
+Ap	|UV	|utf8_to_uv_chk	|U8 *s|STRLEN curlen|STRLEN* retlen|bool checking
 Ap	|U8*	|uv_to_utf8	|U8 *d|UV uv
 p	|void	|vivify_defelem	|SV* sv
 p	|void	|vivify_ref	|SV* sv|U32 to_what
@@ -2358,7 +2358,7 @@ s	|regnode*|reg		|I32|I32 *
 s	|regnode*|reganode	|U8|U32
 s	|regnode*|regatom	|I32 *
 s	|regnode*|regbranch	|I32 *|I32
-s	|void	|reguni		|UV|char *|I32*
+s	|void	|reguni		|UV|char *|STRLEN*
 s	|regnode*|regclass
 s	|regnode*|regclassutf8
 s	|I32	|regcurly	|char *
diff --git a/handy.h b/handy.h
index f0e39af..7341012 100644
--- a/handy.h
+++ b/handy.h
@@ -448,21 +448,23 @@ Converts the specified character to lowercase.
 #define isPSXSPC_utf8(c)	(isSPACE_utf8(c) ||(c) == '\f')
 #define isBLANK_utf8(c)		isBLANK(c) /* could be wrong */
 
-#define isALNUM_LC_utf8(p)	isALNUM_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isIDFIRST_LC_utf8(p)	isIDFIRST_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isALPHA_LC_utf8(p)	isALPHA_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isSPACE_LC_utf8(p)	isSPACE_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isDIGIT_LC_utf8(p)	isDIGIT_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isUPPER_LC_utf8(p)	isUPPER_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isLOWER_LC_utf8(p)	isLOWER_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isALNUMC_LC_utf8(p)	isALNUMC_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isCNTRL_LC_utf8(p)	isCNTRL_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isGRAPH_LC_utf8(p)	isGRAPH_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isPRINT_LC_utf8(p)	isPRINT_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isPUNCT_LC_utf8(p)	isPUNCT_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define toUPPER_LC_utf8(p)	toUPPER_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define toTITLE_LC_utf8(p)	toTITLE_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define toLOWER_LC_utf8(p)	toLOWER_LC_uni(utf8_to_uv_chk(p, 0, 0))
+#define STRLEN_MAX	((STRLEN)-1)
+
+#define isALNUM_LC_utf8(p)	isALNUM_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isIDFIRST_LC_utf8(p)	isIDFIRST_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isALPHA_LC_utf8(p)	isALPHA_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isSPACE_LC_utf8(p)	isSPACE_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isDIGIT_LC_utf8(p)	isDIGIT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isUPPER_LC_utf8(p)	isUPPER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isLOWER_LC_utf8(p)	isLOWER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isALNUMC_LC_utf8(p)	isALNUMC_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isCNTRL_LC_utf8(p)	isCNTRL_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isGRAPH_LC_utf8(p)	isGRAPH_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isPRINT_LC_utf8(p)	isPRINT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isPUNCT_LC_utf8(p)	isPUNCT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define toUPPER_LC_utf8(p)	toUPPER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define toTITLE_LC_utf8(p)	toTITLE_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define toLOWER_LC_utf8(p)	toLOWER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
 
 #define isPSXSPC_LC_utf8(c)	(isSPACE_LC_utf8(c) ||(c) == '\f')
 #define isBLANK_LC_utf8(c)	isBLANK(c) /* could be wrong */
diff --git a/op.c b/op.c
index 6ef4bfe..9e256a3 100644
--- a/op.c
+++ b/op.c
@@ -2621,7 +2621,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 	SV* transv = 0;
 	U8* tend = t + tlen;
 	U8* rend = r + rlen;
-	I32 ulen;
+	STRLEN ulen;
 	U32 tfirst = 1;
 	U32 tlast = 0;
 	I32 tdiff;
@@ -2641,6 +2641,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 	if (complement) {
 	    U8 tmpbuf[UTF8_MAXLEN];
 	    U8** cp;
+	    I32* cl;
 	    UV nextmin = 0;
 	    New(1109, cp, tlen, U8*);
 	    i = 0;
@@ -2656,7 +2657,8 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 	    qsort(cp, i, sizeof(U8*), utf8compare);
 	    for (j = 0; j < i; j++) {
 		U8 *s = cp[j];
-		UV val = utf8_to_uv_chk(s, &ulen, 0);
+		I32 cur = j < i ? cp[j+1] - s : tend - s;
+		UV  val = utf8_to_uv_chk(s, cur, &ulen, 0);
 		s += ulen;
 		diff = val - nextmin;
 		if (diff > 0) {
@@ -2669,7 +2671,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 		    }
 	        }
 		if (*s == 0xff)
-		    val = utf8_to_uv_chk(s+1, &ulen, 0);
+		    val = utf8_to_uv_chk(s+1, cur - 1, &ulen, 0);
 		if (val >= nextmin)
 		    nextmin = val + 1;
 	    }
@@ -2696,10 +2698,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 	while (t < tend || tfirst <= tlast) {
 	    /* see if we need more "t" chars */
 	    if (tfirst > tlast) {
-		tfirst = (I32)utf8_to_uv_chk(t, &ulen, 0);
+		tfirst = (I32)utf8_to_uv_chk(t, tend - t, &ulen, 0);
 		t += ulen;
 		if (t < tend && *t == 0xff) {	/* illegal utf8 val indicates range */
-		    tlast = (I32)utf8_to_uv_chk(++t, &ulen, 0);
+		    t++;
+		    tlast = (I32)utf8_to_uv_chk(t, tend - t, &ulen, 0);
 		    t += ulen;
 		}
 		else
@@ -2709,10 +2712,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 	    /* now see if we need more "r" chars */
 	    if (rfirst > rlast) {
 		if (r < rend) {
-		    rfirst = (I32)utf8_to_uv_chk(r, &ulen, 0);
+		    rfirst = (I32)utf8_to_uv_chk(r, rend - r, &ulen, 0);
 		    r += ulen;
 		    if (r < rend && *r == 0xff) {	/* illegal utf8 val indicates range */
-			rlast = (I32)utf8_to_uv_chk(++r, &ulen, 0);
+			r++;
+			rlast = (I32)utf8_to_uv_chk(r, rend - r, &ulen, 0);
 			r += ulen;
 		    }
 		    else
diff --git a/perl.c b/perl.c
index cb2cb14..3d874ca 100644
--- a/perl.c
+++ b/perl.c
@@ -2025,7 +2025,7 @@ NULL
 char *
 Perl_moreswitches(pTHX_ char *s)
 {
-    I32 numlen;
+    STRLEN numlen;
     U32 rschar;
 
     switch (*s) {
diff --git a/perlapi.c b/perlapi.c
index 3cfe4e0..1f1343d 100644
--- a/perlapi.c
+++ b/perlapi.c
@@ -2638,14 +2638,14 @@ Perl_save_threadsv(pTHXo_ PADOFFSET i)
 
 #undef  Perl_scan_bin
 NV
-Perl_scan_bin(pTHXo_ char* start, I32 len, I32* retlen)
+Perl_scan_bin(pTHXo_ char* start, STRLEN len, STRLEN* retlen)
 {
     return ((CPerlObj*)pPerl)->Perl_scan_bin(start, len, retlen);
 }
 
 #undef  Perl_scan_hex
 NV
-Perl_scan_hex(pTHXo_ char* start, I32 len, I32* retlen)
+Perl_scan_hex(pTHXo_ char* start, STRLEN len, STRLEN* retlen)
 {
     return ((CPerlObj*)pPerl)->Perl_scan_hex(start, len, retlen);
 }
@@ -2659,7 +2659,7 @@ Perl_scan_num(pTHXo_ char* s, YYSTYPE *lvalp)
 
 #undef  Perl_scan_oct
 NV
-Perl_scan_oct(pTHXo_ char* start, I32 len, I32* retlen)
+Perl_scan_oct(pTHXo_ char* start, STRLEN len, STRLEN* retlen)
 {
     return ((CPerlObj*)pPerl)->Perl_scan_oct(start, len, retlen);
 }
@@ -3380,16 +3380,16 @@ Perl_bytes_to_utf8(pTHXo_ U8 *s, STRLEN *len)
 
 #undef  Perl_utf8_to_uv
 UV
-Perl_utf8_to_uv(pTHXo_ U8 *s, I32* retlen)
+Perl_utf8_to_uv(pTHXo_ U8 *s, STRLEN* retlen)
 {
     return ((CPerlObj*)pPerl)->Perl_utf8_to_uv(s, retlen);
 }
 
 #undef  Perl_utf8_to_uv_chk
 UV
-Perl_utf8_to_uv_chk(pTHXo_ U8 *s, I32* retlen, bool checking)
+Perl_utf8_to_uv_chk(pTHXo_ U8 *s, STRLEN curlen, STRLEN* retlen, bool checking)
 {
-    return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_chk(s, retlen, checking);
+    return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_chk(s, curlen, retlen, checking);
 }
 
 #undef  Perl_uv_to_utf8
diff --git a/pod/perlapi.pod b/pod/perlapi.pod
index a5178e8..730d89f 100644
--- a/pod/perlapi.pod
+++ b/pod/perlapi.pod
@@ -3225,7 +3225,7 @@ advanced to the end of the character.
 If C<s> does not point to a well-formed UTF8 character, an optional UTF8
 warning is produced.
 
-	U8* s	utf8_to_uv(I32 *retlen)
+	U8* s	utf8_to_uv(STRLEN *retlen)
 
 =for hackers
 Found in file utf8.c
@@ -3233,9 +3233,9 @@ Found in file utf8.c
 =item utf8_to_uv_chk
 
 Returns the character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
-length, in bytes, of that character, and the pointer C<s> will be
-advanced to the end of the character.
+which is assumed to be in UTF8 encoding and no longer than C<curlen>;
+C<retlen> will be set to the length, in bytes, of that character,
+and the pointer C<s> will be advanced to the end of the character.
 
 If C<s> does not point to a well-formed UTF8 character, the behaviour
 is dependent on the value of C<checking>: if this is true, it is
@@ -3243,7 +3243,7 @@ assumed that the caller will raise a warning, and this function will
 set C<retlen> to C<-1> and return. If C<checking> is not true, an optional UTF8
 warning is produced.
 
-	U8* s	utf8_to_uv_chk(I32 *retlen, I32 checking)
+	U8* s	utf8_to_uv_chk(STRLEN curlen, I32 *retlen, I32 checking)
 
 =for hackers
 Found in file utf8.c
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 480ab84..139bab9 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1789,6 +1789,10 @@ a builtin library search path, prefix2 is substituted.  The error may
 appear if components are not found, or are too long.  See
 "PERLLIB_PREFIX" in L<perlos2>.
 
+=item Malformed UTF-8 character (%s)
+
+Perl detected something that didn't comply with UTF-8 encoding rules.
+
 =item Malformed UTF-16 surrogate
 
 Perl thought it was reading UTF-16 encoded character data but while
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index c9954d8..145c953 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -71,6 +71,11 @@ on Windows.
 Regardless of the above, the C<bytes> pragma can always be used to force
 byte semantics in a particular lexical scope.  See L<bytes>.
 
+One effect of the C<utf8> pragma is that the internal UTF-8 decoding
+becomes stricter so that the character 0xFFFF (UTF-8 bytes 0xEF 0xBF
+0xBF), and the bytes 0xFE and 0xFF, start to cause warnings if they
+appear in the data.
+
 The C<utf8> pragma is primarily a compatibility device that enables
 recognition of UTF-8 in literals encountered by the parser.  It may also
 be used for enabling some of the more experimental Unicode support features.
diff --git a/pp.c b/pp.c
index 98d31cb..35f5956 100644
--- a/pp.c
+++ b/pp.c
@@ -1480,7 +1480,7 @@ PP(pp_complement)
 	  STRLEN targlen = 0;
 	  U8 *result;
 	  U8 *send;
-	  I32 l;
+	  STRLEN l;
 
 	  send = tmps + len;
 	  while (tmps < send) {
@@ -1944,7 +1944,7 @@ PP(pp_hex)
 {
     djSP; dTARGET;
     char *tmps;
-    I32 argtype;
+    STRLEN argtype;
     STRLEN n_a;
 
     tmps = POPpx;
@@ -1957,7 +1957,7 @@ PP(pp_oct)
 {
     djSP; dTARGET;
     NV value;
-    I32 argtype;
+    STRLEN argtype;
     char *tmps;
     STRLEN n_a;
 
@@ -2234,13 +2234,13 @@ PP(pp_ord)
 {
     djSP; dTARGET;
     UV value;
-    STRLEN n_a;
     SV *tmpsv = POPs;
-    U8 *tmps = (U8*)SvPVx(tmpsv,n_a);
-    I32 retlen;
+    STRLEN len;
+    U8 *tmps = (U8*)SvPVx(tmpsv, len);
+    STRLEN retlen;
 
     if ((*tmps & 0x80) && DO_UTF8(tmpsv))
-	value = utf8_to_uv_chk(tmps, &retlen, 0);
+	value = utf8_to_uv_chk(tmps, len, &retlen, 0);
     else
 	value = (UV)(*tmps & 255);
     XPUSHu(value);
@@ -2304,10 +2304,10 @@ PP(pp_ucfirst)
     STRLEN slen;
 
     if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) {
-	I32 ulen;
+	STRLEN ulen;
 	U8 tmpbuf[UTF8_MAXLEN];
 	U8 *tend;
-	UV uv = utf8_to_uv_chk(s, &ulen, 0);
+	UV uv = utf8_to_uv_chk(s, slen, &ulen, 0);
 
 	if (PL_op->op_private & OPpLOCALE) {
 	    TAINT;
@@ -2363,10 +2363,10 @@ PP(pp_lcfirst)
     STRLEN slen;
 
     if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) {
-	I32 ulen;
+	STRLEN ulen;
 	U8 tmpbuf[UTF8_MAXLEN];
 	U8 *tend;
-	UV uv = utf8_to_uv_chk(s, &ulen, 0);
+	UV uv = utf8_to_uv_chk(s, slen, &ulen, 0);
 
 	if (PL_op->op_private & OPpLOCALE) {
 	    TAINT;
@@ -2423,7 +2423,7 @@ PP(pp_uc)
 
     if (DO_UTF8(sv)) {
 	dTARGET;
-	I32 ulen;
+	STRLEN ulen;
 	register U8 *d;
 	U8 *send;
 
@@ -2443,7 +2443,7 @@ PP(pp_uc)
 		TAINT;
 		SvTAINTED_on(TARG);
 		while (s < send) {
-		    d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv_chk(s, &ulen, 0)));
+		    d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv_chk(s, len, &ulen, 0)));
 		    s += ulen;
 		}
 	    }
@@ -2497,7 +2497,7 @@ PP(pp_lc)
 
     if (DO_UTF8(sv)) {
 	dTARGET;
-	I32 ulen;
+	STRLEN ulen;
 	register U8 *d;
 	U8 *send;
 
@@ -2517,7 +2517,7 @@ PP(pp_lc)
 		TAINT;
 		SvTAINTED_on(TARG);
 		while (s < send) {
-		    d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv_chk(s, &ulen, 0)));
+		    d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv_chk(s, len, &ulen, 0)));
 		    s += ulen;
 		}
 	    }
@@ -3363,7 +3363,7 @@ PP(pp_unpack)
     /* These must not be in registers: */
     I16 ashort;
     int aint;
-    I32 along;
+    STRLEN along;
 #ifdef HAS_QUAD
     Quad_t aquad;
 #endif
@@ -3659,7 +3659,7 @@ PP(pp_unpack)
 		len = strend - s;
 	    if (checksum) {
 		while (len-- > 0 && s < strend) {
-		    auint = utf8_to_uv_chk((U8*)s, &along, 0);
+		    auint = utf8_to_uv_chk((U8*)s, strend - s, &along, 0);
 		    s += along;
 		    if (checksum > 32)
 			cdouble += (NV)auint;
@@ -3671,7 +3671,7 @@ PP(pp_unpack)
 		EXTEND(SP, len);
 		EXTEND_MORTAL(len);
 		while (len-- > 0 && s < strend) {
-		    auint = utf8_to_uv_chk((U8*)s, &along, 0);
+		    auint = utf8_to_uv_chk((U8*)s, strend - s, &along, 0);
 		    s += along;
 		    sv = NEWSV(37, 0);
 		    sv_setuv(sv, (UV)auint);
diff --git a/pp_ctl.c b/pp_ctl.c
index cf2000e..33f91ee 100644
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -2971,17 +2971,17 @@ PP(pp_require)
     if (SvNIOKp(sv)) {
 	if (SvPOK(sv) && SvNOK(sv)) {		/* require v5.6.1 */
 	    UV rev = 0, ver = 0, sver = 0;
-	    I32 len;
+	    STRLEN len;
 	    U8 *s = (U8*)SvPVX(sv);
 	    U8 *end = (U8*)SvPVX(sv) + SvCUR(sv);
 	    if (s < end) {
-		rev = utf8_to_uv_chk(s, &len, 0);
+		rev = utf8_to_uv_chk(s, end - s, &len, 0);
 		s += len;
 		if (s < end) {
-		    ver = utf8_to_uv_chk(s, &len, 0);
+		    ver = utf8_to_uv_chk(s, end - s, &len, 0);
 		    s += len;
 		    if (s < end)
-			sver = utf8_to_uv_chk(s, &len, 0);
+			sver = utf8_to_uv_chk(s, end - s, &len, 0);
 		}
 	    }
 	    if (PERL_REVISION < rev
diff --git a/proto.h b/proto.h
index 2713916..7624255 100644
--- a/proto.h
+++ b/proto.h
@@ -677,10 +677,10 @@ PERL_CALLCONV OP*	Perl_scalar(pTHX_ OP* o);
 PERL_CALLCONV OP*	Perl_scalarkids(pTHX_ OP* o);
 PERL_CALLCONV OP*	Perl_scalarseq(pTHX_ OP* o);
 PERL_CALLCONV OP*	Perl_scalarvoid(pTHX_ OP* o);
-PERL_CALLCONV NV	Perl_scan_bin(pTHX_ char* start, I32 len, I32* retlen);
-PERL_CALLCONV NV	Perl_scan_hex(pTHX_ char* start, I32 len, I32* retlen);
+PERL_CALLCONV NV	Perl_scan_bin(pTHX_ char* start, STRLEN len, STRLEN* retlen);
+PERL_CALLCONV NV	Perl_scan_hex(pTHX_ char* start, STRLEN len, STRLEN* retlen);
 PERL_CALLCONV char*	Perl_scan_num(pTHX_ char* s, YYSTYPE *lvalp);
-PERL_CALLCONV NV	Perl_scan_oct(pTHX_ char* start, I32 len, I32* retlen);
+PERL_CALLCONV NV	Perl_scan_oct(pTHX_ char* start, STRLEN len, STRLEN* retlen);
 PERL_CALLCONV OP*	Perl_scope(pTHX_ OP* o);
 PERL_CALLCONV char*	Perl_screaminstr(pTHX_ SV* bigsv, SV* littlesv, I32 start_shift, I32 end_shift, I32 *state, I32 last);
 #if !defined(VMS)
@@ -809,8 +809,8 @@ PERL_CALLCONV I32	Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
 PERL_CALLCONV U8*	Perl_utf8_hop(pTHX_ U8 *s, I32 off);
 PERL_CALLCONV U8*	Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len);
 PERL_CALLCONV U8*	Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len);
-PERL_CALLCONV UV	Perl_utf8_to_uv(pTHX_ U8 *s, I32* retlen);
-PERL_CALLCONV UV	Perl_utf8_to_uv_chk(pTHX_ U8 *s, I32* retlen, bool checking);
+PERL_CALLCONV UV	Perl_utf8_to_uv(pTHX_ U8 *s, STRLEN* retlen);
+PERL_CALLCONV UV	Perl_utf8_to_uv_chk(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, bool checking);
 PERL_CALLCONV U8*	Perl_uv_to_utf8(pTHX_ U8 *d, UV uv);
 PERL_CALLCONV void	Perl_vivify_defelem(pTHX_ SV* sv);
 PERL_CALLCONV void	Perl_vivify_ref(pTHX_ SV* sv, U32 to_what);
@@ -1103,7 +1103,7 @@ STATIC regnode*	S_reg(pTHX_ I32, I32 *);
 STATIC regnode*	S_reganode(pTHX_ U8, U32);
 STATIC regnode*	S_regatom(pTHX_ I32 *);
 STATIC regnode*	S_regbranch(pTHX_ I32 *, I32);
-STATIC void	S_reguni(pTHX_ UV, char *, I32*);
+STATIC void	S_reguni(pTHX_ UV, char *, STRLEN*);
 STATIC regnode*	S_regclass(pTHX);
 STATIC regnode*	S_regclassutf8(pTHX);
 STATIC I32	S_regcurly(pTHX_ char *);
diff --git a/regcomp.c b/regcomp.c
index e7042ea..3f2b10c 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2742,11 +2742,11 @@ tryagain:
 	/* FALL THROUGH */
 
     default: {
-	    register I32 len;
+	    register STRLEN len;
 	    register UV ender;
 	    register char *p;
 	    char *oldp, *s;
-	    I32 numlen;
+	    STRLEN numlen;
 
 	    PL_regcomp_parse++;
 
@@ -2884,7 +2884,8 @@ tryagain:
 		default:
 		  normal_default:
 		    if ((*p & 0xc0) == 0xc0 && UTF) {
-			ender = utf8_to_uv_chk((U8*)p, &numlen, 0);
+			ender = utf8_to_uv_chk((U8*)p, PL_regxend - p,
+					       &numlen, 0);
 			p += numlen;
 		    }
 		    else
@@ -3128,7 +3129,7 @@ S_regclass(pTHX)
     register I32 lastvalue = OOB_CHAR8;
     register I32 range = 0;
     register regnode *ret;
-    I32 numlen;
+    STRLEN numlen;
     I32 namedclass;
     char *rangebegin;
     bool need_class = 0;
@@ -3606,7 +3607,7 @@ S_regclassutf8(pTHX)
     register U32 lastvalue = OOB_UTF8;
     register I32 range = 0;
     register regnode *ret;
-    I32 numlen;
+    STRLEN numlen;
     I32 n;
     SV *listsv;
     U8 flags = 0;
@@ -3638,12 +3639,16 @@ S_regclassutf8(pTHX)
 	namedclass = OOB_NAMEDCLASS;
 	if (!range)
 	    rangebegin = PL_regcomp_parse;
-	value = utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0);
+	value = utf8_to_uv_chk((U8*)PL_regcomp_parse,
+			       PL_regxend - PL_regcomp_parse,
+			       &numlen, 0);
 	PL_regcomp_parse += numlen;
 	if (value == '[')
 	    namedclass = regpposixcc(value);
 	else if (value == '\\') {
-	    value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0);
+	    value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse,
+					PL_regxend - PL_regcomp_parse,
+					&numlen, 0);
 	    PL_regcomp_parse += numlen;
 	    /* Some compilers cannot handle switching on 64-bit integer
 	     * values, therefore value cannot be an UV.  Yes, this will
@@ -3937,7 +3942,7 @@ S_reganode(pTHX_ U8 op, U32 arg)
 - reguni - emit (if appropriate) a Unicode character
 */
 STATIC void
-S_reguni(pTHX_ UV uv, char* s, I32* lenp)
+S_reguni(pTHX_ UV uv, char* s, STRLEN* lenp)
 {
     dTHR;
     if (SIZE_ONLY) {
diff --git a/regexec.c b/regexec.c
index 6e046f3..350f432 100644
--- a/regexec.c
+++ b/regexec.c
@@ -917,7 +917,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	    PL_reg_flags |= RF_tainted;
 	    /* FALL THROUGH */
 	case BOUNDUTF8:
-	    tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), 0, 0) : '\n';
+	    tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1),
+							strend - s,
+							0, 0) : '\n';
 	    tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
 	    while (s < strend) {
 		if (tmp == !(OP(c) == BOUNDUTF8 ?
@@ -953,7 +955,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	    PL_reg_flags |= RF_tainted;
 	    /* FALL THROUGH */
 	case NBOUNDUTF8:
-	    tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), 0, 0) : '\n';
+	    tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1),
+							strend - s,
+							0, 0) : '\n';
 	    tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
 	    while (s < strend) {
 		if (tmp == !(OP(c) == NBOUNDUTF8 ?
@@ -1998,7 +2002,7 @@ S_regmatch(pTHX_ regnode *prog)
 		while (s < e) {
 		    if (l >= PL_regeol)
 			sayNO;
-		    if (utf8_to_uv_chk((U8*)s, 0, 0) != (c1 ?
+		    if (utf8_to_uv_chk((U8*)s, e - s, 0, 0) != (c1 ?
 						  toLOWER_utf8((U8*)l) :
 						  toLOWER_LC_utf8((U8*)l)))
 		    {
@@ -2136,7 +2140,8 @@ S_regmatch(pTHX_ regnode *prog)
 	case NBOUNDUTF8:
 	    /* was last char in word? */
 	    ln = (locinput != PL_regbol)
-		? utf8_to_uv_chk(reghop((U8*)locinput, -1), 0, 0) : PL_regprev;
+		? utf8_to_uv_chk(reghop((U8*)locinput, -1),
+				 PL_regeol - locinput, 0, 0) : PL_regprev;
 	    if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
 		ln = isALNUM_uni(ln);
 		n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
diff --git a/sv.c b/sv.c
index 1fac162..2790cfd 100644
--- a/sv.c
+++ b/sv.c
@@ -6358,13 +6358,13 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
 	case 'd':
 	case 'i':
 	    if (vectorize) {
-		I32 ulen;
+		STRLEN ulen;
 		if (!veclen) {
 		    vectorize = FALSE;
 		    break;
 		}
 		if (utf)
-		    iv = (IV)utf8_to_uv_chk(vecstr, &ulen, 0);
+		    iv = (IV)utf8_to_uv_chk(vecstr, veclen, &ulen, 0);
 		else {
 		    iv = *vecstr;
 		    ulen = 1;
@@ -6440,14 +6440,14 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
 
 	uns_integer:
 	    if (vectorize) {
-		I32 ulen;
+		STRLEN ulen;
 	vector:
 		if (!veclen) {
 		    vectorize = FALSE;
 		    break;
 		}
 		if (utf)
-		    uv = utf8_to_uv_chk(vecstr, &ulen, 0);
+		    uv = utf8_to_uv_chk(vecstr, veclen, &ulen, 0);
 		else {
 		    uv = *vecstr;
 		    ulen = 1;
diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t
index 7224a74..e61baad 100755
--- a/t/pragma/utf8.t
+++ b/t/pragma/utf8.t
@@ -10,7 +10,7 @@ BEGIN {
     }
 }
 
-print "1..103\n";
+print "1..181\n";
 
 my $test = 1;
 
@@ -559,3 +559,170 @@ sub nok_bytes {
     print "ok $test\n";
     $test++;
 }
+
+# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
+# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
+# version dated 2000-09-02. 
+
+my @MK = split(/\n/, <<__EOMK__);
+1	Correct UTF-8
+1.1.1 y "Îºá½¹ÏÎ¼Îµ"	-		11	ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5	5
+2	Boundary conditions 
+2.1	First possible sequence of certain length
+2.1.1 y " "			0		1	00	1
+2.1.2 y "Â"			80		2	c2:80	1
+2.1.3 y "à "		800		3	e0:a0:80	1
+2.1.4 y "ð"		10000		4	f0:90:80:80	1
+2.1.5 y "ø"	200000		5	f8:88:80:80:80	1
+2.1.6 y "ü"	4000000		6	fc:84:80:80:80:80	1
+2.2	Last possible sequence of certain length
+2.2.1 y ""			7f		1	7f	1
+2.2.2 y "ß¿"			7ff		2	df:bf	1
+# The ffff is legal unless under use utf8
+2.2.3 y "ï¿¿"			ffff		3	ef:bf:bf	1
+2.2.4 y "÷¿¿¿"			1fffff		4	f7:bf:bf:bf	1
+2.2.5 y "û¿¿¿¿"			3ffffff		5	fb:bf:bf:bf:bf	1
+2.2.6 y "ý¿¿¿¿¿"		7fffffff	6	fd:bf:bf:bf:bf:bf	1
+2.3	Other boundary conditions
+2.3.1 y "í¿"		d7ff		3	ed:9f:bf	1
+2.3.2 y "î"		e000		3	ee:80:80	1
+2.3.3 y "ï¿½"			fffd		3	ef:bf:bd	1
+2.3.4 y "ô¿¿"		10ffff		4	f4:8f:bf:bf	1
+2.3.5 y "ô"		110000		4	f4:90:80:80	1
+3	Malformed sequences
+3.1	Unexpected continuation bytes
+3.1.1 n ""			-		1	80
+3.1.2 n "¿"			-		1	bf
+3.1.3 n "¿"			-		2	80:bf
+3.1.4 n "¿"		-		3	80:bf:80
+3.1.5 n "¿¿"		-		4	80:bf:80:bf
+3.1.6 n "¿¿"	-		5	80:bf:80:bf:80
+3.1.7 n "¿¿¿"	-		6	80:bf:80:bf:80:bf
+3.1.8 n "¿¿¿"	-		7	80:bf:80:bf:80:bf:80
+3.1.9 n " ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿"				-	64	80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf
+3.2	Lonely start characters
+3.2.1 n "À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß "	-	64 	c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20
+3.2.2 n "à á â ã ä å æ ç è é ê ë ì í î ï "	-	32	e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20
+3.2.3 n "ð ñ ò ó ô õ ö ÷ "	-	16	f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20
+3.2.4 n "ø ù ú û "		-	8	f8:20:f9:20:fa:20:fb:20
+3.2.5 n "ü ý "			-	4	fc:20:fd:20
+3.3	Sequences with last continuation byte missing
+3.3.1 n "À"			-	1	c0
+3.3.2 n "à"			-	2	e0:80
+3.3.3 n "ð"		-	3	f0:80:80
+3.3.4 n "ø"		-	4	f8:80:80:80
+3.3.5 n "ü"	-	5	fc:80:80:80:80
+3.3.6 n "ß"			-	1	df
+3.3.7 n "ï¿"			-	2	ef:bf
+3.3.8 n "÷¿¿"			-	3	f7:bf:bf
+3.3.9 n "û¿¿¿"			-	4	fb:bf:bf:bf
+3.3.10 n "ý¿¿¿¿"		-	5	fd:bf:bf:bf:bf
+3.4	Concatenation of incomplete sequences
+3.4.1 n "Ààðøüßï¿÷¿¿û¿¿¿ý¿¿¿¿"	-	30	c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf
+3.5	Impossible bytes
+3.5.1 n "þ"			-	1	fe
+3.5.2 n "ÿ"			-	1	ff
+3.5.3 n "þþÿÿ"			-	4	fe:fe:ff:ff
+4	Overlong sequences
+4.1	Examples of an overlong ASCII character
+4.1.1 n "À¯"			-	2	c0:af
+4.1.2 n "à¯"		-	3	e0:80:af
+4.1.3 n "ð¯"		-	4	f0:80:80:af
+4.1.4 n "ø¯"	-	5	f8:80:80:80:af
+4.1.5 n "ü¯"	-	6	fc:80:80:80:80:af
+4.2	Maximum overlong sequences
+4.2.1 n "Á¿"			-	2	c1:bf
+4.2.2 n "à¿"		-	3	e0:9f:bf
+4.2.3 n "ð¿¿"		-	4	f0:8f:bf:bf
+4.2.4 n "ø¿¿¿"		-	5	f8:87:bf:bf:bf
+4.2.5 n "ü¿¿¿¿"		-	6	fc:83:bf:bf:bf:bf
+4.3	Overlong representation of the NUL character
+4.3.1 n "À"			-	2	c0:80
+4.3.2 n "à"		-	3	e0:80:80
+4.3.3 n "ð"		-	4	f0:80:80:80
+4.3.4 n "ø"	-	5	f8:80:80:80:80
+4.3.5 n "ü"	-	6	fc:80:80:80:80:80
+5	Illegal code positions
+5.1	Single UTF-16 surrogates
+5.1.1 n "í "		-	3	ed:a0:80
+5.1.2 n "í­¿"			-	3	ed:ad:bf
+5.1.3 n "í®"		-	3	ed:ae:80
+5.1.4 n "í¯¿"			-	3	ed:af:bf
+5.1.5 n "í°"		-	3	ed:b0:80
+5.1.6 n "í¾"		-	3	ed:be:80
+5.1.7 n "í¿¿"			-	3	ed:bf:bf
+5.2	Paired UTF-16 surrogates
+5.2.1 n "í í°"		-	6	ed:a0:80:ed:b0:80
+5.2.2 n "í í¿¿"		-	6	ed:a0:80:ed:bf:bf
+5.2.3 n "í­¿í°"		-	6	ed:ad:bf:ed:b0:80
+5.2.4 n "í­¿í¿¿"		-	6	ed:ad:bf:ed:bf:bf
+5.2.5 n "í®í°"		-	6	ed:ae:80:ed:b0:80
+5.2.6 n "í®í¿¿"		-	6	ed:ae:80:ed:bf:bf
+5.2.7 n "í¯¿í°"		-	6	ed:af:bf:ed:b0:80
+5.2.8 n "í¯¿í¿¿"		-	6	ed:af:bf:ed:bf:bf
+5.3	Other illegal code positions
+5.3.1 n "ï¿¾"			-	3	ef:bf:be
+# The ffff is legal unless under use utf8
+5.3.2 y "ï¿¿"			-	3	ef:bf:bf
+__EOMK__
+
+# 104..181
+{
+    my $WARN;
+    my $id;
+
+    local $SIG{__WARN__} =
+	sub {
+	    # print "# $id: @_";
+	    $WARN++;
+	};
+
+    sub moan {
+	print "$id: @_";
+    }
+    
+    sub test_unpack_U {
+	$WARN = 0;
+	unpack('U*', $_[0]);
+    }
+
+    for (@MK) {
+	if (/^(?:\d+(?:\.\d+)?)\s/ || /^#/) {
+	    # print "# $_\n";
+	} elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+(\d+))?$/) {
+	    $id = $1;
+	    my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen) =
+		($2, $3, $4, $5, $6, $7);
+	    my @hex = split(/:/, $hex);
+	    unless (@hex == $byteslen) {
+		my $nhex = @hex;
+		moan "amount of hex ($nhex) not equal to byteslen ($byteslen)\n";
+	    }
+	    {
+		use bytes;
+		my $bytesbyteslen = length($bytes);
+		unless ($bytesbyteslen == $byteslen) {
+		    moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n";
+		}
+	    }
+	    if ($okay eq 'y') {
+		test_unpack_U($bytes);
+		unless ($WARN == 0) {
+		    moan "unpack('U*') false negative\n";
+		    print "not ";
+		}
+	    } elsif ($okay eq 'n') {
+		test_unpack_U($bytes);
+		unless ($WARN) {
+		    moan "unpack('U*') false positive\n";
+		    print "not ";
+		}
+	    }
+	    print "ok $test\n";
+	    $test++;
+ 	} else {
+	    moan "unknown format\n";
+	}
+    }
+}
+
diff --git a/t/pragma/warn/utf8 b/t/pragma/warn/utf8
index 6a2fe54..012c655 100644
--- a/t/pragma/warn/utf8
+++ b/t/pragma/warn/utf8
@@ -24,6 +24,6 @@ my $a = "sn
     my $a = "snøstorm";
 }
 EXPECT
-Malformed UTF-8 character at - line 3.
-Malformed UTF-8 character at - line 8.
+Malformed UTF-8 character (unexpected non-continuation byte 0x73 after byte 0xf8) at - line 3.
+Malformed UTF-8 character (unexpected non-continuation byte 0x73 after byte 0xf8) at - line 8.
 ########
diff --git a/toke.c b/toke.c
index 2ec1f8c..32073a5 100644
--- a/toke.c
+++ b/toke.c
@@ -813,10 +813,10 @@ Perl_str_to_version(pTHX_ SV *sv)
     bool utf = SvUTF8(sv) ? TRUE : FALSE;
     char *end = start + len;
     while (start < end) {
-	I32 skip;
+	STRLEN skip;
 	UV n;
 	if (utf)
-	    n = utf8_to_uv_chk((U8*)start, &skip, 0);
+	    n = utf8_to_uv_chk((U8*)start, len, &skip, 0);
 	else {
 	    n = *(U8*)start;
 	    skip = 1;
@@ -1188,7 +1188,6 @@ S_scan_const(pTHX_ char *start)
     bool dorange = FALSE;			/* are we in a translit range? */
     bool didrange = FALSE;		        /* did we just finish a range? */
     bool has_utf = FALSE;			/* embedded \x{} */
-    I32 len;					/* ? */
     UV uv;
 
     I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
@@ -1329,20 +1328,23 @@ S_scan_const(pTHX_ char *start)
 	/* (now in tr/// code again) */
 
 	if (*s & 0x80 && thisutf) {
-	   (void)utf8_to_uv_chk((U8*)s, &len, 0);
-	   if (len == 1) {
-	       /* illegal UTF8, make it valid */
-	       char *old_pvx = SvPVX(sv);
-	       /* need space for one extra char (NOTE: SvCUR() not set here) */
-	       d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx);
-	       d = (char*)uv_to_utf8((U8*)d, (U8)*s++);
-	   }
-	   else {
-	       while (len--)
-		   *d++ = *s++;
-	   }
-	   has_utf = TRUE;
-	   continue;
+	    STRLEN len;
+	    UV uv;
+
+	    uv = utf8_to_uv_chk((U8*)s, send - s, &len, 1);
+	    if (len == 1) {
+		/* illegal UTF8, make it valid */
+		char *old_pvx = SvPVX(sv);
+		/* need space for one extra char (NOTE: SvCUR() not set here) */
+		d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx);
+		d = (char*)uv_to_utf8((U8*)d, (U8)*s++);
+	    }
+	    else {
+		while (len--)
+		    *d++ = *s++;
+	    }
+	    has_utf = TRUE;
+	    continue;
 	}
 
 	/* backslashes */
@@ -1398,9 +1400,11 @@ S_scan_const(pTHX_ char *start)
 	    /* \132 indicates an octal constant */
 	    case '0': case '1': case '2': case '3':
 	    case '4': case '5': case '6': case '7':
-		len = 0;	/* disallow underscores */
-		uv = (UV)scan_oct(s, 3, &len);
-		s += len;
+		{
+		    STRLEN len = 0;	/* disallow underscores */
+		    uv = (UV)scan_oct(s, 3, &len);
+		    s += len;
+		}
 		goto NUM_ESCAPE_INSERT;
 
 	    /* \x24 indicates a hex constant */
@@ -1412,14 +1416,18 @@ S_scan_const(pTHX_ char *start)
 			yyerror("Missing right brace on \\x{}");
 			e = s;
 		    }
-		    len = 1;		/* allow underscores */
-                    uv = (UV)scan_hex(s + 1, e - s - 1, &len);
-                    s = e + 1;
+		    {
+			STRLEN len = 1;		/* allow underscores */
+			uv = (UV)scan_hex(s + 1, e - s - 1, &len);
+		    }
+		    s = e + 1;
 		}
 		else {
-		    len = 0;		/* disallow underscores */
-		    uv = (UV)scan_hex(s, 2, &len);
-		    s += len;
+		    {
+			STRLEN len = 0;		/* disallow underscores */
+			uv = (UV)scan_hex(s, 2, &len);
+			s += len;
+		    }
 		}
 
 	      NUM_ESCAPE_INSERT:
@@ -1528,8 +1536,10 @@ S_scan_const(pTHX_ char *start)
 		*d = toCTRL(*d); 
 		d++;
 #else
-		len = *s++;
-		*d++ = toCTRL(len);
+		{
+		    U8 c = *s++;
+		    *d++ = toCTRL(c);
+		}
 #endif
 		continue;
 
diff --git a/utf8.c b/utf8.c
index a713ea1..98236ed 100644
--- a/utf8.c
+++ b/utf8.c
@@ -153,12 +153,12 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 }
 
 /*
-=for apidoc Am|U8* s|utf8_to_uv_chk|I32 *retlen|I32 checking
+=for apidoc Am|U8* s|utf8_to_uv_chk|STRLEN curlen|I32 *retlen|I32 checking
 
 Returns the character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
-length, in bytes, of that character, and the pointer C<s> will be
-advanced to the end of the character.
+which is assumed to be in UTF8 encoding and no longer than C<curlen>;
+C<retlen> will be set to the length, in bytes, of that character,
+and the pointer C<s> will be advanced to the end of the character.
 
 If C<s> does not point to a well-formed UTF8 character, the behaviour
 is dependent on the value of C<checking>: if this is true, it is
@@ -170,79 +170,150 @@ warning is produced.
 */
 
 UV
-Perl_utf8_to_uv_chk(pTHX_ U8* s, I32* retlen, bool checking)
+Perl_utf8_to_uv_chk(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, bool checking)
 {
-    UV uv = *s;
-    int len;
-    if (!(uv & 0x80)) {
+    dTHR;
+    UV uv = *s, ouv;
+    STRLEN len = 1;
+    bool dowarn = ckWARN_d(WARN_UTF8);
+    STRLEN expectlen = 0;
+    
+    if (uv <= 0x7f) { /* Pure ASCII. */
 	if (retlen)
 	    *retlen = 1;
 	return *s;
     }
-    if (!(uv & 0x40)) {
-        dTHR;
-	if (checking && retlen) {
-	    *retlen = -1;
-	    return 0;
-	}
 
-	if (ckWARN_d(WARN_UTF8))
-	    Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
-	if (retlen)
-	    *retlen = 1;
-	return *s;
+    if (uv >= 0x80 && uv <= 0xbf) {
+	if (dowarn)
+	    Perl_warner(aTHX_ WARN_UTF8,
+			"Malformed UTF-8 character (unexpected continuation byte 0x%02x)",
+			uv);
+	goto malformed;
+    }
+
+    if (uv >= 0xc0 && uv <= 0xfd && curlen > 1 && s[1] < 0x80) {
+	if (dowarn)
+	    Perl_warner(aTHX_ WARN_UTF8,
+			"Malformed UTF-8 character (unexpected non-continuation byte 0x%02x after byte 0x%02x)",
+			s[1], uv);
+	goto malformed;
+    }
+
+    if ((uv == 0xfe || uv == 0xff) && IN_UTF8){
+	if (dowarn)
+	    Perl_warner(aTHX_ WARN_UTF8,
+			"Malformed UTF-8 character (impossible byte 0x%02x)",
+			uv);
+	goto malformed;
     }
 
-    if      (!(uv & 0x20))	{ len = 2; uv &= 0x1f; }
-    else if (!(uv & 0x10))	{ len = 3; uv &= 0x0f; }
-    else if (!(uv & 0x08))	{ len = 4; uv &= 0x07; }
-    else if (!(uv & 0x04))	{ len = 5; uv &= 0x03; }
-    else if (!(uv & 0x02))	{ len = 6; uv &= 0x01; }
-    else if (!(uv & 0x01))	{ len = 7;  uv = 0; }
+    if      (!(uv & 0x20))	{ len =  2; uv &= 0x1f; }
+    else if (!(uv & 0x10))	{ len =  3; uv &= 0x0f; }
+    else if (!(uv & 0x08))	{ len =  4; uv &= 0x07; }
+    else if (!(uv & 0x04))	{ len =  5; uv &= 0x03; }
+    else if (!(uv & 0x02))	{ len =  6; uv &= 0x01; }
+    else if (!(uv & 0x01))	{ len =  7; uv = 0; }
     else 			{ len = 13; uv = 0; } /* whoa! */
 
     if (retlen)
 	*retlen = len;
-    --len;
+    
+    expectlen = len;
+
+    if (curlen < expectlen) {
+	if (dowarn)
+	    Perl_warner(aTHX_ WARN_UTF8,
+			"Malformed UTF-8 character (%d byte%s, need %d)",
+			curlen, curlen > 1 ? "s" : "", expectlen);
+	goto malformed;
+    }
+
+    len--;
     s++;
+    ouv = uv;
+
     while (len--) {
 	if ((*s & 0xc0) != 0x80) {
-            dTHR;
-	    if (checking && retlen) {
-		*retlen = -1;
-		return 0;
-            }
-
-	    if (ckWARN_d(WARN_UTF8))
-	        Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
-	    if (retlen)
-		*retlen -= len + 1;
-	    return 0xfffd;
+	    if (dowarn)
+		Perl_warner(aTHX_ WARN_UTF8,
+			    "Malformed UTF-8 character (unexpected continuation byte 0x%02x)",
+			    *s);
+	    goto malformed;
 	}
 	else
-	    uv = (uv << 6) | (*s++ & 0x3f);
+	    uv = (uv << 6) | (*s & 0x3f);
+	if (uv < ouv) {
+	    if (dowarn)
+		Perl_warner(aTHX_ WARN_UTF8,
+			    "Malformed UTF-8 character (overflow at 0x%"UVxf", byte 0x%02x)",
+			    ouv, *s);
+	    goto malformed;
+	}
+	s++;
+	ouv = uv;
+    }
+
+    if (uv >= 0xd800 && uv <= 0xdfff) {
+	if (dowarn)
+	    Perl_warner(aTHX_ WARN_UTF8,
+			"Malformed UTF-8 character (UTF-16 surrogate 0x%04"UVxf")",
+			uv);
+	goto malformed;
+    } else if (uv == 0xfffe) {
+	if (dowarn)
+	    Perl_warner(aTHX_ WARN_UTF8,
+			"Malformed UTF-8 character (byte order mark 0x%04"UVxf")",
+			uv);
+	goto malformed;
+    } else if (uv == 0xffff && IN_UTF8) {
+	if (dowarn)
+	    Perl_warner(aTHX_ WARN_UTF8,
+			"Malformed UTF-8 character (impossible character 0x%04"UVxf")",
+			uv);
+	goto malformed;
+    } else if (expectlen > UTF8LEN(uv)) {
+	if (dowarn)
+	    Perl_warner(aTHX_ WARN_UTF8,
+			"Malformed UTF-8 character (%d byte%s, need %d)",
+			expectlen, expectlen > 1 ? "s": "", UTF8LEN(uv));
+	goto malformed;
     }
+
     return uv;
+
+malformed:
+
+    if (checking) {
+	if (retlen)
+	    *retlen = len;
+	return 0;
+    }
+
+    if (retlen)
+	*retlen = -1;
+
+    return UNICODE_REPLACEMENT_CHARACTER;
 }
 
 /*
-=for apidoc Am|U8* s|utf8_to_uv|I32 *retlen
+=for apidoc Am|U8* s|utf8_to_uv|STRLEN *retlen
 
 Returns the character value of the first character in the string C<s>
 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 length, in bytes, of that character, and the pointer C<s> will be
 advanced to the end of the character.
 
-If C<s> does not point to a well-formed UTF8 character, an optional UTF8
-warning is produced.
+If C<s> does not point to a well-formed UTF8 character, zero is
+returned and retlen is set, if possible, to -1.
 
 =cut
 */
 
 UV
-Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen)
+Perl_utf8_to_uv(pTHX_ U8* s, STRLEN* retlen)
 {
- return Perl_utf8_to_uv_chk(aTHX_ s, retlen, 0);
+    return Perl_utf8_to_uv_chk(aTHX_ s, (STRLEN)-1, retlen, 0);
 }
 
 /* utf8_distance(a,b) returns the number of UTF8 characters between
@@ -324,7 +395,7 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
         if (*s < 0x80)
             *d++ = *s++;
         else {
-            I32 ulen;
+            STRLEN ulen;
             *d++ = (U8)utf8_to_uv(s, &ulen);
             s += ulen;
         }
@@ -853,7 +924,7 @@ Perl_to_utf8_upper(pTHX_ U8 *p)
     if (!PL_utf8_toupper)
 	PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_toupper, p);
-    return uv ? uv : utf8_to_uv_chk(p,0,0);
+    return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0);
 }
 
 UV
@@ -864,7 +935,7 @@ Perl_to_utf8_title(pTHX_ U8 *p)
     if (!PL_utf8_totitle)
 	PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_totitle, p);
-    return uv ? uv : utf8_to_uv_chk(p,0,0);
+    return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0);
 }
 
 UV
@@ -875,7 +946,7 @@ Perl_to_utf8_lower(pTHX_ U8 *p)
     if (!PL_utf8_tolower)
 	PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_tolower, p);
-    return uv ? uv : utf8_to_uv_chk(p,0,0);
+    return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0);
 }
 
 /* a "swash" is a swatch hash */
@@ -965,7 +1036,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
 	    PUSHMARK(SP);
 	    EXTEND(SP,3);
 	    PUSHs((SV*)sv);
-	    PUSHs(sv_2mortal(newSViv(utf8_to_uv_chk(ptr, 0, 0) & ~(needents - 1))));
+	    PUSHs(sv_2mortal(newSViv(utf8_to_uv_chk(ptr, STRLEN_MAX, 0, 0) & ~(needents - 1))));
 	    PUSHs(sv_2mortal(newSViv(needents)));
 	    PUTBACK;
 	    if (call_method("SWASHGET", G_SCALAR))
diff --git a/utf8.h b/utf8.h
index 7407335..bb494ab 100644
--- a/utf8.h
+++ b/utf8.h
@@ -29,7 +29,7 @@ END_EXTERN_C
 
 #define UTF8_MAXLEN 13 /* how wide can a single UTF8 encoded character become */
 
-/*#define IN_UTF8 (PL_curcop->op_private & HINT_UTF8)*/
+#define IN_UTF8 (PL_curcop->op_private & HINT_UTF8)
 #define IN_BYTE (PL_curcop->op_private & HINT_BYTE)
 #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTE)
 
@@ -53,6 +53,8 @@ END_EXTERN_C
 		      (uv) < 0x80000000     ? 6 : 7 )
 #endif
 
+#define UNICODE_REPLACEMENT_CHARACTER	0xfffd
+
 /*
  * Note: we try to be careful never to call the isXXX_utf8() functions
  * unless we're pretty sure we've seen the beginning of a UTF-8 character
diff --git a/util.c b/util.c
index 6c949c7..2122d4e 100644
--- a/util.c
+++ b/util.c
@@ -2933,7 +2933,7 @@ Perl_same_dirent(pTHX_ char *a, char *b)
 #endif /* !HAS_RENAME */
 
 NV
-Perl_scan_bin(pTHX_ char *start, I32 len, I32 *retlen)
+Perl_scan_bin(pTHX_ char *start, STRLEN len, STRLEN *retlen)
 {
     register char *s = start;
     register NV rnv = 0.0;
@@ -3004,7 +3004,7 @@ Perl_scan_bin(pTHX_ char *start, I32 len, I32 *retlen)
 }
 
 NV
-Perl_scan_oct(pTHX_ char *start, I32 len, I32 *retlen)
+Perl_scan_oct(pTHX_ char *start, STRLEN len, STRLEN *retlen)
 {
     register char *s = start;
     register NV rnv = 0.0;
@@ -3074,7 +3074,7 @@ Perl_scan_oct(pTHX_ char *start, I32 len, I32 *retlen)
 }
 
 NV
-Perl_scan_hex(pTHX_ char *start, I32 len, I32 *retlen)
+Perl_scan_hex(pTHX_ char *start, STRLEN len, STRLEN *retlen)
 {
     register char *s = start;
     register NV rnv = 0.0;