From: Jarkko Hietaniemi <jhi@iki.fi>
Date: Fri, 9 Jul 2004 00:35:22 +0000 (+0300)
Subject: Re: Segfault using HTML::Entities
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=078c425b721ea35d568a7ec980e3b95d036a25c3;p=p5sagit%2Fp5-mst-13.2.git

Re: Segfault using HTML::Entities
Message-ID: <40EDBE1A.6080205@iki.fi>
Date: Fri, 09 Jul 2004 00:35:22 +0300

p4raw-id: //depot/perl@23074
---

diff --git a/pp_ctl.c b/pp_ctl.c
index a4c93f1..a9bc3e5 100644
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -187,10 +187,13 @@ PP(pp_substcont)
 	{
 	    SV *targ = cx->sb_targ;
 
-	    if (DO_UTF8(dstr) && !SvUTF8(targ))
-		sv_catpvn_utf8_upgrade(dstr, s, cx->sb_strend - s, nsv);
-	    else
-		sv_catpvn(dstr, s, cx->sb_strend - s);
+	    assert(cx->sb_strend >= s);
+	    if(cx->sb_strend > s) {
+		 if (DO_UTF8(dstr) && !SvUTF8(targ))
+		      sv_catpvn_utf8_upgrade(dstr, s, cx->sb_strend - s, nsv);
+		 else
+		      sv_catpvn(dstr, s, cx->sb_strend - s);
+	    }
 	    cx->sb_rxtainted |= RX_MATCH_TAINTED(rx);
 
 #ifdef PERL_COPY_ON_WRITE
diff --git a/regexec.c b/regexec.c
index 728b1ae..60276cb 100644
--- a/regexec.c
+++ b/regexec.c
@@ -954,6 +954,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	char *m;
 	STRLEN ln;
 	STRLEN lnc;
+	register STRLEN uskip;
 	unsigned int c1;
 	unsigned int c2;
 	char *e;
@@ -964,7 +965,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	switch (OP(c)) {
 	case ANYOF:
 	    if (do_utf8) {
-		 while (s < strend) {
+		 while (s + (uskip = UTF8SKIP(s)) <= strend) {
 		      if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
 			  !UTF8_IS_INVARIANT((U8)s[0]) ?
 			  reginclass(c, (U8*)s, 0, do_utf8) :
@@ -976,7 +977,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		      }
 		      else 
 			   tmp = 1;
-		      s += UTF8SKIP(s);
+		      s += uskip;
 		 }
 	    }
 	    else {
@@ -1172,7 +1173,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		tmp = ((OP(c) == BOUND ?
 			isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
 		LOAD_UTF8_CHARCLASS(alnum,"a");
-		while (s < strend) {
+		while (s + (uskip = UTF8SKIP(s)) <= strend) {
 		    if (tmp == !(OP(c) == BOUND ?
 				 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
 				 isALNUM_LC_utf8((U8*)s)))
@@ -1181,7 +1182,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 			if ((norun || regtry(prog, s)))
 			    goto got_it;
 		    }
-		    s += UTF8SKIP(s);
+		    s += uskip;
 		}
 	    }
 	    else {
@@ -1215,14 +1216,14 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		tmp = ((OP(c) == NBOUND ?
 			isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
 		LOAD_UTF8_CHARCLASS(alnum,"a");
-		while (s < strend) {
+		while (s + (uskip = UTF8SKIP(s)) <= strend) {
 		    if (tmp == !(OP(c) == NBOUND ?
 				 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
 				 isALNUM_LC_utf8((U8*)s)))
 			tmp = !tmp;
 		    else if ((norun || regtry(prog, s)))
 			goto got_it;
-		    s += UTF8SKIP(s);
+		    s += uskip;
 		}
 	    }
 	    else {
@@ -1244,7 +1245,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	case ALNUM:
 	    if (do_utf8) {
 		LOAD_UTF8_CHARCLASS(alnum,"a");
-		while (s < strend) {
+		while (s + (uskip = UTF8SKIP(s)) <= strend) {
 		    if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
 			if (tmp && (norun || regtry(prog, s)))
 			    goto got_it;
@@ -1253,7 +1254,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		    }
 		    else
 			tmp = 1;
-		    s += UTF8SKIP(s);
+		    s += uskip;
 		}
 	    }
 	    else {
@@ -1273,7 +1274,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	case ALNUML:
 	    PL_reg_flags |= RF_tainted;
 	    if (do_utf8) {
-		while (s < strend) {
+		while (s + (uskip = UTF8SKIP(s)) <= strend) {
 		    if (isALNUM_LC_utf8((U8*)s)) {
 			if (tmp && (norun || regtry(prog, s)))
 			    goto got_it;
@@ -1282,7 +1283,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		    }
 		    else
 			tmp = 1;
-		    s += UTF8SKIP(s);
+		    s += uskip;
 		}
 	    }
 	    else {
@@ -1302,7 +1303,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	case NALNUM:
 	    if (do_utf8) {
 		LOAD_UTF8_CHARCLASS(alnum,"a");
-		while (s < strend) {
+		while (s + (uskip = UTF8SKIP(s)) <= strend) {
 		    if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
 			if (tmp && (norun || regtry(prog, s)))
 			    goto got_it;
@@ -1311,7 +1312,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		    }
 		    else
 			tmp = 1;
-		    s += UTF8SKIP(s);
+		    s += uskip;
 		}
 	    }
 	    else {
@@ -1331,7 +1332,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	case NALNUML:
 	    PL_reg_flags |= RF_tainted;
 	    if (do_utf8) {
-		while (s < strend) {
+		while (s + (uskip = UTF8SKIP(s)) <= strend) {
 		    if (!isALNUM_LC_utf8((U8*)s)) {
 			if (tmp && (norun || regtry(prog, s)))
 			    goto got_it;
@@ -1340,7 +1341,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		    }
 		    else
 			tmp = 1;
-		    s += UTF8SKIP(s);
+		    s += uskip;
 		}
 	    }
 	    else {
@@ -1360,7 +1361,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	case SPACE:
 	    if (do_utf8) {
 		LOAD_UTF8_CHARCLASS(space," ");
-		while (s < strend) {
+		while (s + (uskip = UTF8SKIP(s)) <= strend) {
 		    if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
 			if (tmp && (norun || regtry(prog, s)))
 			    goto got_it;
@@ -1369,7 +1370,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		    }
 		    else
 			tmp = 1;
-		    s += UTF8SKIP(s);
+		    s += uskip;
 		}
 	    }
 	    else {
@@ -1389,7 +1390,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	case SPACEL:
 	    PL_reg_flags |= RF_tainted;
 	    if (do_utf8) {
-		while (s < strend) {
+		while (s + (uskip = UTF8SKIP(s)) <= strend) {
 		    if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
 			if (tmp && (norun || regtry(prog, s)))
 			    goto got_it;
@@ -1398,7 +1399,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		    }
 		    else
 			tmp = 1;
-		    s += UTF8SKIP(s);
+		    s += uskip;
 		}
 	    }
 	    else {
@@ -1418,7 +1419,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	case NSPACE:
 	    if (do_utf8) {
 		LOAD_UTF8_CHARCLASS(space," ");
-		while (s < strend) {
+		while (s + (uskip = UTF8SKIP(s)) <= strend) {
 		    if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
 			if (tmp && (norun || regtry(prog, s)))
 			    goto got_it;
@@ -1427,7 +1428,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		    }
 		    else
 			tmp = 1;
-		    s += UTF8SKIP(s);
+		    s += uskip;
 		}
 	    }
 	    else {
@@ -1447,7 +1448,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	case NSPACEL:
 	    PL_reg_flags |= RF_tainted;
 	    if (do_utf8) {
-		while (s < strend) {
+		while (s + (uskip = UTF8SKIP(s)) <= strend) {
 		    if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
 			if (tmp && (norun || regtry(prog, s)))
 			    goto got_it;
@@ -1456,7 +1457,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		    }
 		    else
 			tmp = 1;
-		    s += UTF8SKIP(s);
+		    s += uskip;
 		}
 	    }
 	    else {
@@ -1476,7 +1477,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	case DIGIT:
 	    if (do_utf8) {
 		LOAD_UTF8_CHARCLASS(digit,"0");
-		while (s < strend) {
+		while (s + (uskip = UTF8SKIP(s)) <= strend) {
 		    if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
 			if (tmp && (norun || regtry(prog, s)))
 			    goto got_it;
@@ -1485,7 +1486,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		    }
 		    else
 			tmp = 1;
-		    s += UTF8SKIP(s);
+		    s += uskip;
 		}
 	    }
 	    else {
@@ -1505,7 +1506,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	case DIGITL:
 	    PL_reg_flags |= RF_tainted;
 	    if (do_utf8) {
-		while (s < strend) {
+		while (s + (uskip = UTF8SKIP(s)) <= strend) {
 		    if (isDIGIT_LC_utf8((U8*)s)) {
 			if (tmp && (norun || regtry(prog, s)))
 			    goto got_it;
@@ -1514,7 +1515,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		    }
 		    else
 			tmp = 1;
-		    s += UTF8SKIP(s);
+		    s += uskip;
 		}
 	    }
 	    else {
@@ -1534,7 +1535,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	case NDIGIT:
 	    if (do_utf8) {
 		LOAD_UTF8_CHARCLASS(digit,"0");
-		while (s < strend) {
+		while (s + (uskip = UTF8SKIP(s)) <= strend) {
 		    if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
 			if (tmp && (norun || regtry(prog, s)))
 			    goto got_it;
@@ -1543,7 +1544,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		    }
 		    else
 			tmp = 1;
-		    s += UTF8SKIP(s);
+		    s += uskip;
 		}
 	    }
 	    else {
@@ -1563,7 +1564,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	case NDIGITL:
 	    PL_reg_flags |= RF_tainted;
 	    if (do_utf8) {
-		while (s < strend) {
+		while (s + (uskip = UTF8SKIP(s)) <= strend) {
 		    if (!isDIGIT_LC_utf8((U8*)s)) {
 			if (tmp && (norun || regtry(prog, s)))
 			    goto got_it;
@@ -1572,7 +1573,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		    }
 		    else
 			tmp = 1;
-		    s += UTF8SKIP(s);
+		    s += uskip;
 		}
 	    }
 	    else {
diff --git a/t/run/fresh_perl.t b/t/run/fresh_perl.t
index d47f27f..dbbe6a5 100644
--- a/t/run/fresh_perl.t
+++ b/t/run/fresh_perl.t
@@ -855,3 +855,19 @@ print glob(q(./"TEST"));
 EXPECT
 ./"TEST"
 ./"TEST"
+######## "Segfault using HTML::Entities", Richard Jolly <richardjolly@mac.com>, <A3C7D27E-C9F4-11D8-B294-003065AE00B6@mac.com> in perl-unicode@perl.org
+-lw
+BEGIN {
+  eval 'require Encode';
+  if ($@) { exit 0 } # running minitest?
+}
+# Test case cut down by jhi
+$SIG{__WARN__} = sub { $@ = shift };
+use Encode;
+my $t = "\xE9";
+Encode::_utf8_on($t);
+$t =~ s/([^a])//ge;
+$@ =~ s/ at .*/ at/;
+print $@
+EXPECT
+Malformed UTF-8 character (unexpected end of string) at