From: SADAHIRO Tomoyuki <BQW10602@nifty.com>
Date: Sun, 2 Apr 2006 22:48:44 +0000 (+0900)
Subject: Re: [perl #38293] chr(65535) should be allowed in regexes
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=9f7f391326e967b539b86ed051c163bbf8f6e7de;p=p5sagit%2Fp5-mst-13.2.git

Re: [perl #38293] chr(65535) should be allowed in regexes
Message-Id: <20060402224657.B942.BQW10602@nifty.com>

p4raw-id: //depot/perl@27688
---

diff --git a/doop.c b/doop.c
index 2c1ce81..3e60665 100644
--- a/doop.c
+++ b/doop.c
@@ -69,7 +69,7 @@ S_do_trans_simple(pTHX_ SV *sv)
 	I32 ch;
 
         /* Need to check this, otherwise 128..255 won't match */
-	const UV c = utf8n_to_uvchr(s, send - s, &ulen, 0);
+	const UV c = utf8n_to_uvchr(s, send - s, &ulen, UTF8_ALLOW_DEFAULT);
         if (c < 0x100 && (ch = tbl[c]) >= 0) {
             matches++;
 	    d = uvchr_to_utf8(d, ch);
@@ -119,7 +119,7 @@ S_do_trans_count(pTHX_ SV *sv)
 	const I32 complement = PL_op->op_private & OPpTRANS_COMPLEMENT;
 	while (s < send) {
 	    STRLEN ulen;
-	    const UV c = utf8n_to_uvchr(s, send - s, &ulen, 0);
+	    const UV c = utf8n_to_uvchr(s, send - s, &ulen, UTF8_ALLOW_DEFAULT);
 	    if (c < 0x100) {
 		if (tbl[c] >= 0)
 		    matches++;
@@ -209,7 +209,8 @@ S_do_trans_complex(pTHX_ SV *sv)
 	    UV pch = 0xfeedface;
 	    while (s < send) {
 		STRLEN len;
-		const UV comp = utf8_to_uvchr(s, &len);
+		const UV comp = utf8n_to_uvchr(s, send - s, &len,
+					       UTF8_ALLOW_DEFAULT);
 		I32 ch;
 
 		if (comp > 0xff) {
@@ -254,7 +255,8 @@ S_do_trans_complex(pTHX_ SV *sv)
 	else {
 	    while (s < send) {
 		STRLEN len;
-		const UV comp = utf8_to_uvchr(s, &len);
+		const UV comp = utf8n_to_uvchr(s, send - s, &len,
+					       UTF8_ALLOW_DEFAULT);
 		I32 ch;
 		if (comp > 0xff) {
 		    if (!complement) {
@@ -540,7 +542,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv)
 		}
 		else {
 		    STRLEN len;
-		    uv = utf8_to_uvuni(s, &len);
+		    uv = utf8n_to_uvuni(s, send - s, &len, UTF8_ALLOW_DEFAULT);
 		    if (uv != puv) {
 			Move(s, d, len, U8);
 			d += len;
diff --git a/op.c b/op.c
index fa84b93..64a5b7d 100644
--- a/op.c
+++ b/op.c
@@ -2842,6 +2842,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 	const I32 to_utf    = o->op_private & OPpTRANS_TO_UTF;
 	U8* tsave = NULL;
 	U8* rsave = NULL;
+	const U32 flags = UTF8_ALLOW_DEFAULT;
 
 	if (!from_utf) {
 	    STRLEN len = tlen;
@@ -2868,11 +2869,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 	    i = 0;
 	    transv = newSVpvs("");
 	    while (t < tend) {
-		cp[2*i] = utf8n_to_uvuni(t, tend-t, &ulen, 0);
+		cp[2*i] = utf8n_to_uvuni(t, tend-t, &ulen, flags);
 		t += ulen;
 		if (t < tend && NATIVE_TO_UTF(*t) == 0xff) {
 		    t++;
-		    cp[2*i+1] = utf8n_to_uvuni(t, tend-t, &ulen, 0);
+		    cp[2*i+1] = utf8n_to_uvuni(t, tend-t, &ulen, flags);
 		    t += ulen;
 		}
 		else {
@@ -2926,11 +2927,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 	while (t < tend || tfirst <= tlast) {
 	    /* see if we need more "t" chars */
 	    if (tfirst > tlast) {
-		tfirst = (I32)utf8n_to_uvuni(t, tend - t, &ulen, 0);
+		tfirst = (I32)utf8n_to_uvuni(t, tend - t, &ulen, flags);
 		t += ulen;
 		if (t < tend && NATIVE_TO_UTF(*t) == 0xff) {	/* illegal utf8 val indicates range */
 		    t++;
-		    tlast = (I32)utf8n_to_uvuni(t, tend - t, &ulen, 0);
+		    tlast = (I32)utf8n_to_uvuni(t, tend - t, &ulen, flags);
 		    t += ulen;
 		}
 		else
@@ -2940,11 +2941,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 	    /* now see if we need more "r" chars */
 	    if (rfirst > rlast) {
 		if (r < rend) {
-		    rfirst = (I32)utf8n_to_uvuni(r, rend - r, &ulen, 0);
+		    rfirst = (I32)utf8n_to_uvuni(r, rend - r, &ulen, flags);
 		    r += ulen;
 		    if (r < rend && NATIVE_TO_UTF(*r) == 0xff) {	/* illegal utf8 val indicates range */
 			r++;
-			rlast = (I32)utf8n_to_uvuni(r, rend - r, &ulen, 0);
+			rlast = (I32)utf8n_to_uvuni(r, rend - r, &ulen, flags);
 			r += ulen;
 		    }
 		    else
diff --git a/regcomp.c b/regcomp.c
index 7f5507d..c236a73 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -806,7 +806,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
     /* first pass, loop through and scan words */
     reg_trie_data *trie;
     regnode *cur;
-    const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
+    const U32 uniflags = UTF8_ALLOW_DEFAULT;
     STRLEN len = 0;
     UV uvc = 0;
     U16 curword = 0;
@@ -4274,7 +4274,7 @@ tryagain:
 		    if (UTF8_IS_START(*p) && UTF) {
 			STRLEN numlen;
 			ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
-					       &numlen, 0);
+					       &numlen, UTF8_ALLOW_DEFAULT);
 			p += numlen;
 		    }
 		    else
@@ -4699,7 +4699,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
 	if (UTF) {
 	    value = utf8n_to_uvchr((U8*)RExC_parse,
 				   RExC_end - RExC_parse,
-				   &numlen, 0);
+				   &numlen, UTF8_ALLOW_DEFAULT);
 	    RExC_parse += numlen;
 	}
 	else
@@ -4711,7 +4711,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
 	    if (UTF) {
 		value = utf8n_to_uvchr((U8*)RExC_parse,
 				   RExC_end - RExC_parse,
-				   &numlen, 0);
+				   &numlen, UTF8_ALLOW_DEFAULT);
 		RExC_parse += numlen;
 	    }
 	    else
diff --git a/regexec.c b/regexec.c
index 6a5a3bd..9cb15b8 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1017,7 +1017,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32
 		U8 *sm = (U8 *) m;
 		U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
 		U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
-		const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
+		const U32 uniflags = UTF8_ALLOW_DEFAULT;
 
 		to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
 		to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
@@ -1064,7 +1064,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32
 	        UV c, f;
 	        U8 tmpbuf [UTF8_MAXBYTES+1];
 		STRLEN len, foldlen;
-		const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
+		const U32 uniflags = UTF8_ALLOW_DEFAULT;
 		if (c1 == c2) {
 		    /* Upper and lower of 1st char are equal -
 		     * probably not a "letter". */
@@ -1166,7 +1166,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32
 		    tmp = '\n';
 		else {
 		    U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
-		    tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
+		    tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
 		}
 		tmp = ((OP(c) == BOUND ?
 			isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
@@ -1208,7 +1208,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32
 		    tmp = '\n';
 		else {
 		    U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
-		    tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
+		    tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
 		}
 		tmp = ((OP(c) == NBOUND ?
 			isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
@@ -2430,7 +2430,7 @@ S_regmatch(pTHX_ regnode *prog)
 {
     dVAR;
     register const bool do_utf8 = PL_reg_match_utf8;
-    const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
+    const U32 uniflags = UTF8_ALLOW_DEFAULT;
 
     regmatch_slab  *orig_slab;
     regmatch_state *orig_state;
@@ -3046,7 +3046,7 @@ S_regmatch(pTHX_ regnode *prog)
 		else {
 		    const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
 		
-		    st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
+		    st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
 		}
 		if (OP(scan) == BOUND || OP(scan) == NBOUND) {
 		    st->ln = isALNUM_uni(st->ln);
@@ -4887,8 +4887,8 @@ S_reginclass(pTHX_ register const regnode *n, register const U8* p, STRLEN* lenp
 
     if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
 	c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
-			    ckWARN(WARN_UTF8) ? UTF8_CHECK_ONLY :
-					UTF8_ALLOW_ANYUV|UTF8_CHECK_ONLY);
+		(UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
+		/* see [perl #37836] for UTF8_ALLOW_ANYUV */
 	if (len == (STRLEN)-1)
 	    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
     }
diff --git a/t/op/pat.t b/t/op/pat.t
index df5f3e0..44070b4 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -6,7 +6,7 @@
 
 $| = 1;
 
-print "1..1200\n";
+print "1..1208\n";
 
 BEGIN {
     chdir 't' if -d 't';
@@ -3465,6 +3465,53 @@ ok(("foba  ba$s" =~ qr/(foo|BaSS|bar)/i)
     ok($f eq "ab", "pos retained between calls # TODO") or print "# $@\n";
 }
 
+# [perl #37836] Simple Regex causes SEGV when run on specific data
+if ($ordA == 193) {
+    print "ok $test # Skip: in EBCDIC\n"; $test++;
+} else {
+    no warnings 'utf8';
+    $_ = pack('U0C2', 0xa2, 0xf8); # ill-formed UTF-8
+    my $ret = 0;
+    eval { $ret = s/[\0]+//g };
+    ok($ret == 0, "ill-formed UTF-8 doesn't match NUL in class");
+}
+
+{ # [perl #38293] chr(65535) should be allowed in regexes
+    no warnings 'utf8'; # to allow non-characters
+    my($c, $r, $s);
+
+    $c = chr 0xffff;
+    $c =~ s/$c//g;
+    ok($c eq "", "U+FFFF, parsed as atom");
+
+    $c = chr 0xffff;
+    $r = "\\$c";
+    $c =~ s/$r//g;
+    ok($c eq "", "U+FFFF backslashed, parsed as atom");
+
+    $c = chr 0xffff;
+    $c =~ s/[$c]//g;
+    ok($c eq "", "U+FFFF, parsed in class");
+
+    $c = chr 0xffff;
+    $r = "[\\$c]";
+    $c =~ s/$r//g;
+    ok($c eq "", "U+FFFF backslashed, parsed in class");
+
+    $s = "A\x{ffff}B";
+    $s =~ s/\x{ffff}//i;
+    ok($s eq "AB", "U+FFFF, EXACTF");
+
+    $s = "\x{ffff}A";
+    $s =~ s/\bA//;
+    ok($s eq "\x{ffff}", "U+FFFF, BOUND");
+
+    $s = "\x{ffff}!";
+    $s =~ s/\B!//;
+    ok($s eq "\x{ffff}", "U+FFFF, NBOUND");
+} # non-characters end
+
+
 # Keep the following test last -- it may crash perl
 
 ok(("a" x (2**15 - 10)) =~ /^()(a|bb)*$/, "Recursive stack cracker: #24274")
diff --git a/t/op/tr.t b/t/op/tr.t
index 2a3d29c..796f96a 100755
--- a/t/op/tr.t
+++ b/t/op/tr.t
@@ -6,7 +6,7 @@ BEGIN {
     require './test.pl';
 }
 
-plan tests => 100;
+plan tests => 116;
 
 my $Is_EBCDIC = (ord('i') == 0x89 & ord('J') == 0xd1);
 
@@ -384,3 +384,74 @@ is( ref $x, 'SCALAR', "    doesn't stringify its argument" );
 # rt.perl.org 36622.  Perl didn't like a y/// at end of file.  No trailing
 # newline allowed.
 fresh_perl_is(q[$_ = "foo"; y/A-Z/a-z/], '');
+
+
+{ # [perl #38293] chr(65535) should be allowed in regexes
+no warnings 'utf8'; # to allow non-characters
+
+$s = "\x{d800}\x{ffff}";
+$s =~ tr/\0/A/;
+is($s, "\x{d800}\x{ffff}", "do_trans_simple");
+
+$s = "\x{d800}\x{ffff}";
+$i = $s =~ tr/\0//;
+is($i, 0, "do_trans_count");
+
+$s = "\x{d800}\x{ffff}";
+$s =~ tr/\0/A/s;
+is($s, "\x{d800}\x{ffff}", "do_trans_complex, SQUASH");
+
+$s = "\x{d800}\x{ffff}";
+$s =~ tr/\0/A/c;
+is($s, "AA", "do_trans_complex, COMPLEMENT");
+
+$s = "A\x{ffff}B";
+$s =~ tr/\x{ffff}/\x{1ffff}/;
+is($s, "A\x{1ffff}B", "utf8, SEARCHLIST");
+
+$s = "\x{fffd}\x{fffe}\x{ffff}";
+$s =~ tr/\x{fffd}-\x{ffff}/ABC/;
+is($s, "ABC", "utf8, SEARCHLIST range");
+
+$s = "ABC";
+$s =~ tr/ABC/\x{ffff}/;
+is($s, "\x{ffff}"x3, "utf8, REPLACEMENTLIST");
+
+$s = "ABC";
+$s =~ tr/ABC/\x{fffd}-\x{ffff}/;
+is($s, "\x{fffd}\x{fffe}\x{ffff}", "utf8, REPLACEMENTLIST range");
+
+$s = "A\x{ffff}B\x{100}\0\x{fffe}\x{ffff}";
+$i = $s =~ tr/\x{ffff}//;
+is($i, 2, "utf8, count");
+
+$s = "A\x{ffff}\x{ffff}C";
+$s =~ tr/\x{ffff}/\x{100}/s;
+is($s, "A\x{100}C", "utf8, SQUASH");
+
+$s = "A\x{ffff}\x{ffff}\x{fffe}\x{fffe}\x{fffe}C";
+$s =~ tr/\x{fffe}\x{ffff}//s;
+is($s, "A\x{ffff}\x{fffe}C", "utf8, SQUASH");
+
+$s = "xAABBBy";
+$s =~ tr/AB/\x{ffff}/s;
+is($s, "x\x{ffff}y", "utf8, SQUASH");
+
+$s = "xAABBBy";
+$s =~ tr/AB/\x{fffe}\x{ffff}/s;
+is($s, "x\x{fffe}\x{ffff}y", "utf8, SQUASH");
+
+$s = "A\x{ffff}B\x{fffe}C";
+$s =~ tr/\x{fffe}\x{ffff}/x/c;
+is($s, "x\x{ffff}x\x{fffe}x", "utf8, COMPLEMENT");
+
+$s = "A\x{10000}B\x{2abcd}C";
+$s =~ tr/\0-\x{ffff}/x/c;
+is($s, "AxBxC", "utf8, COMPLEMENT range");
+
+$s = "A\x{fffe}B\x{ffff}C";
+$s =~ tr/\x{fffe}\x{ffff}/x/d;
+is($s, "AxBC", "utf8, DELETE");
+
+} # non-characters end
+
diff --git a/utf8.h b/utf8.h
index 9bf3928..9f6e4e8 100644
--- a/utf8.h
+++ b/utf8.h
@@ -194,15 +194,17 @@ encoded character.
 #define UTF8_ALLOW_EMPTY		0x0001
 #define UTF8_ALLOW_CONTINUATION		0x0002
 #define UTF8_ALLOW_NON_CONTINUATION	0x0004
-#define UTF8_ALLOW_FE_FF		0x0008
+#define UTF8_ALLOW_FE_FF		0x0008 /* Allow above 0x7fffFFFF */
 #define UTF8_ALLOW_SHORT		0x0010
 #define UTF8_ALLOW_SURROGATE		0x0020
-#define UTF8_ALLOW_FFFF			0x0040 /* Allows also FFFE. */
+#define UTF8_ALLOW_FFFF			0x0040 /* Allow UNICODE_ILLEGAL */
 #define UTF8_ALLOW_LONG			0x0080
 #define UTF8_ALLOW_ANYUV		(UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\
 					 UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
 #define UTF8_ALLOW_ANY			0x00FF
 #define UTF8_CHECK_ONLY			0x0200
+#define UTF8_ALLOW_DEFAULT		(ckWARN(WARN_UTF8) ? 0 : \
+					 UTF8_ALLOW_ANYUV)
 
 #define UNICODE_SURROGATE_FIRST		0xD800
 #define UNICODE_SURROGATE_LAST		0xDFFF
@@ -216,8 +218,8 @@ encoded character.
 
 #define UNICODE_ALLOW_SURROGATE 0x0001	/* Allow UTF-16 surrogates (EVIL) */
 #define UNICODE_ALLOW_FDD0	0x0002	/* Allow the U+FDD0...U+FDEF */
-#define UNICODE_ALLOW_FFFF	0x0004	/* Allow 0xFFF[EF], 0x1FFF[EF], ... */
-#define UNICODE_ALLOW_SUPER	0x0008	/* Allow past 10xFFFF */
+#define UNICODE_ALLOW_FFFF	0x0004	/* Allow U+FFF[EF], U+1FFF[EF], ... */
+#define UNICODE_ALLOW_SUPER	0x0008	/* Allow past 0x10FFFF */
 #define UNICODE_ALLOW_ANY	0x000F
 
 #define UNICODE_IS_SURROGATE(c)		((c) >= UNICODE_SURROGATE_FIRST && \