From: SADAHIRO Tomoyuki Date: Sun, 2 Apr 2006 22:48:44 +0000 (+0900) Subject: Re: [perl #38293] chr(65535) should be allowed in regexes X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=9f7f391326e967b539b86ed051c163bbf8f6e7de;p=p5sagit%2Fp5-mst-13.2.git Re: [perl #38293] chr(65535) should be allowed in regexes Message-Id: <20060402224657.B942.BQW10602@nifty.com> p4raw-id: //depot/perl@27688 --- diff --git a/doop.c b/doop.c index 2c1ce81..3e60665 100644 --- a/doop.c +++ b/doop.c @@ -69,7 +69,7 @@ S_do_trans_simple(pTHX_ SV *sv) I32 ch; /* Need to check this, otherwise 128..255 won't match */ - const UV c = utf8n_to_uvchr(s, send - s, &ulen, 0); + const UV c = utf8n_to_uvchr(s, send - s, &ulen, UTF8_ALLOW_DEFAULT); if (c < 0x100 && (ch = tbl[c]) >= 0) { matches++; d = uvchr_to_utf8(d, ch); @@ -119,7 +119,7 @@ S_do_trans_count(pTHX_ SV *sv) const I32 complement = PL_op->op_private & OPpTRANS_COMPLEMENT; while (s < send) { STRLEN ulen; - const UV c = utf8n_to_uvchr(s, send - s, &ulen, 0); + const UV c = utf8n_to_uvchr(s, send - s, &ulen, UTF8_ALLOW_DEFAULT); if (c < 0x100) { if (tbl[c] >= 0) matches++; @@ -209,7 +209,8 @@ S_do_trans_complex(pTHX_ SV *sv) UV pch = 0xfeedface; while (s < send) { STRLEN len; - const UV comp = utf8_to_uvchr(s, &len); + const UV comp = utf8n_to_uvchr(s, send - s, &len, + UTF8_ALLOW_DEFAULT); I32 ch; if (comp > 0xff) { @@ -254,7 +255,8 @@ S_do_trans_complex(pTHX_ SV *sv) else { while (s < send) { STRLEN len; - const UV comp = utf8_to_uvchr(s, &len); + const UV comp = utf8n_to_uvchr(s, send - s, &len, + UTF8_ALLOW_DEFAULT); I32 ch; if (comp > 0xff) { if (!complement) { @@ -540,7 +542,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) } else { STRLEN len; - uv = utf8_to_uvuni(s, &len); + uv = utf8n_to_uvuni(s, send - s, &len, UTF8_ALLOW_DEFAULT); if (uv != puv) { Move(s, d, len, U8); d += len; diff --git a/op.c b/op.c index fa84b93..64a5b7d 100644 --- a/op.c +++ b/op.c @@ -2842,6 +2842,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) const I32 to_utf = o->op_private & OPpTRANS_TO_UTF; U8* tsave = NULL; U8* rsave = NULL; + const U32 flags = UTF8_ALLOW_DEFAULT; if (!from_utf) { STRLEN len = tlen; @@ -2868,11 +2869,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) i = 0; transv = newSVpvs(""); while (t < tend) { - cp[2*i] = utf8n_to_uvuni(t, tend-t, &ulen, 0); + cp[2*i] = utf8n_to_uvuni(t, tend-t, &ulen, flags); t += ulen; if (t < tend && NATIVE_TO_UTF(*t) == 0xff) { t++; - cp[2*i+1] = utf8n_to_uvuni(t, tend-t, &ulen, 0); + cp[2*i+1] = utf8n_to_uvuni(t, tend-t, &ulen, flags); t += ulen; } else { @@ -2926,11 +2927,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) while (t < tend || tfirst <= tlast) { /* see if we need more "t" chars */ if (tfirst > tlast) { - tfirst = (I32)utf8n_to_uvuni(t, tend - t, &ulen, 0); + tfirst = (I32)utf8n_to_uvuni(t, tend - t, &ulen, flags); t += ulen; if (t < tend && NATIVE_TO_UTF(*t) == 0xff) { /* illegal utf8 val indicates range */ t++; - tlast = (I32)utf8n_to_uvuni(t, tend - t, &ulen, 0); + tlast = (I32)utf8n_to_uvuni(t, tend - t, &ulen, flags); t += ulen; } else @@ -2940,11 +2941,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) /* now see if we need more "r" chars */ if (rfirst > rlast) { if (r < rend) { - rfirst = (I32)utf8n_to_uvuni(r, rend - r, &ulen, 0); + rfirst = (I32)utf8n_to_uvuni(r, rend - r, &ulen, flags); r += ulen; if (r < rend && NATIVE_TO_UTF(*r) == 0xff) { /* illegal utf8 val indicates range */ r++; - rlast = (I32)utf8n_to_uvuni(r, rend - r, &ulen, 0); + rlast = (I32)utf8n_to_uvuni(r, rend - r, &ulen, flags); r += ulen; } else diff --git a/regcomp.c b/regcomp.c index 7f5507d..c236a73 100644 --- a/regcomp.c +++ b/regcomp.c @@ -806,7 +806,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* first pass, loop through and scan words */ reg_trie_data *trie; regnode *cur; - const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY; + const U32 uniflags = UTF8_ALLOW_DEFAULT; STRLEN len = 0; UV uvc = 0; U16 curword = 0; @@ -4274,7 +4274,7 @@ tryagain: if (UTF8_IS_START(*p) && UTF) { STRLEN numlen; ender = utf8n_to_uvchr((U8*)p, RExC_end - p, - &numlen, 0); + &numlen, UTF8_ALLOW_DEFAULT); p += numlen; } else @@ -4699,7 +4699,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (UTF) { value = utf8n_to_uvchr((U8*)RExC_parse, RExC_end - RExC_parse, - &numlen, 0); + &numlen, UTF8_ALLOW_DEFAULT); RExC_parse += numlen; } else @@ -4711,7 +4711,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (UTF) { value = utf8n_to_uvchr((U8*)RExC_parse, RExC_end - RExC_parse, - &numlen, 0); + &numlen, UTF8_ALLOW_DEFAULT); RExC_parse += numlen; } else diff --git a/regexec.c b/regexec.c index 6a5a3bd..9cb15b8 100644 --- a/regexec.c +++ b/regexec.c @@ -1017,7 +1017,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 U8 *sm = (U8 *) m; U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; U8 tmpbuf2[UTF8_MAXBYTES_CASE+1]; - const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY; + const U32 uniflags = UTF8_ALLOW_DEFAULT; to_utf8_lower((U8*)m, tmpbuf1, &ulen1); to_utf8_upper((U8*)m, tmpbuf2, &ulen2); @@ -1064,7 +1064,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 UV c, f; U8 tmpbuf [UTF8_MAXBYTES+1]; STRLEN len, foldlen; - const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY; + const U32 uniflags = UTF8_ALLOW_DEFAULT; if (c1 == c2) { /* Upper and lower of 1st char are equal - * probably not a "letter". */ @@ -1166,7 +1166,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 tmp = '\n'; else { U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); - tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0); + tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); } tmp = ((OP(c) == BOUND ? isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); @@ -1208,7 +1208,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 tmp = '\n'; else { U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); - tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0); + tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); } tmp = ((OP(c) == NBOUND ? isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); @@ -2430,7 +2430,7 @@ S_regmatch(pTHX_ regnode *prog) { dVAR; register const bool do_utf8 = PL_reg_match_utf8; - const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY; + const U32 uniflags = UTF8_ALLOW_DEFAULT; regmatch_slab *orig_slab; regmatch_state *orig_state; @@ -3046,7 +3046,7 @@ S_regmatch(pTHX_ regnode *prog) else { const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr); - st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0); + st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags); } if (OP(scan) == BOUND || OP(scan) == NBOUND) { st->ln = isALNUM_uni(st->ln); @@ -4887,8 +4887,8 @@ S_reginclass(pTHX_ register const regnode *n, register const U8* p, STRLEN* lenp if (do_utf8 && !UTF8_IS_INVARIANT(c)) { c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len, - ckWARN(WARN_UTF8) ? UTF8_CHECK_ONLY : - UTF8_ALLOW_ANYUV|UTF8_CHECK_ONLY); + (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY); + /* see [perl #37836] for UTF8_ALLOW_ANYUV */ if (len == (STRLEN)-1) Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); } diff --git a/t/op/pat.t b/t/op/pat.t index df5f3e0..44070b4 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -6,7 +6,7 @@ $| = 1; -print "1..1200\n"; +print "1..1208\n"; BEGIN { chdir 't' if -d 't'; @@ -3465,6 +3465,53 @@ ok(("foba ba$s" =~ qr/(foo|BaSS|bar)/i) ok($f eq "ab", "pos retained between calls # TODO") or print "# $@\n"; } +# [perl #37836] Simple Regex causes SEGV when run on specific data +if ($ordA == 193) { + print "ok $test # Skip: in EBCDIC\n"; $test++; +} else { + no warnings 'utf8'; + $_ = pack('U0C2', 0xa2, 0xf8); # ill-formed UTF-8 + my $ret = 0; + eval { $ret = s/[\0]+//g }; + ok($ret == 0, "ill-formed UTF-8 doesn't match NUL in class"); +} + +{ # [perl #38293] chr(65535) should be allowed in regexes + no warnings 'utf8'; # to allow non-characters + my($c, $r, $s); + + $c = chr 0xffff; + $c =~ s/$c//g; + ok($c eq "", "U+FFFF, parsed as atom"); + + $c = chr 0xffff; + $r = "\\$c"; + $c =~ s/$r//g; + ok($c eq "", "U+FFFF backslashed, parsed as atom"); + + $c = chr 0xffff; + $c =~ s/[$c]//g; + ok($c eq "", "U+FFFF, parsed in class"); + + $c = chr 0xffff; + $r = "[\\$c]"; + $c =~ s/$r//g; + ok($c eq "", "U+FFFF backslashed, parsed in class"); + + $s = "A\x{ffff}B"; + $s =~ s/\x{ffff}//i; + ok($s eq "AB", "U+FFFF, EXACTF"); + + $s = "\x{ffff}A"; + $s =~ s/\bA//; + ok($s eq "\x{ffff}", "U+FFFF, BOUND"); + + $s = "\x{ffff}!"; + $s =~ s/\B!//; + ok($s eq "\x{ffff}", "U+FFFF, NBOUND"); +} # non-characters end + + # Keep the following test last -- it may crash perl ok(("a" x (2**15 - 10)) =~ /^()(a|bb)*$/, "Recursive stack cracker: #24274") diff --git a/t/op/tr.t b/t/op/tr.t index 2a3d29c..796f96a 100755 --- a/t/op/tr.t +++ b/t/op/tr.t @@ -6,7 +6,7 @@ BEGIN { require './test.pl'; } -plan tests => 100; +plan tests => 116; my $Is_EBCDIC = (ord('i') == 0x89 & ord('J') == 0xd1); @@ -384,3 +384,74 @@ is( ref $x, 'SCALAR', " doesn't stringify its argument" ); # rt.perl.org 36622. Perl didn't like a y/// at end of file. No trailing # newline allowed. fresh_perl_is(q[$_ = "foo"; y/A-Z/a-z/], ''); + + +{ # [perl #38293] chr(65535) should be allowed in regexes +no warnings 'utf8'; # to allow non-characters + +$s = "\x{d800}\x{ffff}"; +$s =~ tr/\0/A/; +is($s, "\x{d800}\x{ffff}", "do_trans_simple"); + +$s = "\x{d800}\x{ffff}"; +$i = $s =~ tr/\0//; +is($i, 0, "do_trans_count"); + +$s = "\x{d800}\x{ffff}"; +$s =~ tr/\0/A/s; +is($s, "\x{d800}\x{ffff}", "do_trans_complex, SQUASH"); + +$s = "\x{d800}\x{ffff}"; +$s =~ tr/\0/A/c; +is($s, "AA", "do_trans_complex, COMPLEMENT"); + +$s = "A\x{ffff}B"; +$s =~ tr/\x{ffff}/\x{1ffff}/; +is($s, "A\x{1ffff}B", "utf8, SEARCHLIST"); + +$s = "\x{fffd}\x{fffe}\x{ffff}"; +$s =~ tr/\x{fffd}-\x{ffff}/ABC/; +is($s, "ABC", "utf8, SEARCHLIST range"); + +$s = "ABC"; +$s =~ tr/ABC/\x{ffff}/; +is($s, "\x{ffff}"x3, "utf8, REPLACEMENTLIST"); + +$s = "ABC"; +$s =~ tr/ABC/\x{fffd}-\x{ffff}/; +is($s, "\x{fffd}\x{fffe}\x{ffff}", "utf8, REPLACEMENTLIST range"); + +$s = "A\x{ffff}B\x{100}\0\x{fffe}\x{ffff}"; +$i = $s =~ tr/\x{ffff}//; +is($i, 2, "utf8, count"); + +$s = "A\x{ffff}\x{ffff}C"; +$s =~ tr/\x{ffff}/\x{100}/s; +is($s, "A\x{100}C", "utf8, SQUASH"); + +$s = "A\x{ffff}\x{ffff}\x{fffe}\x{fffe}\x{fffe}C"; +$s =~ tr/\x{fffe}\x{ffff}//s; +is($s, "A\x{ffff}\x{fffe}C", "utf8, SQUASH"); + +$s = "xAABBBy"; +$s =~ tr/AB/\x{ffff}/s; +is($s, "x\x{ffff}y", "utf8, SQUASH"); + +$s = "xAABBBy"; +$s =~ tr/AB/\x{fffe}\x{ffff}/s; +is($s, "x\x{fffe}\x{ffff}y", "utf8, SQUASH"); + +$s = "A\x{ffff}B\x{fffe}C"; +$s =~ tr/\x{fffe}\x{ffff}/x/c; +is($s, "x\x{ffff}x\x{fffe}x", "utf8, COMPLEMENT"); + +$s = "A\x{10000}B\x{2abcd}C"; +$s =~ tr/\0-\x{ffff}/x/c; +is($s, "AxBxC", "utf8, COMPLEMENT range"); + +$s = "A\x{fffe}B\x{ffff}C"; +$s =~ tr/\x{fffe}\x{ffff}/x/d; +is($s, "AxBC", "utf8, DELETE"); + +} # non-characters end + diff --git a/utf8.h b/utf8.h index 9bf3928..9f6e4e8 100644 --- a/utf8.h +++ b/utf8.h @@ -194,15 +194,17 @@ encoded character. #define UTF8_ALLOW_EMPTY 0x0001 #define UTF8_ALLOW_CONTINUATION 0x0002 #define UTF8_ALLOW_NON_CONTINUATION 0x0004 -#define UTF8_ALLOW_FE_FF 0x0008 +#define UTF8_ALLOW_FE_FF 0x0008 /* Allow above 0x7fffFFFF */ #define UTF8_ALLOW_SHORT 0x0010 #define UTF8_ALLOW_SURROGATE 0x0020 -#define UTF8_ALLOW_FFFF 0x0040 /* Allows also FFFE. */ +#define UTF8_ALLOW_FFFF 0x0040 /* Allow UNICODE_ILLEGAL */ #define UTF8_ALLOW_LONG 0x0080 #define UTF8_ALLOW_ANYUV (UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\ UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF) #define UTF8_ALLOW_ANY 0x00FF #define UTF8_CHECK_ONLY 0x0200 +#define UTF8_ALLOW_DEFAULT (ckWARN(WARN_UTF8) ? 0 : \ + UTF8_ALLOW_ANYUV) #define UNICODE_SURROGATE_FIRST 0xD800 #define UNICODE_SURROGATE_LAST 0xDFFF @@ -216,8 +218,8 @@ encoded character. #define UNICODE_ALLOW_SURROGATE 0x0001 /* Allow UTF-16 surrogates (EVIL) */ #define UNICODE_ALLOW_FDD0 0x0002 /* Allow the U+FDD0...U+FDEF */ -#define UNICODE_ALLOW_FFFF 0x0004 /* Allow 0xFFF[EF], 0x1FFF[EF], ... */ -#define UNICODE_ALLOW_SUPER 0x0008 /* Allow past 10xFFFF */ +#define UNICODE_ALLOW_FFFF 0x0004 /* Allow U+FFF[EF], U+1FFF[EF], ... */ +#define UNICODE_ALLOW_SUPER 0x0008 /* Allow past 0x10FFFF */ #define UNICODE_ALLOW_ANY 0x000F #define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \