From: Jarkko Hietaniemi Date: Sun, 18 Feb 2001 22:11:20 +0000 (+0000) Subject: Misapplied regex optimizations when \C is present. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=3baa4c62cda542368be1e7e1f7af8bd8257c2ff4;p=p5sagit%2Fp5-mst-13.2.git Misapplied regex optimizations when \C is present. Fixes 20001230.002. What still remains broken is that the submatches that have \C in them get their UTF8 flag on because their parent SV has it on. This will result in malformed UTF8 if a \C happened to match a non-ASCII byte. p4raw-id: //depot/perl@8836 --- diff --git a/regcomp.c b/regcomp.c index 547f756..69d114e 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1930,6 +1930,8 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->reganch |= ROPT_LOOKBEHIND_SEEN; if (RExC_seen & REG_SEEN_EVAL) r->reganch |= ROPT_EVAL_SEEN; + if (RExC_seen & REG_SEEN_SANY) + r->reganch |= ROPT_SANY_SEEN; Newz(1002, r->startp, RExC_npar, I32); Newz(1002, r->endp, RExC_npar, I32); PL_regdata = r->data; /* for regprop() */ @@ -2638,6 +2640,7 @@ tryagain: break; case 'C': ret = reg_node(pRExC_state, SANY); + RExC_seen |= REG_SEEN_SANY; *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); break; diff --git a/regcomp.h b/regcomp.h index 066e31f..ee9be39 100644 --- a/regcomp.h +++ b/regcomp.h @@ -285,10 +285,11 @@ struct regnode_charclass_class { #define EXTRA_SIZE(guy) ((sizeof(guy)-1)/sizeof(struct regnode)) -#define REG_SEEN_ZERO_LEN 1 -#define REG_SEEN_LOOKBEHIND 2 -#define REG_SEEN_GPOS 4 -#define REG_SEEN_EVAL 8 +#define REG_SEEN_ZERO_LEN 1 +#define REG_SEEN_LOOKBEHIND 2 +#define REG_SEEN_GPOS 4 +#define REG_SEEN_EVAL 8 +#define REG_SEEN_SANY 16 START_EXTERN_C diff --git a/regexec.c b/regexec.c index c70d1b1..5d9e8ac 100644 --- a/regexec.c +++ b/regexec.c @@ -398,7 +398,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); goto fail; } - if (prog->check_offset_min == prog->check_offset_max) { + if (prog->check_offset_min == prog->check_offset_max && + !(prog->reganch & ROPT_SANY_SEEN)) { /* Substring at constant offset from beg-of-str... */ I32 slen; @@ -474,6 +475,10 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, if (data) *data->scream_olds = s; } + else if (prog->reganch & ROPT_SANY_SEEN) + s = fbm_instr((U8*)(s + start_shift), + (U8*)(strend - end_shift), + check, PL_multiline ? FBMrf_MULTILINE : 0); else s = fbm_instr(HOP3(s, start_shift, strend), HOP3(strend, -end_shift, strbeg), @@ -1407,7 +1412,8 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * minlen = prog->minlen; if (do_utf8) { - if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey; + if (!(prog->reganch & ROPT_SANY_SEEN)) + if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey; } else { if (strend - startpos < minlen) goto phooey; @@ -2075,13 +2081,6 @@ S_regmatch(pTHX_ regnode *prog) sayNO; break; case SANY: - if (do_utf8) { - locinput += PL_utf8skip[nextchr]; - if (locinput > PL_regeol) - sayNO; - nextchr = UCHARAT(locinput); - break; - } if (!nextchr && locinput >= PL_regeol) sayNO; nextchr = UCHARAT(++locinput); @@ -3563,15 +3562,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max) } break; case SANY: - if (do_utf8) { - loceol = PL_regeol; - while (hardcount < max && scan < loceol) { - scan += UTF8SKIP(scan); - hardcount++; - } - } else { - scan = loceol; - } + scan = loceol; break; case EXACT: /* length of string is 1 */ c = (U8)*STRING(p); diff --git a/regexp.h b/regexp.h index 3c71060..33ace40 100644 --- a/regexp.h +++ b/regexp.h @@ -54,6 +54,7 @@ typedef struct regexp { #define ROPT_CHECK_ALL 0x00100 #define ROPT_LOOKBEHIND_SEEN 0x00200 #define ROPT_EVAL_SEEN 0x00400 +#define ROPT_SANY_SEEN 0x00800 /* 0xf800 of reganch is used by PMf_COMPILETIME */ diff --git a/t/op/pat.t b/t/op/pat.t index 17df867..d7eb9f8 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -4,7 +4,7 @@ # the format supported by op/regexp.t. If you want to add a test # that does fit that format, add it to op/re_tests, not here. -print "1..231\n"; +print "1..240\n"; BEGIN { chdir 't' if -d 't'; @@ -1129,3 +1129,56 @@ print "not " unless "A \x{263a} B z C" =~ /A . B (??{ "z" }) C/; print "ok $test\n"; $test++; +$_ = "a\x{100}b"; +if (/(.)(\C)(\C)(.)/) { + print "ok 232\n"; + if ($1 eq "a") { + print "ok 233\n"; + } else { + print "not ok 233\n"; + } + if ($2 eq "\xC4") { + print "ok 234\n"; + } else { + print "not ok 234\n"; + } + if ($3 eq "\x80") { + print "ok 235\n"; + } else { + print "not ok 235\n"; + } + if ($4 eq "b") { + print "ok 236\n"; + } else { + print "not ok 236\n"; + } +} else { + for (232..236) { + print "not ok $_\n"; + } +} +$_ = "\x{100}"; +if (/(\C)/g) { + print "ok 237\n"; + if ($1 eq "\xC4") { + print "ok 238\n"; + } else { + print "not ok 238\n"; + } +} else { + for (237..238) { + print "not ok $_\n"; + } +} +if (/(\C)/g) { + print "ok 239\n"; + if ($1 eq "\x80") { + print "ok 240\n"; + } else { + print "not ok 240\n"; + } +} else { + for (239..240) { + print "not ok $_\n"; + } +} diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t index 577e6b4..60e6c6e 100755 --- a/t/pragma/utf8.t +++ b/t/pragma/utf8.t @@ -10,7 +10,7 @@ BEGIN { } } -print "1..107\n"; +print "1..109\n"; my $test = 1; @@ -577,3 +577,16 @@ sub nok_bytes { $test++; # 107 } +{ + # bug id 20001230.002 + + use utf8; + + print "not " unless "École" =~ /^\C\C(.)/ && $1 eq 'c'; + print "ok $test\n"; + $test++; # 108 + + print "not " unless "École" =~ /^\C\C(c)/; + print "ok $test\n"; + $test++; # 109 +}