Misapplied regex optimizations when \C is present.

diff --git a/regcomp.c b/regcomp.c

index 547f756..69d114e 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -1930,6 +1930,8 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
        r->reganch |= ROPT_LOOKBEHIND_SEEN;
     if (RExC_seen & REG_SEEN_EVAL)
        r->reganch |= ROPT_EVAL_SEEN;
+    if (RExC_seen & REG_SEEN_SANY)
+       r->reganch |= ROPT_SANY_SEEN;
     Newz(1002, r->startp, RExC_npar, I32);
     Newz(1002, r->endp, RExC_npar, I32);
     PL_regdata = r->data; /* for regprop() */
@@ -2638,6 +2640,7 @@ tryagain:
            break;
        case 'C':
            ret = reg_node(pRExC_state, SANY);
+           RExC_seen |= REG_SEEN_SANY;
            *flagp |= HASWIDTH|SIMPLE;
            nextchar(pRExC_state);
            break;
diff --git a/regcomp.h b/regcomp.h

index 066e31f..ee9be39 100644 (file)
--- a/regcomp.h
+++ b/regcomp.h
@@ -285,10 +285,11 @@ struct regnode_charclass_class {
 
 #define EXTRA_SIZE(guy) ((sizeof(guy)-1)/sizeof(struct regnode))
 
-#define REG_SEEN_ZERO_LEN      1
-#define REG_SEEN_LOOKBEHIND    2
-#define REG_SEEN_GPOS          4
-#define REG_SEEN_EVAL          8
+#define REG_SEEN_ZERO_LEN       1
+#define REG_SEEN_LOOKBEHIND     2
+#define REG_SEEN_GPOS           4
+#define REG_SEEN_EVAL           8
+#define REG_SEEN_SANY          16
 
 START_EXTERN_C
 
diff --git a/regexec.c b/regexec.c

index c70d1b1..5d9e8ac 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -398,7 +398,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
              DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
              goto fail;
          }
-         if (prog->check_offset_min == prog->check_offset_max) {
+         if (prog->check_offset_min == prog->check_offset_max &&
+             !(prog->reganch & ROPT_SANY_SEEN)) {
            /* Substring at constant offset from beg-of-str... */
            I32 slen;
 
@@ -474,6 +475,10 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
        if (data)
            *data->scream_olds = s;
     }
+    else if (prog->reganch & ROPT_SANY_SEEN)
+       s = fbm_instr((U8*)(s + start_shift),
+                     (U8*)(strend - end_shift),
+                     check, PL_multiline ? FBMrf_MULTILINE : 0);
     else
        s = fbm_instr(HOP3(s, start_shift, strend),
                      HOP3(strend, -end_shift, strbeg),
@@ -1407,7 +1412,8 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
 
     minlen = prog->minlen;
     if (do_utf8) {
-      if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
+      if (!(prog->reganch & ROPT_SANY_SEEN))
+        if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
     }
     else {
       if (strend - startpos < minlen) goto phooey;
@@ -2075,13 +2081,6 @@ S_regmatch(pTHX_ regnode *prog)
                sayNO;
            break;
        case SANY:
-           if (do_utf8) {
-               locinput += PL_utf8skip[nextchr];
-               if (locinput > PL_regeol)
-                   sayNO;
-               nextchr = UCHARAT(locinput);
-               break;
-           }
            if (!nextchr && locinput >= PL_regeol)
                sayNO;
            nextchr = UCHARAT(++locinput);
@@ -3563,15 +3562,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
        }
        break;
     case SANY:
-       if (do_utf8) {
-           loceol = PL_regeol;
-           while (hardcount < max && scan < loceol) {
-               scan += UTF8SKIP(scan);
-               hardcount++;
-           }
-       } else {
-           scan = loceol;
-       }
+       scan = loceol;
        break;
     case EXACT:                /* length of string is 1 */
        c = (U8)*STRING(p);
diff --git a/regexp.h b/regexp.h

index 3c71060..33ace40 100644 (file)
--- a/regexp.h
+++ b/regexp.h
@@ -54,6 +54,7 @@ typedef struct regexp {
 #define ROPT_CHECK_ALL         0x00100
 #define ROPT_LOOKBEHIND_SEEN   0x00200
 #define ROPT_EVAL_SEEN         0x00400
+#define ROPT_SANY_SEEN         0x00800
 
 /* 0xf800 of reganch is used by PMf_COMPILETIME */
 
diff --git a/t/op/pat.t b/t/op/pat.t

index 17df867..d7eb9f8 100755 (executable)
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -4,7 +4,7 @@
 # the format supported by op/regexp.t.  If you want to add a test
 # that does fit that format, add it to op/re_tests, not here.
 
-print "1..231\n";
+print "1..240\n";
 
 BEGIN {
     chdir 't' if -d 't';
@@ -1129,3 +1129,56 @@ print "not " unless "A \x{263a} B z C" =~ /A . B (??{ "z" }) C/;
 print "ok $test\n";
 $test++;
 
+$_ = "a\x{100}b";
+if (/(.)(\C)(\C)(.)/) {
+  print "ok 232\n";
+  if ($1 eq "a") {
+    print "ok 233\n";
+  } else {
+    print "not ok 233\n";
+  }
+  if ($2 eq "\xC4") {
+    print "ok 234\n";
+  } else {
+    print "not ok 234\n";
+  }
+  if ($3 eq "\x80") {
+    print "ok 235\n";
+  } else {
+    print "not ok 235\n";
+  }
+  if ($4 eq "b") {
+    print "ok 236\n";
+  } else {
+    print "not ok 236\n";
+  }
+} else {
+  for (232..236) {
+    print "not ok $_\n";
+  }
+}
+$_ = "\x{100}";
+if (/(\C)/g) {
+  print "ok 237\n";
+  if ($1 eq "\xC4") {
+    print "ok 238\n";
+  } else {
+    print "not ok 238\n";
+  }
+} else {
+  for (237..238) {
+    print "not ok $_\n";
+  }
+}
+if (/(\C)/g) {
+  print "ok 239\n";
+  if ($1 eq "\x80") {
+    print "ok 240\n";
+  } else {
+    print "not ok 240\n";
+  }
+} else {
+  for (239..240) {
+    print "not ok $_\n";
+  }
+}
diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t

index 577e6b4..60e6c6e 100755 (executable)
--- a/t/pragma/utf8.t
+++ b/t/pragma/utf8.t
@@ -10,7 +10,7 @@ BEGIN {
     }
 }
 
-print "1..107\n";
+print "1..109\n";
 
 my $test = 1;
 
@@ -577,3 +577,16 @@ sub nok_bytes {
     $test++;                                   # 107
 }
 
+{
+    # bug id 20001230.002
+
+    use utf8;
+
+    print "not " unless "École" =~ /^\C\C(.)/ && $1 eq 'c';
+    print "ok $test\n";
+    $test++;                                   # 108
+
+    print "not " unless "École" =~ /^\C\C(c)/;
+    print "ok $test\n";
+    $test++;                                   # 109
+}
regcomp.c		patch \| blob \| blame \| history
regcomp.h		patch \| blob \| blame \| history
regexec.c		patch \| blob \| blame \| history
regexp.h		patch \| blob \| blame \| history
t/op/pat.t		patch \| blob \| blame \| history
t/pragma/utf8.t		patch \| blob \| blame \| history