Add a commit verb to regex engine to allow fine tuning of backtracking control.
Yves Orton [Tue, 31 Oct 2006 22:49:57 +0000 (23:49 +0100)]
Message-ID: <9b18b3110610311349n5947cc8fsf0b2e6ddd9a7ee01@mail.gmail.com>

p4raw-id: //depot/perl@29183

pod/perlre.pod
regcomp.c
regcomp.sym
regexec.c
regnodes.h
t/op/pat.t

index b46b381..4e683a3 100644 (file)
@@ -1046,6 +1046,54 @@ to inside of one of these constructs. The following equivalences apply:
     PAT?+               (?>PAT?)
     PAT{min,max}+       (?>PAT{min,max})
 
+=item C<(?COMMIT)>
+X<(?COMMIT)>
+
+This zero-width pattern commits the match at the current point, preventing
+the engine from back-tracking on failure to the left of the commit point.
+Consider the pattern C<A (?COMMIT) B>, where A and B are complex patterns.
+Until the C<(?COMMIT)> is reached, A may backtrack as necessary to match.
+Once it is reached, matching continues in B, which may also backtrack as
+necessary; however, should B not match, then no further backtracking will
+take place, and the pattern will fail outright at that starting position.
+
+The following example counts all the possible matching strings in a
+pattern (without actually matching any of them).
+
+    'aaab'=~/a+b?(?{print "$&\n"; $count++})(?FAIL)/;
+    print "Count=$count\n";
+
+which produces:
+
+    aaab
+    aaa
+    aa
+    a
+    aab
+    aa
+    a
+    ab
+    a
+    Count=9
+
+If we add a C<(?COMMIT)> before the count like the following
+
+    'aaab'=~/a+b?(?COMMIT)(?{print "$&\n"; $count++})(?FAIL)/;
+    print "Count=$count\n";
+
+we prevent backtracking and find the count of the longest matching
+at each matching startpoint like so:
+
+    aaab
+    aab
+    ab
+    Count=3
+
+Any number of C<(?COMMIT)> assertions may be used in a pattern.
+
+See also C<< (?>pattern) >> and possessive quantifiers for other
+ways to control backtracking.
+
 =item C<(?(condition)yes-pattern|no-pattern)>
 X<(?()>
 
index 4e933a9..6938954 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -4717,6 +4717,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
            case ':':           /* (?:...) */
            case '>':           /* (?>...) */
                break;
+           case 'C':
+               if (RExC_parse[0] == 'O' &&
+                   RExC_parse[1] == 'M' &&
+                   RExC_parse[2] == 'M' &&
+                   RExC_parse[3] == 'I' &&
+                   RExC_parse[4] == 'T' &&
+                   RExC_parse[5] == ')')
+               {
+                   RExC_parse+=5;
+                   ret = reg_node(pRExC_state, COMMIT);
+               } else {
+                   vFAIL("Sequence (?C... not terminated");
+               }
+               nextchar(pRExC_state);
+               return ret;
+               break;
             case 'F':
                 if (RExC_parse[0] == 'A' &&
                     RExC_parse[1] == 'I' &&
index 072b969..f60368c 100644 (file)
@@ -170,6 +170,7 @@ DEFINEP             DEFINEP,   none 1       Never execute directly.
 
 #*Bactracking 
 OPFAIL         OPFAIL, none            Same as (?!)
+COMMIT         COMMIT, node            Pattern fails if backtracking through this 
 
 # NEW STUFF ABOVE THIS LINE -- Please update counts below. 
 
@@ -206,4 +207,4 @@ BRANCH      next:FAIL
 CURLYM         A,B:FAIL        
 IFMATCH        A:FAIL  
 CURLY          B_min_known,B_min,B_max:FAIL    
-       
+COMMIT         next:FAIL       
index 9f28e3e..55a90d0 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -2571,6 +2571,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
     U32 state_num;
+    bool no_final = 0;      /* if true then we dont backtrack on failure */
 
     /* these three flags are set by various ops to signal information to
      * the very next op. They have a useful lifetime of exactly one loop
@@ -4614,6 +4615,12 @@ NULL
            if (next == scan)
                next = NULL;
            break;
+       case COMMIT:
+           PUSH_STATE_GOTO(COMMIT_next,next);
+           /* NOTREACHED */
+       case COMMIT_next_fail:
+           no_final = 1;    
+           /* FALLTHROUGH */       
        case OPFAIL:
            sayNO;
        default:
@@ -4672,7 +4679,13 @@ yes:
                PL_regmatch_slab = PL_regmatch_slab->prev;
                st = SLAB_LAST(PL_regmatch_slab);
            }
-           DEBUG_STATE_pp("pop (yes)");
+            DEBUG_STATE_r(
+               if (no_final) {
+                   DEBUG_STATE_pp("pop (no final)");        
+               } else {
+                   DEBUG_STATE_pp("pop (yes)");
+               }
+           ); 
            depth--;
        }
 #else
@@ -4690,7 +4703,7 @@ yes:
        yes_state = st->u.yes.prev_yes_state;
        PL_regmatch_state = st;
 
-       state_num = st->resume_state;
+       state_num = st->resume_state + no_final;
        goto reenter_switch;
     }
 
@@ -4709,6 +4722,13 @@ no:
        );
 
 no_silent:
+    if (no_final) {
+        if (yes_state) {
+            goto yes;
+        } else {
+            goto final_exit;
+        }
+    }    
     if (depth) {
        /* there's a previous state to backtrack to */
        st--;
index 766dcff..c42fcf8 100644 (file)
@@ -6,8 +6,8 @@
 
 /* Regops and State definitions */
 
-#define REGNODE_MAX            75
-#define REGMATCH_STATE_MAX     105
+#define REGNODE_MAX            76
+#define REGMATCH_STATE_MAX     108
 
 #define        END                     0       /* 0000 End of program. */
 #define        SUCCEED                 1       /* 0x01 Return from a subroutine, basically. */
 #define        INSUBP                  71      /* 0x47 Whether we are in a specific recurse. */
 #define        DEFINEP                 72      /* 0x48 Never execute directly. */
 #define        OPFAIL                  73      /* 0x49 Same as (?!) */
-#define        OPTIMIZED               74      /* 0x4a Placeholder for dump. */
-#define        PSEUDO                  75      /* 0x4b Pseudo opcode for internal use. */
+#define        COMMIT                  74      /* 0x4a Pattern fails if backtracking through this */
+#define        OPTIMIZED               75      /* 0x4b Placeholder for dump. */
+#define        PSEUDO                  76      /* 0x4c Pseudo opcode for internal use. */
 
        /* ------------ States ------------- */
 
-#define        TRIE_next               76      /* 0x4c Regmatch state for TRIE */
-#define        TRIE_next_fail          77      /* 0x4d Regmatch state for TRIE */
-#define        EVAL_AB                 78      /* 0x4e Regmatch state for EVAL */
-#define        EVAL_AB_fail            79      /* 0x4f Regmatch state for EVAL */
-#define        CURLYX_end              80      /* 0x50 Regmatch state for CURLYX */
-#define        CURLYX_end_fail         81      /* 0x51 Regmatch state for CURLYX */
-#define        WHILEM_A_pre            82      /* 0x52 Regmatch state for WHILEM */
-#define        WHILEM_A_pre_fail       83      /* 0x53 Regmatch state for WHILEM */
-#define        WHILEM_A_min            84      /* 0x54 Regmatch state for WHILEM */
-#define        WHILEM_A_min_fail       85      /* 0x55 Regmatch state for WHILEM */
-#define        WHILEM_A_max            86      /* 0x56 Regmatch state for WHILEM */
-#define        WHILEM_A_max_fail       87      /* 0x57 Regmatch state for WHILEM */
-#define        WHILEM_B_min            88      /* 0x58 Regmatch state for WHILEM */
-#define        WHILEM_B_min_fail       89      /* 0x59 Regmatch state for WHILEM */
-#define        WHILEM_B_max            90      /* 0x5a Regmatch state for WHILEM */
-#define        WHILEM_B_max_fail       91      /* 0x5b Regmatch state for WHILEM */
-#define        BRANCH_next             92      /* 0x5c Regmatch state for BRANCH */
-#define        BRANCH_next_fail        93      /* 0x5d Regmatch state for BRANCH */
-#define        CURLYM_A                94      /* 0x5e Regmatch state for CURLYM */
-#define        CURLYM_A_fail           95      /* 0x5f Regmatch state for CURLYM */
-#define        CURLYM_B                96      /* 0x60 Regmatch state for CURLYM */
-#define        CURLYM_B_fail           97      /* 0x61 Regmatch state for CURLYM */
-#define        IFMATCH_A               98      /* 0x62 Regmatch state for IFMATCH */
-#define        IFMATCH_A_fail          99      /* 0x63 Regmatch state for IFMATCH */
-#define        CURLY_B_min_known       100     /* 0x64 Regmatch state for CURLY */
-#define        CURLY_B_min_known_fail  101     /* 0x65 Regmatch state for CURLY */
-#define        CURLY_B_min             102     /* 0x66 Regmatch state for CURLY */
-#define        CURLY_B_min_fail        103     /* 0x67 Regmatch state for CURLY */
-#define        CURLY_B_max             104     /* 0x68 Regmatch state for CURLY */
-#define        CURLY_B_max_fail        105     /* 0x69 Regmatch state for CURLY */
+#define        TRIE_next               77      /* 0x4d Regmatch state for TRIE */
+#define        TRIE_next_fail          78      /* 0x4e Regmatch state for TRIE */
+#define        EVAL_AB                 79      /* 0x4f Regmatch state for EVAL */
+#define        EVAL_AB_fail            80      /* 0x50 Regmatch state for EVAL */
+#define        CURLYX_end              81      /* 0x51 Regmatch state for CURLYX */
+#define        CURLYX_end_fail         82      /* 0x52 Regmatch state for CURLYX */
+#define        WHILEM_A_pre            83      /* 0x53 Regmatch state for WHILEM */
+#define        WHILEM_A_pre_fail       84      /* 0x54 Regmatch state for WHILEM */
+#define        WHILEM_A_min            85      /* 0x55 Regmatch state for WHILEM */
+#define        WHILEM_A_min_fail       86      /* 0x56 Regmatch state for WHILEM */
+#define        WHILEM_A_max            87      /* 0x57 Regmatch state for WHILEM */
+#define        WHILEM_A_max_fail       88      /* 0x58 Regmatch state for WHILEM */
+#define        WHILEM_B_min            89      /* 0x59 Regmatch state for WHILEM */
+#define        WHILEM_B_min_fail       90      /* 0x5a Regmatch state for WHILEM */
+#define        WHILEM_B_max            91      /* 0x5b Regmatch state for WHILEM */
+#define        WHILEM_B_max_fail       92      /* 0x5c Regmatch state for WHILEM */
+#define        BRANCH_next             93      /* 0x5d Regmatch state for BRANCH */
+#define        BRANCH_next_fail        94      /* 0x5e Regmatch state for BRANCH */
+#define        CURLYM_A                95      /* 0x5f Regmatch state for CURLYM */
+#define        CURLYM_A_fail           96      /* 0x60 Regmatch state for CURLYM */
+#define        CURLYM_B                97      /* 0x61 Regmatch state for CURLYM */
+#define        CURLYM_B_fail           98      /* 0x62 Regmatch state for CURLYM */
+#define        IFMATCH_A               99      /* 0x63 Regmatch state for IFMATCH */
+#define        IFMATCH_A_fail          100     /* 0x64 Regmatch state for IFMATCH */
+#define        CURLY_B_min_known       101     /* 0x65 Regmatch state for CURLY */
+#define        CURLY_B_min_known_fail  102     /* 0x66 Regmatch state for CURLY */
+#define        CURLY_B_min             103     /* 0x67 Regmatch state for CURLY */
+#define        CURLY_B_min_fail        104     /* 0x68 Regmatch state for CURLY */
+#define        CURLY_B_max             105     /* 0x69 Regmatch state for CURLY */
+#define        CURLY_B_max_fail        106     /* 0x6a Regmatch state for CURLY */
+#define        COMMIT_next             107     /* 0x6b Regmatch state for COMMIT */
+#define        COMMIT_next_fail        108     /* 0x6c Regmatch state for COMMIT */
 
 /* PL_regkind[] What type of regop or state is this. */
 
@@ -199,6 +202,7 @@ EXTCONST U8 PL_regkind[] = {
        INSUBP,         /* INSUBP                 */
        DEFINEP,        /* DEFINEP                */
        OPFAIL,         /* OPFAIL                 */
+       COMMIT,         /* COMMIT                 */
        NOTHING,        /* OPTIMIZED              */
        PSEUDO,         /* PSEUDO                 */
        /* ------------ States ------------- */
@@ -232,6 +236,8 @@ EXTCONST U8 PL_regkind[] = {
        CURLY,          /* CURLY_B_min_fail       */
        CURLY,          /* CURLY_B_max            */
        CURLY,          /* CURLY_B_max_fail       */
+       COMMIT,         /* COMMIT_next            */
+       COMMIT,         /* COMMIT_next_fail       */
 };
 #endif
 
@@ -313,6 +319,7 @@ static const U8 regarglen[] = {
        EXTRA_SIZE(struct regnode_1),           /* INSUBP       */
        EXTRA_SIZE(struct regnode_1),           /* DEFINEP      */
        0,                                      /* OPFAIL       */
+       0,                                      /* COMMIT       */
        0,                                      /* OPTIMIZED    */
        0,                                      /* PSEUDO       */
 };
@@ -394,6 +401,7 @@ static const char reg_off_by_arg[] = {
        0,      /* INSUBP       */
        0,      /* DEFINEP      */
        0,      /* OPFAIL       */
+       0,      /* COMMIT       */
        0,      /* OPTIMIZED    */
        0,      /* PSEUDO       */
 };
@@ -476,39 +484,42 @@ const char * reg_name[] = {
        "INSUBP",                       /* 0x47 */
        "DEFINEP",                      /* 0x48 */
        "OPFAIL",                       /* 0x49 */
-       "OPTIMIZED",                    /* 0x4a */
-       "PSEUDO",                       /* 0x4b */
+       "COMMIT",                       /* 0x4a */
+       "OPTIMIZED",                    /* 0x4b */
+       "PSEUDO",                       /* 0x4c */
        /* ------------ States ------------- */
-       "TRIE_next",                    /* 0x4c */
-       "TRIE_next_fail",               /* 0x4d */
-       "EVAL_AB",                      /* 0x4e */
-       "EVAL_AB_fail",                 /* 0x4f */
-       "CURLYX_end",                   /* 0x50 */
-       "CURLYX_end_fail",              /* 0x51 */
-       "WHILEM_A_pre",                 /* 0x52 */
-       "WHILEM_A_pre_fail",            /* 0x53 */
-       "WHILEM_A_min",                 /* 0x54 */
-       "WHILEM_A_min_fail",            /* 0x55 */
-       "WHILEM_A_max",                 /* 0x56 */
-       "WHILEM_A_max_fail",            /* 0x57 */
-       "WHILEM_B_min",                 /* 0x58 */
-       "WHILEM_B_min_fail",            /* 0x59 */
-       "WHILEM_B_max",                 /* 0x5a */
-       "WHILEM_B_max_fail",            /* 0x5b */
-       "BRANCH_next",                  /* 0x5c */
-       "BRANCH_next_fail",             /* 0x5d */
-       "CURLYM_A",                     /* 0x5e */
-       "CURLYM_A_fail",                /* 0x5f */
-       "CURLYM_B",                     /* 0x60 */
-       "CURLYM_B_fail",                /* 0x61 */
-       "IFMATCH_A",                    /* 0x62 */
-       "IFMATCH_A_fail",               /* 0x63 */
-       "CURLY_B_min_known",            /* 0x64 */
-       "CURLY_B_min_known_fail",       /* 0x65 */
-       "CURLY_B_min",                  /* 0x66 */
-       "CURLY_B_min_fail",             /* 0x67 */
-       "CURLY_B_max",                  /* 0x68 */
-       "CURLY_B_max_fail",             /* 0x69 */
+       "TRIE_next",                    /* 0x4d */
+       "TRIE_next_fail",               /* 0x4e */
+       "EVAL_AB",                      /* 0x4f */
+       "EVAL_AB_fail",                 /* 0x50 */
+       "CURLYX_end",                   /* 0x51 */
+       "CURLYX_end_fail",              /* 0x52 */
+       "WHILEM_A_pre",                 /* 0x53 */
+       "WHILEM_A_pre_fail",            /* 0x54 */
+       "WHILEM_A_min",                 /* 0x55 */
+       "WHILEM_A_min_fail",            /* 0x56 */
+       "WHILEM_A_max",                 /* 0x57 */
+       "WHILEM_A_max_fail",            /* 0x58 */
+       "WHILEM_B_min",                 /* 0x59 */
+       "WHILEM_B_min_fail",            /* 0x5a */
+       "WHILEM_B_max",                 /* 0x5b */
+       "WHILEM_B_max_fail",            /* 0x5c */
+       "BRANCH_next",                  /* 0x5d */
+       "BRANCH_next_fail",             /* 0x5e */
+       "CURLYM_A",                     /* 0x5f */
+       "CURLYM_A_fail",                /* 0x60 */
+       "CURLYM_B",                     /* 0x61 */
+       "CURLYM_B_fail",                /* 0x62 */
+       "IFMATCH_A",                    /* 0x63 */
+       "IFMATCH_A_fail",               /* 0x64 */
+       "CURLY_B_min_known",            /* 0x65 */
+       "CURLY_B_min_known_fail",       /* 0x66 */
+       "CURLY_B_min",                  /* 0x67 */
+       "CURLY_B_min_fail",             /* 0x68 */
+       "CURLY_B_max",                  /* 0x69 */
+       "CURLY_B_max_fail",             /* 0x6a */
+       "COMMIT_next",                  /* 0x6b */
+       "COMMIT_next_fail",             /* 0x6c */
 };
 #endif /* DEBUGGING */
 #else
index 3cb8e8f..1686234 100755 (executable)
@@ -3719,8 +3719,14 @@ sub iseq($$;$) {
     ';
     ok(!$@,'lvalue $+{...} should not throw an exception');
 }
-
-
+{
+    our $count = 0;
+    'aaab'=~/a+b?(?{$count++})(?FAIL)/;
+    iseq($count,9,"expect 9 for no (?COMMIT)");
+    $count = 0;
+    'aaab'=~/a+b?(?COMMIT)(?{$count++})(?FAIL)/;
+    iseq($count,3,"expect 3 with (?COMMIT)");
+}
 # stress test CURLYX/WHILEM.
 #
 # This test includes varying levels of nesting, and according to
@@ -3860,5 +3866,5 @@ ok((q(a)x 100) =~ /^(??{'(.)'x 100})/,
     or print "# Unexpected outcome: should pass or crash perl\n";
 
 # Don't forget to update this!
-BEGIN{print "1..1287\n"};
+BEGIN{print "1..1289\n"};