From: Yves Orton Date: Tue, 31 Oct 2006 22:49:57 +0000 (+0100) Subject: Add a commit verb to regex engine to allow fine tuning of backtracking control. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=5461259206276a3618e115d5d68776273bb41ca6;p=p5sagit%2Fp5-mst-13.2.git Add a commit verb to regex engine to allow fine tuning of backtracking control. Message-ID: <9b18b3110610311349n5947cc8fsf0b2e6ddd9a7ee01@mail.gmail.com> p4raw-id: //depot/perl@29183 --- diff --git a/pod/perlre.pod b/pod/perlre.pod index b46b381..4e683a3 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -1046,6 +1046,54 @@ to inside of one of these constructs. The following equivalences apply: PAT?+ (?>PAT?) PAT{min,max}+ (?>PAT{min,max}) +=item C<(?COMMIT)> +X<(?COMMIT)> + +This zero-width pattern commits the match at the current point, preventing +the engine from back-tracking on failure to the left of the commit point. +Consider the pattern C, where A and B are complex patterns. +Until the C<(?COMMIT)> is reached, A may backtrack as necessary to match. +Once it is reached, matching continues in B, which may also backtrack as +necessary; however, should B not match, then no further backtracking will +take place, and the pattern will fail outright at that starting position. + +The following example counts all the possible matching strings in a +pattern (without actually matching any of them). + + 'aaab'=~/a+b?(?{print "$&\n"; $count++})(?FAIL)/; + print "Count=$count\n"; + +which produces: + + aaab + aaa + aa + a + aab + aa + a + ab + a + Count=9 + +If we add a C<(?COMMIT)> before the count like the following + + 'aaab'=~/a+b?(?COMMIT)(?{print "$&\n"; $count++})(?FAIL)/; + print "Count=$count\n"; + +we prevent backtracking and find the count of the longest matching +at each matching startpoint like so: + + aaab + aab + ab + Count=3 + +Any number of C<(?COMMIT)> assertions may be used in a pattern. + +See also C<< (?>pattern) >> and possessive quantifiers for other +ways to control backtracking. + =item C<(?(condition)yes-pattern|no-pattern)> X<(?()> diff --git a/regcomp.c b/regcomp.c index 4e933a9..6938954 100644 --- a/regcomp.c +++ b/regcomp.c @@ -4717,6 +4717,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case ':': /* (?:...) */ case '>': /* (?>...) */ break; + case 'C': + if (RExC_parse[0] == 'O' && + RExC_parse[1] == 'M' && + RExC_parse[2] == 'M' && + RExC_parse[3] == 'I' && + RExC_parse[4] == 'T' && + RExC_parse[5] == ')') + { + RExC_parse+=5; + ret = reg_node(pRExC_state, COMMIT); + } else { + vFAIL("Sequence (?C... not terminated"); + } + nextchar(pRExC_state); + return ret; + break; case 'F': if (RExC_parse[0] == 'A' && RExC_parse[1] == 'I' && diff --git a/regcomp.sym b/regcomp.sym index 072b969..f60368c 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -170,6 +170,7 @@ DEFINEP DEFINEP, none 1 Never execute directly. #*Bactracking OPFAIL OPFAIL, none Same as (?!) +COMMIT COMMIT, node Pattern fails if backtracking through this # NEW STUFF ABOVE THIS LINE -- Please update counts below. @@ -206,4 +207,4 @@ BRANCH next:FAIL CURLYM A,B:FAIL IFMATCH A:FAIL CURLY B_min_known,B_min,B_max:FAIL - +COMMIT next:FAIL diff --git a/regexec.c b/regexec.c index 9f28e3e..55a90d0 100644 --- a/regexec.c +++ b/regexec.c @@ -2571,6 +2571,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */ struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */ U32 state_num; + bool no_final = 0; /* if true then we dont backtrack on failure */ /* these three flags are set by various ops to signal information to * the very next op. They have a useful lifetime of exactly one loop @@ -4614,6 +4615,12 @@ NULL if (next == scan) next = NULL; break; + case COMMIT: + PUSH_STATE_GOTO(COMMIT_next,next); + /* NOTREACHED */ + case COMMIT_next_fail: + no_final = 1; + /* FALLTHROUGH */ case OPFAIL: sayNO; default: @@ -4672,7 +4679,13 @@ yes: PL_regmatch_slab = PL_regmatch_slab->prev; st = SLAB_LAST(PL_regmatch_slab); } - DEBUG_STATE_pp("pop (yes)"); + DEBUG_STATE_r( + if (no_final) { + DEBUG_STATE_pp("pop (no final)"); + } else { + DEBUG_STATE_pp("pop (yes)"); + } + ); depth--; } #else @@ -4690,7 +4703,7 @@ yes: yes_state = st->u.yes.prev_yes_state; PL_regmatch_state = st; - state_num = st->resume_state; + state_num = st->resume_state + no_final; goto reenter_switch; } @@ -4709,6 +4722,13 @@ no: ); no_silent: + if (no_final) { + if (yes_state) { + goto yes; + } else { + goto final_exit; + } + } if (depth) { /* there's a previous state to backtrack to */ st--; diff --git a/regnodes.h b/regnodes.h index 766dcff..c42fcf8 100644 --- a/regnodes.h +++ b/regnodes.h @@ -6,8 +6,8 @@ /* Regops and State definitions */ -#define REGNODE_MAX 75 -#define REGMATCH_STATE_MAX 105 +#define REGNODE_MAX 76 +#define REGMATCH_STATE_MAX 108 #define END 0 /* 0000 End of program. */ #define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */ @@ -83,41 +83,44 @@ #define INSUBP 71 /* 0x47 Whether we are in a specific recurse. */ #define DEFINEP 72 /* 0x48 Never execute directly. */ #define OPFAIL 73 /* 0x49 Same as (?!) */ -#define OPTIMIZED 74 /* 0x4a Placeholder for dump. */ -#define PSEUDO 75 /* 0x4b Pseudo opcode for internal use. */ +#define COMMIT 74 /* 0x4a Pattern fails if backtracking through this */ +#define OPTIMIZED 75 /* 0x4b Placeholder for dump. */ +#define PSEUDO 76 /* 0x4c Pseudo opcode for internal use. */ /* ------------ States ------------- */ -#define TRIE_next 76 /* 0x4c Regmatch state for TRIE */ -#define TRIE_next_fail 77 /* 0x4d Regmatch state for TRIE */ -#define EVAL_AB 78 /* 0x4e Regmatch state for EVAL */ -#define EVAL_AB_fail 79 /* 0x4f Regmatch state for EVAL */ -#define CURLYX_end 80 /* 0x50 Regmatch state for CURLYX */ -#define CURLYX_end_fail 81 /* 0x51 Regmatch state for CURLYX */ -#define WHILEM_A_pre 82 /* 0x52 Regmatch state for WHILEM */ -#define WHILEM_A_pre_fail 83 /* 0x53 Regmatch state for WHILEM */ -#define WHILEM_A_min 84 /* 0x54 Regmatch state for WHILEM */ -#define WHILEM_A_min_fail 85 /* 0x55 Regmatch state for WHILEM */ -#define WHILEM_A_max 86 /* 0x56 Regmatch state for WHILEM */ -#define WHILEM_A_max_fail 87 /* 0x57 Regmatch state for WHILEM */ -#define WHILEM_B_min 88 /* 0x58 Regmatch state for WHILEM */ -#define WHILEM_B_min_fail 89 /* 0x59 Regmatch state for WHILEM */ -#define WHILEM_B_max 90 /* 0x5a Regmatch state for WHILEM */ -#define WHILEM_B_max_fail 91 /* 0x5b Regmatch state for WHILEM */ -#define BRANCH_next 92 /* 0x5c Regmatch state for BRANCH */ -#define BRANCH_next_fail 93 /* 0x5d Regmatch state for BRANCH */ -#define CURLYM_A 94 /* 0x5e Regmatch state for CURLYM */ -#define CURLYM_A_fail 95 /* 0x5f Regmatch state for CURLYM */ -#define CURLYM_B 96 /* 0x60 Regmatch state for CURLYM */ -#define CURLYM_B_fail 97 /* 0x61 Regmatch state for CURLYM */ -#define IFMATCH_A 98 /* 0x62 Regmatch state for IFMATCH */ -#define IFMATCH_A_fail 99 /* 0x63 Regmatch state for IFMATCH */ -#define CURLY_B_min_known 100 /* 0x64 Regmatch state for CURLY */ -#define CURLY_B_min_known_fail 101 /* 0x65 Regmatch state for CURLY */ -#define CURLY_B_min 102 /* 0x66 Regmatch state for CURLY */ -#define CURLY_B_min_fail 103 /* 0x67 Regmatch state for CURLY */ -#define CURLY_B_max 104 /* 0x68 Regmatch state for CURLY */ -#define CURLY_B_max_fail 105 /* 0x69 Regmatch state for CURLY */ +#define TRIE_next 77 /* 0x4d Regmatch state for TRIE */ +#define TRIE_next_fail 78 /* 0x4e Regmatch state for TRIE */ +#define EVAL_AB 79 /* 0x4f Regmatch state for EVAL */ +#define EVAL_AB_fail 80 /* 0x50 Regmatch state for EVAL */ +#define CURLYX_end 81 /* 0x51 Regmatch state for CURLYX */ +#define CURLYX_end_fail 82 /* 0x52 Regmatch state for CURLYX */ +#define WHILEM_A_pre 83 /* 0x53 Regmatch state for WHILEM */ +#define WHILEM_A_pre_fail 84 /* 0x54 Regmatch state for WHILEM */ +#define WHILEM_A_min 85 /* 0x55 Regmatch state for WHILEM */ +#define WHILEM_A_min_fail 86 /* 0x56 Regmatch state for WHILEM */ +#define WHILEM_A_max 87 /* 0x57 Regmatch state for WHILEM */ +#define WHILEM_A_max_fail 88 /* 0x58 Regmatch state for WHILEM */ +#define WHILEM_B_min 89 /* 0x59 Regmatch state for WHILEM */ +#define WHILEM_B_min_fail 90 /* 0x5a Regmatch state for WHILEM */ +#define WHILEM_B_max 91 /* 0x5b Regmatch state for WHILEM */ +#define WHILEM_B_max_fail 92 /* 0x5c Regmatch state for WHILEM */ +#define BRANCH_next 93 /* 0x5d Regmatch state for BRANCH */ +#define BRANCH_next_fail 94 /* 0x5e Regmatch state for BRANCH */ +#define CURLYM_A 95 /* 0x5f Regmatch state for CURLYM */ +#define CURLYM_A_fail 96 /* 0x60 Regmatch state for CURLYM */ +#define CURLYM_B 97 /* 0x61 Regmatch state for CURLYM */ +#define CURLYM_B_fail 98 /* 0x62 Regmatch state for CURLYM */ +#define IFMATCH_A 99 /* 0x63 Regmatch state for IFMATCH */ +#define IFMATCH_A_fail 100 /* 0x64 Regmatch state for IFMATCH */ +#define CURLY_B_min_known 101 /* 0x65 Regmatch state for CURLY */ +#define CURLY_B_min_known_fail 102 /* 0x66 Regmatch state for CURLY */ +#define CURLY_B_min 103 /* 0x67 Regmatch state for CURLY */ +#define CURLY_B_min_fail 104 /* 0x68 Regmatch state for CURLY */ +#define CURLY_B_max 105 /* 0x69 Regmatch state for CURLY */ +#define CURLY_B_max_fail 106 /* 0x6a Regmatch state for CURLY */ +#define COMMIT_next 107 /* 0x6b Regmatch state for COMMIT */ +#define COMMIT_next_fail 108 /* 0x6c Regmatch state for COMMIT */ /* PL_regkind[] What type of regop or state is this. */ @@ -199,6 +202,7 @@ EXTCONST U8 PL_regkind[] = { INSUBP, /* INSUBP */ DEFINEP, /* DEFINEP */ OPFAIL, /* OPFAIL */ + COMMIT, /* COMMIT */ NOTHING, /* OPTIMIZED */ PSEUDO, /* PSEUDO */ /* ------------ States ------------- */ @@ -232,6 +236,8 @@ EXTCONST U8 PL_regkind[] = { CURLY, /* CURLY_B_min_fail */ CURLY, /* CURLY_B_max */ CURLY, /* CURLY_B_max_fail */ + COMMIT, /* COMMIT_next */ + COMMIT, /* COMMIT_next_fail */ }; #endif @@ -313,6 +319,7 @@ static const U8 regarglen[] = { EXTRA_SIZE(struct regnode_1), /* INSUBP */ EXTRA_SIZE(struct regnode_1), /* DEFINEP */ 0, /* OPFAIL */ + 0, /* COMMIT */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -394,6 +401,7 @@ static const char reg_off_by_arg[] = { 0, /* INSUBP */ 0, /* DEFINEP */ 0, /* OPFAIL */ + 0, /* COMMIT */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -476,39 +484,42 @@ const char * reg_name[] = { "INSUBP", /* 0x47 */ "DEFINEP", /* 0x48 */ "OPFAIL", /* 0x49 */ - "OPTIMIZED", /* 0x4a */ - "PSEUDO", /* 0x4b */ + "COMMIT", /* 0x4a */ + "OPTIMIZED", /* 0x4b */ + "PSEUDO", /* 0x4c */ /* ------------ States ------------- */ - "TRIE_next", /* 0x4c */ - "TRIE_next_fail", /* 0x4d */ - "EVAL_AB", /* 0x4e */ - "EVAL_AB_fail", /* 0x4f */ - "CURLYX_end", /* 0x50 */ - "CURLYX_end_fail", /* 0x51 */ - "WHILEM_A_pre", /* 0x52 */ - "WHILEM_A_pre_fail", /* 0x53 */ - "WHILEM_A_min", /* 0x54 */ - "WHILEM_A_min_fail", /* 0x55 */ - "WHILEM_A_max", /* 0x56 */ - "WHILEM_A_max_fail", /* 0x57 */ - "WHILEM_B_min", /* 0x58 */ - "WHILEM_B_min_fail", /* 0x59 */ - "WHILEM_B_max", /* 0x5a */ - "WHILEM_B_max_fail", /* 0x5b */ - "BRANCH_next", /* 0x5c */ - "BRANCH_next_fail", /* 0x5d */ - "CURLYM_A", /* 0x5e */ - "CURLYM_A_fail", /* 0x5f */ - "CURLYM_B", /* 0x60 */ - "CURLYM_B_fail", /* 0x61 */ - "IFMATCH_A", /* 0x62 */ - "IFMATCH_A_fail", /* 0x63 */ - "CURLY_B_min_known", /* 0x64 */ - "CURLY_B_min_known_fail", /* 0x65 */ - "CURLY_B_min", /* 0x66 */ - "CURLY_B_min_fail", /* 0x67 */ - "CURLY_B_max", /* 0x68 */ - "CURLY_B_max_fail", /* 0x69 */ + "TRIE_next", /* 0x4d */ + "TRIE_next_fail", /* 0x4e */ + "EVAL_AB", /* 0x4f */ + "EVAL_AB_fail", /* 0x50 */ + "CURLYX_end", /* 0x51 */ + "CURLYX_end_fail", /* 0x52 */ + "WHILEM_A_pre", /* 0x53 */ + "WHILEM_A_pre_fail", /* 0x54 */ + "WHILEM_A_min", /* 0x55 */ + "WHILEM_A_min_fail", /* 0x56 */ + "WHILEM_A_max", /* 0x57 */ + "WHILEM_A_max_fail", /* 0x58 */ + "WHILEM_B_min", /* 0x59 */ + "WHILEM_B_min_fail", /* 0x5a */ + "WHILEM_B_max", /* 0x5b */ + "WHILEM_B_max_fail", /* 0x5c */ + "BRANCH_next", /* 0x5d */ + "BRANCH_next_fail", /* 0x5e */ + "CURLYM_A", /* 0x5f */ + "CURLYM_A_fail", /* 0x60 */ + "CURLYM_B", /* 0x61 */ + "CURLYM_B_fail", /* 0x62 */ + "IFMATCH_A", /* 0x63 */ + "IFMATCH_A_fail", /* 0x64 */ + "CURLY_B_min_known", /* 0x65 */ + "CURLY_B_min_known_fail", /* 0x66 */ + "CURLY_B_min", /* 0x67 */ + "CURLY_B_min_fail", /* 0x68 */ + "CURLY_B_max", /* 0x69 */ + "CURLY_B_max_fail", /* 0x6a */ + "COMMIT_next", /* 0x6b */ + "COMMIT_next_fail", /* 0x6c */ }; #endif /* DEBUGGING */ #else diff --git a/t/op/pat.t b/t/op/pat.t index 3cb8e8f..1686234 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -3719,8 +3719,14 @@ sub iseq($$;$) { '; ok(!$@,'lvalue $+{...} should not throw an exception'); } - - +{ + our $count = 0; + 'aaab'=~/a+b?(?{$count++})(?FAIL)/; + iseq($count,9,"expect 9 for no (?COMMIT)"); + $count = 0; + 'aaab'=~/a+b?(?COMMIT)(?{$count++})(?FAIL)/; + iseq($count,3,"expect 3 with (?COMMIT)"); +} # stress test CURLYX/WHILEM. # # This test includes varying levels of nesting, and according to @@ -3860,5 +3866,5 @@ ok((q(a)x 100) =~ /^(??{'(.)'x 100})/, or print "# Unexpected outcome: should pass or crash perl\n"; # Don't forget to update this! -BEGIN{print "1..1287\n"}; +BEGIN{print "1..1289\n"};