From: Yves Orton Date: Thu, 2 Nov 2006 12:35:10 +0000 (+0100) Subject: Add more backtracking control verbs to regex engine (?CUT), (?ERROR) X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=24b23f37fefbcc71a881f6805d87449a234dc645;p=p5sagit%2Fp5-mst-13.2.git Add more backtracking control verbs to regex engine (?CUT), (?ERROR) Message-ID: <9b18b3110611020335h7ea469a8g28ca483f6832816d@mail.gmail.com> p4raw-id: //depot/perl@29189 --- diff --git a/embed.fnc b/embed.fnc index 2d88011..7511a88 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1357,9 +1357,9 @@ Es |U8 |regtail_study |NN struct RExC_state_t *state|NN regnode *p|NN const regn #endif #if defined(PERL_IN_REGEXEC_C) || defined(PERL_DECL_PROT) -ERs |I32 |regmatch |NN const regmatch_info *reginfo|NN regnode *prog +ERs |I32 |regmatch |NN regmatch_info *reginfo|NN regnode *prog ERs |I32 |regrepeat |NN const regexp *prog|NN const regnode *p|I32 max -ERs |I32 |regtry |NN const regmatch_info *reginfo|NN char *startpos +ERs |I32 |regtry |NN regmatch_info *reginfo|NN char **startpos ERs |bool |reginclass |NULLOK const regexp *prog|NN const regnode *n|NN const U8 *p|NULLOK STRLEN *lenp\ |bool do_utf8sv_is_utf8 Es |CHECKPOINT|regcppush |I32 parenfloor @@ -1369,7 +1369,7 @@ ERsn |U8* |reghop3 |NN U8 *pos|I32 off|NN const U8 *lim ERsn |U8* |reghop4 |NN U8 *pos|I32 off|NN const U8 *llim|NN const U8 *rlim #endif ERsn |U8* |reghopmaybe3 |NN U8 *pos|I32 off|NN const U8 *lim -ERs |char* |find_byclass |NN regexp * prog|NN const regnode *c|NN char *s|NN const char *strend|NULLOK const regmatch_info *reginfo +ERs |char* |find_byclass |NN regexp * prog|NN const regnode *c|NN char *s|NN const char *strend|NULLOK regmatch_info *reginfo Es |void |to_utf8_substr |NN regexp * prog Es |void |to_byte_substr |NN regexp * prog ERs |I32 |reg_check_named_buff_matched |NN const regexp *rex|NN const regnode *prog diff --git a/ext/re/re.pm b/ext/re/re.pm index 5c54ae3..ac71f0a 100644 --- a/ext/re/re.pm +++ b/ext/re/re.pm @@ -95,7 +95,7 @@ Turns on debug output related to the process of parsing the pattern. Enables output related to the optimisation phase of compilation. -=item TRIE_COMPILE +=item TRIEC Detailed info about trie compilation. @@ -103,16 +103,6 @@ Detailed info about trie compilation. Dump the final program out after it is compiled and optimised. -=item OFFSETS - -Dump offset information. This can be used to see how regops correlate -to the pattern. Output format is - - NODENUM:POSITION[LENGTH] - -Where 1 is the position of the first char in the string. Note that position -can be 0, or larger than the actual length of the pattern, likewise length -can be zero. =back @@ -128,7 +118,7 @@ Turns on all execute related debug options. Turns on debugging of the main matching loop. -=item TRIE_EXECUTE +=item TRIEE Extra debugging of how tries execute. @@ -146,12 +136,38 @@ Enable debugging of start point optimisations. Turns on all "extra" debugging options. -=item TRIE_MORE +=item TRIEM + +Enable enhanced TRIE debugging. Enhances both TRIEE +and TRIEC. + +=item STATE + +Enable debugging of states in the engine. + +=item STACK -Enable enhanced TRIE debugging. Enhances both TRIE_EXECUTE -and TRIE_COMPILE. +Enable debugging of the recursion stack in the engine. Enabling +or disabling this option automatically does the same for debugging +states as well. This output from this can be quite large. + +=item OPTIMISEM + +Enable enhanced optimisation debugging and start point optimisations. +Probably not useful except when debugging the regex engine itself. + +=item OFFSETS + +Dump offset information. This can be used to see how regops correlate +to the pattern. Output format is + + NODENUM:POSITION[LENGTH] + +Where 1 is the position of the first char in the string. Note that position +can be 0, or larger than the actual length of the pattern, likewise length +can be zero. -=item OFFSETS_DEBUG +=item OFFSETSDBG Enable debugging of offsets information. This emits copious amounts of trace information and doesn't mesh well with other @@ -182,7 +198,7 @@ Enable DUMP and all execute options. Equivalent to: =item More -Enable TRIE_MORE and all execute compile and execute options. +Enable TRIEM and all execute compile and execute options. =back @@ -239,6 +255,7 @@ my %flags = ( OFFSETSDBG => 0x040000, STATE => 0x080000, OPTIMISEM => 0x100000, + STACK => 0x280000, ); $flags{ALL} = -1; $flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE}; diff --git a/pod/perl595delta.pod b/pod/perl595delta.pod index 5d7e686..ff8efcd 100644 --- a/pod/perl595delta.pod +++ b/pod/perl595delta.pod @@ -104,6 +104,12 @@ similar to non-greedy matching, except instead of using a '?' as the modifier the '+' is used. Thus C, C<*+>, C<++>, C<{min,max}+> are now legal quantifiers. (Yves Orton) +=item Backtracking control verbs + +The regex engine now supports a number of special purpose backtrack +control verbs: (?COMMIT), (?CUT), (?ERROR) and (?FAIL). See L +for their descriptions. + =back =head2 The C<_> prototype diff --git a/pod/perlre.pod b/pod/perlre.pod index 4e683a3..bce7291 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -1094,6 +1094,48 @@ Any number of C<(?COMMIT)> assertions may be used in a pattern. See also C<< (?>pattern) >> and possessive quantifiers for other ways to control backtracking. +=item C<(?CUT)> +X<(?CUT)> + +This zero-width pattern is similar to C<(?COMMIT)>, except that on +failure it also signifies that whatever text that was matched leading +up to the C<(?CUT)> pattern cannot match, I. + +Compare the following to the examples in C<(?COMMIT)>, note the string +is twice as long: + + 'aaabaaab'=~/a+b?(?CUT)(?{print "$&\n"; $count++})(?FAIL)/; + print "Count=$count\n"; + +outputs + + aaab + aaab + Count=2 + +Once the 'aaab' at the start of the string has matched and the C<(?CUT)> +executed the next startpoint will be where the cursor was when the +C<(?CUT)> was executed. + +=item C<(?ERROR)> +X<(?ERROR)> + +This zero-width pattern is similar to C<(?CUT)> except that it causes +the match to fail outright. No attempts to match will occur again. + + 'aaabaaab'=~/a+b?(?ERROR)(?{print "$&\n"; $count++})(?FAIL)/; + print "Count=$count\n"; + +outputs + + aaab + Count=1 + +In other words, once the C<(?ERROR)> has been entered and then pattern +does not match then the regex engine will not try any further matching at +all on the rest of the string. + =item C<(?(condition)yes-pattern|no-pattern)> X<(?()> diff --git a/proto.h b/proto.h index c6f398a..47f302e 100644 --- a/proto.h +++ b/proto.h @@ -3691,7 +3691,7 @@ STATIC U8 S_regtail_study(pTHX_ struct RExC_state_t *state, regnode *p, const re #endif #if defined(PERL_IN_REGEXEC_C) || defined(PERL_DECL_PROT) -STATIC I32 S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) +STATIC I32 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2); @@ -3701,7 +3701,7 @@ STATIC I32 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2); -STATIC I32 S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos) +STATIC I32 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2); @@ -3733,7 +3733,7 @@ STATIC U8* S_reghopmaybe3(U8 *pos, I32 off, const U8 *lim) __attribute__nonnull__(1) __attribute__nonnull__(3); -STATIC char* S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const char *strend, const regmatch_info *reginfo) +STATIC char* S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const char *strend, regmatch_info *reginfo) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2) diff --git a/regcomp.c b/regcomp.c index 6938954..1523fc1 100644 --- a/regcomp.c +++ b/regcomp.c @@ -4717,7 +4717,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case ':': /* (?:...) */ case '>': /* (?>...) */ break; - case 'C': + case 'C': /* (?CUT) and (?COMMIT) */ if (RExC_parse[0] == 'O' && RExC_parse[1] == 'M' && RExC_parse[2] == 'M' && @@ -4727,12 +4727,34 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) { RExC_parse+=5; ret = reg_node(pRExC_state, COMMIT); + } else if ( + RExC_parse[0] == 'U' && + RExC_parse[1] == 'T' && + RExC_parse[2] == ')') + { + RExC_parse+=2; + ret = reg_node(pRExC_state, CUT); } else { vFAIL("Sequence (?C... not terminated"); } nextchar(pRExC_state); return ret; break; + case 'E': /* (?ERROR) */ + if (RExC_parse[0] == 'R' && + RExC_parse[1] == 'R' && + RExC_parse[2] == 'O' && + RExC_parse[3] == 'R' && + RExC_parse[4] == ')') + { + RExC_parse+=4; + ret = reg_node(pRExC_state, OPERROR); + } else { + vFAIL("Sequence (?E... not terminated"); + } + nextchar(pRExC_state); + return ret; + break; case 'F': if (RExC_parse[0] == 'A' && RExC_parse[1] == 'I' && @@ -8669,7 +8691,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, (dist ? this_trie + dist : next) - start); if (dist) { if (!nextbranch) - nextbranch = this_trie + trie->jump[0]; + nextbranch= this_trie + trie->jump[0]; DUMPUNTIL(this_trie + dist, nextbranch); } if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) diff --git a/regcomp.h b/regcomp.h index f7082bf..360e2a9 100644 --- a/regcomp.h +++ b/regcomp.h @@ -624,6 +624,8 @@ re.pm, especially to the documentation. #define RE_DEBUG_EXTRA_OFFDEBUG 0x040000 #define RE_DEBUG_EXTRA_STATE 0x080000 #define RE_DEBUG_EXTRA_OPTIMISE 0x100000 +/* combined */ +#define RE_DEBUG_EXTRA_STACK 0x280000 #define RE_DEBUG_FLAG(x) (re_debug_flags & x) /* Compile */ @@ -657,6 +659,8 @@ re.pm, especially to the documentation. if (re_debug_flags & RE_DEBUG_EXTRA_OFFSETS) x ) #define DEBUG_STATE_r(x) DEBUG_r( \ if (re_debug_flags & RE_DEBUG_EXTRA_STATE) x ) +#define DEBUG_STACK_r(x) DEBUG_r( \ + if (re_debug_flags & RE_DEBUG_EXTRA_STACK) x ) #define DEBUG_OPTIMISE_MORE_r(x) DEBUG_r( \ if ((RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE) == \ (re_debug_flags & (RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE)) ) x ) diff --git a/regcomp.pl b/regcomp.pl index 2e84604..700268d 100644 --- a/regcomp.pl +++ b/regcomp.pl @@ -48,7 +48,7 @@ while () { $ind++; $name[$ind]="$real$suffix"; $type[$ind]=$type; - $rest[$ind]="Regmatch state for $type"; + $rest[$ind]="state for $type"; } } } @@ -92,13 +92,16 @@ EOP -$width, REGMATCH_STATE_MAX => $tot - 1 ; -$ind = 0; -while (++$ind <= $tot) { + +for ($ind=1; $ind <= $lastregop ; $ind++) { my $oind = $ind - 1; printf OUT "#define\t%*s\t%d\t/* %#04x %s */\n", -$width, $name[$ind], $ind-1, $ind-1, $rest[$ind]; - print OUT "\n\t/* ------------ States ------------- */\n\n" - if $ind == $lastregop and $lastregop != $tot; +} +print OUT "\t/* ------------ States ------------- */\n"; +for ( ; $ind <= $tot ; $ind++) { + printf OUT "#define\t%*s\t(REGNODE_MAX + %d)\t/* %s */\n", + -$width, $name[$ind], $ind - $lastregop, $rest[$ind]; } print OUT <reganch & ROPT_SKIP) == 0; @@ -1155,7 +1155,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* The assignment of 2 is intentional: * for the folded sharp s, the skip is 2. */ (skip = SHARP_S_SKIP))) { - if (tmp && (!reginfo || regtry(reginfo, s))) + if (tmp && (!reginfo || regtry(reginfo, &s))) goto got_it; else tmp = doevery; @@ -1168,7 +1168,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; case CANY: REXEC_FBC_SCAN( - if (tmp && (!reginfo || regtry(reginfo, s))) + if (tmp && (!reginfo || regtry(reginfo, &s))) goto got_it; else tmp = doevery; @@ -1302,7 +1302,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } ); } - if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s))) + if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) goto got_it; break; case NBOUNDL: @@ -1338,7 +1338,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, else REXEC_FBC_TRYIT; ); } - if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s))) + if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s))) goto got_it; break; case ALNUM: @@ -1598,7 +1598,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, (UV)accepted_word, s - real_start ); }); - if (!reginfo || regtry(reginfo, s)) { + if (!reginfo || regtry(reginfo, &s)) { FREETMPS; LEAVE; goto got_it; @@ -1639,9 +1639,9 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * /* nosave: For optimizations. */ { dVAR; - register char *s; + /*register*/ char *s; register regnode *c; - register char *startpos = stringarg; + /*register*/ char *startpos = stringarg; I32 minlen; /* must match at least this many chars */ I32 dontbother = 0; /* how many characters not to try at end */ I32 end_shift = 0; /* Same for the end. */ /* CC */ @@ -1744,7 +1744,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * /* Simplest case: anchored match need be tried only once. */ /* [unless only anchor is BOL and multiline is set] */ if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) { - if (s == startpos && regtry(®info, startpos)) + if (s == startpos && regtry(®info, &startpos)) goto got_it; else if (multiline || (prog->reganch & ROPT_IMPLICIT) || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */ @@ -1759,7 +1759,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * if (s == startpos) goto after_try; while (1) { - if (regtry(®info, s)) + if (regtry(®info, &s)) goto got_it; after_try: if (s >= end) @@ -1777,7 +1777,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * s--; while (s < end) { if (*s++ == '\n') { /* don't need PL_utf8skip here */ - if (regtry(®info, s)) + if (regtry(®info, &s)) goto got_it; } } @@ -1789,7 +1789,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * /* the warning about reginfo.ganch being used without intialization is bogus -- we set it above, when prog->reganch & ROPT_GPOS_SEEN and we only enter this block when the same bit is set. */ - if (regtry(®info, reginfo.ganch)) + if (regtry(®info, ®info.ganch)) goto got_it; goto phooey; } @@ -1810,7 +1810,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * REXEC_FBC_SCAN( if (*s == ch) { DEBUG_EXECUTE_r( did_match = 1 ); - if (regtry(®info, s)) goto got_it; + if (regtry(®info, &s)) goto got_it; s += UTF8SKIP(s); while (s < strend && *s == ch) s += UTF8SKIP(s); @@ -1821,7 +1821,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * REXEC_FBC_SCAN( if (*s == ch) { DEBUG_EXECUTE_r( did_match = 1 ); - if (regtry(®info, s)) goto got_it; + if (regtry(®info, &s)) goto got_it; s++; while (s < strend && *s == ch) s++; @@ -1903,14 +1903,14 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * } if (do_utf8) { while (s <= last1) { - if (regtry(®info, s)) + if (regtry(®info, &s)) goto got_it; s += UTF8SKIP(s); } } else { while (s <= last1) { - if (regtry(®info, s)) + if (regtry(®info, &s)) goto got_it; s++; } @@ -2004,7 +2004,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * /* We don't know much -- general case. */ if (do_utf8) { for (;;) { - if (regtry(®info, s)) + if (regtry(®info, &s)) goto got_it; if (s >= strend) break; @@ -2013,7 +2013,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * } else { do { - if (regtry(®info, s)) + if (regtry(®info, &s)) goto got_it; } while (s++ < strend); } @@ -2082,7 +2082,7 @@ phooey: - regtry - try match at specific point */ STATIC I32 /* 0 failure, 1 success */ -S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos) +S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) { dVAR; register I32 *sp; @@ -2090,6 +2090,7 @@ S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos) CHECKPOINT lastcp; regexp *prog = reginfo->prog; GET_RE_DEBUG_FLAGS_DECL; + reginfo->cutpoint=NULL; if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) { MAGIC *mg; @@ -2161,9 +2162,9 @@ S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos) prog->subbeg = PL_bostr; prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */ } - DEBUG_EXECUTE_r(PL_reg_starttry = startpos); - prog->startp[0] = startpos - PL_bostr; - PL_reginput = startpos; + DEBUG_EXECUTE_r(PL_reg_starttry = *startpos); + prog->startp[0] = *startpos - PL_bostr; + PL_reginput = *startpos; PL_reglastparen = &prog->lastparen; PL_reglastcloseparen = &prog->lastcloseparen; prog->lastparen = 0; @@ -2209,6 +2210,8 @@ S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos) prog->endp[0] = PL_reginput - PL_bostr; return 1; } + if (reginfo->cutpoint) + *startpos= reginfo->cutpoint; REGCP_UNWIND(lastcp); return 0; } @@ -2538,7 +2541,7 @@ S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) { } STATIC I32 /* 0 failure, 1 success */ -S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) +S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) { #if PERL_VERSION < 9 dMY_CXT; @@ -2571,7 +2574,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */ struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */ U32 state_num; - bool no_final = 0; /* if true then we dont backtrack on failure */ + bool no_final = 0; /* these three flags are set by various ops to signal information to * the very next op. They have a useful lifetime of exactly one loop @@ -2592,6 +2595,9 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) GET_RE_DEBUG_FLAGS_DECL; #endif + DEBUG_STACK_r( { + PerlIO_printf(Perl_debug_log,"regmatch start\n"); + }); /* on first ever call to regmatch, allocate first slab */ if (!PL_regmatch_slab) { Newx(PL_regmatch_slab, 1, regmatch_slab); @@ -3414,7 +3420,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) case GOSTART: case GOSUB: /* /(...(?1))/ */ if (cur_eval && cur_eval->locinput==locinput) { - if (cur_eval->u.eval.close_paren == ARG(scan)) + if (cur_eval->u.eval.close_paren == (U32)ARG(scan)) Perl_croak(aTHX_ "Infinite recursion in regex"); if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH ) Perl_croak(aTHX_ @@ -3451,7 +3457,8 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) n = ARG(scan); PL_op = (OP_4tree*)rex->data->data[n]; - DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) ); + DEBUG_STATE_r( PerlIO_printf(Perl_debug_log, + " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) ); PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]); PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr; @@ -4083,7 +4090,7 @@ NULL locinput = PL_reginput; if (cur_eval && cur_eval->u.eval.close_paren && - cur_eval->u.eval.close_paren == ST.me->flags) + cur_eval->u.eval.close_paren == (U32)ST.me->flags) goto fake_end; if ( ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)) ) @@ -4095,7 +4102,7 @@ NULL if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ || (cur_eval && cur_eval->u.eval.close_paren && - cur_eval->u.eval.close_paren == ST.me->flags)) + cur_eval->u.eval.close_paren == (U32)ST.me->flags)) sayNO; curlym_do_B: /* execute the B in /A{m,n}B/ */ @@ -4149,7 +4156,7 @@ NULL else PL_regendp[paren] = -1; if (cur_eval && cur_eval->u.eval.close_paren && - cur_eval->u.eval.close_paren == ST.me->flags) + cur_eval->u.eval.close_paren == (U32)ST.me->flags) { if (ST.count) goto fake_end; @@ -4615,7 +4622,17 @@ NULL if (next == scan) next = NULL; break; + case OPERROR: + reginfo->cutpoint=PL_regeol; + goto do_commit; + /* NOTREACHED */ + case CUT: + if ( locinput > reginfo->bol ) + reginfo->cutpoint = HOPBACKc(locinput, 1); + /* FALLTHROUGH */ case COMMIT: + do_commit: + PL_reginput = locinput; PUSH_STATE_GOTO(COMMIT_next,next); /* NOTREACHED */ case COMMIT_next_fail: @@ -4643,7 +4660,27 @@ NULL { regmatch_state *newst; - DEBUG_STATE_pp("push"); + DEBUG_STACK_r({ + regmatch_state *cur = st; + regmatch_state *curyes = yes_state; + int curd = depth; + regmatch_slab *slab = PL_regmatch_slab; + for (;curd > -1;cur--,curd--) { + if (cur < SLAB_FIRST(slab)) { + slab = slab->prev; + cur = SLAB_LAST(slab); + } + PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n", + REPORT_CODE_OFF + 2 + depth * 2,"", + curd, reg_name[cur->resume_state], + (curyes == cur) ? "yes" : "" + ); + if (curyes == cur) + curyes = cur->u.yes.prev_yes_state; + } + } else + DEBUG_STATE_pp("push") + ); depth++; st->locinput = locinput; newst = st+1; @@ -4702,6 +4739,7 @@ yes: st = yes_state; yes_state = st->u.yes.prev_yes_state; PL_regmatch_state = st; + state_num = st->resume_state + no_final; goto reenter_switch; diff --git a/regexp.h b/regexp.h index 89fcea7..f13a5c5 100644 --- a/regexp.h +++ b/regexp.h @@ -202,6 +202,7 @@ typedef struct { char *till; SV *sv; char *ganch; + char *cutpoint; } regmatch_info; diff --git a/regnodes.h b/regnodes.h index c42fcf8..010b943 100644 --- a/regnodes.h +++ b/regnodes.h @@ -6,8 +6,8 @@ /* Regops and State definitions */ -#define REGNODE_MAX 76 -#define REGMATCH_STATE_MAX 108 +#define REGNODE_MAX 78 +#define REGMATCH_STATE_MAX 110 #define END 0 /* 0000 End of program. */ #define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */ @@ -84,43 +84,43 @@ #define DEFINEP 72 /* 0x48 Never execute directly. */ #define OPFAIL 73 /* 0x49 Same as (?!) */ #define COMMIT 74 /* 0x4a Pattern fails if backtracking through this */ -#define OPTIMIZED 75 /* 0x4b Placeholder for dump. */ -#define PSEUDO 76 /* 0x4c Pseudo opcode for internal use. */ - +#define CUT 75 /* 0x4b ... and restarts at the cursor point */ +#define OPERROR 76 /* 0x4c Pattern fails outright if backtracking through this */ +#define OPTIMIZED 77 /* 0x4d Placeholder for dump. */ +#define PSEUDO 78 /* 0x4e Pseudo opcode for internal use. */ /* ------------ States ------------- */ - -#define TRIE_next 77 /* 0x4d Regmatch state for TRIE */ -#define TRIE_next_fail 78 /* 0x4e Regmatch state for TRIE */ -#define EVAL_AB 79 /* 0x4f Regmatch state for EVAL */ -#define EVAL_AB_fail 80 /* 0x50 Regmatch state for EVAL */ -#define CURLYX_end 81 /* 0x51 Regmatch state for CURLYX */ -#define CURLYX_end_fail 82 /* 0x52 Regmatch state for CURLYX */ -#define WHILEM_A_pre 83 /* 0x53 Regmatch state for WHILEM */ -#define WHILEM_A_pre_fail 84 /* 0x54 Regmatch state for WHILEM */ -#define WHILEM_A_min 85 /* 0x55 Regmatch state for WHILEM */ -#define WHILEM_A_min_fail 86 /* 0x56 Regmatch state for WHILEM */ -#define WHILEM_A_max 87 /* 0x57 Regmatch state for WHILEM */ -#define WHILEM_A_max_fail 88 /* 0x58 Regmatch state for WHILEM */ -#define WHILEM_B_min 89 /* 0x59 Regmatch state for WHILEM */ -#define WHILEM_B_min_fail 90 /* 0x5a Regmatch state for WHILEM */ -#define WHILEM_B_max 91 /* 0x5b Regmatch state for WHILEM */ -#define WHILEM_B_max_fail 92 /* 0x5c Regmatch state for WHILEM */ -#define BRANCH_next 93 /* 0x5d Regmatch state for BRANCH */ -#define BRANCH_next_fail 94 /* 0x5e Regmatch state for BRANCH */ -#define CURLYM_A 95 /* 0x5f Regmatch state for CURLYM */ -#define CURLYM_A_fail 96 /* 0x60 Regmatch state for CURLYM */ -#define CURLYM_B 97 /* 0x61 Regmatch state for CURLYM */ -#define CURLYM_B_fail 98 /* 0x62 Regmatch state for CURLYM */ -#define IFMATCH_A 99 /* 0x63 Regmatch state for IFMATCH */ -#define IFMATCH_A_fail 100 /* 0x64 Regmatch state for IFMATCH */ -#define CURLY_B_min_known 101 /* 0x65 Regmatch state for CURLY */ -#define CURLY_B_min_known_fail 102 /* 0x66 Regmatch state for CURLY */ -#define CURLY_B_min 103 /* 0x67 Regmatch state for CURLY */ -#define CURLY_B_min_fail 104 /* 0x68 Regmatch state for CURLY */ -#define CURLY_B_max 105 /* 0x69 Regmatch state for CURLY */ -#define CURLY_B_max_fail 106 /* 0x6a Regmatch state for CURLY */ -#define COMMIT_next 107 /* 0x6b Regmatch state for COMMIT */ -#define COMMIT_next_fail 108 /* 0x6c Regmatch state for COMMIT */ +#define TRIE_next (REGNODE_MAX + 1) /* state for TRIE */ +#define TRIE_next_fail (REGNODE_MAX + 2) /* state for TRIE */ +#define EVAL_AB (REGNODE_MAX + 3) /* state for EVAL */ +#define EVAL_AB_fail (REGNODE_MAX + 4) /* state for EVAL */ +#define CURLYX_end (REGNODE_MAX + 5) /* state for CURLYX */ +#define CURLYX_end_fail (REGNODE_MAX + 6) /* state for CURLYX */ +#define WHILEM_A_pre (REGNODE_MAX + 7) /* state for WHILEM */ +#define WHILEM_A_pre_fail (REGNODE_MAX + 8) /* state for WHILEM */ +#define WHILEM_A_min (REGNODE_MAX + 9) /* state for WHILEM */ +#define WHILEM_A_min_fail (REGNODE_MAX + 10) /* state for WHILEM */ +#define WHILEM_A_max (REGNODE_MAX + 11) /* state for WHILEM */ +#define WHILEM_A_max_fail (REGNODE_MAX + 12) /* state for WHILEM */ +#define WHILEM_B_min (REGNODE_MAX + 13) /* state for WHILEM */ +#define WHILEM_B_min_fail (REGNODE_MAX + 14) /* state for WHILEM */ +#define WHILEM_B_max (REGNODE_MAX + 15) /* state for WHILEM */ +#define WHILEM_B_max_fail (REGNODE_MAX + 16) /* state for WHILEM */ +#define BRANCH_next (REGNODE_MAX + 17) /* state for BRANCH */ +#define BRANCH_next_fail (REGNODE_MAX + 18) /* state for BRANCH */ +#define CURLYM_A (REGNODE_MAX + 19) /* state for CURLYM */ +#define CURLYM_A_fail (REGNODE_MAX + 20) /* state for CURLYM */ +#define CURLYM_B (REGNODE_MAX + 21) /* state for CURLYM */ +#define CURLYM_B_fail (REGNODE_MAX + 22) /* state for CURLYM */ +#define IFMATCH_A (REGNODE_MAX + 23) /* state for IFMATCH */ +#define IFMATCH_A_fail (REGNODE_MAX + 24) /* state for IFMATCH */ +#define CURLY_B_min_known (REGNODE_MAX + 25) /* state for CURLY */ +#define CURLY_B_min_known_fail (REGNODE_MAX + 26) /* state for CURLY */ +#define CURLY_B_min (REGNODE_MAX + 27) /* state for CURLY */ +#define CURLY_B_min_fail (REGNODE_MAX + 28) /* state for CURLY */ +#define CURLY_B_max (REGNODE_MAX + 29) /* state for CURLY */ +#define CURLY_B_max_fail (REGNODE_MAX + 30) /* state for CURLY */ +#define COMMIT_next (REGNODE_MAX + 31) /* state for COMMIT */ +#define COMMIT_next_fail (REGNODE_MAX + 32) /* state for COMMIT */ /* PL_regkind[] What type of regop or state is this. */ @@ -203,6 +203,8 @@ EXTCONST U8 PL_regkind[] = { DEFINEP, /* DEFINEP */ OPFAIL, /* OPFAIL */ COMMIT, /* COMMIT */ + COMMIT, /* CUT */ + OPERROR, /* OPERROR */ NOTHING, /* OPTIMIZED */ PSEUDO, /* PSEUDO */ /* ------------ States ------------- */ @@ -320,6 +322,8 @@ static const U8 regarglen[] = { EXTRA_SIZE(struct regnode_1), /* DEFINEP */ 0, /* OPFAIL */ 0, /* COMMIT */ + 0, /* CUT */ + 0, /* OPERROR */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -402,6 +406,8 @@ static const char reg_off_by_arg[] = { 0, /* DEFINEP */ 0, /* OPFAIL */ 0, /* COMMIT */ + 0, /* CUT */ + 0, /* OPERROR */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -485,41 +491,43 @@ const char * reg_name[] = { "DEFINEP", /* 0x48 */ "OPFAIL", /* 0x49 */ "COMMIT", /* 0x4a */ - "OPTIMIZED", /* 0x4b */ - "PSEUDO", /* 0x4c */ + "CUT", /* 0x4b */ + "OPERROR", /* 0x4c */ + "OPTIMIZED", /* 0x4d */ + "PSEUDO", /* 0x4e */ /* ------------ States ------------- */ - "TRIE_next", /* 0x4d */ - "TRIE_next_fail", /* 0x4e */ - "EVAL_AB", /* 0x4f */ - "EVAL_AB_fail", /* 0x50 */ - "CURLYX_end", /* 0x51 */ - "CURLYX_end_fail", /* 0x52 */ - "WHILEM_A_pre", /* 0x53 */ - "WHILEM_A_pre_fail", /* 0x54 */ - "WHILEM_A_min", /* 0x55 */ - "WHILEM_A_min_fail", /* 0x56 */ - "WHILEM_A_max", /* 0x57 */ - "WHILEM_A_max_fail", /* 0x58 */ - "WHILEM_B_min", /* 0x59 */ - "WHILEM_B_min_fail", /* 0x5a */ - "WHILEM_B_max", /* 0x5b */ - "WHILEM_B_max_fail", /* 0x5c */ - "BRANCH_next", /* 0x5d */ - "BRANCH_next_fail", /* 0x5e */ - "CURLYM_A", /* 0x5f */ - "CURLYM_A_fail", /* 0x60 */ - "CURLYM_B", /* 0x61 */ - "CURLYM_B_fail", /* 0x62 */ - "IFMATCH_A", /* 0x63 */ - "IFMATCH_A_fail", /* 0x64 */ - "CURLY_B_min_known", /* 0x65 */ - "CURLY_B_min_known_fail", /* 0x66 */ - "CURLY_B_min", /* 0x67 */ - "CURLY_B_min_fail", /* 0x68 */ - "CURLY_B_max", /* 0x69 */ - "CURLY_B_max_fail", /* 0x6a */ - "COMMIT_next", /* 0x6b */ - "COMMIT_next_fail", /* 0x6c */ + "TRIE_next", /* REGNODE_MAX +0x01 */ + "TRIE_next_fail", /* REGNODE_MAX +0x02 */ + "EVAL_AB", /* REGNODE_MAX +0x03 */ + "EVAL_AB_fail", /* REGNODE_MAX +0x04 */ + "CURLYX_end", /* REGNODE_MAX +0x05 */ + "CURLYX_end_fail", /* REGNODE_MAX +0x06 */ + "WHILEM_A_pre", /* REGNODE_MAX +0x07 */ + "WHILEM_A_pre_fail", /* REGNODE_MAX +0x08 */ + "WHILEM_A_min", /* REGNODE_MAX +0x09 */ + "WHILEM_A_min_fail", /* REGNODE_MAX +0x0a */ + "WHILEM_A_max", /* REGNODE_MAX +0x0b */ + "WHILEM_A_max_fail", /* REGNODE_MAX +0x0c */ + "WHILEM_B_min", /* REGNODE_MAX +0x0d */ + "WHILEM_B_min_fail", /* REGNODE_MAX +0x0e */ + "WHILEM_B_max", /* REGNODE_MAX +0x0f */ + "WHILEM_B_max_fail", /* REGNODE_MAX +0x10 */ + "BRANCH_next", /* REGNODE_MAX +0x11 */ + "BRANCH_next_fail", /* REGNODE_MAX +0x12 */ + "CURLYM_A", /* REGNODE_MAX +0x13 */ + "CURLYM_A_fail", /* REGNODE_MAX +0x14 */ + "CURLYM_B", /* REGNODE_MAX +0x15 */ + "CURLYM_B_fail", /* REGNODE_MAX +0x16 */ + "IFMATCH_A", /* REGNODE_MAX +0x17 */ + "IFMATCH_A_fail", /* REGNODE_MAX +0x18 */ + "CURLY_B_min_known", /* REGNODE_MAX +0x19 */ + "CURLY_B_min_known_fail", /* REGNODE_MAX +0x1a */ + "CURLY_B_min", /* REGNODE_MAX +0x1b */ + "CURLY_B_min_fail", /* REGNODE_MAX +0x1c */ + "CURLY_B_max", /* REGNODE_MAX +0x1d */ + "CURLY_B_max_fail", /* REGNODE_MAX +0x1e */ + "COMMIT_next", /* REGNODE_MAX +0x1f */ + "COMMIT_next_fail", /* REGNODE_MAX +0x20 */ }; #endif /* DEBUGGING */ #else diff --git a/t/op/pat.t b/t/op/pat.t index 1686234..67be900 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -3719,14 +3719,7 @@ sub iseq($$;$) { '; ok(!$@,'lvalue $+{...} should not throw an exception'); } -{ - our $count = 0; - 'aaab'=~/a+b?(?{$count++})(?FAIL)/; - iseq($count,9,"expect 9 for no (?COMMIT)"); - $count = 0; - 'aaab'=~/a+b?(?COMMIT)(?{$count++})(?FAIL)/; - iseq($count,3,"expect 3 with (?COMMIT)"); -} + # stress test CURLYX/WHILEM. # # This test includes varying levels of nesting, and according to @@ -3734,7 +3727,9 @@ sub iseq($$;$) { # CURLYX and WHILEM blocks, except those related to LONGJMP, the # super-linear cache and warnings. It executes about 0.5M regexes -{ +if ($ENV{PERL_SKIP_PSYCHO_TEST}){ + printf "ok %d Skip: No psycho tests\n", $test++; +} else { my $r = qr/^ (?: ( (?:a|z+)+ ) @@ -3856,6 +3851,57 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) { iseq($count,1,"should have matched once only [RT#36046]"); } +{ # Test the (?COMMIT) pattern + our $count = 0; + 'aaab'=~/a+b?(?{$count++})(?FAIL)/; + iseq($count,9,"expect 9 for no (?COMMIT)"); + $count = 0; + 'aaab'=~/a+b?(?COMMIT)(?{$count++})(?FAIL)/; + iseq($count,3,"expect 3 with (?COMMIT)"); + local $_='aaab'; + $count=0; + 1 while /.(?COMMIT)(?{$count++})(?FAIL)/g; + iseq($count,4,"/.(?COMMIT)/"); + $count = 0; + 'aaab'=~/a+b?(??{'(?COMMIT)'})(?{$count++})(?FAIL)/; + iseq($count,3,"expect 3 with (?COMMIT)"); + local $_='aaab'; + $count=0; + 1 while /.(??{'(?COMMIT)'})(?{$count++})(?FAIL)/g; + iseq($count,4,"/.(?COMMIT)/"); +} +{ # Test the (?CUT) pattern + our $count = 0; + 'aaab'=~/a+b?(?CUT)(?{$count++})(?FAIL)/; + iseq($count,1,"expect 1 with (?CUT)"); + local $_='aaab'; + $count=0; + 1 while /.(?CUT)(?{$count++})(?FAIL)/g; + iseq($count,4,"/.(?CUT)/"); + $_='aaabaaab'; + $count=0; + our @res=(); + 1 while /(a+b?)(?CUT)(?{$count++; push @res,$1})(?FAIL)/g; + iseq($count,2,"Expect 2 with (?CUT)" ); + iseq("@res","aaab aaab","adjacent (?CUT) works as expected" ); +} +{ # Test the (?ERROR) pattern + our $count = 0; + 'aaabaaab'=~/a+b?(?ERROR)(?{$count++})(?FAIL)/; + iseq($count,1,"expect 1 with (?ERROR)"); + local $_='aaab'; + $count=0; + 1 while /.(?ERROR)(?{$count++})(?FAIL)/g; + iseq($count,1,"/.(?ERROR)/"); + $_='aaabaaab'; + $count=0; + our @res=(); + 1 while /(a+b?)(?ERROR)(?{$count++; push @res,$1})(?FAIL)/g; + iseq($count,1,"Expect 1 with (?ERROR)" ); + iseq("@res","aaab","adjacent (?ERROR) works as expected" ); +} +#------------------------------------------------------------------- + # Keep the following tests last -- they may crash perl ok(("a" x (2**15 - 10)) =~ /^()(a|bb)*$/, "Recursive stack cracker: #24274") @@ -3865,6 +3911,8 @@ ok((q(a)x 100) =~ /^(??{'(.)'x 100})/, "Regexp /^(??{'(.)'x 100})/ crashes older perls") or print "# Unexpected outcome: should pass or crash perl\n"; +# Put new tests above the line, not here. + # Don't forget to update this! -BEGIN{print "1..1289\n"}; +BEGIN{print "1..1300\n"};