From: Yves Orton Date: Mon, 6 Nov 2006 13:06:28 +0000 (+0100) Subject: New regex syntax omnibus X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=e2e6a0f1870d05ddb1ce18fd8556b71330dc694c;p=p5sagit%2Fp5-mst-13.2.git New regex syntax omnibus Message-ID: <9b18b3110611060406u2fa1572as57073949a5df9e62@mail.gmail.com> Plus a portability fix (in string comparison for regex verbs) and doc tweaks / podchecker fixes p4raw-id: //depot/perl@29222 --- diff --git a/embed.fnc b/embed.fnc index 350b433..a3251a0 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1359,7 +1359,7 @@ Es |U8 |regtail_study |NN struct RExC_state_t *state|NN regnode *p|NN const regn #if defined(PERL_IN_REGEXEC_C) || defined(PERL_DECL_PROT) ERs |I32 |regmatch |NN regmatch_info *reginfo|NN regnode *prog -ERs |I32 |regrepeat |NN const regexp *prog|NN const regnode *p|I32 max +ERs |I32 |regrepeat |NN const regexp *prog|NN const regnode *p|I32 max|int depth ERs |I32 |regtry |NN regmatch_info *reginfo|NN char **startpos ERs |bool |reginclass |NULLOK const regexp *prog|NN const regnode *n|NN const U8 *p|NULLOK STRLEN *lenp\ |bool do_utf8sv_is_utf8 diff --git a/embed.h b/embed.h index 22595d5..fea5b27 100644 --- a/embed.h +++ b/embed.h @@ -3555,7 +3555,7 @@ #if defined(PERL_IN_REGEXEC_C) || defined(PERL_DECL_PROT) #if defined(PERL_CORE) || defined(PERL_EXT) #define regmatch(a,b) S_regmatch(aTHX_ a,b) -#define regrepeat(a,b,c) S_regrepeat(aTHX_ a,b,c) +#define regrepeat(a,b,c,d) S_regrepeat(aTHX_ a,b,c,d) #define regtry(a,b) S_regtry(aTHX_ a,b) #define reginclass(a,b,c,d,e) S_reginclass(aTHX_ a,b,c,d,e) #define regcppush(a) S_regcppush(aTHX_ a) diff --git a/ext/re/t/regop.t b/ext/re/t/regop.t index 1ccf8b3..f586d22 100644 --- a/ext/re/t/regop.t +++ b/ext/re/t/regop.t @@ -252,7 +252,7 @@ Matching stclass EXACTF <.> against ".exe" #Guessed: match at offset 0 #%MATCHED% #Freeing REx: "[q]" -Got 100 bytes for offset annotations. -Offsets: [12] +Got 108 bytes for offset annotations. +Offsets: [13] 1:1[3] 3:4[0] %MATCHED% diff --git a/pod/perl595delta.pod b/pod/perl595delta.pod index ff8efcd..a7e3b40 100644 --- a/pod/perl595delta.pod +++ b/pod/perl595delta.pod @@ -107,8 +107,8 @@ quantifiers. (Yves Orton) =item Backtracking control verbs The regex engine now supports a number of special purpose backtrack -control verbs: (?COMMIT), (?CUT), (?ERROR) and (?FAIL). See L -for their descriptions. +control verbs: (*COMMIT), (*MARK), (*CUT), (*ERROR), (*FAIL) and +(*ACCEPT). See L for their descriptions. =back diff --git a/pod/perldiag.pod b/pod/perldiag.pod index c20b060..e9d2326 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -4291,6 +4291,13 @@ category that is unknown to perl at this point. Note that if you want to enable a warnings category registered by a module (e.g. C), you must have imported this module + +=item Unknown verb pattern '%s' in regex; marked by <-- HERE in m/%s/ + +(F) You either made a typo or have incorrectly put a C<*> quantifier +after an open brace in your pattern. Check the pattern and review +L for details on legal verb patterns. + first. =item unmatched [ in regex; marked by <-- HERE in m/%s/ @@ -4412,6 +4419,17 @@ character to get your parentheses to balance. See L. compressed integer format and could not be converted to an integer. See L. +=item Unterminated verb pattern in regex; marked by <-- HERE in m/%s/ + +(F) You used a pattern of the form C<(*VERB)> but did not terminate +the pattern with a C<)>. Fix the pattern and retry. + +=item Unterminated verb pattern argument in regex; marked by <-- HERE in m/%s/ + +(F) You used a pattern of the form C<(*VERB:ARG)> but did not terminate +the pattern with a C<)>. Fix the pattern and retry. + + =item Unterminated <> operator (F) The lexer saw a left angle bracket in a place where it was expecting @@ -4807,6 +4825,16 @@ anonymous, using the C syntax. When inner anonymous subs that reference variables in outer subroutines are created, they are automatically rebound to the current values of such variables. +=item Verb pattern '%s' has a mandatory argument in regex; marked by <-- HERE in m/%s/ + +(F) You used a verb pattern that requires an argument. Supply an argument +or check that you are using the right verb. + +=item Verb pattern '%s' may not have an argument in regex; marked by <-- HERE in m/%s/ + +(F) You used a verb pattern that is not allowed an argument. Remove the +argument or check that you are using the right verb. + =item Version number must be a constant number (P) The attempt to translate a C statement into diff --git a/pod/perlre.pod b/pod/perlre.pod index bce7291..45e41e5 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -933,14 +933,100 @@ the same name, then it recurses to the leftmost. It is an error to refer to a name that is not declared somewhere in the pattern. -=item C<(?FAIL)> C<(?F)> -X<(?FAIL)> X<(?F)> +=item C<(?(condition)yes-pattern|no-pattern)> +X<(?()> -This pattern matches nothing and always fails. It can be used to force the -engine to backtrack. It is equivalent to C<(?!)>, but easier to read. In -fact, C<(?!)> gets optimised into C<(?FAIL)> internally. +=item C<(?(condition)yes-pattern)> -It is probably useful only when combined with C<(?{})> or C<(??{})>. +Conditional expression. C<(condition)> should be either an integer in +parentheses (which is valid if the corresponding pair of parentheses +matched), a look-ahead/look-behind/evaluate zero-width assertion, a +name in angle brackets or single quotes (which is valid if a buffer +with the given name matched), or the special symbol (R) (true when +evaluated inside of recursion or eval). Additionally the R may be +followed by a number, (which will be true when evaluated when recursing +inside of the appropriate group), or by C<&NAME>, in which case it will +be true only when evaluated during recursion in the named group. + +Here's a summary of the possible predicates: + +=over 4 + +=item (1) (2) ... + +Checks if the numbered capturing buffer has matched something. + +=item () ('NAME') + +Checks if a buffer with the given name has matched something. + +=item (?{ CODE }) + +Treats the code block as the condition. + +=item (R) + +Checks if the expression has been evaluated inside of recursion. + +=item (R1) (R2) ... + +Checks if the expression has been evaluated while executing directly +inside of the n-th capture group. This check is the regex equivalent of + + if ((caller(0))[3] eq 'subname') { ... } + +In other words, it does not check the full recursion stack. + +=item (R&NAME) + +Similar to C<(R1)>, this predicate checks to see if we're executing +directly inside of the leftmost group with a given name (this is the same +logic used by C<(?&NAME)> to disambiguate). It does not check the full +stack, but only the name of the innermost active recursion. + +=item (DEFINE) + +In this case, the yes-pattern is never directly executed, and no +no-pattern is allowed. Similar in spirit to C<(?{0})> but more efficient. +See below for details. + +=back + +For example: + + m{ ( \( )? + [^()]+ + (?(1) \) ) + }x + +matches a chunk of non-parentheses, possibly included in parentheses +themselves. + +A special form is the C<(DEFINE)> predicate, which never executes directly +its yes-pattern, and does not allow a no-pattern. This allows to define +subpatterns which will be executed only by using the recursion mechanism. +This way, you can define a set of regular expression rules that can be +bundled into any pattern you choose. + +It is recommended that for this usage you put the DEFINE block at the +end of the pattern, and that you name any subpatterns defined within it. + +Also, it's worth noting that patterns defined this way probably will +not be as efficient, as the optimiser is not very clever about +handling them. + +An example of how this might be used is as follows: + + /(?(&NAME_PAT))(?(&ADDRESS_PAT)) + (?(DEFINE) + (....) + (....) + )/x + +Note that capture buffers matched inside of recursion are not accessible +after the recursion returns, so the extra layer of capturing buffers are +necessary. Thus C<$+{NAME_PAT}> would not be defined even though +C<$+{NAME}> would be. =item C<< (?>pattern) >> X X X X @@ -973,12 +1059,12 @@ in the rest of a regular expression.) Consider this pattern: m{ \( - ( - [^()]+ # x+ - | + ( + [^()]+ # x+ + | \( [^()]* \) )+ - \) + \) }x That will efficiently match a nonempty group with matching parentheses @@ -992,13 +1078,13 @@ seconds, but that each extra letter doubles this time. This exponential performance will make it appear that your program has hung. However, a tiny change to this pattern - m{ \( - ( - (?> [^()]+ ) # change x+ above to (?> x+ ) - | + m{ \( + ( + (?> [^()]+ ) # change x+ above to (?> x+ ) + | \( [^()]* \) )+ - \) + \) }x which uses C<< (?>...) >> matches exactly when the one above does (verifying @@ -1046,13 +1132,50 @@ to inside of one of these constructs. The following equivalences apply: PAT?+ (?>PAT?) PAT{min,max}+ (?>PAT{min,max}) -=item C<(?COMMIT)> -X<(?COMMIT)> +=back + +=head2 Special Backtracking Control Verbs + +B These patterns are experimental and subject to change or +removal in a future version of perl. Their usage in production code should +be noted to avoid problems during upgrades. + +These special patterns are generally of the form C<(*VERB:ARG)>. Unless +otherwise stated the ARG argument is optional; in some cases, it is +forbidden. + +Any pattern containing a special backtracking verb that allows an argument +has the special behaviour that when executed it sets the current packages' +C<$REGERROR> variable. In this case, the following rules apply: + +On failure, this variable will be set to the ARG value of the verb +pattern, if the verb was involved in the failure of the match. If the ARG +part of the pattern was omitted, then C<$REGERROR> will be set to TRUE. + +On a successful match this variable will be set to FALSE. + +B C<$REGERROR> is not a magic variable in the same sense than +C<$1> and most other regex related variables. It is not local to a +scope, nor readonly but instead a volatile package variable similar to +C<$AUTOLOAD>. Use C to localize changes to it to a specific scope +if necessary. + +If a pattern does not contain a special backtracking verb that allows an +argument, then C<$REGERROR> is not touched at all. + +=over 4 + +=item Verbs that take an argument + +=over 4 + +=item C<(*NOMATCH)> C<(*NOMATCH:NAME)> +X<(*NOMATCH)> X<(*NOMATCH:NAME)> This zero-width pattern commits the match at the current point, preventing -the engine from back-tracking on failure to the left of the commit point. -Consider the pattern C, where A and B are complex patterns. -Until the C<(?COMMIT)> is reached, A may backtrack as necessary to match. +the engine from backtracking on failure to the left of the this point. +Consider the pattern C, where A and B are complex patterns. +Until the C<(*NOMATCH)> is reached, A may backtrack as necessary to match. Once it is reached, matching continues in B, which may also backtrack as necessary; however, should B not match, then no further backtracking will take place, and the pattern will fail outright at that starting position. @@ -1060,7 +1183,7 @@ take place, and the pattern will fail outright at that starting position. The following example counts all the possible matching strings in a pattern (without actually matching any of them). - 'aaab'=~/a+b?(?{print "$&\n"; $count++})(?FAIL)/; + 'aaab' =~ /a+b?(?{print "$&\n"; $count++})(*FAIL)/; print "Count=$count\n"; which produces: @@ -1076,9 +1199,9 @@ which produces: a Count=9 -If we add a C<(?COMMIT)> before the count like the following +If we add a C<(*NOMATCH)> before the count like the following - 'aaab'=~/a+b?(?COMMIT)(?{print "$&\n"; $count++})(?FAIL)/; + 'aaab' =~ /a+b?(*NOMATCH)(?{print "$&\n"; $count++})(*FAIL)/; print "Count=$count\n"; we prevent backtracking and find the count of the longest matching @@ -1089,23 +1212,47 @@ at each matching startpoint like so: ab Count=3 -Any number of C<(?COMMIT)> assertions may be used in a pattern. +Any number of C<(*NOMATCH)> assertions may be used in a pattern. See also C<< (?>pattern) >> and possessive quantifiers for other ways to control backtracking. -=item C<(?CUT)> -X<(?CUT)> - -This zero-width pattern is similar to C<(?COMMIT)>, except that on -failure it also signifies that whatever text that was matched leading -up to the C<(?CUT)> pattern cannot match, I. - -Compare the following to the examples in C<(?COMMIT)>, note the string +=item C<(*MARK)> C<(*MARK:NAME)> +X<(*MARK)> + +This zero-width pattern can be used to mark the point in a string +reached when a certain part of the pattern has been successfully +matched. This mark may be given a name. A later C<(*CUT)> pattern +will then cut at that point if backtracked into on failure. Any +number of (*MARK) patterns are allowed, and the NAME portion is +optional and may be duplicated. + +See C<*CUT> for more detail. + +=item C<(*CUT)> C<(*CUT:NAME)> +X<(*CUT)> + +This zero-width pattern is similar to C<(*NOMATCH)>, except that on +failure it also signifies that whatever text that was matched leading up +to the C<(*CUT)> pattern being executed cannot be part of a match, I. This effectively means that the regex +engine moves forward to this position on failure and tries to match +again, (assuming that there is sufficient room to match). + +The name of the C<(*CUT:NAME)> pattern has special significance. If a +C<(*MARK:NAME)> was encountered while matching, then it is the position +where that pattern was executed that is used for the "cut point" in the +string. If no mark of that name was encountered, then the cut is done at +the point where the C<(*CUT)> was. Similarly if no NAME is specified in +the C<(*CUT)>, and if a C<(*MARK)> with any name (or none) is encountered, +then that C<(*MARK)>'s cursor point will be used. If the C<(*CUT)> is not +preceded by a C<(*MARK)>, then the cut point is where the string was when +the C<(*CUT)> was encountered. + +Compare the following to the examples in C<(*NOMATCH)>, note the string is twice as long: - 'aaabaaab'=~/a+b?(?CUT)(?{print "$&\n"; $count++})(?FAIL)/; + 'aaabaaab' =~ /a+b?(*CUT)(?{print "$&\n"; $count++})(*FAIL)/; print "Count=$count\n"; outputs @@ -1114,17 +1261,17 @@ outputs aaab Count=2 -Once the 'aaab' at the start of the string has matched and the C<(?CUT)> -executed the next startpoint will be where the cursor was when the -C<(?CUT)> was executed. +Once the 'aaab' at the start of the string has matched, and the C<(*CUT)> +executed, the next startpoint will be where the cursor was when the +C<(*CUT)> was executed. -=item C<(?ERROR)> -X<(?ERROR)> +=item C<(*COMMIT)> +X<(*COMMIT)> -This zero-width pattern is similar to C<(?CUT)> except that it causes +This zero-width pattern is similar to C<(*CUT)> except that it causes the match to fail outright. No attempts to match will occur again. - 'aaabaaab'=~/a+b?(?ERROR)(?{print "$&\n"; $count++})(?FAIL)/; + 'aaabaaab' =~ /a+b?(*COMMIT)(?{print "$&\n"; $count++})(*FAIL)/; print "Count=$count\n"; outputs @@ -1132,105 +1279,49 @@ outputs aaab Count=1 -In other words, once the C<(?ERROR)> has been entered and then pattern -does not match then the regex engine will not try any further matching at -all on the rest of the string. - -=item C<(?(condition)yes-pattern|no-pattern)> -X<(?()> - -=item C<(?(condition)yes-pattern)> +In other words, once the C<(*COMMIT)> has been entered, and if the pattern +does not match, the regex engine will not try any further matching on the +rest of the string. -Conditional expression. C<(condition)> should be either an integer in -parentheses (which is valid if the corresponding pair of parentheses -matched), a look-ahead/look-behind/evaluate zero-width assertion, a -name in angle brackets or single quotes (which is valid if a buffer -with the given name matched), the special symbol (R) (true when -evaluated inside of recursion or eval). Additionally the R may be -followed by a number, (which will be true when evaluated when recursing -inside of the appropriate group), or by C<&NAME> in which case it will -be true only when evaluated during recursion in the named group. +=back -Here's a summary of the possible predicates: +=item Verbs without an argument =over 4 -=item (1) (2) ... - -Checks if the numbered capturing buffer has matched something. - -=item () ('NAME') +=item C<(*FAIL)> C<(*F)> +X<(*FAIL)> X<(*F)> -Checks if a buffer with the given name has matched something. - -=item (?{ CODE }) - -Treats the code block as the condition - -=item (R) - -Checks if the expression has been evaluated inside of recursion. - -=item (R1) (R2) ... +This pattern matches nothing and always fails. It can be used to force the +engine to backtrack. It is equivalent to C<(?!)>, but easier to read. In +fact, C<(?!)> gets optimised into C<(*FAIL)> internally. -Checks if the expression has been evaluated while executing directly -inside of the n-th capture group. This check is the regex equivalent of +It is probably useful only when combined with C<(?{})> or C<(??{})>. - if ((caller(0))[3] eq 'subname') { .. } +=item C<(*ACCEPT)> +X<(*ACCEPT)> -In other words, it does not check the full recursion stack. +B This feature is highly experimental. It is not recommended +for production code. -=item (R&NAME) +This pattern matches nothing and causes the end of successful matching at +the point at which the C<(*ACCEPT)> pattern was encountered, regardless of +whether there is actually more to match in the string. When inside of a +nested pattern, such as recursion or a dynamically generated subbpattern +via C<(??{})>, only the innermost pattern is ended immediately. -Similar to C<(R1)>, this predicate checks to see if we're executing -directly inside of the leftmost group with a given name (this is the same -logic used by C<(?&NAME)> to disambiguate). It does not check the full -stack, but only the name of the innermost active recursion. +If the C<(*ACCEPT)> is inside of capturing buffers then the buffers are +marked as ended at the point at which the C<(*ACCEPT)> was encountered. +For instance: -=item (DEFINE) + 'AB' =~ /(A (A|B(*ACCEPT)|C) D)(E)/x; -In this case, the yes-pattern is never directly executed, and no -no-pattern is allowed. Similar in spirit to C<(?{0})> but more efficient. -See below for details. +will match, and C<$1> will be C and C<$2> will be C, C<$3> will not +be set. If another branch in the inner parens were matched, such as in the +string 'ACDE', then the C and C would have to be matched as well. =back -For example: - - m{ ( \( )? - [^()]+ - (?(1) \) ) - }x - -matches a chunk of non-parentheses, possibly included in parentheses -themselves. - -A special form is the C<(DEFINE)> predicate, which never executes directly -its yes-pattern, and does not allow a no-pattern. This allows to define -subpatterns which will be executed only by using the recursion mechanism. -This way, you can define a set of regular expression rules that can be -bundled into any pattern you choose. - -It is recommended that for this usage you put the DEFINE block at the -end of the pattern, and that you name any subpatterns defined within it. - -Also, it's worth noting that patterns defined this way probably will -not be as efficient, as the optimiser is not very clever about -handling them. YMMV. - -An example of how this might be used is as follows: - - /(?(&NAME_PAT))(?(&ADDRESS_PAT)) - (?(DEFINE) - (....) - (....) - )/x - -Note that capture buffers matched inside of recursion are not accessible -after the recursion returns, so the extra layer of capturing buffers are -necessary. Thus C<$+{NAME_PAT}> would not be defined even though -C<$+{NAME}> would be. - =back =head2 Backtracking diff --git a/proto.h b/proto.h index b141466..531d583 100644 --- a/proto.h +++ b/proto.h @@ -3697,7 +3697,7 @@ STATIC I32 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2); -STATIC I32 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max) +STATIC I32 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2); diff --git a/regcomp.c b/regcomp.c index 80d7eec..3ce84c1 100644 --- a/regcomp.c +++ b/regcomp.c @@ -114,6 +114,7 @@ typedef struct RExC_state_t { U32 seen; I32 size; /* Code size. */ I32 npar; /* () count. */ + I32 nestroot; /* root parens we are in - used by accept */ I32 extralen; I32 seen_zerolen; I32 seen_evals; @@ -152,6 +153,7 @@ typedef struct RExC_state_t { #define RExC_seen (pRExC_state->seen) #define RExC_size (pRExC_state->size) #define RExC_npar (pRExC_state->npar) +#define RExC_nestroot (pRExC_state->nestroot) #define RExC_extralen (pRExC_state->extralen) #define RExC_seen_zerolen (pRExC_state->seen_zerolen) #define RExC_seen_evals (pRExC_state->seen_evals) @@ -335,7 +337,7 @@ static const scan_data_t zero_scan_data = #define SCF_WHILEM_VISITED_POS 0x2000 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ - +#define SCF_SEEN_ACCEPT 0x8000 #define UTF (RExC_utf8 != 0) #define LOC ((RExC_flags & PMf_LOCALE) != 0) @@ -2311,6 +2313,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, scan_data_t data_fake; SV *re_trie_maxbuff = NULL; regnode *first_non_open = scan; + I32 stopmin = I32_MAX; GET_RE_DEBUG_FLAGS_DECL; #ifdef DEBUGGING StructCopy(&zero_scan_data, &data_fake, scan_data_t); @@ -2411,6 +2414,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, scan = next; if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) pars++; + if (data_fake.flags & SCF_SEEN_ACCEPT) { + if ( stopmin > minnext) + stopmin = min + min1; + flags &= ~SCF_DO_SUBSTR; + if (data) + data->flags |= SCF_SEEN_ACCEPT; + } if (data) { if (data_fake.flags & SF_HAS_EVAL) data->flags |= SF_HAS_EVAL; @@ -3580,11 +3590,16 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (data) data->flags |= SF_HAS_EVAL; } - else if ( OP(scan)==OPFAIL ) { + else if ( PL_regkind[OP(scan)] == ENDLIKE ) { if (flags & SCF_DO_SUBSTR) { scan_commit(pRExC_state,data,minlenp); flags &= ~SCF_DO_SUBSTR; } + if (data && OP(scan)==ACCEPT) { + data->flags |= SCF_SEEN_ACCEPT; + if (stopmin > min) + stopmin = min; + } } else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ { @@ -3666,7 +3681,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) pars++; - + if (data_fake.flags & SCF_SEEN_ACCEPT) { + if ( stopmin > min + min1) + stopmin = min + min1; + flags &= ~SCF_DO_SUBSTR; + if (data) + data->flags |= SCF_SEEN_ACCEPT; + } if (data) { if (data_fake.flags & SF_HAS_EVAL) data->flags |= SF_HAS_EVAL; @@ -3758,7 +3779,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, DEBUG_STUDYDATA(data,depth); - return min; + return min < stopmin ? min : stopmin; } STATIC I32 @@ -3915,6 +3936,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) RExC_end = xend; RExC_naughty = 0; RExC_npar = 1; + RExC_nestroot = 0; RExC_size = 0L; RExC_emit = &PL_regdummy; RExC_whilem_seen = 0; @@ -3952,6 +3974,11 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) if (RExC_whilem_seen > 15) RExC_whilem_seen = 15; +#ifdef DEBUGGING + /* Make room for a sentinel value at the end of the program */ + RExC_size++; +#endif + /* Allocate space and zero-initialize. Note, the two step process of zeroing when in debug mode, thus anything assigned has to happen after that */ @@ -4008,6 +4035,11 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) RExC_npar = 1; RExC_emit_start = r->program; RExC_emit = r->program; +#ifdef DEBUGGING + /* put a sentinal on the end of the program so we can check for + overwrites */ + r->program[RExC_size].type = 255; +#endif /* Store the count of eval-groups for security checks: */ RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals; REGC((U8)REG_MAGIC, (char*) RExC_emit++); @@ -4415,6 +4447,8 @@ reStudy: r->reganch |= ROPT_EVAL_SEEN; if (RExC_seen & REG_SEEN_CANY) r->reganch |= ROPT_CANY_SEEN; + if (RExC_seen & REG_SEEN_VERBARG) + r->reganch |= ROPT_VERBARG_SEEN; if (RExC_paren_names) r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names); else @@ -4605,6 +4639,10 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) { #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) #endif +/* this idea is borrowed from STR_WITH_LEN in handy.h */ +#define CHECK_WORD(s,v,l) \ + (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1)))) + STATIC regnode * S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */ @@ -4641,6 +4679,98 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* Make an OPEN node, if parenthesized. */ if (paren) { + if ( *RExC_parse == '*') { /* (*VERB:ARG) */ + char *start_verb = RExC_parse; + STRLEN verb_len = 0; + char *start_arg = NULL; + unsigned char op = 0; + int argok = 1; + int internal_argval = 0; /* internal_argval is only useful if !argok */ + while ( *RExC_parse && *RExC_parse != ')' ) { + if ( *RExC_parse == ':' ) { + start_arg = RExC_parse + 1; + break; + } + RExC_parse++; + } + ++start_verb; + verb_len = RExC_parse - start_verb; + if ( start_arg ) { + RExC_parse++; + while ( *RExC_parse && *RExC_parse != ')' ) + RExC_parse++; + if ( *RExC_parse != ')' ) + vFAIL("Unterminated verb pattern argument"); + if ( RExC_parse == start_arg ) + start_arg = NULL; + } else { + if ( *RExC_parse != ')' ) + vFAIL("Unterminated verb pattern"); + } + switch ( *start_verb ) { + case 'A': /* (*ACCEPT) */ + if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) { + op = ACCEPT; + internal_argval = RExC_nestroot; + } + break; + case 'C': /* (*COMMIT) */ + if ( CHECK_WORD("COMMIT",start_verb,verb_len) ) + op = COMMIT; + else if ( CHECK_WORD("CUT",start_verb,verb_len) ) + op = CUT; + break; + case 'F': /* (*FAIL) */ + if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) { + op = OPFAIL; + argok = 0; + } + break; + case 'M': + if ( CHECK_WORD("MARK",start_verb,verb_len) ) + op = MARKPOINT; + break; + case 'N': /* (*NOMATCH) */ + if ( CHECK_WORD("NOMATCH",start_verb,verb_len) ) + op = NOMATCH; + break; + } + if ( ! op ) { + RExC_parse++; + vFAIL3("Unknown verb pattern '%.*s'", + verb_len, start_verb); + } + if ( argok ) { + if ( start_arg && internal_argval ) { + vFAIL3("Verb pattern '%.*s' may not have an argument", + verb_len, start_verb); + } else if ( argok < 0 && !start_arg ) { + vFAIL3("Verb pattern '%.*s' has a mandatory argument", + verb_len, start_verb); + } else { + ret = reganode(pRExC_state, op, internal_argval); + if ( ! internal_argval && ! SIZE_ONLY ) { + if (start_arg) { + SV *sv = newSVpvn( start_arg, RExC_parse - start_arg); + ARG(ret) = add_data( pRExC_state, 1, "S" ); + RExC_rx->data->data[ARG(ret)]=(void*)sv; + ret->flags = 0; + } else { + ret->flags = 1; + } + } + } + if (!internal_argval) + RExC_seen |= REG_SEEN_VERBARG; + } else if ( start_arg ) { + vFAIL3("Verb pattern '%.*s' may not have an argument", + verb_len, start_verb); + } else { + ret = reg_node(pRExC_state, op); + } + nextchar(pRExC_state); + return ret; + } else if (*RExC_parse == '?') { /* (?...) */ U32 posflags = 0, negflags = 0; U32 *flagsp = &posflags; @@ -4711,62 +4841,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse++; case '=': /* (?=...) */ case '!': /* (?!...) */ - if (*RExC_parse == ')') - goto do_op_fail; RExC_seen_zerolen++; + if (*RExC_parse == ')') { + ret=reg_node(pRExC_state, OPFAIL); + nextchar(pRExC_state); + return ret; + } case ':': /* (?:...) */ case '>': /* (?>...) */ break; - case 'C': /* (?CUT) and (?COMMIT) */ - if (RExC_parse[0] == 'O' && - RExC_parse[1] == 'M' && - RExC_parse[2] == 'M' && - RExC_parse[3] == 'I' && - RExC_parse[4] == 'T' && - RExC_parse[5] == ')') - { - RExC_parse+=5; - ret = reg_node(pRExC_state, COMMIT); - } else if ( - RExC_parse[0] == 'U' && - RExC_parse[1] == 'T' && - RExC_parse[2] == ')') - { - RExC_parse+=2; - ret = reg_node(pRExC_state, CUT); - } else { - vFAIL("Sequence (?C... not terminated"); - } - nextchar(pRExC_state); - return ret; - break; - case 'E': /* (?ERROR) */ - if (RExC_parse[0] == 'R' && - RExC_parse[1] == 'R' && - RExC_parse[2] == 'O' && - RExC_parse[3] == 'R' && - RExC_parse[4] == ')') - { - RExC_parse+=4; - ret = reg_node(pRExC_state, OPERROR); - } else { - vFAIL("Sequence (?E... not terminated"); - } - nextchar(pRExC_state); - return ret; - break; - case 'F': - if (RExC_parse[0] == 'A' && - RExC_parse[1] == 'I' && - RExC_parse[2] == 'L') - RExC_parse+=3; - if (*RExC_parse != ')') - vFAIL("Sequence (?FAIL) or (?F) not terminated"); - do_op_fail: - ret = reg_node(pRExC_state, OPFAIL); - nextchar(pRExC_state); - return ret; - break; case '$': /* (?$...) */ case '@': /* (?@...) */ vFAIL2("Sequence (?%c...) not implemented", (int)paren); @@ -5098,12 +5181,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) capturing_parens: parno = RExC_npar; RExC_npar++; + ret = reganode(pRExC_state, OPEN, parno); - if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) { - DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + if (!SIZE_ONLY ){ + if (!RExC_nestroot) + RExC_nestroot = parno; + if (RExC_seen & REG_SEEN_RECURSE) { + DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, "Setting open paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ret))); - RExC_open_parens[parno-1]= ret; + RExC_open_parens[parno-1]= ret; + } } Set_Node_Length(ret, 1); /* MJD */ Set_Node_Offset(ret, RExC_parse); /* MJD */ @@ -5175,6 +5263,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) "Setting close paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ender))); RExC_close_parens[parno-1]= ender; + if (RExC_nestroot == parno) + RExC_nestroot = 0; } Set_Node_Offset(ender,RExC_parse+1); /* MJD */ Set_Node_Length(ender,1); /* MJD */ @@ -7505,6 +7595,11 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) RExC_size += 1; return(ret); } +#ifdef DEBUGGING + if (OP(RExC_emit) == 255) + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ", + reg_name[op], OP(RExC_emit)); +#endif NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE(ptr, op); @@ -7521,7 +7616,6 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) } RExC_emit = ptr; - return(ret); } @@ -7555,7 +7649,10 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) */ return(ret); } - +#ifdef DEBUGGING + if (OP(RExC_emit) == 255) + Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space"); +#endif NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE_ARG(ptr, op, arg); @@ -7573,7 +7670,6 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) } RExC_emit = ptr; - return(ret); } @@ -8006,11 +8102,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } else if (k == WHILEM && o->flags) /* Ordinal/of */ Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); - else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP) + else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ else if (k == GOSUB) Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */ - else if (k == LOGICAL) + else if (k == VERB) { + if (!o->flags) + Perl_sv_catpvf(aTHX_ sv, ":%"SVf, + (SV*)prog->data->data[ ARG( o ) ]); + } else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ else if (k == ANYOF) { int i, rangestart = -1; @@ -8401,7 +8501,7 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param) for (i = 0; i < count; i++) { d->what[i] = r->data->what[i]; switch (d->what[i]) { - /* legal options are one of: sfpont + /* legal options are one of: sSfpont see also regcomp.h and pregfree() */ case 's': case 'S': diff --git a/regcomp.h b/regcomp.h index 360e2a9..2774a27 100644 --- a/regcomp.h +++ b/regcomp.h @@ -351,6 +351,7 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define REG_SEEN_SANY REG_SEEN_CANY /* src bckwrd cmpt */ #define REG_SEEN_RECURSE 0x00000020 #define REG_TOP_LEVEL_BRANCHES 0x00000040 +#define REG_SEEN_VERBARG 0x00000080 START_EXTERN_C diff --git a/regcomp.sym b/regcomp.sym index e673313..074af13 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -169,10 +169,16 @@ INSUBP INSUBP, num 1 Whether we are in a specific recurse. DEFINEP DEFINEP, none 1 Never execute directly. #*Bactracking -OPFAIL OPFAIL, none Same as (?!) -COMMIT COMMIT, none Pattern fails if backtracking through this -CUT COMMIT, none ... and restarts at the cursor point -OPERROR OPERROR,none Pattern fails outright if backtracking through this +ENDLIKE ENDLIKE, none Used only for the type field of verbs +OPFAIL ENDLIKE, none Same as (?!) +ACCEPT ENDLIKE, parno 1 Accepts the current matched string. +VERB VERB, no-sv 1 Used only for the type field of verbs +NOMATCH VERB, no-sv 1 Pattern fails at this startpoint if no-backtracking through this +MARKPOINT VERB, no-sv 1 Push the current location for rollback by cut. +CUT VERB, no-sv 1 On failure cut the string at the mark. +COMMIT VERB, no-sv 1 Pattern fails outright if backtracking through this + + # NEW STUFF ABOVE THIS LINE -- Please update counts below. @@ -210,4 +216,5 @@ CURLYM A,B:FAIL IFMATCH A:FAIL CURLY B_min_known,B_min,B_max:FAIL COMMIT next:FAIL - +MARKPOINT next:FAIL +CUT next:FAIL diff --git a/regexec.c b/regexec.c index f7fd347..8e0aabd 100644 --- a/regexec.c +++ b/regexec.c @@ -2571,11 +2571,17 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) int nochange_depth = 0; /* depth of GOSUB recursion with nochange*/ regmatch_state *yes_state = NULL; /* state to pop to on success of subpattern */ + /* mark_state piggy backs on the yes_state logic so that when we unwind + the stack on success we can update the mark_state as we go */ + regmatch_state *mark_state = NULL; /* last mark state we have seen */ regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */ struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */ U32 state_num; bool no_final = 0; - + char *startpoint = PL_reginput; + SV *popmark = NULL; + SV *sv_commit = NULL; + int lastopen = 0; /* these three flags are set by various ops to signal information to * the very next op. They have a useful lifetime of exactly one loop * iteration, and are not preserved or restored by state pushes/pops @@ -3606,6 +3612,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) PL_reg_start_tmp[n] = locinput; if (n > PL_regsize) PL_regsize = n; + lastopen = n; break; case CLOSE: n = ARG(scan); /* which paren pair */ @@ -3620,6 +3627,32 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) goto fake_end; } break; + case ACCEPT: + if (ARG(scan)){ + regnode *cursor; + for (cursor=scan; + cursor && OP(cursor)!=END; + cursor=regnext(cursor)) + { + if ( OP(cursor)==CLOSE ){ + n = ARG(cursor); + if ( n <= lastopen ) { + PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr; + PL_regendp[n] = locinput - PL_bostr; + /*if (n > PL_regsize) + PL_regsize = n;*/ + if (n > (I32)*PL_reglastparen) + *PL_reglastparen = n; + *PL_reglastcloseparen = n; + if ( n == ARG(scan) || (cur_eval && + cur_eval->u.eval.close_paren == (U32)n)) + break; + } + } + } + } + goto fake_end; + /*NOTREACHED*/ case GROUPP: n = ARG(scan); /* which paren pair */ sw = (bool)((I32)*PL_reglastparen >= n && PL_regendp[n] != -1); @@ -4302,7 +4335,7 @@ NULL PL_reginput = locinput; if (minmod) { minmod = 0; - if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min) + if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min) sayNO; ST.count = ST.min; locinput = PL_reginput; @@ -4335,7 +4368,7 @@ NULL } else { - ST.count = regrepeat(rex, ST.A, ST.max); + ST.count = regrepeat(rex, ST.A, ST.max, depth); locinput = PL_reginput; if (ST.count < ST.min) sayNO; @@ -4421,7 +4454,7 @@ NULL /* PL_reginput == oldloc now */ if (n) { ST.count += n; - if (regrepeat(rex, ST.A, n) < n) + if (regrepeat(rex, ST.A, n, depth) < n) sayNO; } PL_reginput = locinput; @@ -4443,7 +4476,7 @@ NULL REGCP_UNWIND(ST.cp); /* failed -- move forward one */ PL_reginput = locinput; - if (regrepeat(rex, ST.A, 1)) { + if (regrepeat(rex, ST.A, 1, depth)) { ST.count++; locinput = PL_reginput; if (ST.count <= ST.max || (ST.max == REG_INFTY && @@ -4622,17 +4655,13 @@ NULL if (next == scan) next = NULL; break; - case OPERROR: - reginfo->cutpoint=PL_regeol; - goto do_commit; - /* NOTREACHED */ - case CUT: - if ( locinput > reginfo->bol ) - reginfo->cutpoint = HOPBACKc(locinput, 1); - /* FALLTHROUGH */ case COMMIT: - do_commit: + reginfo->cutpoint = PL_regeol; + /* FALLTHROUGH */ + case NOMATCH: PL_reginput = locinput; + if (!scan->flags) + sv_commit = (SV*)rex->data->data[ ARG( scan ) ]; PUSH_STATE_GOTO(COMMIT_next,next); /* NOTREACHED */ case COMMIT_next_fail: @@ -4640,6 +4669,71 @@ NULL /* FALLTHROUGH */ case OPFAIL: sayNO; + /* NOTREACHED */ + +#define ST st->u.mark + case MARKPOINT: + ST.prev_mark = mark_state; + ST.mark_name = scan->flags ? &PL_sv_yes : + (SV*)rex->data->data[ ARG( scan ) ]; + mark_state = st; + ST.mark_loc = PL_reginput = locinput; + PUSH_YES_STATE_GOTO(MARKPOINT_next,next); + /* NOTREACHED */ + case MARKPOINT_next: + mark_state = ST.prev_mark; + sayYES; + /* NOTREACHED */ + case MARKPOINT_next_fail: + if (popmark && ( popmark == &PL_sv_yes || + (ST.mark_name != &PL_sv_yes && + sv_eq(ST.mark_name,popmark)))) + { + if (ST.mark_loc > startpoint) + reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); + popmark = NULL; /* we found our mark */ + sv_commit = ST.mark_name; + + DEBUG_EXECUTE_r({ + if (sv_commit != &PL_sv_yes) + PerlIO_printf(Perl_debug_log, + "%*s %ssetting cutpoint to mark:%"SVf"...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], sv_commit, PL_colors[5]); + else + PerlIO_printf(Perl_debug_log, + "%*s %ssetting cutpoint to mark...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], PL_colors[5]); + }); + } + mark_state = ST.prev_mark; + sayNO; + /* NOTREACHED */ + case CUT: + ST.mark_name = scan->flags ? &PL_sv_yes : + (SV*)rex->data->data[ ARG( scan ) ]; + if (mark_state) { + ST.mark_loc = NULL; + } else { + ST.mark_loc = locinput; + } + PL_reginput = locinput; + PUSH_STATE_GOTO(CUT_next,next); + /* NOTREACHED */ + case CUT_next_fail: + if (ST.mark_loc) { + if (ST.mark_loc > startpoint) + reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); + sv_commit = ST.mark_name; + } else { + popmark = ST.mark_name; + } + no_final = 1; + sayNO; + /* NOTREACHED */ +#undef ST + default: PerlIO_printf(Perl_error_log, "%"UVxf" %d\n", PTR2UV(scan), OP(scan)); @@ -4716,13 +4810,13 @@ yes: PL_regmatch_slab = PL_regmatch_slab->prev; st = SLAB_LAST(PL_regmatch_slab); } - DEBUG_STATE_r({ + DEBUG_STATE_r({ if (no_final) { DEBUG_STATE_pp("pop (no final)"); } else { DEBUG_STATE_pp("pop (yes)"); } - }); + }); depth--; } #else @@ -4789,7 +4883,14 @@ no_silent: result = 0; final_exit: - + if (rex->reganch & ROPT_VERBARG_SEEN) { + SV *sv = get_sv("REGERROR", 1); + if (result) + sv_commit = &PL_sv_no; + else if (!sv_commit) + sv_commit = &PL_sv_yes; + sv_setsv(sv, sv_commit); + } /* restore original high-water mark */ PL_regmatch_slab = orig_slab; PL_regmatch_state = orig_state; @@ -4817,7 +4918,7 @@ no_silent: * rather than incrementing count on every character. [Er, except utf8.]] */ STATIC I32 -S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max) +S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) { dVAR; register char *scan; @@ -5048,7 +5149,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max) regprop(prog, prop, p); PerlIO_printf(Perl_debug_log, "%*s %s can match %"IVdf" times out of %"IVdf"...\n", - REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max); + REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max); }); }); diff --git a/regexp.h b/regexp.h index f13a5c5..9b3ce79 100644 --- a/regexp.h +++ b/regexp.h @@ -97,7 +97,6 @@ typedef struct regexp_engine { #define ROPT_CANY_SEEN 0x00000800 #define ROPT_SANY_SEEN ROPT_CANY_SEEN /* src bckwrd cmpt */ #define ROPT_GPOS_CHECK (ROPT_GPOS_SEEN|ROPT_ANCH_GPOS) -#define ROPT_RECURSE_SEEN 0x00001000 /* 0xf800 of reganch is used by PMf_COMPILETIME */ @@ -106,6 +105,8 @@ typedef struct regexp_engine { #define ROPT_COPY_DONE 0x00040000 /* subbeg is a copy of the string */ #define ROPT_TAINTED_SEEN 0x00080000 #define ROPT_MATCH_UTF8 0x10000000 /* subbeg is utf-8 */ +#define ROPT_RECURSE_SEEN 0x20000000 +#define ROPT_VERBARG_SEEN 0x40000000 #define RE_USE_INTUIT_NOML 0x00100000 /* Best to intuit before matching */ #define RE_USE_INTUIT_ML 0x00200000 @@ -311,6 +312,14 @@ typedef struct regmatch_state { I32 logical; /* saved copy of 'logical' var */ regnode *me; /* the IFMATCH/SUSPEND/UNLESSM node */ } ifmatch; /* and SUSPEND/UNLESSM */ + + struct { + /* this first element must match u.yes */ + struct regmatch_state *prev_yes_state; + struct regmatch_state *prev_mark; + SV* mark_name; + char *mark_loc; + } mark; } u; } regmatch_state; diff --git a/regnodes.h b/regnodes.h index 010b943..005e409 100644 --- a/regnodes.h +++ b/regnodes.h @@ -6,8 +6,8 @@ /* Regops and State definitions */ -#define REGNODE_MAX 78 -#define REGMATCH_STATE_MAX 110 +#define REGNODE_MAX 82 +#define REGMATCH_STATE_MAX 118 #define END 0 /* 0000 End of program. */ #define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */ @@ -82,12 +82,16 @@ #define NGROUPP 70 /* 0x46 Whether the group matched. */ #define INSUBP 71 /* 0x47 Whether we are in a specific recurse. */ #define DEFINEP 72 /* 0x48 Never execute directly. */ -#define OPFAIL 73 /* 0x49 Same as (?!) */ -#define COMMIT 74 /* 0x4a Pattern fails if backtracking through this */ -#define CUT 75 /* 0x4b ... and restarts at the cursor point */ -#define OPERROR 76 /* 0x4c Pattern fails outright if backtracking through this */ -#define OPTIMIZED 77 /* 0x4d Placeholder for dump. */ -#define PSEUDO 78 /* 0x4e Pseudo opcode for internal use. */ +#define ENDLIKE 73 /* 0x49 Used only for the type field of verbs */ +#define OPFAIL 74 /* 0x4a Same as (?!) */ +#define ACCEPT 75 /* 0x4b Accepts the current matched string. */ +#define VERB 76 /* 0x4c no-sv 1 Used only for the type field of verbs */ +#define NOMATCH 77 /* 0x4d Pattern fails at this startpoint if no-backtracking through this */ +#define MARKPOINT 78 /* 0x4e Push the current location for rollback by cut. */ +#define CUT 79 /* 0x4f On failure cut the string at the mark. */ +#define COMMIT 80 /* 0x50 Pattern fails outright if backtracking through this */ +#define OPTIMIZED 81 /* 0x51 Placeholder for dump. */ +#define PSEUDO 82 /* 0x52 Pseudo opcode for internal use. */ /* ------------ States ------------- */ #define TRIE_next (REGNODE_MAX + 1) /* state for TRIE */ #define TRIE_next_fail (REGNODE_MAX + 2) /* state for TRIE */ @@ -121,6 +125,10 @@ #define CURLY_B_max_fail (REGNODE_MAX + 30) /* state for CURLY */ #define COMMIT_next (REGNODE_MAX + 31) /* state for COMMIT */ #define COMMIT_next_fail (REGNODE_MAX + 32) /* state for COMMIT */ +#define MARKPOINT_next (REGNODE_MAX + 33) /* state for MARKPOINT */ +#define MARKPOINT_next_fail (REGNODE_MAX + 34) /* state for MARKPOINT */ +#define CUT_next (REGNODE_MAX + 35) /* state for CUT */ +#define CUT_next_fail (REGNODE_MAX + 36) /* state for CUT */ /* PL_regkind[] What type of regop or state is this. */ @@ -128,118 +136,126 @@ EXTCONST U8 PL_regkind[]; #else EXTCONST U8 PL_regkind[] = { - END, /* END */ - END, /* SUCCEED */ - BOL, /* BOL */ - BOL, /* MBOL */ - BOL, /* SBOL */ - EOL, /* EOS */ - EOL, /* EOL */ - EOL, /* MEOL */ - EOL, /* SEOL */ - BOUND, /* BOUND */ - BOUND, /* BOUNDL */ - NBOUND, /* NBOUND */ - NBOUND, /* NBOUNDL */ - GPOS, /* GPOS */ - REG_ANY, /* REG_ANY */ - REG_ANY, /* SANY */ - REG_ANY, /* CANY */ - ANYOF, /* ANYOF */ - ALNUM, /* ALNUM */ - ALNUM, /* ALNUML */ - NALNUM, /* NALNUM */ - NALNUM, /* NALNUML */ - SPACE, /* SPACE */ - SPACE, /* SPACEL */ - NSPACE, /* NSPACE */ - NSPACE, /* NSPACEL */ - DIGIT, /* DIGIT */ - DIGIT, /* DIGITL */ - NDIGIT, /* NDIGIT */ - NDIGIT, /* NDIGITL */ - CLUMP, /* CLUMP */ - BRANCH, /* BRANCH */ - BACK, /* BACK */ - EXACT, /* EXACT */ - EXACT, /* EXACTF */ - EXACT, /* EXACTFL */ - NOTHING, /* NOTHING */ - NOTHING, /* TAIL */ - STAR, /* STAR */ - PLUS, /* PLUS */ - CURLY, /* CURLY */ - CURLY, /* CURLYN */ - CURLY, /* CURLYM */ - CURLY, /* CURLYX */ - WHILEM, /* WHILEM */ - OPEN, /* OPEN */ - CLOSE, /* CLOSE */ - REF, /* REF */ - REF, /* REFF */ - REF, /* REFFL */ - BRANCHJ, /* IFMATCH */ - BRANCHJ, /* UNLESSM */ - BRANCHJ, /* SUSPEND */ - BRANCHJ, /* IFTHEN */ - GROUPP, /* GROUPP */ - LONGJMP, /* LONGJMP */ - BRANCHJ, /* BRANCHJ */ - EVAL, /* EVAL */ - MINMOD, /* MINMOD */ - LOGICAL, /* LOGICAL */ - BRANCHJ, /* RENUM */ - TRIE, /* TRIE */ - TRIE, /* TRIEC */ - TRIE, /* AHOCORASICK */ - TRIE, /* AHOCORASICKC */ - GOSUB, /* GOSUB */ - GOSTART, /* GOSTART */ - NREF, /* NREF */ - NREF, /* NREFF */ - NREF, /* NREFFL */ - NGROUPP, /* NGROUPP */ - INSUBP, /* INSUBP */ - DEFINEP, /* DEFINEP */ - OPFAIL, /* OPFAIL */ - COMMIT, /* COMMIT */ - COMMIT, /* CUT */ - OPERROR, /* OPERROR */ - NOTHING, /* OPTIMIZED */ - PSEUDO, /* PSEUDO */ + END, /* END */ + END, /* SUCCEED */ + BOL, /* BOL */ + BOL, /* MBOL */ + BOL, /* SBOL */ + EOL, /* EOS */ + EOL, /* EOL */ + EOL, /* MEOL */ + EOL, /* SEOL */ + BOUND, /* BOUND */ + BOUND, /* BOUNDL */ + NBOUND, /* NBOUND */ + NBOUND, /* NBOUNDL */ + GPOS, /* GPOS */ + REG_ANY, /* REG_ANY */ + REG_ANY, /* SANY */ + REG_ANY, /* CANY */ + ANYOF, /* ANYOF */ + ALNUM, /* ALNUM */ + ALNUM, /* ALNUML */ + NALNUM, /* NALNUM */ + NALNUM, /* NALNUML */ + SPACE, /* SPACE */ + SPACE, /* SPACEL */ + NSPACE, /* NSPACE */ + NSPACE, /* NSPACEL */ + DIGIT, /* DIGIT */ + DIGIT, /* DIGITL */ + NDIGIT, /* NDIGIT */ + NDIGIT, /* NDIGITL */ + CLUMP, /* CLUMP */ + BRANCH, /* BRANCH */ + BACK, /* BACK */ + EXACT, /* EXACT */ + EXACT, /* EXACTF */ + EXACT, /* EXACTFL */ + NOTHING, /* NOTHING */ + NOTHING, /* TAIL */ + STAR, /* STAR */ + PLUS, /* PLUS */ + CURLY, /* CURLY */ + CURLY, /* CURLYN */ + CURLY, /* CURLYM */ + CURLY, /* CURLYX */ + WHILEM, /* WHILEM */ + OPEN, /* OPEN */ + CLOSE, /* CLOSE */ + REF, /* REF */ + REF, /* REFF */ + REF, /* REFFL */ + BRANCHJ, /* IFMATCH */ + BRANCHJ, /* UNLESSM */ + BRANCHJ, /* SUSPEND */ + BRANCHJ, /* IFTHEN */ + GROUPP, /* GROUPP */ + LONGJMP, /* LONGJMP */ + BRANCHJ, /* BRANCHJ */ + EVAL, /* EVAL */ + MINMOD, /* MINMOD */ + LOGICAL, /* LOGICAL */ + BRANCHJ, /* RENUM */ + TRIE, /* TRIE */ + TRIE, /* TRIEC */ + TRIE, /* AHOCORASICK */ + TRIE, /* AHOCORASICKC */ + GOSUB, /* GOSUB */ + GOSTART, /* GOSTART */ + NREF, /* NREF */ + NREF, /* NREFF */ + NREF, /* NREFFL */ + NGROUPP, /* NGROUPP */ + INSUBP, /* INSUBP */ + DEFINEP, /* DEFINEP */ + ENDLIKE, /* ENDLIKE */ + ENDLIKE, /* OPFAIL */ + ENDLIKE, /* ACCEPT */ + VERB, /* VERB */ + VERB, /* NOMATCH */ + VERB, /* MARKPOINT */ + VERB, /* CUT */ + VERB, /* COMMIT */ + NOTHING, /* OPTIMIZED */ + PSEUDO, /* PSEUDO */ /* ------------ States ------------- */ - TRIE, /* TRIE_next */ - TRIE, /* TRIE_next_fail */ - EVAL, /* EVAL_AB */ - EVAL, /* EVAL_AB_fail */ - CURLYX, /* CURLYX_end */ - CURLYX, /* CURLYX_end_fail */ - WHILEM, /* WHILEM_A_pre */ - WHILEM, /* WHILEM_A_pre_fail */ - WHILEM, /* WHILEM_A_min */ - WHILEM, /* WHILEM_A_min_fail */ - WHILEM, /* WHILEM_A_max */ - WHILEM, /* WHILEM_A_max_fail */ - WHILEM, /* WHILEM_B_min */ - WHILEM, /* WHILEM_B_min_fail */ - WHILEM, /* WHILEM_B_max */ - WHILEM, /* WHILEM_B_max_fail */ - BRANCH, /* BRANCH_next */ - BRANCH, /* BRANCH_next_fail */ - CURLYM, /* CURLYM_A */ - CURLYM, /* CURLYM_A_fail */ - CURLYM, /* CURLYM_B */ - CURLYM, /* CURLYM_B_fail */ - IFMATCH, /* IFMATCH_A */ - IFMATCH, /* IFMATCH_A_fail */ - CURLY, /* CURLY_B_min_known */ - CURLY, /* CURLY_B_min_known_fail */ - CURLY, /* CURLY_B_min */ - CURLY, /* CURLY_B_min_fail */ - CURLY, /* CURLY_B_max */ - CURLY, /* CURLY_B_max_fail */ - COMMIT, /* COMMIT_next */ - COMMIT, /* COMMIT_next_fail */ + TRIE, /* TRIE_next */ + TRIE, /* TRIE_next_fail */ + EVAL, /* EVAL_AB */ + EVAL, /* EVAL_AB_fail */ + CURLYX, /* CURLYX_end */ + CURLYX, /* CURLYX_end_fail */ + WHILEM, /* WHILEM_A_pre */ + WHILEM, /* WHILEM_A_pre_fail */ + WHILEM, /* WHILEM_A_min */ + WHILEM, /* WHILEM_A_min_fail */ + WHILEM, /* WHILEM_A_max */ + WHILEM, /* WHILEM_A_max_fail */ + WHILEM, /* WHILEM_B_min */ + WHILEM, /* WHILEM_B_min_fail */ + WHILEM, /* WHILEM_B_max */ + WHILEM, /* WHILEM_B_max_fail */ + BRANCH, /* BRANCH_next */ + BRANCH, /* BRANCH_next_fail */ + CURLYM, /* CURLYM_A */ + CURLYM, /* CURLYM_A_fail */ + CURLYM, /* CURLYM_B */ + CURLYM, /* CURLYM_B_fail */ + IFMATCH, /* IFMATCH_A */ + IFMATCH, /* IFMATCH_A_fail */ + CURLY, /* CURLY_B_min_known */ + CURLY, /* CURLY_B_min_known_fail */ + CURLY, /* CURLY_B_min */ + CURLY, /* CURLY_B_min_fail */ + CURLY, /* CURLY_B_max */ + CURLY, /* CURLY_B_max_fail */ + COMMIT, /* COMMIT_next */ + COMMIT, /* COMMIT_next_fail */ + MARKPOINT, /* MARKPOINT_next */ + MARKPOINT, /* MARKPOINT_next_fail */ + CUT, /* CUT_next */ + CUT, /* CUT_next_fail */ }; #endif @@ -320,10 +336,14 @@ static const U8 regarglen[] = { EXTRA_SIZE(struct regnode_1), /* NGROUPP */ EXTRA_SIZE(struct regnode_1), /* INSUBP */ EXTRA_SIZE(struct regnode_1), /* DEFINEP */ + 0, /* ENDLIKE */ 0, /* OPFAIL */ - 0, /* COMMIT */ - 0, /* CUT */ - 0, /* OPERROR */ + EXTRA_SIZE(struct regnode_1), /* ACCEPT */ + 0, /* VERB */ + EXTRA_SIZE(struct regnode_1), /* NOMATCH */ + EXTRA_SIZE(struct regnode_1), /* MARKPOINT */ + EXTRA_SIZE(struct regnode_1), /* CUT */ + EXTRA_SIZE(struct regnode_1), /* COMMIT */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -404,10 +424,14 @@ static const char reg_off_by_arg[] = { 0, /* NGROUPP */ 0, /* INSUBP */ 0, /* DEFINEP */ + 0, /* ENDLIKE */ 0, /* OPFAIL */ - 0, /* COMMIT */ + 0, /* ACCEPT */ + 0, /* VERB */ + 0, /* NOMATCH */ + 0, /* MARKPOINT */ 0, /* CUT */ - 0, /* OPERROR */ + 0, /* COMMIT */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -489,12 +513,16 @@ const char * reg_name[] = { "NGROUPP", /* 0x46 */ "INSUBP", /* 0x47 */ "DEFINEP", /* 0x48 */ - "OPFAIL", /* 0x49 */ - "COMMIT", /* 0x4a */ - "CUT", /* 0x4b */ - "OPERROR", /* 0x4c */ - "OPTIMIZED", /* 0x4d */ - "PSEUDO", /* 0x4e */ + "ENDLIKE", /* 0x49 */ + "OPFAIL", /* 0x4a */ + "ACCEPT", /* 0x4b */ + "VERB", /* 0x4c */ + "NOMATCH", /* 0x4d */ + "MARKPOINT", /* 0x4e */ + "CUT", /* 0x4f */ + "COMMIT", /* 0x50 */ + "OPTIMIZED", /* 0x51 */ + "PSEUDO", /* 0x52 */ /* ------------ States ------------- */ "TRIE_next", /* REGNODE_MAX +0x01 */ "TRIE_next_fail", /* REGNODE_MAX +0x02 */ @@ -528,6 +556,10 @@ const char * reg_name[] = { "CURLY_B_max_fail", /* REGNODE_MAX +0x1e */ "COMMIT_next", /* REGNODE_MAX +0x1f */ "COMMIT_next_fail", /* REGNODE_MAX +0x20 */ + "MARKPOINT_next", /* REGNODE_MAX +0x21 */ + "MARKPOINT_next_fail", /* REGNODE_MAX +0x22 */ + "CUT_next", /* REGNODE_MAX +0x23 */ + "CUT_next_fail", /* REGNODE_MAX +0x24 */ }; #endif /* DEBUGGING */ #else diff --git a/t/op/pat.t b/t/op/pat.t index 67be900..5405cf6 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -3851,54 +3851,136 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) { iseq($count,1,"should have matched once only [RT#36046]"); } -{ # Test the (?COMMIT) pattern +{ # Test the (*NOMATCH) pattern our $count = 0; - 'aaab'=~/a+b?(?{$count++})(?FAIL)/; - iseq($count,9,"expect 9 for no (?COMMIT)"); + 'aaab'=~/a+b?(?{$count++})(*FAIL)/; + iseq($count,9,"expect 9 for no (*NOMATCH)"); $count = 0; - 'aaab'=~/a+b?(?COMMIT)(?{$count++})(?FAIL)/; - iseq($count,3,"expect 3 with (?COMMIT)"); + 'aaab'=~/a+b?(*NOMATCH)(?{$count++})(*FAIL)/; + iseq($count,3,"expect 3 with (*NOMATCH)"); local $_='aaab'; $count=0; - 1 while /.(?COMMIT)(?{$count++})(?FAIL)/g; - iseq($count,4,"/.(?COMMIT)/"); + 1 while /.(*NOMATCH)(?{$count++})(*FAIL)/g; + iseq($count,4,"/.(*NOMATCH)/"); $count = 0; - 'aaab'=~/a+b?(??{'(?COMMIT)'})(?{$count++})(?FAIL)/; - iseq($count,3,"expect 3 with (?COMMIT)"); + 'aaab'=~/a+b?(??{'(*NOMATCH)'})(?{$count++})(*FAIL)/; + iseq($count,3,"expect 3 with (*NOMATCH)"); local $_='aaab'; $count=0; - 1 while /.(??{'(?COMMIT)'})(?{$count++})(?FAIL)/g; - iseq($count,4,"/.(?COMMIT)/"); + 1 while /.(??{'(*NOMATCH)'})(?{$count++})(*FAIL)/g; + iseq($count,4,"/.(*NOMATCH)/"); } -{ # Test the (?CUT) pattern +{ # Test the (*CUT) pattern our $count = 0; - 'aaab'=~/a+b?(?CUT)(?{$count++})(?FAIL)/; - iseq($count,1,"expect 1 with (?CUT)"); + 'aaab'=~/a+b?(*CUT)(?{$count++})(*FAIL)/; + iseq($count,1,"expect 1 with (*CUT)"); local $_='aaab'; $count=0; - 1 while /.(?CUT)(?{$count++})(?FAIL)/g; - iseq($count,4,"/.(?CUT)/"); + 1 while /.(*CUT)(?{$count++})(*FAIL)/g; + iseq($count,4,"/.(*CUT)/"); $_='aaabaaab'; $count=0; our @res=(); - 1 while /(a+b?)(?CUT)(?{$count++; push @res,$1})(?FAIL)/g; - iseq($count,2,"Expect 2 with (?CUT)" ); - iseq("@res","aaab aaab","adjacent (?CUT) works as expected" ); + 1 while /(a+b?)(*CUT)(?{$count++; push @res,$1})(*FAIL)/g; + iseq($count,2,"Expect 2 with (*CUT)" ); + iseq("@res","aaab aaab","adjacent (*CUT) works as expected" ); } -{ # Test the (?ERROR) pattern +{ # Test the (*CUT) pattern our $count = 0; - 'aaabaaab'=~/a+b?(?ERROR)(?{$count++})(?FAIL)/; - iseq($count,1,"expect 1 with (?ERROR)"); + 'aaab'=~/a+b?(*MARK)(*CUT)(?{$count++})(*FAIL)/; + iseq($count,1,"expect 1 with (*CUT)"); local $_='aaab'; $count=0; - 1 while /.(?ERROR)(?{$count++})(?FAIL)/g; - iseq($count,1,"/.(?ERROR)/"); + 1 while /.(*MARK)(*CUT)(?{$count++})(*FAIL)/g; + iseq($count,4,"/.(*CUT)/"); $_='aaabaaab'; $count=0; our @res=(); - 1 while /(a+b?)(?ERROR)(?{$count++; push @res,$1})(?FAIL)/g; - iseq($count,1,"Expect 1 with (?ERROR)" ); - iseq("@res","aaab","adjacent (?ERROR) works as expected" ); + 1 while /(a+b?)(*MARK)(*CUT)(?{$count++; push @res,$1})(*FAIL)/g; + iseq($count,2,"Expect 2 with (*CUT)" ); + iseq("@res","aaab aaab","adjacent (*CUT) works as expected" ); +} +{ # Test the (*CUT) pattern + our $count = 0; + 'aaab'=~/a*(*MARK:a)b?(*MARK:b)(*CUT:a)(?{$count++})(*FAIL)/; + iseq($count,3,"expect 3 with *MARK:a)b?(*MARK:b)(*CUT:a)"); + local $_='aaabaaab'; + $count=0; + our @res=(); + 1 while /(a*(*MARK:a)b?)(*MARK)(*CUT:a)(?{$count++; push @res,$1})(*FAIL)/g; + iseq($count,5,"Expect 5 with (*MARK:a)b?)(*MARK)(*CUT:a)" ); + iseq("@res","aaab b aaab b ","adjacent (*MARK:a)b?)(*MARK)(*CUT:a) works as expected" ); +} +{ # Test the (*COMMIT) pattern + our $count = 0; + 'aaabaaab'=~/a+b?(*COMMIT)(?{$count++})(*FAIL)/; + iseq($count,1,"expect 1 with (*COMMIT)"); + local $_='aaab'; + $count=0; + 1 while /.(*COMMIT)(?{$count++})(*FAIL)/g; + iseq($count,1,"/.(*COMMIT)/"); + $_='aaabaaab'; + $count=0; + our @res=(); + 1 while /(a+b?)(*COMMIT)(?{$count++; push @res,$1})(*FAIL)/g; + iseq($count,1,"Expect 1 with (*COMMIT)" ); + iseq("@res","aaab","adjacent (*COMMIT) works as expected" ); +} +{ + # Test named commits and the $REGERROR var + our $REGERROR; + for my $name ('',':foo') + { + for my $pat ("(*NOMATCH$name)","(*MARK$name)(*CUT)", + "(*CUT$name)","(*COMMIT$name)") + { + for my $suffix ('(*FAIL)','') + { + 'aaaab'=~/a+b$pat$suffix/; + iseq( + $REGERROR, + ($suffix ? ($name ? 'foo' : "1") : ""), + "Test $pat and \$REGERROR $suffix" + ); + } + } + } +} +{ + # Test named commits and the $REGERROR var + package Fnorble; + our $REGERROR; + for my $name ('',':foo') + { + for my $pat ("(*NOMATCH$name)","(*MARK$name)(*CUT)", + "(*CUT$name)","(*COMMIT$name)") + { + for my $suffix ('(*FAIL)','') + { + 'aaaab'=~/a+b$pat$suffix/; + ::iseq( + $REGERROR, + ($suffix ? ($name ? 'foo' : "1") : ""), + "Test $pat and \$REGERROR $suffix" + ); + } + } + } +} +{ + # Test named commits and the $REGERROR var + our $REGERROR; + for $word (qw(bar baz bop)) { + $REGERROR=""; + "aaaaa$word"=~/a+(?:bar(*COMMIT:bar)|baz(*COMMIT:baz)|bop(*COMMIT:bop))(*FAIL)/; + iseq($REGERROR,$word); + } +} +{ #Regression test for perlbug 40684 + my $s = "abc\ndef"; + my $rex = qr'^abc$'m; + ok($s =~ m/$rex/); + ok($s =~ m/^abc$/m); } #------------------------------------------------------------------- @@ -3914,5 +3996,5 @@ ok((q(a)x 100) =~ /^(??{'(.)'x 100})/, # Put new tests above the line, not here. # Don't forget to update this! -BEGIN{print "1..1300\n"}; +BEGIN{print "1..1344\n"}; diff --git a/t/op/re_tests b/t/op/re_tests index 9b9e5f8..99c6824 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -138,7 +138,8 @@ ab|cd abcd y $& ab ()ef def y $-[1] 1 ()ef def y $+[1] 1 *a - c - Quantifier follows nothing -(*)b - c - Quantifier follows nothing +(|*)b - c - Quantifier follows nothing +(*)b - c - Unknown verb $b b n - - a\ - c - Search pattern not terminated a\(b a(b y $&-$1 a(b- @@ -325,7 +326,8 @@ a[-]?c ac y $& ac 'ab|cd'i ABCD y $& AB '()ef'i DEF y $&-$1 EF- '*a'i - c - Quantifier follows nothing -'(*)b'i - c - Quantifier follows nothing +'(|*)b'i - c - Quantifier follows nothing +'(*)b'i - c - Unknown verb '$b'i B n - - 'a\'i - c - Search pattern not terminated 'a\(b'i A(B y $&-$1 A(B- @@ -1178,5 +1180,9 @@ round\(([^()]++)\) _I(round(xs * sz),1) y $1 xs * sz a*(?!) aaaab n - - -a*(?FAIL) aaaab n - - -a*(?F) aaaab n - - +a*(*FAIL) aaaab n - - +a*(*F) aaaab n - - + +(A(A|B(*ACCEPT)|C)D)(E) AB y $1 AB +(A(A|B(*ACCEPT)|C)D)(E) ACDE y $1$2$3 ACDCE +