Es |regnode*|reg_node |NN struct RExC_state_t *state|U8 op
Es |regnode*|regpiece |NN struct RExC_state_t *state|NN I32 *flagp|U32 depth
Es |regnode*|reg_namedseq |NN struct RExC_state_t *state|NULLOK UV *valuep
-Es |void |reginsert |NN struct RExC_state_t *state|U8 op|NN regnode *opnd
+Es |void |reginsert |NN struct RExC_state_t *state|U8 op|NN regnode *opnd|U32 depth
Es |void |regtail |NN struct RExC_state_t *state|NN regnode *p|NN const regnode *val|U32 depth
Es |U32 |join_exact |NN struct RExC_state_t *state|NN regnode *scan|NN I32 *min|U32 flags|NULLOK regnode *val|U32 depth
EsRn |char* |regwhite |NN char *p|NN const char *e
#define reg_node(a,b) S_reg_node(aTHX_ a,b)
#define regpiece(a,b,c) S_regpiece(aTHX_ a,b,c)
#define reg_namedseq(a,b) S_reg_namedseq(aTHX_ a,b)
-#define reginsert(a,b,c) S_reginsert(aTHX_ a,b,c)
+#define reginsert(a,b,c,d) S_reginsert(aTHX_ a,b,c,d)
#define regtail(a,b,c,d) S_regtail(aTHX_ a,b,c,d)
#define join_exact(a,b,c,d,e,f) S_join_exact(aTHX_ a,b,c,d,e,f)
#define regwhite S_regwhite
value of C<$^R> is restored if the assertion is backtracked; compare
L<"Backtracking">.
+Due to an unfortunate implementation issue the perl code contained in these
+blocks is treated as a compile time closure, which can have seemingly bizarre
+consequences when used with lexically scoped variables inside of subroutines
+or loops. There are various workarounds for this, including simply using
+global variables instead. If you are using this construct and strange results
+occur then check for the use of lexically scoped variables.
+
For reasons of security, this construct is forbidden if the regular
expression involves run-time interpolation of variables, unless the
perilous C<use re 'eval'> pragma has been used (see L<re>), or the
=item C<(??{ code })>
X<(??{})>
X<regex, postponed> X<regexp, postponed> X<regular expression, postponed>
-X<regex, recursive> X<regexp, recursive> X<regular expression, recursive>
B<WARNING>: This extended regular expression feature is considered
highly experimental, and may be changed or deleted without notice.
This is a "postponed" regular subexpression. The C<code> is evaluated
at run time, at the moment this subexpression may match. The result
of evaluation is considered as a regular expression and matched as
-if it were inserted instead of this construct.
+if it were inserted instead of this construct. Note that this means
+that the contents of capture buffers defined inside an eval'ed pattern
+are not available outside of the pattern, and vice versa, there is no
+way for the inner pattern to refer to a capture buffer defined outside.
+Thus,
+
+ ('a' x 100)=~/(??{'(.)' x 100})/
+
+B<will> match, it will B<not> set $1.
The C<code> is not interpolated. As before, the rules to determine
where the C<code> ends are currently somewhat convoluted.
\)
}x;
+See also C<(?PARNO)> for a different, more efficient way to accomplish
+the same task.
+
Because perl's regex engine is not currently re-entrant, delayed
code may not invoke the regex engine either directly with C<m//> or C<s///>),
or indirectly with functions such as C<split>.
+Recursing deeper than 50 times without consuming any input string will
+result in a fatal error. The maximum depth is compiled into perl, so
+changing it requires a custom build.
+
+=item C<(?PARNO)> C<(?R)>
+
+X<(?PARNO)> X<(?1)>
+X<regex, recursive> X<regexp, recursive> X<regular expression, recursive>
+
+B<WARNING>: This extended regular expression feature is considered
+highly experimental, and may be changed or deleted without notice.
+
+Similar to C<(??{ code })> except it does not involve compiling any code,
+instead it treats the contents of a capture buffer as an independent
+pattern that must match at the current position. Capture buffers
+contained by the pattern will have the value as determined by the
+outermost recursion.
+
+PARNO is a sequence of digits not starting with 0 whose value
+reflects the paren-number of the capture buffer to recurse to.
+C<(?R)> curses to the beginning of the pattern.
+
+The following pattern matches a function foo() which may contain
+balanced parenthesis as the argument.
+
+ $re = qr{ ( # paren group 1 (full function)
+ foo
+ ( # paren group 2 (parens)
+ \(
+ ( # paren group 3 (contents of parens)
+ (?:
+ (?> [^()]+ ) # Non-parens without backtracking
+ |
+ (?2) # Recurse to start of paren group 2
+ )*
+ )
+ \)
+ )
+ )
+ }x;
+
+If the pattern was used as follows
+
+ 'foo(bar(baz)+baz(bop))'=~/$re/
+ and print "\$1 = $1\n",
+ "\$2 = $2\n",
+ "\$3 = $3\n";
+
+the output produced should be the following:
+
+ $1 = foo(bar(baz)+baz(bop))
+ $2 = (bar(baz)+baz(bop))
+ $3 = bar(baz)+baz(bop)
+
+If there is no corresponding capture buffer defined, then it is a
+fatal error. Recursing deeper than 50 times without consuming any input
+string will also result in a fatal error. The maximum depth is compiled
+into perl, so changing it requires a custom build.
+
+B<Note> that this pattern does not behave the same way as the equivalent
+PCRE or Python construct of the same form. In perl you can backtrack into
+a recursed group, in PCRE and Python the recursed into group is treated
+as atomic. Also, constructs like (?i:(?1)) or (?:(?i)(?1)) do not affect
+the pattern being recursed into.
+
=item C<< (?>pattern) >>
-X<backtrack> X<backtracking>
+X<backtrack> X<backtracking> X<atomic> X<possessive>
B<WARNING>: This extended regular expression feature is considered
highly experimental, and may be changed or deleted without notice.
Which one you pick depends on which of these expressions better reflects
the above specification of comments.
+In some literature this construct is called "atomic matching" or
+"possessive matching".
+
=item C<(?(condition)yes-pattern|no-pattern)>
X<(?()>
For this grouping operator there is no need to describe the ordering, since
only whether or not C<S> can match is important.
-=item C<(??{ EXPR })>
+=item C<(??{ EXPR })>, C<(?PARNO)>
The ordering is the same as for the regular expression which is
-the result of EXPR.
+the result of EXPR, or the pattern contained by capture buffer PARNO.
=item C<(?(condition)yes-pattern|no-pattern)>
This will allow the use of a regex from inside (?{ }), (??{ }) and
(?(?{ })|) constructs.
+
+=head2 Add named capture to regexp engine
+
+Named capture is supported by .NET, PCRE and Python. Its embarrassing
+Perl doesn't support it yet.
+
+Jeffrey Friedl notes that "the most glaring omission [in perl's regexp
+engine] offered by other implementations is named capture".
+
+demerphq is working on this.
+
+=head2 Add possessive quantifiers to regexp engine
+
+Possessive quantifiers are a syntactic sugar that affords a more
+elegant way to express (?>A+). They are also provided by many other
+regex engines. Most importantly they allow various patterns to be
+optimised more efficiently than (?>...) allows, and allow various data
+driven optimisations to be implemented (such as auto-possesification of
+quantifiers followed by contrary suffixes). Common syntax for them is
+
+ ++ possessive 1 or more
+ *+ possessive 0 or more
+ {n,m}+ possessive n..m
+
+A possessive quantifier basically absorbs as much as it can and doesn't
+give any back.
+
+Jeffrey Friedl documents possessive quantifiers in Mastering Regular
+Expressions 2nd edition and explicitly pleads for them to be added to
+perl. We should oblige him, lest he leaves us out of a future edition.
+;-)
+
+demerphq has this on his todo list
+
+=head2 Add (?YES) (?NO) to regexp enigne
+
+YES/NO would allow a subpattern to be passed/failed but allow backtracking.
+Basically a more efficient (?=), (?!).
+
+demerphq has this on his todo list
+
+=head2 Add (?SUCCEED) (?FAIL) to regexp engine
+
+SUCCEED/FAIL would allow a pattern to be passed/failed but without backtracking.
+Thus you could signal that a pattern has matched or not, and return (regardless
+that there is more pattern following).
+
+demerphq has this on his todo list
+
+=head2 Add (?CUT) (?COMMIT) to regexp engine
+
+CUT would allow a pattern to say "do not backtrack beyond here".
+COMMIT would say match from here or don't, but don't try the pattern from
+another starting pattern.
+
+These correspond to the \v and \V that Jeffrey Friedl mentions in
+Mastering Regular Expressions 2nd edition.
+
+demerphq has this on his todo list
+
+=head2 Add class set operations to regexp engine
+
+Apparently these are quite useful. Anyway, Jeffery Friedl wants them.
+
+demerphq has this on his todo list, but right at the bottom.
+
+
STATIC regnode* S_reg_namedseq(pTHX_ struct RExC_state_t *state, UV *valuep)
__attribute__nonnull__(pTHX_1);
-STATIC void S_reginsert(pTHX_ struct RExC_state_t *state, U8 op, regnode *opnd)
+STATIC void S_reginsert(pTHX_ struct RExC_state_t *state, U8 op, regnode *opnd, U32 depth)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_3);
I32 extralen;
I32 seen_zerolen;
I32 seen_evals;
+ regnode **parens; /* offsets of each paren */
I32 utf8;
- HV *charnames; /* cache of named sequences */
+ HV *charnames; /* cache of named sequences */
#if ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
#define RExC_seen_evals (pRExC_state->seen_evals)
#define RExC_utf8 (pRExC_state->utf8)
#define RExC_charnames (pRExC_state->charnames)
+#define RExC_parens (pRExC_state->parens)
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
}
flags &= ~SCF_DO_STCLASS;
}
+ else if (OP(scan)==RECURSE) {
+ ARG2L_SET( scan, RExC_parens[ARG(scan)-1] - scan );
+ }
else if (strchr((const char*)PL_varies,OP(scan))) {
I32 mincount, maxcount, minnext, deltanext, fl = 0;
I32 f = flags, pos_before = 0;
RExC_emit = &PL_regdummy;
RExC_whilem_seen = 0;
RExC_charnames = NULL;
+ RExC_parens= NULL;
#if 0 /* REGC() is (currently) a NOP at the first pass.
* Clever compilers notice this and complain. --jhi */
r->substrs = 0; /* Useful during FAIL. */
r->startp = 0; /* Useful during FAIL. */
- r->endp = 0; /* Useful during FAIL. */
+ r->endp = 0;
+ if (RExC_seen & REG_SEEN_RECURSE) {
+ Newx(RExC_parens, RExC_npar,regnode *);
+ SAVEFREEPV(RExC_parens);
+ }
+
+ /* Useful during FAIL. */
Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
if (r->offsets) {
r->offsets[0] = RExC_size;
r->data = 0;
if (reg(pRExC_state, 0, &flags,1) == NULL)
return(NULL);
+
/* XXXX To minimize changes to RE engine we always allocate
3-units-long substrs field. */
Newx(r->substrs, 1, struct reg_substr_data);
Newxz(r->startp, RExC_npar, I32);
Newxz(r->endp, RExC_npar, I32);
-
- if (RExC_charnames)
- SvREFCNT_dec((SV*)(RExC_charnames));
-
DEBUG_r( RX_DEBUG_on(r) );
DEBUG_DUMP_r({
PerlIO_printf(Perl_debug_log,"Final program:\n");
DEBUG_PARSE_MSG((funcname)); \
PerlIO_printf(Perl_debug_log,"%4s","\n"); \
})
+#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
+ DEBUG_PARSE_MSG((funcname)); \
+ PerlIO_printf(Perl_debug_log,fmt "\n",args); \
+})
/*
- reg - regular expression, i.e. main body or parenthesized thing
*
nextchar(pRExC_state);
*flagp = TRYAGAIN;
return NULL;
+ case 'R' :
+ if (*RExC_parse != ')')
+ FAIL("Sequence (?R) not terminated");
+ reg_node(pRExC_state, SRECURSE);
+ break;
+ case '1': case '2': case '3': case '4': /* (?1) */
+ case '5': case '6': case '7': case '8': case '9':
+ RExC_parse--;
+ {
+ const I32 num = atoi(RExC_parse);
+ char * const parse_start = RExC_parse - 1; /* MJD */
+ while (isDIGIT(*RExC_parse))
+ RExC_parse++;
+ if (*RExC_parse!=')')
+ vFAIL("Expecting close bracket");
+ ret = reganode(pRExC_state, RECURSE, num);
+ if (!SIZE_ONLY) {
+ if (num > (I32)RExC_rx->nparens) {
+ RExC_parse++;
+ vFAIL("Reference to nonexistent group");
+ }
+ ARG2L_SET( ret, 0);
+ RExC_emit++;
+ DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
+ "Recurse #%d to %d\n", ARG(ret), ARG2L(ret)));
+ } else{
+ RExC_size++;
+ RExC_seen|=REG_SEEN_RECURSE;
+ }
+ Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
+ Set_Node_Offset(ret, RExC_parse); /* MJD */
+
+ nextchar(pRExC_state);
+ return ret;
+ }
case 'p': /* (?p...) */
if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
parno = RExC_npar;
RExC_npar++;
ret = reganode(pRExC_state, OPEN, parno);
+ if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
+ DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, "Setting paren #%d to %d\n",
+ parno,REG_NODE_NUM(ret)));
+ RExC_parens[parno-1]= ret;
+
+ }
Set_Node_Length(ret, 1); /* MJD */
Set_Node_Offset(ret, RExC_parse); /* MJD */
is_open = 1;
return(NULL);
if (*RExC_parse == '|') {
if (!SIZE_ONLY && RExC_extralen) {
- reginsert(pRExC_state, BRANCHJ, br);
+ reginsert(pRExC_state, BRANCHJ, br, depth+1);
}
else { /* MJD */
- reginsert(pRExC_state, BRANCH, br);
+ reginsert(pRExC_state, BRANCH, br, depth+1);
Set_Node_Length(br, paren != 0);
Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
}
if (paren == '>')
node = SUSPEND, flag = 0;
- reginsert(pRExC_state, node,ret);
+ reginsert(pRExC_state, node,ret, depth+1);
Set_Node_Cur_Length(ret);
Set_Node_Offset(ret, parse_start + 1);
ret->flags = flag;
do_curly:
if ((flags&SIMPLE)) {
RExC_naughty += 2 + RExC_naughty / 2;
- reginsert(pRExC_state, CURLY, ret);
+ reginsert(pRExC_state, CURLY, ret, depth+1);
Set_Node_Offset(ret, parse_start+1); /* MJD */
Set_Node_Cur_Length(ret);
}
w->flags = 0;
REGTAIL(pRExC_state, ret, w);
if (!SIZE_ONLY && RExC_extralen) {
- reginsert(pRExC_state, LONGJMP,ret);
- reginsert(pRExC_state, NOTHING,ret);
+ reginsert(pRExC_state, LONGJMP,ret, depth+1);
+ reginsert(pRExC_state, NOTHING,ret, depth+1);
NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
}
- reginsert(pRExC_state, CURLYX,ret);
+ reginsert(pRExC_state, CURLYX,ret, depth+1);
/* MJD hk */
Set_Node_Offset(ret, parse_start+1);
Set_Node_Length(ret,
*flagp = flags;
return(ret);
}
+ /* else if (OP(ret)==RECURSE) {
+ RExC_parse++;
+ vFAIL("Illegal quantifier on recursion group");
+ } */
#if 0 /* Now runtime fix should be reliable. */
*flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
if (op == '*' && (flags&SIMPLE)) {
- reginsert(pRExC_state, STAR, ret);
+ reginsert(pRExC_state, STAR, ret, depth+1);
ret->flags = 0;
RExC_naughty += 4;
}
goto do_curly;
}
else if (op == '+' && (flags&SIMPLE)) {
- reginsert(pRExC_state, PLUS, ret);
+ reginsert(pRExC_state, PLUS, ret, depth+1);
ret->flags = 0;
RExC_naughty += 3;
}
if (*RExC_parse == '?') {
nextchar(pRExC_state);
- reginsert(pRExC_state, MINMOD, ret);
+ reginsert(pRExC_state, MINMOD, ret, depth+1);
REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
}
if (ISMULT2(RExC_parse)) {
if (!RExC_charnames) {
/* make sure our cache is allocated */
RExC_charnames = newHV();
+ sv_2mortal((SV*)RExC_charnames);
}
/* see if we have looked this one up before */
he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
if (SIZE_ONLY) {
SIZE_ALIGN(RExC_size);
RExC_size += 2;
+ /*
+ We can't do this:
+
+ assert(2==regarglen[op]+1);
+
+ Anything larger than this has to allocate the extra amount.
+ If we changed this to be:
+
+ RExC_size += (1 + regarglen[op]);
+
+ then it wouldn't matter. Its not clear what side effect
+ might come from that so its not done so far.
+ -- dmq
+ */
return(ret);
}
* Means relocating the operand.
*/
STATIC void
-S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
+S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
{
dVAR;
register regnode *src;
register regnode *dst;
register regnode *place;
const int offset = regarglen[(U8)op];
+ const int size = NODE_STEP_REGNODE + offset;
GET_RE_DEBUG_FLAGS_DECL;
/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
-
+ DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
if (SIZE_ONLY) {
- RExC_size += NODE_STEP_REGNODE + offset;
+ RExC_size += size;
return;
}
src = RExC_emit;
- RExC_emit += NODE_STEP_REGNODE + offset;
+ RExC_emit += size;
dst = RExC_emit;
+ if (RExC_parens) {
+ int paren;
+ for ( paren=0 ; paren < RExC_npar ; paren++ ) {
+ if ( RExC_parens[paren] >= src )
+ RExC_parens[paren] += size;
+ }
+ }
+
while (src > opnd) {
StructCopy(--src, --dst, regnode);
if (RExC_offsets) { /* MJD 20010112 */
}
else if (k == WHILEM && o->flags) /* Ordinal/of */
Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
- else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
+ else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP)
Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
+ else if (k == RECURSE)
+ Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
else if (k == LOGICAL)
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
else if (k == ANYOF) {
char string[1];
};
+/* Argument bearing node - workhorse,
+ arg1 is often for the data field */
struct regnode_1 {
U8 flags;
U8 type;
U32 arg1;
};
+/* Similar to a regnode_1 but with an extra signed argument */
+struct regnode_2L {
+ U8 flags;
+ U8 type;
+ U16 next_off;
+ U32 arg1;
+ I32 arg2;
+};
+
+/* 'Two field' -- Two 16 bit unsigned args */
struct regnode_2 {
U8 flags;
U8 type;
U16 arg2;
};
+
#define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */
#define ANYOF_CLASSBITMAP_SIZE 4 /* up to 32 (8*4) named classes */
#define ARG(p) ARG_VALUE(ARG_LOC(p))
#define ARG1(p) ARG_VALUE(ARG1_LOC(p))
#define ARG2(p) ARG_VALUE(ARG2_LOC(p))
+#define ARG2L(p) ARG_VALUE(ARG2L_LOC(p))
#define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val))
#define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val))
#define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val))
+#define ARG2L_SET(p, val) ARG__SET(ARG2L_LOC(p), (val))
#undef NEXT_OFF
#undef NODE_ALIGN
#define ARG_LOC(p) (((struct regnode_1 *)p)->arg1)
#define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1)
#define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2)
-
+#define ARG2L_LOC(p) (((struct regnode_2L *)p)->arg2)
#define NODE_STEP_REGNODE 1 /* sizeof(regnode)/sizeof(regnode) */
#define EXTRA_STEP_2ARGS EXTRA_SIZE(struct regnode_2)
#define REG_SEEN_EVAL 0x00000008
#define REG_SEEN_CANY 0x00000010
#define REG_SEEN_SANY REG_SEEN_CANY /* src bckwrd cmpt */
+#define REG_SEEN_RECURSE 0x00000020
START_EXTERN_C
Any changes made here will be lost!
*/
+/* Regops and State definitions */
+
#define %*s\t%d
#define %*s\t%d
print OUT <<EOP;
+/* PL_regkind[] What type of regop or state is this. */
#ifndef DOINIT
EXTCONST U8 PL_regkind[];
};
#endif
+/* regarglen[] - How large is the argument part of the node (in regnodes) */
#ifdef REG_COMP_C
static const U8 regarglen[] = {
print OUT <<EOP;
};
+/* reg_off_by_arg[] - Which argument holds the offset to the next node */
+
static const char reg_off_by_arg[] = {
EOP
print OUT <<EOP;
};
+/* reg_name[] - Opcode/state names in string form, for debugging */
+
#ifdef DEBUGGING
const char * reg_name[] = {
EOP
AHOCORASICK TRIE, trie 1 Aho Corasick stclass. flags==type
AHOCORASICKC TRIE, trie charclass Same as AHOCORASICK, but with embedded charclass data
-
+#*Recursion (65)
+RECURSE RECURSE, num/ofs 2L recurse to paren arg1 at (signed) ofs arg2
+SRECURSE RECURSE, no recurse to start of pattern
# NEW STUFF ABOVE THIS LINE -- Please update counts below.
if (paren_elems_to_push < 0)
Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
-#define REGCP_OTHER_ELEMS 6
+#define REGCP_OTHER_ELEMS 8
SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
for (p = PL_regsize; p > parenfloor; p--) {
/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
));
}
/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
+ SSPUSHPTR(PL_regstartp);
+ SSPUSHPTR(PL_regendp);
SSPUSHINT(PL_regsize);
SSPUSHINT(*PL_reglastparen);
SSPUSHINT(*PL_reglastcloseparen);
*PL_reglastcloseparen = SSPOPINT;
*PL_reglastparen = SSPOPINT;
PL_regsize = SSPOPINT;
+ PL_regendp=(I32 *) SSPOPPTR;
+ PL_regstartp=(I32 *) SSPOPPTR;
+
/* Now restore the parentheses context. */
for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
i > 0; i -= REGCP_PAREN_ELEMS) {
srch_end_shift -= ((strbeg - s) - srch_start_shift);
srch_start_shift = strbeg - s;
}
- DEBUG_OPTIMISE_r({
+ DEBUG_OPTIMISE_MORE_r({
PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
(IV)prog->check_offset_min,
(IV)srch_start_shift,
start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
end_point= HOP3(strend, -srch_end_shift, strbeg);
}
- DEBUG_OPTIMISE_r({
+ DEBUG_OPTIMISE_MORE_r({
PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
(int)(end_point - start_point),
(int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
- DEBUG_OPTIMISE_r(
+ DEBUG_OPTIMISE_MORE_r(
PerlIO_printf(Perl_debug_log,
"Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
(IV)prog->check_offset_min,
}
}
if (last == NULL) {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- "%sCan't trim the tail, match fails (should not happen)%s\n",
- PL_colors[4], PL_colors[5]));
+ DEBUG_EXECUTE_r(
+ PerlIO_printf(Perl_debug_log,
+ "%sCan't trim the tail, match fails (should not happen)%s\n",
+ PL_colors[4], PL_colors[5]));
goto phooey; /* Should not happen! */
}
dontbother = strend - last + prog->float_min_offset;
return 0;
}
+
/*
- regtry - try match at specific point
*/
prog->subbeg = PL_bostr;
prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
}
+ DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
prog->startp[0] = startpos - PL_bostr;
PL_reginput = startpos;
- PL_regstartp = prog->startp;
- PL_regendp = prog->endp;
PL_reglastparen = &prog->lastparen;
PL_reglastcloseparen = &prog->lastcloseparen;
prog->lastparen = 0;
prog->lastcloseparen = 0;
PL_regsize = 0;
- DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
+ PL_regstartp = prog->startp;
+ PL_regendp = prog->endp;
if (PL_reg_start_tmpl <= prog->nparens) {
PL_reg_start_tmpl = prog->nparens*3/2 + 3;
if(PL_reg_start_tmp)
register I32 nextchr; /* is always set to UCHARAT(locinput) */
bool result = 0; /* return value of S_regmatch */
int depth = 0; /* depth of recursion */
+ int nochange_depth = 0; /* depth of RECURSE recursion with nochange*/
regmatch_state *yes_state = NULL; /* state to pop to on success of
subpattern */
regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
#undef ST
#define ST st->u.eval
-
- case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
{
SV *ret;
+ regexp *re;
+ regnode *startpoint;
+
+ case SRECURSE:
+ case RECURSE: /* /(...(?1))/ */
+ if (cur_eval && cur_eval->locinput==locinput) {
+ if (cur_eval->u.eval.close_paren == ARG(scan))
+ Perl_croak(aTHX_ "Infinite recursion in RECURSE in regexp");
+ if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
+ Perl_croak(aTHX_ "RECURSE without pos change exceeded limit in regexp");
+ } else {
+ nochange_depth = 0;
+ }
+ re = rex;
+ (void)ReREFCNT_inc(rex);
+ if (OP(scan)==RECURSE) {
+ startpoint = scan + ARG2L(scan);
+ ST.close_paren = ARG(scan);
+ } else {
+ startpoint = re->program+1;
+ ST.close_paren = 0;
+ }
+ goto eval_recurse_doit;
+ /* NOTREACHED */
+ case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
+ if (cur_eval && cur_eval->locinput==locinput) {
+ if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
+ Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regexp");
+ } else {
+ nochange_depth = 0;
+ }
{
/* execute the code in the {...} */
dSP;
}
}
if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
- regexp *re;
+
{
/* extract RE object from returned value; compiling if
* necessary */
PL_regsize = osize;
}
}
+ DEBUG_EXECUTE_r(
+ debug_start_match(re, do_utf8, locinput, PL_regeol,
+ "Matching embedded");
+ );
+ startpoint = re->program + 1;
+ ST.close_paren = 0; /* only used for RECURSE */
+ /* borrowed from regtry */
+ if (PL_reg_start_tmpl <= re->nparens) {
+ PL_reg_start_tmpl = re->nparens*3/2 + 3;
+ if(PL_reg_start_tmp)
+ Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
+ else
+ Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
+ }
+ eval_recurse_doit: /* Share code with RECURSE below this line */
/* run the pattern returned from (??{...}) */
ST.cp = regcppush(0); /* Save *all* the positions. */
REGCP_SET(ST.lastcp);
+
+ PL_regstartp = re->startp; /* essentially NOOP on RECURSE */
+ PL_regendp = re->endp; /* essentially NOOP on RECURSE */
+
*PL_reglastparen = 0;
*PL_reglastcloseparen = 0;
PL_reginput = locinput;
ST.B = next;
ST.prev_eval = cur_eval;
cur_eval = st;
-
- DEBUG_EXECUTE_r(
- debug_start_match(re, do_utf8, locinput, PL_regeol,
- "Matching embedded");
- );
/* now continue from first node in postoned RE */
- PUSH_YES_STATE_GOTO(EVAL_AB, re->program + 1);
+ PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
/* NOTREACHED */
}
/* /(?(?{...})X|Y)/ */
/* XXXX This is too dramatic a measure... */
PL_reg_maxiter = 0;
sayNO_SILENT;
-
#undef ST
case OPEN:
if (n > (I32)*PL_reglastparen)
*PL_reglastparen = n;
*PL_reglastcloseparen = n;
+ if (cur_eval && cur_eval->u.eval.close_paren == (U32)n) {
+ goto fake_end;
+ }
break;
case GROUPP:
n = ARG(scan); /* which paren pair */
case END:
+ fake_end:
if (cur_eval) {
/* we've just finished A in /(??{A})B/; now continue with B */
I32 tmpix;
st->u.eval.prev_eval = cur_eval;
cur_eval = cur_eval->u.eval.prev_eval;
DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ...\n",
- REPORT_CODE_OFF+depth*2, ""););
+ PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %x\n",
+ REPORT_CODE_OFF+depth*2, "",(int)cur_eval););
PUSH_YES_STATE_GOTO(EVAL_AB,
st->u.eval.prev_eval->u.eval.B); /* match B */
}
#define ROPT_CANY_SEEN 0x00000800
#define ROPT_SANY_SEEN ROPT_CANY_SEEN /* src bckwrd cmpt */
#define ROPT_GPOS_CHECK (ROPT_GPOS_SEEN|ROPT_ANCH_GPOS)
+#define ROPT_RECURSE_SEEN 0x00001000
/* 0xf800 of reganch is used by PMf_COMPILETIME */
/* structures for holding and saving the state maintained by regmatch() */
+#define MAX_RECURSE_EVAL_NOCHANGE_DEPTH 50
+
typedef I32 CHECKPOINT;
typedef struct regmatch_state {
CHECKPOINT cp; /* remember current savestack indexes */
CHECKPOINT lastcp;
regnode *B; /* the node following us */
+ U32 close_paren; /* which close bracket is our end */
} eval;
struct {
Any changes made here will be lost!
*/
-#define REGNODE_MAX 66
-#define REGMATCH_STATE_MAX 91
+/* Regops and State definitions */
+
+#define REGNODE_MAX 68
+#define REGMATCH_STATE_MAX 93
#define END 0 /* 0000 End of program. */
#define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */
#define TRIEC 62 /* 0x3e Same as TRIE, but with embedded charclass data */
#define AHOCORASICK 63 /* 0x3f Aho Corasick stclass. flags==type */
#define AHOCORASICKC 64 /* 0x40 Same as AHOCORASICK, but with embedded charclass data */
-#define OPTIMIZED 65 /* 0x41 Placeholder for dump. */
-#define PSEUDO 66 /* 0x42 Pseudo opcode for internal use. */
+#define RECURSE 65 /* 0x41 recurse to paren arg1 at (signed) ofs arg2 */
+#define SRECURSE 66 /* 0x42 recurse to start of pattern */
+#define OPTIMIZED 67 /* 0x43 Placeholder for dump. */
+#define PSEUDO 68 /* 0x44 Pseudo opcode for internal use. */
/* ------------ States ------------- */
-#define TRIE_next 67 /* 0x43 Regmatch state for TRIE */
-#define TRIE_next_fail 68 /* 0x44 Regmatch state for TRIE */
-#define EVAL_AB 69 /* 0x45 Regmatch state for EVAL */
-#define EVAL_AB_fail 70 /* 0x46 Regmatch state for EVAL */
-#define resume_CURLYX 71 /* 0x47 Regmatch state for CURLYX */
-#define resume_WHILEM1 72 /* 0x48 Regmatch state for WHILEM */
-#define resume_WHILEM2 73 /* 0x49 Regmatch state for WHILEM */
-#define resume_WHILEM3 74 /* 0x4a Regmatch state for WHILEM */
-#define resume_WHILEM4 75 /* 0x4b Regmatch state for WHILEM */
-#define resume_WHILEM5 76 /* 0x4c Regmatch state for WHILEM */
-#define resume_WHILEM6 77 /* 0x4d Regmatch state for WHILEM */
-#define BRANCH_next 78 /* 0x4e Regmatch state for BRANCH */
-#define BRANCH_next_fail 79 /* 0x4f Regmatch state for BRANCH */
-#define CURLYM_A 80 /* 0x50 Regmatch state for CURLYM */
-#define CURLYM_A_fail 81 /* 0x51 Regmatch state for CURLYM */
-#define CURLYM_B 82 /* 0x52 Regmatch state for CURLYM */
-#define CURLYM_B_fail 83 /* 0x53 Regmatch state for CURLYM */
-#define IFMATCH_A 84 /* 0x54 Regmatch state for IFMATCH */
-#define IFMATCH_A_fail 85 /* 0x55 Regmatch state for IFMATCH */
-#define CURLY_B_min_known 86 /* 0x56 Regmatch state for CURLY */
-#define CURLY_B_min_known_fail 87 /* 0x57 Regmatch state for CURLY */
-#define CURLY_B_min 88 /* 0x58 Regmatch state for CURLY */
-#define CURLY_B_min_fail 89 /* 0x59 Regmatch state for CURLY */
-#define CURLY_B_max 90 /* 0x5a Regmatch state for CURLY */
-#define CURLY_B_max_fail 91 /* 0x5b Regmatch state for CURLY */
+#define TRIE_next 69 /* 0x45 Regmatch state for TRIE */
+#define TRIE_next_fail 70 /* 0x46 Regmatch state for TRIE */
+#define EVAL_AB 71 /* 0x47 Regmatch state for EVAL */
+#define EVAL_AB_fail 72 /* 0x48 Regmatch state for EVAL */
+#define resume_CURLYX 73 /* 0x49 Regmatch state for CURLYX */
+#define resume_WHILEM1 74 /* 0x4a Regmatch state for WHILEM */
+#define resume_WHILEM2 75 /* 0x4b Regmatch state for WHILEM */
+#define resume_WHILEM3 76 /* 0x4c Regmatch state for WHILEM */
+#define resume_WHILEM4 77 /* 0x4d Regmatch state for WHILEM */
+#define resume_WHILEM5 78 /* 0x4e Regmatch state for WHILEM */
+#define resume_WHILEM6 79 /* 0x4f Regmatch state for WHILEM */
+#define BRANCH_next 80 /* 0x50 Regmatch state for BRANCH */
+#define BRANCH_next_fail 81 /* 0x51 Regmatch state for BRANCH */
+#define CURLYM_A 82 /* 0x52 Regmatch state for CURLYM */
+#define CURLYM_A_fail 83 /* 0x53 Regmatch state for CURLYM */
+#define CURLYM_B 84 /* 0x54 Regmatch state for CURLYM */
+#define CURLYM_B_fail 85 /* 0x55 Regmatch state for CURLYM */
+#define IFMATCH_A 86 /* 0x56 Regmatch state for IFMATCH */
+#define IFMATCH_A_fail 87 /* 0x57 Regmatch state for IFMATCH */
+#define CURLY_B_min_known 88 /* 0x58 Regmatch state for CURLY */
+#define CURLY_B_min_known_fail 89 /* 0x59 Regmatch state for CURLY */
+#define CURLY_B_min 90 /* 0x5a Regmatch state for CURLY */
+#define CURLY_B_min_fail 91 /* 0x5b Regmatch state for CURLY */
+#define CURLY_B_max 92 /* 0x5c Regmatch state for CURLY */
+#define CURLY_B_max_fail 93 /* 0x5d Regmatch state for CURLY */
+/* PL_regkind[] What type of regop or state is this. */
#ifndef DOINIT
EXTCONST U8 PL_regkind[];
TRIE, /* TRIEC */
TRIE, /* AHOCORASICK */
TRIE, /* AHOCORASICKC */
+ RECURSE, /* RECURSE */
+ RECURSE, /* SRECURSE */
NOTHING, /* OPTIMIZED */
PSEUDO, /* PSEUDO */
/* ------------ States ------------- */
};
#endif
+/* regarglen[] - How large is the argument part of the node (in regnodes) */
#ifdef REG_COMP_C
static const U8 regarglen[] = {
EXTRA_SIZE(struct regnode_charclass), /* TRIEC */
EXTRA_SIZE(struct regnode_1), /* AHOCORASICK */
EXTRA_SIZE(struct regnode_charclass), /* AHOCORASICKC */
+ EXTRA_SIZE(struct regnode_2L), /* RECURSE */
+ 0, /* SRECURSE */
0, /* OPTIMIZED */
0, /* PSEUDO */
};
+/* reg_off_by_arg[] - Which argument holds the offset to the next node */
+
static const char reg_off_by_arg[] = {
0, /* END */
0, /* SUCCEED */
0, /* TRIEC */
0, /* AHOCORASICK */
0, /* AHOCORASICKC */
+ 0, /* RECURSE */
+ 0, /* SRECURSE */
0, /* OPTIMIZED */
0, /* PSEUDO */
};
+/* reg_name[] - Opcode/state names in string form, for debugging */
+
#ifdef DEBUGGING
const char * reg_name[] = {
"END", /* 0000 */
"TRIEC", /* 0x3e */
"AHOCORASICK", /* 0x3f */
"AHOCORASICKC", /* 0x40 */
- "OPTIMIZED", /* 0x41 */
- "PSEUDO", /* 0x42 */
+ "RECURSE", /* 0x41 */
+ "SRECURSE", /* 0x42 */
+ "OPTIMIZED", /* 0x43 */
+ "PSEUDO", /* 0x44 */
/* ------------ States ------------- */
- "TRIE_next", /* 0x43 */
- "TRIE_next_fail", /* 0x44 */
- "EVAL_AB", /* 0x45 */
- "EVAL_AB_fail", /* 0x46 */
- "resume_CURLYX", /* 0x47 */
- "resume_WHILEM1", /* 0x48 */
- "resume_WHILEM2", /* 0x49 */
- "resume_WHILEM3", /* 0x4a */
- "resume_WHILEM4", /* 0x4b */
- "resume_WHILEM5", /* 0x4c */
- "resume_WHILEM6", /* 0x4d */
- "BRANCH_next", /* 0x4e */
- "BRANCH_next_fail", /* 0x4f */
- "CURLYM_A", /* 0x50 */
- "CURLYM_A_fail", /* 0x51 */
- "CURLYM_B", /* 0x52 */
- "CURLYM_B_fail", /* 0x53 */
- "IFMATCH_A", /* 0x54 */
- "IFMATCH_A_fail", /* 0x55 */
- "CURLY_B_min_known", /* 0x56 */
- "CURLY_B_min_known_fail", /* 0x57 */
- "CURLY_B_min", /* 0x58 */
- "CURLY_B_min_fail", /* 0x59 */
- "CURLY_B_max", /* 0x5a */
- "CURLY_B_max_fail", /* 0x5b */
+ "TRIE_next", /* 0x45 */
+ "TRIE_next_fail", /* 0x46 */
+ "EVAL_AB", /* 0x47 */
+ "EVAL_AB_fail", /* 0x48 */
+ "resume_CURLYX", /* 0x49 */
+ "resume_WHILEM1", /* 0x4a */
+ "resume_WHILEM2", /* 0x4b */
+ "resume_WHILEM3", /* 0x4c */
+ "resume_WHILEM4", /* 0x4d */
+ "resume_WHILEM5", /* 0x4e */
+ "resume_WHILEM6", /* 0x4f */
+ "BRANCH_next", /* 0x50 */
+ "BRANCH_next_fail", /* 0x51 */
+ "CURLYM_A", /* 0x52 */
+ "CURLYM_A_fail", /* 0x53 */
+ "CURLYM_B", /* 0x54 */
+ "CURLYM_B_fail", /* 0x55 */
+ "IFMATCH_A", /* 0x56 */
+ "IFMATCH_A_fail", /* 0x57 */
+ "CURLY_B_min_known", /* 0x58 */
+ "CURLY_B_min_known_fail", /* 0x59 */
+ "CURLY_B_min", /* 0x5a */
+ "CURLY_B_min_fail", /* 0x5b */
+ "CURLY_B_max", /* 0x5c */
+ "CURLY_B_max_fail", /* 0x5d */
};
#endif /* DEBUGGING */
#else
}x;
ok("{b{c}d" !~ m/^((??{ $brackets }))/, "bracket mismatch");
-
+SKIP:{
+ our @stack=();
+ my @expect=qw(
+ stuff1
+ stuff2
+ <stuff1>and<stuff2>
+ right
+ <right>
+ <<right>>
+ <<<right>>>
+ <<stuff1>and<stuff2>><<<<right>>>>
+ );
+
+ local $_='<<<stuff1>and<stuff2>><<<<right>>>>>';
+ ok(/^(<((?:(?>[^<>]+)|(?1))*)>(?{push @stack, $2 }))$/,
+ "Recursion should match");
+ ok(@stack==@expect)
+ or skip("Won't test individual results as count isn't equal",
+ 0+@expect);
+ foreach my $idx (@expect) {
+ ok($expect[$idx] eq $stack[$idx],
+ "Expecting '$expect' at stack pos #$idx");
+ }
+
+}
# stress test CURLYX/WHILEM.
#
# This test includes varying levels of nesting, and according to
}
-# Keep the following test last -- it may crash perl
+# Keep the following tests last -- they may crash perl
ok(("a" x (2**15 - 10)) =~ /^()(a|bb)*$/, "Recursive stack cracker: #24274")
or print "# Unexpected outcome: should pass or crash perl\n";
+ok((q(a)x 100) =~ /^(??{'(.)'x 100})/,
+ "Regexp /^(??{'(.)'x 100})/ crashes older perls")
+ or print "# Unexpected outcome: should pass or crash perl\n";
+
# Don't forget to update this!
-BEGIN{print "1..1253\n"};
+BEGIN{print "1..1264\n"};
^(.)((??{"(.)(cz+)"})|.) abcd y $1-$2 a-b
^a(?>(??{q(b)}))(??{q(c)})d abcd y - -
^x(??{""})+$ x y $& x
+^(<(?:[^<>]+|(?3)|(?1))*>)()(!>!>!>)$ <<!>!>!>><>>!>!>!> y $1 <<!>!>!>><>>
+^(<(?:[^<>]+|(?1))*>)$ <<><<<><>>>> y $1 <<><<<><>>>>
+((?2)*)([fF]o+) fooFoFoo y $1-$2 fooFo-Foo
+(<(?:[^<>]+|(?R))*>) <<><<<><>>>> y $1 <<><<<><>>>>