char *strend, U32 flags,
struct re_scream_pos_data_s *data);
extern SV* my_re_intuit_string (pTHX_ regexp *prog);
-extern char* my_reg_stringify (pTHX_ MAGIC *mg, U32 *flags, STRLEN *lp, I32 *haseval);
+extern char* my_reg_stringify (pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval);
#if defined(USE_ITHREADS)
extern regexp* my_regdupe (pTHX_ const regexp *r, CLONE_PARAMS *param);
The regex engine now supports a number of special purpose backtrack
control verbs: (*THEN), (*PRUNE), (*MARK), (*SKIP), (*COMMIT), (*FAIL)
-and (*ACCEPT). See L<perlre> for their descriptions.
+and (*ACCEPT). See L<perlre> for their descriptions. (Yves Orton)
+
+=item Relative backreferences
+
+A new syntax C<\R1> ("1" being any positive decimal integer) allows
+relative backreferencing. This should make it easier to embed patterns
+that contain backreferences. (Yves Orton)
=back
The <-- HERE shows in the regular expression about where the problem was
discovered.
+=item Reference to nonexistent or unclosed group in regex; marked by <-- HERE in m/%s/
+
+(F) You used something like C<\R7> in your regular expression, but there are
+not at least seven sets of closed capturing parentheses in the expression before
+where the C<\R7> was located. It's also possible you forgot to escape the
+backslash.
+
+The <-- HERE shows in the regular expression about where the problem was
+discovered.
+
=item Reference to nonexistent named group in regex; marked by <-- HERE in m/%s/
(F) You used something like C<\k'NAME'> or C<< \k<NAME> >> in your regular
so you may end up with malformed pieces of UTF-8.
Unsupported in lookbehind.
\1 Backreference to a specific group.
- '1' may actually be any positive integer.
+ '1' may actually be any positive integer.
+ \R1 Relative backreference to a preceding closed group.
+ '1' may actually be any positive integer.
\k<name> Named backreference
\N{name} Named unicode character, or unicode escape
\x12 Hexadecimal escape sequence
left parentheses have opened before it. Likewise \11 is a
backreference only if at least 11 left parentheses have opened
before it. And so on. \1 through \9 are always interpreted as
-backreferences.
+backreferences.
+
+X<relative backreference>
+In Perl 5.10 it is possible to relatively address a capture buffer by
+using the C<\RNNN> notation, where C<NNN> is negative offset to a
+preceding completed capture buffer. Thus C<\R1> refers to the last
+buffer closed, C<\R2> refers to the buffer before that, and so on. Note
+especially that C</(foo)(\R1)/> refers to the capture buffer containing
+C<foo>, not to the buffer containing C<\R1>.
Additionally, as of Perl 5.10 you may use named capture buffers and named
backreferences. The notation is C<< (?<name>...) >> and C<< \k<name> >>
to be relative, with negative numbers indicating preceding capture buffers
and positive ones following. Thus C<(?-1)> refers to the most recently
declared buffer, and C<(?+1)> indicates the next buffer to be declared.
+Note that the counting for relative recursion differs from that of
+relative backreferences, in that with recursion unclosed buffers B<are>
+included.
The following pattern matches a function foo() which may contain
balanced parentheses as the argument.
typedef struct regexp {
I32 *startp;
I32 *endp;
+ regexp_paren_ofs *swap;
regnode *regstclass;
struct reg_substr_data *substrs;
char *precomp; /* pre-compilation regular expression */
in the pattern, which was the last open paren to be entered, and which was
the last close paren to be entered.
-=item C<startp>, C<endp>
+=item C<startp>, C<endp>, C<swap>
These fields store arrays that are used to hold the offsets of the begining
and end of each capture group that has matched. -1 is used to indicate no match.
+C<swap> is an extra set of startp/endp stored in a C<regexp_paren_ofs>
+struct. This is used when the last successful match was from same pattern
+as the current pattern, so that a partial match doesn't overwrite the
+previous match's results. When this field is data filled the matching
+engine will swap buffers before every match attempt. If the match fails,
+then it swaps them back. If it's successful it leaves them. This field
+is populated on demand and is by default null.
+
These are the source for @- and @+.
=item C<subbeg> C<sublen> C<saved_copy>
I32 sawback; /* Did we see \1, ...? */
U32 seen;
I32 size; /* Code size. */
- I32 npar; /* () count. */
+ I32 npar; /* Capture buffer count, (OPEN). */
+ I32 cpar; /* Capture buffer count, (CLOSE). */
I32 nestroot; /* root parens we are in - used by accept */
I32 extralen;
I32 seen_zerolen;
#define RExC_seen (pRExC_state->seen)
#define RExC_size (pRExC_state->size)
#define RExC_npar (pRExC_state->npar)
+#define RExC_cpar (pRExC_state->cpar)
#define RExC_nestroot (pRExC_state->nestroot)
#define RExC_extralen (pRExC_state->extralen)
#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
RExC_end = xend;
RExC_naughty = 0;
RExC_npar = 1;
+ RExC_cpar = 1;
RExC_nestroot = 0;
RExC_size = 0L;
RExC_emit = &PL_regdummy;
r->substrs = 0; /* Useful during FAIL. */
r->startp = 0; /* Useful during FAIL. */
r->endp = 0;
+ r->swap = NULL;
r->paren_names = 0;
if (RExC_seen & REG_SEEN_RECURSE) {
RExC_end = xend;
RExC_naughty = 0;
RExC_npar = 1;
+ RExC_cpar = 1;
RExC_emit_start = r->program;
RExC_emit = r->program;
#ifdef DEBUGGING
}
Newxz(r->startp, RExC_npar, I32);
Newxz(r->endp, RExC_npar, I32);
-
+ /* assume we don't need to swap parens around before we match */
+
DEBUG_DUMP_r({
PerlIO_printf(Perl_debug_log,"Final program:\n");
regdump(r);
ender = reg_node(pRExC_state, TAIL);
break;
case 1:
+ RExC_cpar++;
ender = reganode(pRExC_state, CLOSE, parno);
if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
case 'c':
case '0':
goto defchar;
+ case 'R':
case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
{
- const I32 num = atoi(RExC_parse);
-
+ I32 num;
+ bool isrel=(*RExC_parse=='R');
+ if (isrel)
+ RExC_parse++;
+ num = atoi(RExC_parse);
+ if (isrel) {
+ num = RExC_cpar - num;
+ if (num < 1)
+ vFAIL("Reference to nonexistent or unclosed group");
+ }
if (num > 9 && num >= RExC_npar)
goto defchar;
else {
while (isDIGIT(*RExC_parse))
RExC_parse++;
- if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
- vFAIL("Reference to nonexistent group");
+ if (!SIZE_ONLY) {
+ if (num > (I32)RExC_rx->nparens)
+ vFAIL("Reference to nonexistent group");
+ /* People make this error all the time apparently.
+ So we cant fail on it, even though we should
+
+ else if (num >= RExC_cpar)
+ vFAIL("Reference to unclosed group will always match");
+ */
+ }
RExC_sawback = 1;
ret = reganode(pRExC_state,
(U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
case 'p':
case 'P':
case 'N':
+ case 'R':
--p;
goto loopdone;
case 'n':
}
Safefree(r->startp);
Safefree(r->endp);
+ if (r->swap) {
+ Safefree(r->swap->startp);
+ Safefree(r->swap->endp);
+ Safefree(r->swap);
+ }
Safefree(r);
}
Copy(r->startp, ret->startp, npar, I32);
Newx(ret->endp, npar, I32);
Copy(r->startp, ret->startp, npar, I32);
+ if(r->swap) {
+ Newx(ret->swap, 1, regexp_paren_ofs);
+ /* no need to copy these */
+ Newx(ret->swap->startp, npar, I32);
+ Newx(ret->swap->endp, npar, I32);
+ } else {
+ ret->swap = NULL;
+ }
Newx(ret->substrs, 1, struct reg_substr_data);
for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
else /* pos() not defined */
reginfo.ganch = strbeg;
}
-
+ if (PL_curpm && (PM_GETRE(PL_curpm) == prog)) {
+ I32 *t;
+ if (!prog->swap) {
+ /* We have to be careful. If the previous successful match
+ was from this regex we don't want a subsequent paritally
+ successful match to clobber the old results.
+ So when we detect this possibility we add a swap buffer
+ to the re, and switch the buffer each match. If we fail
+ we switch it back, otherwise we leave it swapped.
+ */
+ Newxz(prog->swap, 1, regexp_paren_ofs);
+ /* no need to copy these */
+ Newxz(prog->swap->startp, prog->nparens + 1, I32);
+ Newxz(prog->swap->endp, prog->nparens + 1, I32);
+ }
+ t = prog->swap->startp;
+ prog->swap->startp = prog->startp;
+ prog->startp = t;
+ t = prog->swap->endp;
+ prog->swap->endp = prog->endp;
+ prog->endp = t;
+ }
if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
re_scream_pos_data d;
PL_colors[4], PL_colors[5]));
if (PL_reg_eval_set)
restore_pos(aTHX_ prog);
+ if (prog->swap) {
+ /* we failed :-( roll it back */
+ I32 *t;
+ t = prog->swap->startp;
+ prog->swap->startp = prog->startp;
+ prog->startp = t;
+ t = prog->swap->endp;
+ prog->swap->endp = prog->endp;
+ prog->endp = t;
+ }
return 0;
}
* on those tests seems to be returning null fields from matches.
* --jhi */
#if 1
- sp = prog->startp;
- ep = prog->endp;
+ sp = PL_regstartp;
+ ep = PL_regendp;
if (prog->nparens) {
register I32 i;
for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
#endif
REGCP_SET(lastcp);
if (regmatch(reginfo, prog->program + 1)) {
- prog->endp[0] = PL_reginput - PL_bostr;
+ PL_regendp[0] = PL_reginput - PL_bostr;
return 1;
}
if (reginfo->cutpoint)
struct reg_data;
struct regexp_engine;
+typedef struct regexp_paren_ofs {
+ I32 *startp;
+ I32 *endp;
+} regexp_paren_ofs;
typedef struct regexp {
- I32 *startp;
+ I32 *startp;
I32 *endp;
+ regexp_paren_ofs *swap;
regnode *regstclass;
struct reg_substr_data *substrs;
char *precomp; /* pre-compilation regular expression */
#$_ = "x"; /x(?{func "in regexp"})/;
#$_ = "x"; /x(?{func "in multiline regexp"})/m;
-# bug #19049
+# bug RT#19049
$_="abcdef\n";
@x = m/./g;
-ok("abcde" eq "$`", '# TODO #19049 - global match not setting $`');
+ok("abcde" eq "$`", 'RT#19049 - global match not setting $`');
ok("123\x{100}" =~ /^.*1.*23\x{100}$/, 'uft8 + multiple floating substr');
#-------------------------------------------------------------------
# Keep the following tests last -- they may crash perl
+{
+ # RT#19049 / RT#38869
+ my @list = (
+ 'ab cdef', # matches regex
+ ( 'e' x 40000 ) .'ab c' # matches not, but 'ab c' matches part of it
+ );
+ my $y;
+ my $x;
+ foreach (@list) {
+ m/ab(.+)cd/i; # the ignore-case seems to be important
+ $y = $1; # use $1, which might not be from the last match!
+ $x = substr($list[0],$-[0],$+[0]-$-[0]);
+ }
+ iseq($y,' ',
+ 'pattern in a loop, failure should not affect previous success');
+ iseq($x,'ab cd',
+ 'pattern in a loop, failure should not affect previous success');
+}
ok(("a" x (2**15 - 10)) =~ /^()(a|bb)*$/, "Recursive stack cracker: #24274")
or print "# Unexpected outcome: should pass or crash perl\n";
# Put new tests above the line, not here.
# Don't forget to update this!
-BEGIN { print "1..1345\n" };
+BEGIN { print "1..1347\n" };
(A(A|B(*ACCEPT)|C)D)(E) ACDE y $1$2$3 ACDCE
(a)(?:(?-1)|(?+1))(b) aab y $&-$1-$2 aab-a-b
-(a)(?:(?-1)|(?+1))(b) abb y $&-$1-$2 abb-a-b
+(a)(?:(?-1)|(?+1))(b) abb y $1-$2 a-b
(a)(?:(?-1)|(?+1))(b) acb n - -
+
+(foo)(\R1) foofoo y $1-$2 foo-foo
+(foo)(\R1)(foo)(\R1) foofoofoofoo y $1-$2-$3-$4 foo-foo-foo-foo