From: Yves Orton Date: Mon, 13 Nov 2006 18:59:32 +0000 (+0100) Subject: Allow negative indexing in recursive patterns X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=542fa716385f08c64e1dc5ef1d9ceacf2ee69d29;p=p5sagit%2Fp5-mst-13.2.git Allow negative indexing in recursive patterns Message-ID: <9b18b3110611130959k1fdd2485yd8eb1cd428de570a@mail.gmail.com> p4raw-id: //depot/perl@29267 --- diff --git a/pod/perlre.pod b/pod/perlre.pod index 0323a97..c2b9680 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -866,9 +866,10 @@ Recursing deeper than 50 times without consuming any input string will result in a fatal error. The maximum depth is compiled into perl, so changing it requires a custom build. -=item C<(?PARNO)> C<(?R)> C<(?0)> -X<(?PARNO)> X<(?1)> X<(?R)> X<(?0)> +=item C<(?PARNO)> C<(?-PARNO)> C<(?+PARNO)> C<(?R)> C<(?0)> +X<(?PARNO)> X<(?1)> X<(?R)> X<(?0)> X<(?-1)> X<(?+1)> X<(?-PARNO)> X<(?+PARNO)> X X X +X Similar to C<(??{ code })> except it does not involve compiling any code, instead it treats the contents of a capture buffer as an independent @@ -879,7 +880,10 @@ outermost recursion. PARNO is a sequence of digits (not starting with 0) whose value reflects the paren-number of the capture buffer to recurse to. C<(?R)> recurses to the beginning of the whole pattern. C<(?0)> is an alternate syntax for -C<(?R)>. +C<(?R)>. If PARNO is preceded by a plus or minus sign then it is assumed +to be relative, with negative numbers indicating preceding capture buffers +and positive ones following. Thus C<(?-1)> refers to the most recently +declared buffer, and C<(?+1)> indicates the next buffer to be declared. The following pattern matches a function foo() which may contain balanced parentheses as the argument. @@ -918,11 +922,21 @@ fatal error. Recursing deeper than 50 times without consuming any input string will also result in a fatal error. The maximum depth is compiled into perl, so changing it requires a custom build. +The following shows how using negative indexing can make it +easier to embed recursive patterns inside of a C construct +for later use: + + my $parens = qr/(\((?:[^()]++|(?-1))*+\))/; + if (/foo $parens \s+ + \s+ bar $parens/x) { + # do something here... + } + B that this pattern does not behave the same way as the equivalent PCRE or Python construct of the same form. In perl you can backtrack into a recursed group, in PCRE and Python the recursed into group is treated -as atomic. Also, constructs like (?i:(?1)) or (?:(?i)(?1)) do not affect -the pattern being recursed into. +as atomic. Also, modifiers are resolved at compile time, so constructs +like (?i:(?1)) or (?:(?i)(?1)) do not affect how the sub-pattern will +be processed. =item C<(?&NAME)> X<(?&NAME)> diff --git a/regcomp.c b/regcomp.c index 3cc1295..b077ddb 100644 --- a/regcomp.c +++ b/regcomp.c @@ -4915,17 +4915,54 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } goto gen_recurse_regop; /* NOT REACHED */ + case '+': + if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { + RExC_parse++; + vFAIL("Illegal pattern"); + } + goto parse_recursion; + /* NOT REACHED*/ + case '-': /* (?-1) */ + if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { + RExC_parse--; /* rewind to let it be handled later */ + goto parse_flags; + } + /*FALLTHROUGH */ case '1': case '2': case '3': case '4': /* (?1) */ case '5': case '6': case '7': case '8': case '9': RExC_parse--; + parse_recursion: num = atoi(RExC_parse); parse_start = RExC_parse - 1; /* MJD */ + if (*RExC_parse == '-') + RExC_parse++; while (isDIGIT(*RExC_parse)) RExC_parse++; if (*RExC_parse!=')') vFAIL("Expecting close bracket"); gen_recurse_regop: + if ( paren == '-' ) { + /* + Diagram of capture buffer numbering. + Top line is the normal capture buffer numbers + Botton line is the negative indexing as from + the X (the (?-2)) + + + 1 2 3 4 5 X 6 7 + /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/ + - 5 4 3 2 1 X x x + + */ + num = RExC_npar + num; + if (num < 1) { + RExC_parse++; + vFAIL("Reference to nonexistent group"); + } + } else if ( paren == '+' ) { + num = RExC_npar + num - 1; + } + ret = reganode(pRExC_state, GOSUB, num); if (!SIZE_ONLY) { if (num > (I32)RExC_rx->nparens) { diff --git a/t/op/pat.t b/t/op/pat.t index 0bc0eb6..333165d 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -3991,8 +3991,23 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) { for ("ABC","BAX") { ok(/A (*THEN) X | B (*THEN) C/x,"Simple (*THEN) test"); } -} - +} + +{ + my $parens=qr/(\((?:[^()]++|(?-1))*+\))/; + local $_='foo((2*3)+4-3) + bar(2*(3+4)-1*(2-3))'; + my ($all,$one,$two)=('','',''); + if (/foo $parens \s* \+ \s* bar $parens/x) { + $all=$&; + $one=$1; + $two=$2; + } + iseq($one, '((2*3)+4-3)'); + iseq($two, '(2*(3+4)-1*(2-3))'); + iseq($all, 'foo((2*3)+4-3) + bar(2*(3+4)-1*(2-3))'); + iseq($all, $_); +} + #------------------------------------------------------------------- # Keep the following tests last -- they may crash perl @@ -4019,4 +4034,4 @@ ok((q(a)x 100) =~ /^(??{'(.)'x 100})/, # Put new tests above the line, not here. # Don't forget to update this! -BEGIN { print "1..1341\n" }; +BEGIN { print "1..1345\n" }; diff --git a/t/op/re_tests b/t/op/re_tests index 99c6824..078caa9 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -1186,3 +1186,6 @@ a*(*F) aaaab n - - (A(A|B(*ACCEPT)|C)D)(E) AB y $1 AB (A(A|B(*ACCEPT)|C)D)(E) ACDE y $1$2$3 ACDCE +(a)(?:(?-1)|(?+1))(b) aab y $&-$1-$2 aab-a-b +(a)(?:(?-1)|(?+1))(b) abb y $&-$1-$2 abb-a-b +(a)(?:(?-1)|(?+1))(b) acb n - -