From: Yves Orton Date: Sun, 3 Dec 2006 16:55:55 +0000 (+0100) Subject: \R is supposed to mean something else so switch to \g and make it more useful in... X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=2bf803e2214c46ec8286bc855080cf05bb5cf7a4;p=p5sagit%2Fp5-mst-13.2.git \R is supposed to mean something else so switch to \g and make it more useful in the process Message-ID: <9b18b3110612030755o241e6372o9870ecce9c42e3d5@mail.gmail.com> p4raw-id: //depot/perl@29445 --- diff --git a/pod/perl595delta.pod b/pod/perl595delta.pod index c3d59ec..0497d55 100644 --- a/pod/perl595delta.pod +++ b/pod/perl595delta.pod @@ -122,8 +122,9 @@ and (*ACCEPT). See L for their descriptions. (Yves Orton) =item Relative backreferences -A new syntax C<\R1> ("1" being any positive decimal integer) allows -relative backreferencing. This should make it easier to embed patterns +A new syntax C<\g{N}> or C<\gN> where "N" is a decimal integer allows a +safer form of back-reference notation as well as allowing relative +backreferences. This should make it easier to generate and embed patterns that contain backreferences. (Yves Orton) =back diff --git a/pod/perldiag.pod b/pod/perldiag.pod index cec3945..c8c90ef 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -3496,10 +3496,9 @@ discovered. =item Reference to nonexistent or unclosed group in regex; marked by <-- HERE in m/%s/ -(F) You used something like C<\R7> in your regular expression, but there are +(F) You used something like C<\g{-7}> in your regular expression, but there are not at least seven sets of closed capturing parentheses in the expression before -where the C<\R7> was located. It's also possible you forgot to escape the -backslash. +where the C<\g{-7}> was located. The <-- HERE shows in the regular expression about where the problem was discovered. @@ -4438,6 +4437,10 @@ the pattern with a C<)>. Fix the pattern and retry. (F) You used a pattern of the form C<(*VERB:ARG)> but did not terminate the pattern with a C<)>. Fix the pattern and retry. +=item Unterminated \g{...} pattern in regex; marked by <-- HERE in m/%s/ + +(F) You missed a close brace on a \g{..} pattern (group reference) in +a regular expression. Fix the pattern and retry. =item Unterminated <> operator diff --git a/pod/perlre.pod b/pod/perlre.pod index bff63a6..556909f 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -247,8 +247,9 @@ X X Unsupported in lookbehind. \1 Backreference to a specific group. '1' may actually be any positive integer. - \R1 Relative backreference to a preceding closed group. - '1' may actually be any positive integer. + \g1 Backreference to a specific or previous group, + \g{-1} number may be negative indicating a previous buffer and may + optionally be wrapped in curly brackets for safer parsing. \k Named backreference \N{name} Named unicode character, or unicode escape \x12 Hexadecimal escape sequence @@ -485,22 +486,28 @@ backreference only if at least 11 left parentheses have opened before it. And so on. \1 through \9 are always interpreted as backreferences. -X -In Perl 5.10 it is possible to relatively address a capture buffer by -using the C<\RNNN> notation, where C is negative offset to a -preceding capture buffer. Thus C<\R1> refers to the last buffer, -C<\R2> refers to the buffer before that. For example: +X<\g{1}> X<\g{-1}> X +In order to provide a safer and easier way to construct patterns using +backrefs, in Perl 5.10 the C<\g{N}> notation is provided. The curly +brackets are optional, however omitting them is less safe as the meaning +of the pattern can be changed by text (such as digits) following it. +When N is a positive integer the C<\g{N}> notation is exactly equivalent +to using normal backreferences. When N is a negative integer then it is +a relative backreference referring to the previous N'th capturing group. + +Thus C<\g{-1}> refers to the last buffer, C<\g{-2}> refers to the +buffer before that. For example: / (Y) # buffer 1 ( # buffer 2 (X) # buffer 3 - \R1 # backref to buffer 3 - \R3 # backref to buffer 1 + \g{-1} # backref to buffer 3 + \g{-3} # backref to buffer 1 ) /x -and would match the same as C. +and would match the same as C. Additionally, as of Perl 5.10 you may use named capture buffers and named backreferences. The notation is C<< (?...) >> and C<< \k >> @@ -1066,10 +1073,10 @@ handling them. An example of how this might be used is as follows: - /(?(&NAME_PAT))(?(&ADDRESS_PAT)) + /(?(?&NAME_PAT))(?(?&ADDRESS_PAT)) (?(DEFINE) - (....) - (....) + (?....) + (?....) )/x Note that capture buffers matched inside of recursion are not accessible diff --git a/regcomp.c b/regcomp.c index 359e4f6..4ea9a5a 100644 --- a/regcomp.c +++ b/regcomp.c @@ -4345,6 +4345,7 @@ reStudy: if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) && data.last_start_min == 0 && data.last_end > 0 && !RExC_seen_zerolen + && !(RExC_seen & REG_SEEN_VERBARG) && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) r->extflags |= RXf_CHECK_ALL; scan_commit(pRExC_state, &data,&minlen,0); @@ -6364,27 +6365,42 @@ tryagain: case 'c': case '0': goto defchar; - case 'R': + case 'g': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { I32 num; - bool isrel=(*RExC_parse=='R'); - if (isrel) + bool isg = *RExC_parse == 'g'; + bool isrel = 0; + bool hasbrace = 0; + if (isg) { RExC_parse++; + if (*RExC_parse == '{') { + RExC_parse++; + hasbrace = 1; + } + if (*RExC_parse == '-') { + RExC_parse++; + isrel = 1; + } + } num = atoi(RExC_parse); if (isrel) { num = RExC_npar - num; if (num < 1) vFAIL("Reference to nonexistent or unclosed group"); } - if (num > 9 && num >= RExC_npar) + if (!isg && num > 9 && num >= RExC_npar) goto defchar; else { char * const parse_start = RExC_parse - 1; /* MJD */ while (isDIGIT(*RExC_parse)) RExC_parse++; - + if (hasbrace) { + if (*RExC_parse != '}') + vFAIL("Unterminated \\g{...} pattern"); + RExC_parse++; + } if (!SIZE_ONLY) { if (num > (I32)RExC_rx->nparens) vFAIL("Reference to nonexistent group"); @@ -6464,6 +6480,7 @@ tryagain: case 'C': case 'X': case 'G': + case 'g': case 'Z': case 'z': case 'w': diff --git a/regexec.c b/regexec.c index b54a4cb..2da8bfd 100644 --- a/regexec.c +++ b/regexec.c @@ -3561,6 +3561,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]); PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr; + if (sv_yes_mark) { + SV *sv_mrk = get_sv("REGMARK", 1); + sv_setsv(sv_mrk, sv_yes_mark); + } + CALLRUNOPS(aTHX); /* Scalar context. */ SPAGAIN; if (SP == before) @@ -4848,12 +4853,12 @@ NULL case SKIP: PL_reginput = locinput; if (scan->flags) { - /* (*CUT) : if we fail we cut here*/ + /* (*SKIP) : if we fail we cut here*/ ST.mark_name = NULL; ST.mark_loc = locinput; PUSH_STATE_GOTO(SKIP_next,next); } else { - /* (*CUT:NAME) : if there is a (*MARK:NAME) fail where it was, + /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, otherwise do nothing. Meaning we need to scan */ regmatch_state *cur = mark_state; @@ -4869,7 +4874,7 @@ NULL cur = cur->u.mark.prev_mark; } } - /* Didn't find our (*MARK:NAME) so ignore this (*CUT:NAME) */ + /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */ break; case SKIP_next_fail: if (ST.mark_name) { diff --git a/t/op/pat.t b/t/op/pat.t index 245f1b5..31922e9 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -4113,13 +4113,13 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) { } { local $Message = "http://nntp.perl.org/group/perl.perl5.porters/118663"; - my $qr_barR1 = qr/(bar)\R1/; + my $qr_barR1 = qr/(bar)\g-1/; ok("foobarbarxyz" =~ $qr_barR1); ok("foobarbarxyz" =~ qr/foo${qr_barR1}xyz/); ok("foobarbarxyz" =~ qr/(foo)${qr_barR1}xyz/); - ok("foobarbarxyz" =~ qr/(foo)(bar)\R1xyz/); + ok("foobarbarxyz" =~ qr/(foo)(bar)\g{-1}xyz/); ok("foobarbarxyz" =~ qr/(foo${qr_barR1})xyz/); - ok("foobarbarxyz" =~ qr/(foo(bar)\R1)xyz/); + ok("foobarbarxyz" =~ qr/(foo(bar)\g{-1})xyz/); } { local $Message = "RT#41010"; @@ -4154,7 +4154,16 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) { $doit->(\@spats,@sstrs); $doit->(\@dpats,@dstrs); } - +{ + local $Message = "\$REGMARK"; + our @r=(); + ok('foofoo' =~ /foo (*MARK:foo) (?{push @r,$REGMARK}) /x); + iseq("@r","foo"); + iseq($REGMARK,"foo"); + ok('foofoo' !~ /foo (*MARK:foo) (*FAIL) /x); + ok(!$REGMARK); + iseq($REGERROR,'foo'); +} # Test counter is at bottom of file. Put new tests above here. #------------------------------------------------------------------- # Keep the following tests last -- they may crash perl @@ -4201,7 +4210,7 @@ ok((q(a)x 100) =~ /^(??{'(.)'x 100})/, iseq(0+$::test,$::TestCount,"Got the right number of tests!"); # Don't forget to update this! BEGIN { - $::TestCount = 1567; + $::TestCount = 1573; print "1..$::TestCount\n"; } diff --git a/t/op/re_tests b/t/op/re_tests index 925bb36..d0f6ae3 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -1190,9 +1190,11 @@ a*(*F) aaaab n - - (a)(?:(?-1)|(?+1))(b) abb y $1-$2 a-b (a)(?:(?-1)|(?+1))(b) acb n - - -(foo)(\R2) foofoo y $1-$2 foo-foo -(foo)(\R2)(foo)(\R2) foofoofoofoo y $1-$2-$3-$4 foo-foo-foo-foo -(([abc]+) \R1)(([abc]+) \R1) abc abccba cba y $2-$4 abc-cba +(foo)(\g-2) foofoo y $1-$2 foo-foo +(foo)(\g-2)(foo)(\g-2) foofoofoofoo y $1-$2-$3-$4 foo-foo-foo-foo +(([abc]+) \g-1)(([abc]+) \g{-1}) abc abccba cba y $2-$4 abc-cba +(a)(b)(c)\g1\g2\g3 abcabc y $1$2$3 abc + /(?'n'foo) \k/ ..foo foo.. y $1 foo /(?'n'foo) \k/ ..foo foo.. y $+{n} foo