From: Ilya Zakharevich Date: Sat, 27 Jun 1998 02:55:26 +0000 (-0400) Subject: applied patch, tweaked doc, and regen regnodes.h X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=b85d18e97b6ae9e0cc168f99b999fd3fd33104bd;p=p5sagit%2Fp5-mst-13.2.git applied patch, tweaked doc, and regen regnodes.h Message-Id: <199806270655.CAA29144@monk.mps.ohio-state.edu> Subject: [PATCH 5.004_68] \z in RE p4raw-id: //depot/perl@1250 --- diff --git a/pod/perlre.pod b/pod/perlre.pod index ebd5858..30608ce 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -177,8 +177,9 @@ Perl defines the following zero-width assertions: \b Match a word boundary \B Match a non-(word boundary) - \A Match at only beginning of string - \Z Match at only end of string (or before newline at the end) + \A Match only at beginning of string + \Z Match only at end of string, or before newline at the end + \z Match only at end of string \G Match only where previous m//g left off (works only with /g) A word boundary (C<\b>) is defined as a spot between two characters that @@ -189,7 +190,7 @@ represents backspace rather than a word boundary.) The C<\A> and C<\Z> are just like "^" and "$", except that they won't match multiple times when the C modifier is used, while "^" and "$" will match at every internal line boundary. To match the actual end of the string, not ignoring newline, -you can use C<\Z(?!\n)>. The C<\G> assertion can be used to chain global +you can use C<\z>. The C<\G> assertion can be used to chain global matches (using C), as described in L. diff --git a/regcomp.c b/regcomp.c index 2b71d99..5475d78 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1539,6 +1539,12 @@ tryagain: *flagp |= SIMPLE; nextchar(); break; + case 'z': + ret = reg_node(EOS); + *flagp |= SIMPLE; + seen_zerolen++; /* Do not optimize RE away */ + nextchar(); + break; case 'w': ret = reg_node((regflags & PMf_LOCALE) ? ALNUML : ALNUM); *flagp |= HASWIDTH|SIMPLE; @@ -1665,6 +1671,7 @@ tryagain: case 'A': case 'G': case 'Z': + case 'z': case 'w': case 'W': case 'b': @@ -2329,6 +2336,9 @@ regprop(SV *sv, regnode *o) case EOL: p = "EOL"; break; + case EOS: + p = "EOS"; + break; case MEOL: p = "MEOL"; break; diff --git a/regcomp.sym b/regcomp.sym index aa18d11..9775b93 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -11,6 +11,7 @@ SUCCEED END, no Return from a subroutine, basically. BOL BOL, no Match "" at beginning of line. MBOL BOL, no Same, assuming multiline. SBOL BOL, no Same, assuming singleline. +EOS EOL, no Match "" at end of string. EOL EOL, no Match "" at end of line. MEOL EOL, no Same, assuming multiline. SEOL EOL, no Same, assuming singleline. diff --git a/regexec.c b/regexec.c index dd51bc1..107d68b 100644 --- a/regexec.c +++ b/regexec.c @@ -824,6 +824,10 @@ regmatch(regnode *prog) if (regeol - locinput > 1) sayNO; break; + case EOS: + if (regeol != locinput) + sayNO; + break; case SANY: if (!nextchr && locinput >= regeol) sayNO; diff --git a/regnodes.h b/regnodes.h index 445d0b2..c494dae 100644 --- a/regnodes.h +++ b/regnodes.h @@ -8,58 +8,59 @@ #define BOL 2 /* 0x2 Match "" at beginning of line. */ #define MBOL 3 /* 0x3 Same, assuming multiline. */ #define SBOL 4 /* 0x4 Same, assuming singleline. */ -#define EOL 5 /* 0x5 Match "" at end of line. */ -#define MEOL 6 /* 0x6 Same, assuming multiline. */ -#define SEOL 7 /* 0x7 Same, assuming singleline. */ -#define BOUND 8 /* 0x8 Match "" at any word boundary */ -#define BOUNDL 9 /* 0x9 Match "" at any word boundary */ -#define NBOUND 10 /* 0xa Match "" at any word non-boundary */ -#define NBOUNDL 11 /* 0xb Match "" at any word non-boundary */ -#define GPOS 12 /* 0xc Matches where last m//g left off. */ -#define ANY 13 /* 0xd Match any one character (except newline). */ -#define SANY 14 /* 0xe Match any one character. */ -#define ANYOF 15 /* 0xf Match character in (or not in) this class. */ -#define ALNUM 16 /* 0x10 Match any alphanumeric character */ -#define ALNUML 17 /* 0x11 Match any alphanumeric char in locale */ -#define NALNUM 18 /* 0x12 Match any non-alphanumeric character */ -#define NALNUML 19 /* 0x13 Match any non-alphanumeric char in locale */ -#define SPACE 20 /* 0x14 Match any whitespace character */ -#define SPACEL 21 /* 0x15 Match any whitespace char in locale */ -#define NSPACE 22 /* 0x16 Match any non-whitespace character */ -#define NSPACEL 23 /* 0x17 Match any non-whitespace char in locale */ -#define DIGIT 24 /* 0x18 Match any numeric character */ -#define NDIGIT 25 /* 0x19 Match any non-numeric character */ -#define BRANCH 26 /* 0x1a Match this alternative, or the next... */ -#define BACK 27 /* 0x1b Match "", "next" ptr points backward. */ -#define EXACT 28 /* 0x1c Match this string (preceded by length). */ -#define EXACTF 29 /* 0x1d Match this string, folded (prec. by length). */ -#define EXACTFL 30 /* 0x1e Match this string, folded in locale (w/len). */ -#define NOTHING 31 /* 0x1f Match empty string. */ -#define TAIL 32 /* 0x20 Match empty string. Can jump here from outside. */ -#define STAR 33 /* 0x21 Match this (simple) thing 0 or more times. */ -#define PLUS 34 /* 0x22 Match this (simple) thing 1 or more times. */ -#define CURLY 35 /* 0x23 Match this simple thing {n,m} times. */ -#define CURLYN 36 /* 0x24 Match next-after-this simple thing */ -#define CURLYM 37 /* 0x25 Match this medium-complex thing {n,m} times. */ -#define CURLYX 38 /* 0x26 Match this complex thing {n,m} times. */ -#define WHILEM 39 /* 0x27 Do curly processing and see if rest matches. */ -#define OPEN 40 /* 0x28 Mark this point in input as start of #n. */ -#define CLOSE 41 /* 0x29 Analogous to OPEN. */ -#define REF 42 /* 0x2a Match some already matched string */ -#define REFF 43 /* 0x2b Match already matched string, folded */ -#define REFFL 44 /* 0x2c Match already matched string, folded in loc. */ -#define IFMATCH 45 /* 0x2d Succeeds if the following matches. */ -#define UNLESSM 46 /* 0x2e Fails if the following matches. */ -#define SUSPEND 47 /* 0x2f "Independent" sub-RE. */ -#define IFTHEN 48 /* 0x30 Switch, should be preceeded by switcher . */ -#define GROUPP 49 /* 0x31 Whether the group matched. */ -#define LONGJMP 50 /* 0x32 Jump far away. */ -#define BRANCHJ 51 /* 0x33 BRANCH with long offset. */ -#define EVAL 52 /* 0x34 Execute some Perl code. */ -#define MINMOD 53 /* 0x35 Next operator is not greedy. */ -#define LOGICAL 54 /* 0x36 Next opcode should set the flag only. */ -#define RENUM 55 /* 0x37 Group with independently numbered parens. */ -#define OPTIMIZED 56 /* 0x38 Placeholder for dump. */ +#define EOS 5 /* 0x5 Match "" at end of string. */ +#define EOL 6 /* 0x6 Match "" at end of line. */ +#define MEOL 7 /* 0x7 Same, assuming multiline. */ +#define SEOL 8 /* 0x8 Same, assuming singleline. */ +#define BOUND 9 /* 0x9 Match "" at any word boundary */ +#define BOUNDL 10 /* 0xa Match "" at any word boundary */ +#define NBOUND 11 /* 0xb Match "" at any word non-boundary */ +#define NBOUNDL 12 /* 0xc Match "" at any word non-boundary */ +#define GPOS 13 /* 0xd Matches where last m//g left off. */ +#define ANY 14 /* 0xe Match any one character (except newline). */ +#define SANY 15 /* 0xf Match any one character. */ +#define ANYOF 16 /* 0x10 Match character in (or not in) this class. */ +#define ALNUM 17 /* 0x11 Match any alphanumeric character */ +#define ALNUML 18 /* 0x12 Match any alphanumeric char in locale */ +#define NALNUM 19 /* 0x13 Match any non-alphanumeric character */ +#define NALNUML 20 /* 0x14 Match any non-alphanumeric char in locale */ +#define SPACE 21 /* 0x15 Match any whitespace character */ +#define SPACEL 22 /* 0x16 Match any whitespace char in locale */ +#define NSPACE 23 /* 0x17 Match any non-whitespace character */ +#define NSPACEL 24 /* 0x18 Match any non-whitespace char in locale */ +#define DIGIT 25 /* 0x19 Match any numeric character */ +#define NDIGIT 26 /* 0x1a Match any non-numeric character */ +#define BRANCH 27 /* 0x1b Match this alternative, or the next... */ +#define BACK 28 /* 0x1c Match "", "next" ptr points backward. */ +#define EXACT 29 /* 0x1d Match this string (preceded by length). */ +#define EXACTF 30 /* 0x1e Match this string, folded (prec. by length). */ +#define EXACTFL 31 /* 0x1f Match this string, folded in locale (w/len). */ +#define NOTHING 32 /* 0x20 Match empty string. */ +#define TAIL 33 /* 0x21 Match empty string. Can jump here from outside. */ +#define STAR 34 /* 0x22 Match this (simple) thing 0 or more times. */ +#define PLUS 35 /* 0x23 Match this (simple) thing 1 or more times. */ +#define CURLY 36 /* 0x24 Match this simple thing {n,m} times. */ +#define CURLYN 37 /* 0x25 Match next-after-this simple thing */ +#define CURLYM 38 /* 0x26 Match this medium-complex thing {n,m} times. */ +#define CURLYX 39 /* 0x27 Match this complex thing {n,m} times. */ +#define WHILEM 40 /* 0x28 Do curly processing and see if rest matches. */ +#define OPEN 41 /* 0x29 Mark this point in input as start of #n. */ +#define CLOSE 42 /* 0x2a Analogous to OPEN. */ +#define REF 43 /* 0x2b Match some already matched string */ +#define REFF 44 /* 0x2c Match already matched string, folded */ +#define REFFL 45 /* 0x2d Match already matched string, folded in loc. */ +#define IFMATCH 46 /* 0x2e Succeeds if the following matches. */ +#define UNLESSM 47 /* 0x2f Fails if the following matches. */ +#define SUSPEND 48 /* 0x30 "Independent" sub-RE. */ +#define IFTHEN 49 /* 0x31 Switch, should be preceeded by switcher . */ +#define GROUPP 50 /* 0x32 Whether the group matched. */ +#define LONGJMP 51 /* 0x33 Jump far away. */ +#define BRANCHJ 52 /* 0x34 BRANCH with long offset. */ +#define EVAL 53 /* 0x35 Execute some Perl code. */ +#define MINMOD 54 /* 0x36 Next operator is not greedy. */ +#define LOGICAL 55 /* 0x37 Next opcode should set the flag only. */ +#define RENUM 56 /* 0x38 Group with independently numbered parens. */ +#define OPTIMIZED 57 /* 0x39 Placeholder for dump. */ #ifndef DOINIT EXTCONST U8 regkind[]; @@ -70,6 +71,7 @@ EXTCONST U8 regkind[] = { BOL, /* BOL */ BOL, /* MBOL */ BOL, /* SBOL */ + EOL, /* EOS */ EOL, /* EOL */ EOL, /* MEOL */ EOL, /* SEOL */ @@ -133,6 +135,7 @@ const static U8 regarglen[] = { 0, /* BOL */ 0, /* MBOL */ 0, /* SBOL */ + 0, /* EOS */ 0, /* EOL */ 0, /* MEOL */ 0, /* SEOL */ @@ -193,6 +196,7 @@ const static char reg_off_by_arg[] = { 0, /* BOL */ 0, /* MBOL */ 0, /* SBOL */ + 0, /* EOS */ 0, /* EOL */ 0, /* MEOL */ 0, /* SEOL */ diff --git a/t/op/re_tests b/t/op/re_tests index b506306..dd54a2a 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -439,3 +439,8 @@ $(?<=^(a)) a y $1 a ((?>[^()]+)|\([^()]*\))+ ((abc(ade)ufh()()x y $& abc(ade)ufh()()x (?<=x+)y - c - /(?<=x+)y/: variable length lookbehind not implemented a{37,17} - c - /a{37,17}/: Can't do {n,m} with n > m +a\Z a\nb\n n - - +b\Z a\nb\n y - - +b\z a\nb\n n - - +b\Z a\nb y - - +b\z a\nb y - - diff --git a/t/op/regexp.t b/t/op/regexp.t index e3eb336..7e43526 100755 --- a/t/op/regexp.t +++ b/t/op/regexp.t @@ -21,7 +21,7 @@ # Column 5 contains the expected result of double-quote # interpolating that string after the match, or start of error message. # -# Columns 1, 2 and 5 are \n-interpolated. +# \n in the tests are interpolated. # # If you want to add a regular expression test that can't be expressed # in this format, don't add it here: put it in op/pat.t instead. @@ -40,7 +40,9 @@ $| = 1; print "1..$numtests\n# $iters iterations\n"; TEST: while () { - ($pat, $subject, $result, $repl, $expect) = split(/[\t\n]/,$_); + chomp; + s/\\n/\n/g; + ($pat, $subject, $result, $repl, $expect) = split(/\t/,$_); $input = join(':',$pat,$subject,$result,$repl,$expect); infty_subst(\$pat); infty_subst(\$expect); diff --git a/toke.c b/toke.c index 4aa96d3..6738dc1 100644 --- a/toke.c +++ b/toke.c @@ -832,7 +832,7 @@ scan_const(char *start) /* leaveit is the set of acceptably-backslashed characters */ char *leaveit = lex_inpat - ? "\\.^$@AGZdDwWsSbB+*?|()-nrtfeaxc0123456789[{]} \t\n\r\f\v#" + ? "\\.^$@AGZdDwWsSbB+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#" : ""; while (s < send || dorange) {