Upgrade to podlators-2.2.0

[p5sagit/p5-mst-13.2.git] / pod / perlre.pod
diff --git a/pod/perlre.pod b/pod/perlre.pod

index 0f9ded3..a076d3a 100644 (file)
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -65,6 +65,15 @@ X</p> X<regex, preserve> X<regexp, preserve>
 Preserve the string matched such that ${^PREMATCH}, {$^MATCH}, and
 ${^POSTMATCH} are available for use after matching.
 
+=item g and c
+X</g> X</c>
+
+Global matching, and keep the Current position after failed matching.
+Unlike i, m, s and x, these two flags affect the way the regex is used
+rather than the regex itself. See
+L<perlretut/"Using regular expressions in Perl"> for further explanation
+of the g and c modifiers.
+
 =back
 
 These are usually written as "the C</x> modifier", even though the delimiter
@@ -93,7 +102,7 @@ X</x>
 
 =head3 Metacharacters
 
-The patterns used in Perl pattern matching evolved from the ones supplied in
+The patterns used in Perl pattern matching evolved from those supplied in
 the Version 8 regex routines.  (The routines are derived
 (distantly) from Henry Spencer's freely redistributable reimplementation
 of the V8 routines.)  See L<Version 8 Regular Expressions> for
@@ -214,9 +223,9 @@ X<\0> X<\c> X<\N> X<\x>
     \e         escape (think troff)  (ESC)
     \033       octal char            (example: ESC)
     \x1B       hex char              (example: ESC)
-    \x{263a}   wide hex char         (example: Unicode SMILEY)
+    \x{263a}   long hex char         (example: Unicode SMILEY)
     \cK                control char          (example: VT)
-    \N{name}   named char
+    \N{name}   named Unicode character
     \l         lowercase next char (think vi)
     \u         uppercase next char (think vi)
     \L         lowercase till \E (think vi)
@@ -237,7 +246,7 @@ You'll need to write something like C<m/\Quser\E\@\Qhost/>.
 
 In addition, Perl defines the following:
 X<\w> X<\W> X<\s> X<\S> X<\d> X<\D> X<\X> X<\p> X<\P> X<\C>
-X<\g> X<\k> X<\N> X<\K> X<\v> X<\V>
+X<\g> X<\k> X<\N> X<\K> X<\v> X<\V> X<\h> X<\H>
 X<word> X<whitespace> X<character class> X<backreference>
 
     \w      Match a "word" character (alphanumeric plus "_")
@@ -249,7 +258,7 @@ X<word> X<whitespace> X<character class> X<backreference>
     \pP             Match P, named property.  Use \p{Prop} for longer names.
     \PP             Match non-P
     \X      Match eXtended Unicode "combining character sequence",
-             equivalent to (?:\PM\pM*)
+             equivalent to (?>\PM\pM*)
     \C      Match a single C char (octet) even under Unicode.
             NOTE: breaks up characters into their UTF-8 bytes,
             so you may end up with malformed pieces of UTF-8.
@@ -261,9 +270,6 @@ X<word> X<whitespace> X<character class> X<backreference>
              optionally be wrapped in curly brackets for safer parsing.
     \g{name} Named backreference
     \k<name> Named backreference
-    \N{name} Named Unicode character, or Unicode escape
-    \x12     Hexadecimal escape sequence
-    \x{1234} Long hexadecimal escape sequence
     \K       Keep the stuff left of the \K, don't include it in $&
     \v       Vertical whitespace
     \V       Not vertical whitespace
@@ -287,7 +293,7 @@ in general.
 X<\w> X<\W> X<word>
 
 C<\R> will atomically match a linebreak, including the network line-ending
-"\x0D\x0A".  Specifically, X<\R> is exactly equivelent to
+"\x0D\x0A".  Specifically, X<\R> is exactly equivalent to
 
   (?>\x0D\x0A?|[\x0A-\x0C\x85\x{2028}\x{2029}])
 
@@ -369,21 +375,61 @@ X<character class> X<\p> X<\p{}>
     digit       IsDigit        \d
     graph       IsGraph
     lower       IsLower
-    print       IsPrint
-    punct       IsPunct
+    print       IsPrint                (but see [2] below)
+    punct       IsPunct                (but see [3] below)
     space       IsSpace
                 IsSpacePerl    \s
     upper       IsUpper
-    word        IsWord
+    word        IsWord         \w
     xdigit      IsXDigit
 
 For example C<[[:lower:]]> and C<\p{IsLower}> are equivalent.
 
+However, the equivalence between C<[[:xxxxx:]]> and C<\p{IsXxxxx}>
+is not exact.
+
+=over 4
+
+=item [1]
+
 If the C<utf8> pragma is not used but the C<locale> pragma is, the
 classes correlate with the usual isalpha(3) interface (except for
 "word" and "blank").
 
-The assumedly non-obviously named classes are:
+But if the C<locale> or C<encoding> pragmas are not used and
+the string is not C<utf8>, then C<[[:xxxxx:]]> (and C<\w>, etc.)
+will not match characters 0x80-0xff; whereas C<\p{IsXxxxx}> will
+force the string to C<utf8> and can match these characters
+(as Unicode).
+
+=item [2]
+
+C<\p{IsPrint}> matches characters 0x09-0x0d but C<[[:print:]]> does not.
+
+=item [3]
+
+C<[[:punct::]]> matches the following but C<\p{IsPunct}> does not,
+because they are classed as symbols (not punctuation) in Unicode.
+
+=over 4
+
+=item C<$>
+
+Currency symbol
+
+=item C<+> C<< < >> C<=> C<< > >> C<|> C<~>
+
+Mathematical symbols
+
+=item C<^> C<`>
+
+Modifier symbols (accents)
+
+=back
+
+=back
+
+The other named classes are:
 
 =over 4
 
@@ -515,14 +561,14 @@ backreferences.
 
 X<\g{1}> X<\g{-1}> X<\g{name}> X<relative backreference> X<named backreference>
 In order to provide a safer and easier way to construct patterns using
-backreferences, Perl 5.10 provides the C<\g{N}> notation. The curly
-brackets are optional, however omitting them is less safe as the meaning
-of the pattern can be changed by text (such as digits) following it.
-When N is a positive integer the C<\g{N}> notation is exactly equivalent
-to using normal backreferences. When N is a negative integer then it is
-a relative backreference referring to the previous N'th capturing group.
-When the bracket form is used and N is not an integer, it is treated as a
-reference to a named buffer.
+backreferences, Perl provides the C<\g{N}> notation (starting with perl
+5.10.0). The curly brackets are optional, however omitting them is less
+safe as the meaning of the pattern can be changed by text (such as digits)
+following it. When N is a positive integer the C<\g{N}> notation is
+exactly equivalent to using normal backreferences. When N is a negative
+integer then it is a relative backreference referring to the previous N'th
+capturing group. When the bracket form is used and N is not an integer, it
+is treated as a reference to a named buffer.
 
 Thus C<\g{-1}> refers to the last buffer, C<\g{-2}> refers to the
 buffer before that. For example:
@@ -538,7 +584,7 @@ buffer before that. For example:
 
 and would match the same as C</(Y) ( (X) \3 \1 )/x>.
 
-Additionally, as of Perl 5.10 you may use named capture buffers and named
+Additionally, as of Perl 5.10.0 you may use named capture buffers and named
 backreferences. The notation is C<< (?<name>...) >> to declare and C<< \k<name> >>
 to reference. You may also use apostrophes instead of angle brackets to delimit the
 name; and you may use the bracketed C<< \g{name} >> backreference syntax.
@@ -549,7 +595,7 @@ and C<< \k<name> >> refer to the leftmost defined group. (Thus it's possible
 to do things with named capture buffers that would otherwise require C<(??{})>
 code to accomplish.)
 X<named capture buffer> X<regular expression, named capture buffer>
-X<%+> X<$+{name}> X<\k{name}>
+X<%+> X<$+{name}> X<< \k<name> >>
 
 Examples:
 
@@ -608,7 +654,7 @@ already paid the price.  As of 5.005, C<$&> is not so costly as the
 other two.
 X<$&> X<$`> X<$'>
 
-As a workaround for this problem, Perl 5.10 introduces C<${^PREMATCH}>,
+As a workaround for this problem, Perl 5.10.0 introduces C<${^PREMATCH}>,
 C<${^MATCH}> and C<${^POSTMATCH}>, which are equivalent to C<$`>, C<$&>
 and C<$'>, B<except> that they are only guaranteed to be defined after a
 successful match that was executed with the C</p> (preserve) modifier.
@@ -670,7 +716,7 @@ whitespace formatting, a simple C<#> will suffice.  Note that Perl closes
 the comment as soon as it sees a C<)>, so there is no way to put a literal
 C<)> in the comment.
 
-=item C<(?kimsx-imsx)>
+=item C<(?pimsx-imsx)>
 X<(?)>
 
 One or more embedded pattern-match modifiers, to be turned on (or
@@ -698,9 +744,9 @@ will match C<blah> in any case, some spaces, and an exact (I<including the case>
 repetition of the previous word, assuming the C</x> modifier, and no C</i>
 modifier outside this group.
 
-Note that the C<k> modifier is special in that it can only be enabled,
+Note that the C<p> modifier is special in that it can only be enabled,
 not disabled, and that its presence anywhere in a pattern has a global
-effect. Thus C<(?-k)> and C<(?-k:...)> are meaningless and will warn
+effect. Thus C<(?-p)> and C<(?-p:...)> are meaningless and will warn
 when executed under C<use warnings>.
 
 =item C<(?:pattern)>
@@ -734,7 +780,7 @@ X<(?|)> X<Branch reset>
 
 This is the "branch reset" pattern, which has the special property
 that the capture buffers are numbered from the same starting point
-in each alternation branch. It is available starting from perl 5.10.
+in each alternation branch. It is available starting from perl 5.10.0.
 
 Capture buffers are numbered from left to right, but inside this
 construct the numbering is restarted for each branch.
@@ -755,6 +801,9 @@ which buffer the captured content will be stored.
     / ( a )  (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x
     # 1            2         2  3        2     3     4  
 
+Note: as of Perl 5.10.0, branch resets interfere with the contents of
+the C<%+> hash, that holds named captures. Consider using C<%-> instead.
+
 =item Look-Around Assertions
 X<look-around assertion> X<lookaround assertion> X<look-around> X<lookaround>
 
@@ -831,9 +880,9 @@ only for fixed-width look-behind.
 X<< (?<NAME>) >> X<(?'NAME')> X<named capture> X<capture>
 
 A named capture buffer. Identical in every respect to normal capturing
-parentheses C<()> but for the additional fact that C<%+> may be used after
-a successful match to refer to a named buffer. See C<perlvar> for more
-details on the C<%+> hash.
+parentheses C<()> but for the additional fact that C<%+> or C<%-> may be
+used after a successful match to refer to a named buffer. See C<perlvar>
+for more details on the C<%+> and C<%-> hashes.
 
 If multiple distinct capture buffers have the same name then the
 $+{NAME} will refer to the leftmost defined buffer in the match.
@@ -858,8 +907,7 @@ though it isn't extended by the locale (see L<perllocale>).
 B<NOTE:> In order to make things easier for programmers with experience
 with the Python or PCRE regex engines, the pattern C<< (?PE<lt>NAMEE<gt>pattern) >>
 may be used instead of C<< (?<NAME>pattern) >>; however this form does not
-support the use of single quotes as a delimiter for the name. This is
-only available in Perl 5.10 or later.
+support the use of single quotes as a delimiter for the name.
 
 =item C<< \k<NAME> >>
 
@@ -877,7 +925,7 @@ Both forms are equivalent.
 
 B<NOTE:> In order to make things easier for programmers with experience
 with the Python or PCRE regex engines, the pattern C<< (?P=NAME) >>
-may be used instead of C<< \k<NAME> >> in Perl 5.10 or later.
+may be used instead of C<< \k<NAME> >>.
 
 =item C<(?{ code })>
 X<(?{})> X<regex, code in> X<regexp, code in> X<regular expression, code in>
@@ -1102,7 +1150,7 @@ pattern.
 
 B<NOTE:> In order to make things easier for programmers with experience
 with the Python or PCRE regex engines the pattern C<< (?P>NAME) >>
-may be used instead of C<< (?&NAME) >> in Perl 5.10 or later.
+may be used instead of C<< (?&NAME) >>.
 
 =item C<(?(condition)yes-pattern|no-pattern)>
 X<(?()>
@@ -1346,7 +1394,7 @@ argument, then C<$REGERROR> and C<$REGMARK> are not touched at all.
 =over 4
 
 =item C<(*PRUNE)> C<(*PRUNE:NAME)>
-X<(*PRUNE)> X<(*PRUNE:NAME)> X<\v>
+X<(*PRUNE)> X<(*PRUNE:NAME)>
 
 This zero-width pattern prunes the backtracking tree at the current point
 when backtracked into on failure. Consider the pattern C<A (*PRUNE) B>,
@@ -1356,8 +1404,6 @@ continues in B, which may also backtrack as necessary; however, should B
 not match, then no further backtracking will take place, and the pattern
 will fail outright at the current starting position.
 
-As a shortcut, C<\v> is exactly equivalent to C<(*PRUNE)>.
-
 The following example counts all the possible matching strings in a
 pattern (without actually matching any of them).
 
@@ -1383,7 +1429,7 @@ If we add a C<(*PRUNE)> before the count like the following
     print "Count=$count\n";
 
 we prevent backtracking and find the count of the longest matching
-at each matching startpoint like so:
+at each matching starting point like so:
 
     aaab
     aab
@@ -1409,8 +1455,6 @@ of this pattern. This effectively means that the regex engine "skips" forward
 to this position on failure and tries to match again, (assuming that
 there is sufficient room to match).
 
-As a shortcut C<\V> is exactly equivalent to C<(*SKIP)>.
-
 The name of the C<(*SKIP:NAME)> pattern has special significance. If a
 C<(*MARK:NAME)> was encountered while matching, then it is that position
 which is used as the "skip point". If no C<(*MARK)> of that name was
@@ -1431,7 +1475,7 @@ outputs
     Count=2
 
 Once the 'aaab' at the start of the string has matched, and the C<(*SKIP)>
-executed, the next startpoint will be where the cursor was when the
+executed, the next starting point will be where the cursor was when the
 C<(*SKIP)> was executed.
 
 =item C<(*MARK:NAME)> C<(*:NAME)>
@@ -1470,7 +1514,7 @@ As a shortcut C<(*MARK:NAME)> can be written C<(*:NAME)>.
 
 =item C<(*THEN)> C<(*THEN:NAME)>
 
-This is similar to the "cut group" operator C<::> from Perl6. Like
+This is similar to the "cut group" operator C<::> from Perl 6. Like
 C<(*PRUNE)>, this verb always matches, and when backtracked into on
 failure, it causes the regex engine to try the next alternation in the
 innermost enclosing group (capturing or otherwise).
@@ -1504,7 +1548,7 @@ backtrack and try C; but the C<(*PRUNE)> verb will simply fail.
 =item C<(*COMMIT)>
 X<(*COMMIT)>
 
-This is the Perl6 "commit pattern" C<< <commit> >> or C<:::>. It's a
+This is the Perl 6 "commit pattern" C<< <commit> >> or C<:::>. It's a
 zero-width pattern similar to C<(*SKIP)>, except that when backtracked
 into on failure it causes the match to fail outright. No further attempts
 to find a valid match by advancing the start pointer will occur again.
@@ -2104,9 +2148,9 @@ part of this regular expression needs to be converted explicitly
 
 =head1 PCRE/Python Support
 
-As of Perl 5.10 Perl supports several Python/PCRE specific extensions
+As of Perl 5.10.0, Perl supports several Python/PCRE specific extensions
 to the regex syntax. While Perl programmers are encouraged to use the
-Perl specific syntax, the following are legal in Perl 5.10:
+Perl specific syntax, the following are also accepted:
 
 =over 4