Tick off Unicode collation and the normalization from

[p5sagit/p5-mst-13.2.git] / pod / perlretut.pod
diff --git a/pod/perlretut.pod b/pod/perlretut.pod

index 66f8179..95e3f03 100644 (file)
--- a/pod/perlretut.pod
+++ b/pod/perlretut.pod
@@ -368,24 +368,31 @@ has several abbreviations for common character classes:
 =over 4
 
 =item *
+
 \d is a digit and represents [0-9]
 
 =item *
+
 \s is a whitespace character and represents [\ \t\r\n\f]
 
 =item *
+
 \w is a word character (alphanumeric or _) and represents [0-9a-zA-Z_]
 
 =item *
+
 \D is a negated \d; it represents any character but a digit [^0-9]
 
 =item *
+
 \S is a negated \s; it represents any non-whitespace character [^\s]
 
 =item *
+
 \W is a negated \w; it represents any non-word character [^\w]
 
 =item *
+
 The period '.' matches any character but "\n"
 
 =back
@@ -451,22 +458,26 @@ and C<$> are able to match.  Here are the four possible combinations:
 =over 4
 
 =item *
+
 no modifiers (//): Default behavior.  C<'.'> matches any character
 except C<"\n">.  C<^> matches only at the beginning of the string and
 C<$> matches only at the end or before a newline at the end.
 
 =item *
+
 s modifier (//s): Treat string as a single long line.  C<'.'> matches
 any character, even C<"\n">.  C<^> matches only at the beginning of
 the string and C<$> matches only at the end or before a newline at the
 end.
 
 =item *
+
 m modifier (//m): Treat string as a set of multiple lines.  C<'.'>
 matches any character except C<"\n">.  C<^> and C<$> are able to match
 at the start or end of I<any> line within the string.
 
 =item *
+
 both s and m modifiers (//sm): Treat string as a single long line, but
 detect multiple lines.  C<'.'> matches any character, even
 C<"\n">.  C<^> and C<$>, however, are able to match at the start or end
@@ -602,32 +613,52 @@ of what perl does when it tries to match the regexp
 
 =over 4
 
-=item 0 Start with the first letter in the string 'a'.
+=item 0
+
+Start with the first letter in the string 'a'.
+
+=item 1
+
+Try the first alternative in the first group 'abd'.
+
+=item 2
 
-=item 1 Try the first alternative in the first group 'abd'.
+Match 'a' followed by 'b'. So far so good.
 
-=item 2 Match 'a' followed by 'b'. So far so good.
+=item 3
 
-=item 3 'd' in the regexp doesn't match 'c' in the string - a dead
+'d' in the regexp doesn't match 'c' in the string - a dead
 end.  So backtrack two characters and pick the second alternative in
 the first group 'abc'.
 
-=item 4 Match 'a' followed by 'b' followed by 'c'.  We are on a roll
+=item 4
+
+Match 'a' followed by 'b' followed by 'c'.  We are on a roll
 and have satisfied the first group. Set $1 to 'abc'.
 
-=item 5 Move on to the second group and pick the first alternative
+=item 5
+
+Move on to the second group and pick the first alternative
 'df'.
 
-=item 6 Match the 'd'.
+=item 6
+
+Match the 'd'.
 
-=item 7 'f' in the regexp doesn't match 'e' in the string, so a dead
+=item 7
+
+'f' in the regexp doesn't match 'e' in the string, so a dead
 end.  Backtrack one character and pick the second alternative in the
 second group 'd'.
 
-=item 8 'd' matches. The second grouping is satisfied, so set $2 to
+=item 8
+
+'d' matches. The second grouping is satisfied, so set $2 to
 'd'.
 
-=item 9 We are at the end of the regexp, so we are done! We have
+=item 9
+
+We are at the end of the regexp, so we are done! We have
 matched 'abcd' out of the string "abcde".
 
 =back
@@ -679,9 +710,12 @@ indicated below it:
     /(ab(cd|ef)((gi)|j))/;
      1  2      34
 
-so that if the regexp matched, e.g., C<$2> would contain 'cd' or 'ef'.
-For convenience, perl sets C<$+> to the highest numbered C<$1>, C<$2>,
-... that got assigned.
+so that if the regexp matched, e.g., C<$2> would contain 'cd' or 'ef'. For
+convenience, perl sets C<$+> to the string held by the highest numbered
+C<$1>, C<$2>, ... that got assigned (and, somewhat related, C<$^N> to the
+value of the C<$1>, C<$2>, ... most-recently assigned; i.e. the C<$1>,
+C<$2>, ... associated with the rightmost closing parenthesis used in the
+match).
 
 Closely associated with the matching variables C<$1>, C<$2>, ... are
 the B<backreferences> C<\1>, C<\2>, ... .  Backreferences are simply
@@ -770,18 +804,30 @@ meanings:
 
 =over 4
 
-=item * C<a?> = match 'a' 1 or 0 times
+=item *
+
+C<a?> = match 'a' 1 or 0 times
+
+=item *
+
+C<a*> = match 'a' 0 or more times, i.e., any number of times
 
-=item * C<a*> = match 'a' 0 or more times, i.e., any number of times
+=item *
 
-=item * C<a+> = match 'a' 1 or more times, i.e., at least once
+C<a+> = match 'a' 1 or more times, i.e., at least once
 
-=item * C<a{n,m}> = match at least C<n> times, but not more than C<m>
+=item *
+
+C<a{n,m}> = match at least C<n> times, but not more than C<m>
 times.
 
-=item * C<a{n,}> = match at least C<n> or more times
+=item *
+
+C<a{n,}> = match at least C<n> or more times
+
+=item *
 
-=item * C<a{n}> = match exactly C<n> times
+C<a{n}> = match exactly C<n> times
 
 =back
 
@@ -845,19 +891,23 @@ the principles above to predict which way the regexp will match:
 =over 4
 
 =item *
+
 Principle 0: Taken as a whole, any regexp will be matched at the
 earliest possible position in the string.
 
 =item *
+
 Principle 1: In an alternation C<a|b|c...>, the leftmost alternative
 that allows a match for the whole regexp will be the one used.
 
 =item *
+
 Principle 2: The maximal matching quantifiers C<?>, C<*>, C<+> and
 C<{n,m}> will in general match as much of the string as possible while
 still allowing the whole regexp to match.
 
 =item *
+
 Principle 3: If there are two or more elements in a regexp, the
 leftmost greedy quantifier, if any, will match as much of the string
 as possible while still allowing the whole regexp to match.  The next
@@ -925,21 +975,33 @@ following meanings:
 
 =over 4
 
-=item * C<a??> = match 'a' 0 or 1 times. Try 0 first, then 1.
+=item *
+
+C<a??> = match 'a' 0 or 1 times. Try 0 first, then 1.
+
+=item *
 
-=item * C<a*?> = match 'a' 0 or more times, i.e., any number of times,
+C<a*?> = match 'a' 0 or more times, i.e., any number of times,
 but as few times as possible
 
-=item * C<a+?> = match 'a' 1 or more times, i.e., at least once, but
+=item *
+
+C<a+?> = match 'a' 1 or more times, i.e., at least once, but
 as few times as possible
 
-=item * C<a{n,m}?> = match at least C<n> times, not more than C<m>
+=item *
+
+C<a{n,m}?> = match at least C<n> times, not more than C<m>
 times, as few times as possible
 
-=item * C<a{n,}?> = match at least C<n> times, but as few times as
+=item *
+
+C<a{n,}?> = match at least C<n> times, but as few times as
 possible
 
-=item * C<a{n}?> = match exactly C<n> times.  Because we match exactly
+=item *
+
+C<a{n}?> = match exactly C<n> times.  Because we match exactly
 C<n> times, C<a{n}?> is equivalent to C<a{n}> and is just there for
 notational consistency.
 
@@ -998,6 +1060,7 @@ quantifiers:
 =over 4
 
 =item *
+
 Principle 3: If there are two or more elements in a regexp, the
 leftmost greedy (non-greedy) quantifier, if any, will match as much
 (little) of the string as possible while still allowing the whole
@@ -1019,23 +1082,37 @@ backtracking.  Here is a step-by-step analysis of the example
 
 =over 4
 
-=item 0 Start with the first letter in the string 't'.
+=item 0
 
-=item 1 The first quantifier '.*' starts out by matching the whole
+Start with the first letter in the string 't'.
+
+=item 1
+
+The first quantifier '.*' starts out by matching the whole
 string 'the cat in the hat'.
 
-=item 2 'a' in the regexp element 'at' doesn't match the end of the
+=item 2
+
+'a' in the regexp element 'at' doesn't match the end of the
 string.  Backtrack one character.
 
-=item 3 'a' in the regexp element 'at' still doesn't match the last
+=item 3
+
+'a' in the regexp element 'at' still doesn't match the last
 letter of the string 't', so backtrack one more character.
 
-=item 4 Now we can match the 'a' and the 't'.
+=item 4
+
+Now we can match the 'a' and the 't'.
 
-=item 5 Move on to the third element '.*'.  Since we are at the end of
+=item 5
+
+Move on to the third element '.*'.  Since we are at the end of
 the string and '.*' can match 0 times, assign it the empty string.
 
-=item 6 We are done!
+=item 6
+
+We are done!
 
 =back
 
@@ -1180,15 +1257,25 @@ This is our final regexp.  To recap, we built a regexp by
 
 =over 4
 
-=item * specifying the task in detail,
+=item *
+
+specifying the task in detail,
+
+=item *
+
+breaking down the problem into smaller parts,
+
+=item *
+
+translating the small parts into regexps,
 
-=item * breaking down the problem into smaller parts,
+=item *
 
-=item * translating the small parts into regexps,
+combining the regexps,
 
-=item * combining the regexps,
+=item *
 
-=item * and optimizing the final combined regexp.
+and optimizing the final combined regexp.
 
 =back
 
@@ -1560,13 +1647,18 @@ sequence of bytes (the old way) or as a sequence of Unicode characters
 than C<chr(127)> may be represented using the C<\x{hex}> notation,
 with C<hex> a hexadecimal integer:
 
-    use utf8;    # We will be doing Unicode processing
     /\x{263a}/;  # match a Unicode smiley face :)
 
 Unicode characters in the range of 128-255 use two hexadecimal digits
 with braces: C<\x{ab}>.  Note that this is different than C<\xab>,
-which is just a hexadecimal byte with no Unicode
-significance.
+which is just a hexadecimal byte with no Unicode significance.
+
+B<NOTE>: in perl 5.6.0 it used to be that one needed to say C<use utf8>
+to use any Unicode features.  This is no more the case: for almost all
+Unicode processing, the explicit C<utf8> pragma is not needed.
+(The only case where it matters is if your Perl script is in Unicode,
+that is, encoded in UTF-8/UTF-16/UTF-EBCDIC: then an explicit C<use utf8>
+is needed.)
 
 Figuring out the hexadecimal sequence of a Unicode character you want
 or deciphering someone else's hexadecimal Unicode regexp is about as
@@ -1577,15 +1669,12 @@ specified in the Unicode standard.  For instance, if we wanted to
 represent or match the astrological sign for the planet Mercury, we
 could use
 
-    use utf8;              # We will be doing Unicode processing
     use charnames ":full"; # use named chars with Unicode full names
     $x = "abc\N{MERCURY}def";
     $x =~ /\N{MERCURY}/;   # matches
 
 One can also use short names or restrict names to a certain alphabet:
 
-    use utf8;              # We will be doing Unicode processing
-
     use charnames ':full';
     print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
 
@@ -1596,7 +1685,7 @@ One can also use short names or restrict names to a certain alphabet:
     print "\N{sigma} is Greek sigma\n";
 
 A list of full names is found in the file Names.txt in the
-lib/perl5/5.6.0/unicode directory.
+lib/perl5/5.X.X/unicore directory.
 
 The answer to requirement 2), as of 5.6.0, is that if a regexp
 contains Unicode characters, the string is searched as a sequence of
@@ -1606,7 +1695,6 @@ characters, but matching a single byte is required, we can use the C<\C>
 escape sequence.  C<\C> is a character class akin to C<.> except that
 it matches I<any> byte 0-255.  So
 
-    use utf8;              # We will be doing Unicode processing
     use charnames ":full"; # use named chars with Unicode full names
     $x = "a";
     $x =~ /\C/;  # matches 'a', eats one byte
@@ -1628,7 +1716,6 @@ the C<\P{name}> character class, which is the negation of the
 C<\p{name}> class.  For example, to match lower and uppercase
 characters,
 
-    use utf8;              # We will be doing Unicode processing
     use charnames ":full"; # use named chars with Unicode full names
     $x = "BOB";
     $x =~ /^\p{IsUpper}/;   # matches, uppercase char class
@@ -1636,29 +1723,38 @@ characters,
     $x =~ /^\p{IsLower}/;   # doesn't match, lowercase char class
     $x =~ /^\P{IsLower}/;   # matches, char class sans lowercase
 
-If a C<name> is just one letter, the braces can be dropped.  For
-instance, C<\pM> is the character class of Unicode 'marks'.  Here is
-the association between some Perl named classes and the traditional
-Unicode classes:
+Here is the association between some Perl named classes and the
+traditional Unicode classes:
 
-    Perl class name  Unicode class name
+    Perl class name  Unicode class name or regular expression
 
-    IsAlpha          Lu, Ll, or Lo
-    IsAlnum          Lu, Ll, Lo, or Nd
-    IsASCII          $code le 127
-    IsCntrl          C
+    IsAlpha          /^[LM]/
+    IsAlnum          /^[LMN]/
+    IsASCII          $code <= 127
+    IsCntrl          /^C/
+    IsBlank          $code =~ /^(0020|0009)$/ || /^Z[^lp]/
     IsDigit          Nd
-    IsGraph          [^C] and $code ne "0020"
+    IsGraph          /^([LMNPS]|Co)/
     IsLower          Ll
-    IsPrint          [^C]
-    IsPunct          P
-    IsSpace          Z, or ($code lt "0020" and chr(hex $code) is a \s)
-    IsUpper          Lu
-    IsWord           Lu, Ll, Lo, Nd or $code eq "005F"
+    IsPrint          /^([LMNPS]|Co|Zs)/
+    IsPunct          /^P/
+    IsSpace          /^Z/ || ($code =~ /^(0009|000A|000B|000C|000D)$/
+    IsSpacePerl      /^Z/ || ($code =~ /^(0009|000A|000C|000D)$/
+    IsUpper          /^L[ut]/
+    IsWord           /^[LMN]/ || $code eq "005F"
     IsXDigit         $code =~ /^00(3[0-9]|[46][1-6])$/
 
-For a full list of Perl class names, consult the mktables.PL program
-in the lib/perl5/5.6.0/unicode directory.
+You can also use the official Unicode class names with the C<\p> and
+C<\P>, like C<\p{L}> for Unicode 'letters', or C<\p{Lu}> for uppercase
+letters, or C<\P{Nd}> for non-digits.  If a C<name> is just one
+letter, the braces can be dropped.  For instance, C<\pM> is the
+character class of Unicode 'marks', for example accent marks.
+For the full list see L<perlunicode>.
+
+The Unicode has also been separated into various sets of charaters
+which you can test with C<\p{In...}> (in) and C<\P{In...}> (not in),
+for example C<\p{InLatin}>, C<\p{InGreek}>, or C<\P{InKatakana}>.
+For the full list see L<perlunicode>.
 
 C<\X> is an abbreviation for a character class sequence that includes
 the Unicode 'combining character sequences'.  A 'combining character
@@ -1670,24 +1766,28 @@ S<C<COMBINING RING> >, which translates in Danish to A with the circle
 atop it, as in the word Angstrom.  C<\X> is equivalent to C<\PM\pM*}>,
 i.e., a non-mark followed by one or more marks.
 
+For the the full and latest information about Unicode see the latest
+Unicode standard, or the Unicode Consortium's website http://www.unicode.org/
+
 As if all those classes weren't enough, Perl also defines POSIX style
 character classes.  These have the form C<[:name:]>, with C<name> the
-name of the POSIX class.  The POSIX classes are alpha, alnum, ascii,
-cntrl, digit, graph, lower, print, punct, space, upper, word, and
-xdigit.  If C<utf8> is being used, then these classes are defined the
-same as their corresponding perl Unicode classes: C<[:upper:]> is the
-same as C<\p{IsUpper}>, etc.  The POSIX character classes, however,
-don't require using C<utf8>.  The C<[:digit:]>, C<[:word:]>, and
+name of the POSIX class.  The POSIX classes are C<alpha>, C<alnum>,
+C<ascii>, C<cntrl>, C<digit>, C<graph>, C<lower>, C<print>, C<punct>,
+C<space>, C<upper>, and C<xdigit>, and two extensions, C<word> (a Perl
+extension to match C<\w>), and C<blank> (a GNU extension).  If C<utf8>
+is being used, then these classes are defined the same as their
+corresponding perl Unicode classes: C<[:upper:]> is the same as
+C<\p{IsUpper}>, etc.  The POSIX character classes, however, don't
+require using C<utf8>.  The C<[:digit:]>, C<[:word:]>, and
 C<[:space:]> correspond to the familiar C<\d>, C<\w>, and C<\s>
-character classes.  To negate a POSIX class, put a C<^> in front of the
-name, so that, e.g., C<[:^digit:]> corresponds to C<\D> and under
+character classes.  To negate a POSIX class, put a C<^> in front of
+the name, so that, e.g., C<[:^digit:]> corresponds to C<\D> and under
 C<utf8>, C<\P{IsDigit}>.  The Unicode and POSIX character classes can
 be used just like C<\d>, both inside and outside of character classes:
 
     /\s+[abc[:digit:]xyz]\s*/;  # match a,b,c,x,y,z, or a digit
     /^=item\s[:digit:]/;        # match '=item',
                                 # followed by a space and a digit
-    use utf8;
     use charnames ":full";
     /\s+[abc\p{IsDigit}xyz]\s+/;  # match a,b,c,x,y,z, or a digit
     /^=item\s\p{IsDigit}/;        # match '=item',
@@ -2044,8 +2144,41 @@ in the regexp.  Here are some silly examples:
                                          # prints 'Hi Mom!'
     $x =~ /aaa(?{print "Hi Mom!";})def/; # doesn't match,
                                          # no 'Hi Mom!'
+
+Pay careful attention to the next example:
+
     $x =~ /abc(?{print "Hi Mom!";})ddd/; # doesn't match,
                                          # no 'Hi Mom!'
+                                         # but why not?
+
+At first glance, you'd think that it shouldn't print, because obviously
+the C<ddd> isn't going to match the target string. But look at this
+example:
+
+    $x =~ /abc(?{print "Hi Mom!";})[d]dd/; # doesn't match,
+                                           # but _does_ print
+
+Hmm. What happened here? If you've been following along, you know that
+the above pattern should be effectively the same as the last one --
+enclosing the d in a character class isn't going to change what it
+matches. So why does the first not print while the second one does?
+
+The answer lies in the optimizations the REx engine makes. In the first
+case, all the engine sees are plain old characters (aside from the
+C<?{}> construct). It's smart enough to realize that the string 'ddd'
+doesn't occur in our target string before actually running the pattern
+through. But in the second case, we've tricked it into thinking that our
+pattern is more complicated than it is. It takes a look, sees our
+character class, and decides that it will have to actually run the
+pattern to determine whether or not it matches, and in the process of
+running it hits the print statement before it discovers that we don't
+have a match.
+
+To take a closer look at how the engine does optimizations, see the
+section L<"Pragmas and debugging"> below.
+
+More fun with C<?{}>:
+
     $x =~ /(?{print "Hi Mom!";})/;       # matches,
                                          # prints 'Hi Mom!'
     $x =~ /(?{$c = 1;})(?{print "$c";})/;  # matches,