From: Juerd Waalboer <#####@juerd.nl>
Date: Sat, 17 Nov 2007 20:03:00 +0000 (+0100)
Subject: [patch] :utf8 updates
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=740d4bb23b722729f87a23733be98429529fd900;p=p5sagit%2Fp5-mst-13.2.git

[patch] :utf8 updates
Message-ID: <20071117190300.GY10696@c4.convolution.nl>

p4raw-id: //depot/perl@32461
---

diff --git a/AUTHORS b/AUTHORS
index 93a0bcb..b784990 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -476,7 +476,7 @@ Joshua Pritikin			<joshua.pritikin@db.com>
 Joost van Baal			<J.E.vanBaal@uvt.nl>
 JT McDuffie			<jt@kpc.com>
 Juan Gallego			<Little.Boss@physics.mcgill.ca>
-Juerd Waalboer			<juerd@cpan.org>
+Juerd Waalboer			<#####@juerd.nl>
 Juha Laiho			<juha.laiho@Elma.Net>
 Julian Yip			<julian@imoney.com>
 juna                            <ggl.20.jj...@spamgourmet.com>
diff --git a/lib/PerlIO.pm b/lib/PerlIO.pm
index c0acdec..42c56e8 100644
--- a/lib/PerlIO.pm
+++ b/lib/PerlIO.pm
@@ -139,6 +139,10 @@ and then read it back in.
 	$in = <F>;
 	close(F);
 
+Note that this layer does not validate byte sequences. For reading
+input, using C<:encoding(utf8)> instead of bare C<:utf8>, is strongly
+recommended.
+
 =item :bytes
 
 This is the inverse of C<:utf8> layer. It turns off the flag
diff --git a/lib/open.pm b/lib/open.pm
index d384b41..6415a08 100644
--- a/lib/open.pm
+++ b/lib/open.pm
@@ -79,11 +79,7 @@ sub import {
 		    unless defined $locale_encoding;
 		(warnings::warnif("layer", "Cannot figure out an encoding to use"), last)
 		    unless defined $locale_encoding;
-		if ($locale_encoding =~ /^utf-?8$/i) {
-		    $layer = "utf8";
-		} else {
-		    $layer = "encoding($locale_encoding)";
-		}
+                $layer = "encoding($locale_encoding)";
 		$std = 1;
 	    } else {
 		my $target = $layer;		# the layer name itself
@@ -151,7 +147,7 @@ open - perl pragma to set default PerlIO layers for input and output
 
     use open IO  => ':locale';
 
-    use open ':utf8';
+    use open ':encoding(utf8)';
     use open ':locale';
     use open ':encoding(iso-8859-7)';
 
@@ -193,8 +189,8 @@ For example:
 
 These are equivalent
 
-    use open ':utf8';
-    use open IO => ':utf8';
+    use open ':encoding(utf8)';
+    use open IO => ':encoding(utf8)';
 
 as are these
 
@@ -210,9 +206,6 @@ The matching of encoding names is loose: case does not matter, and
 many encodings have several aliases.  See L<Encode::Supported> for
 details and the list of supported locales.
 
-Note that C<:utf8> PerlIO layer must always be specified exactly like
-that, it is not subject to the loose matching of encoding names.
-
 When open() is given an explicit list of layers (with the three-arg
 syntax), they override the list declared using this pragma.
 
@@ -220,10 +213,10 @@ The C<:std> subpragma on its own has no effect, but if combined with
 the C<:utf8> or C<:encoding> subpragmas, it converts the standard
 filehandles (STDIN, STDOUT, STDERR) to comply with encoding selected
 for input/output handles.  For example, if both input and out are
-chosen to be C<:utf8>, a C<:std> will mean that STDIN, STDOUT, and
-STDERR are also in C<:utf8>.  On the other hand, if only output is
-chosen to be in C<< :encoding(koi8r) >>, a C<:std> will cause only the
-STDOUT and STDERR to be in C<koi8r>.  The C<:locale> subpragma
+chosen to be C<:encoding(utf8)>, a C<:std> will mean that STDIN, STDOUT,
+and STDERR are also in C<:encoding(utf8)>.  On the other hand, if only
+output is chosen to be in C<< :encoding(koi8r) >>, a C<:std> will cause
+only the STDOUT and STDERR to be in C<koi8r>.  The C<:locale> subpragma
 implicitly turns on C<:std>.
 
 The logic of C<:locale> is described in full in L<encoding>,
diff --git a/pod/perlcheat.pod b/pod/perlcheat.pod
index 440b359..07853a8 100644
--- a/pod/perlcheat.pod
+++ b/pod/perlcheat.pod
@@ -84,7 +84,7 @@ Perl 6 version to show that Perl will stay Perl.
 
 =head1 AUTHOR
 
-Juerd Waalboer <juerd@cpan.org>, with the help of many Perl Monks.
+Juerd Waalboer <#####@juerd.nl>, with the help of many Perl Monks.
 
 =head1 SEE ALSO
 
diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod
index f954aa2..8a84557 100644
--- a/pod/perlfunc.pod
+++ b/pod/perlfunc.pod
@@ -4345,10 +4345,10 @@ See L<perlipc/"UDP: Message Passing"> for examples.
 Note the I<characters>: depending on the status of the socket, either
 (8-bit) bytes or characters are received.  By default all sockets
 operate on bytes, but for example if the socket has been changed using
-binmode() to operate with the C<:utf8> I/O layer (see the C<open>
-pragma, L<open>), the I/O will operate on UTF-8 encoded Unicode
-characters, not bytes.  Similarly for the C<:encoding> pragma:
-in that case pretty much any characters can be read.
+binmode() to operate with the C<:encoding(utf8)> I/O layer (see the
+C<open> pragma, L<open>), the I/O will operate on UTF-8 encoded Unicode
+characters, not bytes.  Similarly for the C<:encoding> pragma: in that
+case pretty much any characters can be read.
 
 =item redo LABEL
 X<redo>
@@ -4784,7 +4784,7 @@ of the file) from the Fcntl module.  Returns C<1> upon success, C<0>
 otherwise.
 
 Note the I<in bytes>: even if the filehandle has been set to
-operate on characters (for example by using the C<:utf8> open
+operate on characters (for example by using the C<:encoding(utf8)> open
 layer), tell() will return byte offsets, not character offsets
 (because implementing that would render seek() and tell() rather slow).
 
@@ -4974,10 +4974,10 @@ L<perlipc/"UDP: Message Passing"> for examples.
 Note the I<characters>: depending on the status of the socket, either
 (8-bit) bytes or characters are sent.  By default all sockets operate
 on bytes, but for example if the socket has been changed using
-binmode() to operate with the C<:utf8> I/O layer (see L</open>, or the
-C<open> pragma, L<open>), the I/O will operate on UTF-8 encoded
-Unicode characters, not bytes.  Similarly for the C<:encoding> pragma:
-in that case pretty much any characters can be sent.
+binmode() to operate with the C<:encoding(utf8)> I/O layer (see
+L</open>, or the C<open> pragma, L<open>), the I/O will operate on UTF-8
+encoded Unicode characters, not bytes.  Similarly for the C<:encoding>
+pragma: in that case pretty much any characters can be sent.
 
 =item setpgrp PID,PGRP
 X<setpgrp> X<group>
@@ -6328,9 +6328,9 @@ POSITION, and C<2> to set it to EOF plus POSITION (typically
 negative).
 
 Note the I<in bytes>: even if the filehandle has been set to operate
-on characters (for example by using the C<:utf8> I/O layer), tell()
-will return byte offsets, not character offsets (because implementing
-that would render sysseek() very slow).
+on characters (for example by using the C<:encoding(utf8)> I/O layer),
+tell() will return byte offsets, not character offsets (because
+implementing that would render sysseek() very slow).
 
 sysseek() bypasses normal buffered IO, so mixing this with reads (other
 than C<sysread>, for example C<< <> >> or read()) C<print>, C<write>,
@@ -6455,9 +6455,9 @@ the actual filehandle.  If FILEHANDLE is omitted, assumes the file
 last read.
 
 Note the I<in bytes>: even if the filehandle has been set to
-operate on characters (for example by using the C<:utf8> open
-layer), tell() will return byte offsets, not character offsets
-(because that would render seek() and tell() rather slow).
+operate on characters (for example by using the C<:encoding(utf8)> open
+layer), tell() will return byte offsets, not character offsets (because
+that would render seek() and tell() rather slow).
 
 The return value of tell() for the standard streams like the STDIN
 depends on the operating system: it may return -1 or something else.
diff --git a/pod/perlopentut.pod b/pod/perlopentut.pod
index cd97fdc..18bc369 100644
--- a/pod/perlopentut.pod
+++ b/pod/perlopentut.pod
@@ -917,7 +917,7 @@ second argument contains something else in addition to the usual
 C<< '<' >>, C<< '>' >>, C<< '>>' >>, C<< '|' >> and their variants,
 for example:
 
-    open(my $fh, "<:utf8", $fn);
+    open(my $fh, "<:crlf", $fn);
 
 =item *
 
diff --git a/pod/perlrun.pod b/pod/perlrun.pod
index 33d4f55..f345a78 100644
--- a/pod/perlrun.pod
+++ b/pod/perlrun.pod
@@ -1124,9 +1124,9 @@ X<:utf8>
 
 A pseudolayer that turns on a flag on the layer below to tell perl
 that output should be in utf8 and that input should be regarded as
-already in utf8 form.  May be useful in PERLIO environment
-variable to make UTF-8 the default. (To turn off that behaviour
-use C<:bytes> layer.)
+already in valid utf8 form. It does not check for validity and as such
+should be handled with caution for input. Generally C<:encoding(utf8)> is
+the best option when reading UTF-8 encoded data.
 
 =item :win32
 X<:win32>
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index bd279f9..61d62d2 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -1523,7 +1523,7 @@ to work under 5.6, so you should be safe to try them out.
 A filehandle that should read or write UTF-8
 
   if ($] > 5.007) {
-    binmode $fh, ":utf8";
+    binmode $fh, ":encoding(utf8)";
   }
 
 =item *
diff --git a/pod/perlunifaq.pod b/pod/perlunifaq.pod
index 4b2290a..b291334 100644
--- a/pod/perlunifaq.pod
+++ b/pod/perlunifaq.pod
@@ -2,7 +2,7 @@
 
 perlunifaq - Perl Unicode FAQ
 
-=head1 DESCRIPTION
+=head1 Q and A
 
 This is a list of questions and answers about Unicode in Perl, intended to be
 read after L<perlunitut>.
@@ -16,6 +16,21 @@ is actually a generic C<Encode> tutorial and C<Encode> FAQ. But many people
 think that Unicode is special and magical, and I didn't want to disappoint
 them, so I decided to call the document a Unicode tutorial.
 
+=head2 What character encodings does Perl support?
+
+To find out which character encodings your Perl supports, run:
+
+    perl -MEncode -le "print for Encode->encodings(':all')"
+
+=head2 Which version of perl should I use?
+
+Well, if you can, upgrade to the most recent, but certainly C<5.8.1> or newer.
+The tutorial and FAQ are based on the status quo as of C<5.8.8>.
+
+You should also check your modules, and upgrade them if necessary. For example,
+HTML::Entities requires version >= 1.32 to function correctly, even though the
+changelog is silent about this.
+
 =head2 What about binary data, like images?
 
 Well, apart from a bare C<binmode $fh>, you shouldn't treat them specially.
@@ -27,20 +42,9 @@ need text in a binary stream, encode your text strings first using the
 appropriate encoding, then join them with binary strings. See also: "What if I
 don't encode?".
 
-=head2 What about the UTF8 flag?
-
-Please, unless you're hacking the internals, or debugging weirdness, don't
-think about the UTF8 flag at all. That means that you very probably shouldn't
-use C<is_utf8>, C<_utf8_on> or C<_utf8_off> at all.
-
-Perl's internal format happens to be UTF-8. Unfortunately, Perl can't keep a
-secret, so everyone knows about this.  That is the source of much confusion.
-It's better to pretend that the internal format is some unknown encoding,
-and that you always have to encode and decode explicitly.
-
 =head2 When should I decode or encode?
 
-Whenever you're communicating with anything that is external to your perl
+Whenever you're communicating text with anything that is external to your perl
 process, like a database, a text file, a socket, or another program. Even if
 the thing you're communicating with is also written in Perl.
 
@@ -88,23 +92,7 @@ Or if you already have an open filehandle:
     binmode $fh, ':encoding(UTF-8)';
 
 Some database drivers for DBI can also automatically encode and decode, but
-that is typically limited to the UTF-8 encoding, because they cheat.
-
-=head2 Cheat?! Tell me, how can I cheat?
-
-Well, because Perl's internal format is UTF-8, you can just skip the encoding
-or decoding step, and manipulate the UTF8 flag directly.
-
-Instead of C<:encoding(UTF-8)>, you can simply use C<:utf8>. This is widely
-accepted as good behavior when you're writing, but it can be dangerous when
-reading, because it causes internal inconsistency when you have invalid byte
-sequences.
-
-Instead of C<decode> and C<encode>, you could use C<_utf8_on> and C<_utf8_off>,
-but this is considered bad style. Especially C<_utf8_on> can be dangerous, for
-the same reason that C<:utf8> can.
-
-There are some shortcuts for oneliners; see C<-C> in L<perlrun>.
+that is sometimes limited to the UTF-8 encoding.
 
 =head2 What if I don't know which encoding was used?
 
@@ -146,6 +134,25 @@ UTF-8.
 If you properly encode your strings for output, none of this is of your
 concern, and you can just C<eval> dumped data as always.
 
+=head2 Why do regex character classes sometimes match only in the ASCII range?
+
+=head2 Why do some characters not uppercase or lowercase correctly?
+
+It seemed like a good idea at the time, to keep the semantics the same for
+standard strings, when Perl got Unicode support. While it might be repaired
+in the future, we now have to deal with the fact that Perl treats equal
+strings differently, depending on the internal state.
+
+Affected are C<uc>, C<lc>, C<ucfirst>, C<lcfirst>, C<\U>, C<\L>, C<\u>, C<\l>,
+C<\d>, C<\s>, C<\w>, C<\D>, C<\S>, C<\W>, C</.../i>, C<(?i:...)>,
+C</[[:posix:]]/>.
+
+To force Unicode semantics, you can upgrade the internal representation to
+by doing C<utf8::upgrade($string)>. This does not change strings that were
+already upgraded.
+
+For a more detailed discussion, see L<Unicode::Semantics> on CPAN.
+
 =head2 How can I determine if a string is a text string or a binary string?
 
 You can't. Some use the UTF8 flag for this, but that's misuse, and makes well
@@ -176,6 +183,45 @@ or by letting automatic decoding and encoding do all the work:
     open my $barfh, '>:encoding(BAR)', 'example.bar.txt';
     print { $barfh } $_ while <$foofh>;
 
+=head2 What are C<decode_utf8> and C<encode_utf8>?
+
+These are alternate syntaxes for C<decode('utf8', ...)> and C<encode('utf8',
+...)>.
+
+=head2 What is a "wide character"?
+
+This is a term used both for characters with an ordinal value greater than 127,
+characters with an ordinal value greater than 255, or any character occupying
+than one byte, depending on the context.
+
+The Perl warning "Wide character in ..." is caused by a character with an
+ordinal value greater than 255. With no specified encoding layer, Perl tries to
+fit things in ISO-8859-1 for backward compatibility reasons. When it can't, it
+emits this warning (if warnings are enabled), and outputs UTF-8 encoded data
+instead.
+
+To avoid this warning and to avoid having different output encodings in a single
+stream, always specify an encoding explicitly, for example with a PerlIO layer:
+
+    binmode STDOUT, ":encoding(UTF-8)";
+
+=head1 INTERNALS
+
+=head2 What is "the UTF8 flag"?
+
+Please, unless you're hacking the internals, or debugging weirdness, don't
+think about the UTF8 flag at all. That means that you very probably shouldn't
+use C<is_utf8>, C<_utf8_on> or C<_utf8_off> at all.
+
+The UTF8 flag, also called SvUTF8, is an internal flag that indicates that the
+current internal representation is UTF-8. Without the flag, it is assumed to be
+ISO-8859-1. Perl converts between these automatically.
+
+One of Perl's internal formats happens to be UTF-8. Unfortunately, Perl can't
+keep a secret, so everyone knows about this. That is the source of much
+confusion. It's better to pretend that the internal format is some unknown
+encoding, and that you always have to encode and decode explicitly.
+
 =head2 What about the C<use bytes> pragma?
 
 Don't use it. It makes no sense to deal with bytes in a text string, and it
@@ -186,10 +232,36 @@ character counts for decoded data, and byte counts for encoded data.
 C<use bytes> is usually a failed attempt to do something useful. Just forget
 about it.
 
-=head2 What are C<decode_utf8> and C<encode_utf8>?
+=head2 What about the C<use encoding> pragma?
 
-These are alternate syntaxes for C<decode('utf8', ...)> and C<encode('utf8',
-...)>.
+Don't use it. Unfortunately, it assumes that the programmer's environment and
+that of the user will use the same encoding. It will use the same encoding for
+the source code and for STDIN and STDOUT. When a program is copied to another
+machine, the source code does not change, but the STDIO environment might.
+
+If you need non-ASCII characters in your source code, make it a UTF-8 encoded
+file and C<use utf8>.
+
+If you need to set the encoding for STDIN, STDOUT, and STDERR, for example
+based on the user's locale, C<use open>.
+
+=head2 What is the difference between C<:encoding> and C<:utf8>?
+
+Because UTF-8 is one of Perl's internal formats, you can often just skip the
+encoding or decoding step, and manipulate the UTF8 flag directly.
+
+Instead of C<:encoding(UTF-8)>, you can simply use C<:utf8>, which skips the
+encoding step if the data was already represented as UTF8 internally. This is
+widely accepted as good behavior when you're writing, but it can be dangerous
+when reading, because it causes internal inconsistency when you have invalid
+byte sequences. Using C<:utf8> for input can sometimes result in security
+breaches, so please use C<:encoding(UTF-8)> instead.
+
+Instead of C<decode> and C<encode>, you could use C<_utf8_on> and C<_utf8_off>,
+but this is considered bad style. Especially C<_utf8_on> can be dangerous, for
+the same reason that C<:utf8> can.
+
+There are some shortcuts for oneliners; see C<-C> in L<perlrun>.
 
 =head2 What's the difference between C<UTF-8> and C<utf8>?
 
@@ -223,24 +295,9 @@ when you C<encode>. In other words: don't try to find out what the internal
 encoding for a certain string is, but instead just encode it into the encoding
 that you want.
 
-=head2 What character encodings does Perl support?
-
-To find out which character encodings your Perl supports, run:
-
-    perl -MEncode -le "print for Encode->encodings(':all')"
-
-=head2 Which version of perl should I use?
-
-Well, if you can, upgrade to the most recent, but certainly C<5.8.1> or newer.
-The tutorial and FAQ are based on the status quo as of C<5.8.8>.
-
-You should also check your modules, and upgrade them if necessary. For example,
-HTML::Entities requires version >= 1.32 to function correctly, even though the
-changelog is silent about this.
-
 =head1 AUTHOR
 
-Juerd Waalboer <juerd@cpan.org>
+Juerd Waalboer <#####@juerd.nl>
 
 =head1 SEE ALSO
 
diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod
index ec5f6a4..ee61acf 100644
--- a/pod/perluniintro.pod
+++ b/pod/perluniintro.pod
@@ -167,7 +167,7 @@ as a warning:
 
      Wide character in print at ...
 
-To output UTF-8, use the C<:utf8> output layer.  Prepending
+To output UTF-8, use the C<:encoding> or C<:utf8> output layer.  Prepending
 
       binmode(STDOUT, ":utf8");
 
@@ -317,7 +317,9 @@ and on already open streams, use C<binmode()>:
 The matching of encoding names is loose: case does not matter, and
 many encodings have several aliases.  Note that the C<:utf8> layer
 must always be specified exactly like that; it is I<not> subject to
-the loose matching of encoding names.
+the loose matching of encoding names. Also note that C<:utf8> is unsafe for
+input, because it accepts the data without validating that it is indeed valid
+UTF8.
 
 See L<PerlIO> for the C<:utf8> layer, L<PerlIO::encoding> and
 L<Encode::PerlIO> for the C<:encoding()> layer, and
@@ -329,7 +331,7 @@ Unicode or legacy encodings does not magically turn the data into
 Unicode in Perl's eyes.  To do that, specify the appropriate
 layer when opening files
 
-    open(my $fh,'<:utf8', 'anything');
+    open(my $fh,'<:encoding(utf8)', 'anything');
     my $line_of_unicode = <$fh>;
 
     open(my $fh,'<:encoding(Big5)', 'anything');
@@ -338,7 +340,7 @@ layer when opening files
 The I/O layers can also be specified more flexibly with
 the C<open> pragma.  See L<open>, or look at the following example.
 
-    use open ':utf8'; # input and output default layer will be UTF-8
+    use open ':encoding(utf8)'; # input/output default encoding will be UTF-8
     open X, ">file";
     print X chr(0x100), "\n";
     close X;
@@ -358,11 +360,6 @@ With the C<open> pragma you can use the C<:locale> layer
     printf "%#x\n", ord(<I>), "\n"; # this should print 0xc1
     close I;
 
-or you can also use the C<':encoding(...)'> layer
-
-    open(my $epic,'<:encoding(iso-8859-7)','iliad.greek');
-    my $line_of_unicode = <$epic>;
-
 These methods install a transparent filter on the I/O stream that
 converts data from the specified encoding when it is read in from the
 stream.  The result is always Unicode.
@@ -411,13 +408,13 @@ by repeatedly encoding the data:
     local $/; ## read in the whole file of 8-bit characters
     $t = <F>;
     close F;
-    open F, ">:utf8", "file";
+    open F, ">:encoding(utf8)", "file";
     print F $t; ## convert to UTF-8 on output
     close F;
 
 If you run this code twice, the contents of the F<file> will be twice
-UTF-8 encoded.  A C<use open ':utf8'> would have avoided the bug, or
-explicitly opening also the F<file> for input as UTF-8.
+UTF-8 encoded.  A C<use open ':encoding(utf8)'> would have avoided the
+bug, or explicitly opening also the F<file> for input as UTF-8.
 
 B<NOTE>: the C<:utf8> and C<:encoding> features work only if your
 Perl has been built with the new PerlIO feature (which is the default
diff --git a/pod/perlunitut.pod b/pod/perlunitut.pod
index 5328049..6c7dfb0 100644
--- a/pod/perlunitut.pod
+++ b/pod/perlunitut.pod
@@ -201,7 +201,7 @@ Gray.
 
 =head1 AUTHOR
 
-Juerd Waalboer <juerd@cpan.org>
+Juerd Waalboer <#####@juerd.nl>
 
 =head1 SEE ALSO