Make the point a bit clearer after suggestion from Merijin

[p5sagit/p5-mst-13.2.git] / pod / perluniintro.pod
diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod

index 736a0e2..d6eae60 100644 (file)
--- a/pod/perluniintro.pod
+++ b/pod/perluniintro.pod
@@ -128,10 +128,9 @@ This model was found to be wrong, or at least clumsy: the Unicodeness
 is now carried with the data, not attached to the operations.  (There
 is one remaining case where an explicit C<use utf8> is needed: if your
 Perl script itself is encoded in UTF-8, you can use UTF-8 in your
-variable and subroutine names, and in your string and regular
-expression literals, by saying C<use utf8>.  This is not the default
-because that would break existing scripts having legacy 8-bit data in
-them.)
+identifier names, and in your string and regular expression literals,
+by saying C<use utf8>.  This is not the default because that would
+break existing scripts having legacy 8-bit data in them.)
 
 =head2 Perl's Unicode Model
 
@@ -172,7 +171,10 @@ of course, removes the warning.
 If your locale environment variables (LANGUAGE, LC_ALL, LC_CTYPE, LANG)
 contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching),
 the default encoding of your STDIN, STDOUT, and STDERR, and of
-B<any subsequent file open>, is UTF-8.
+B<any subsequent file open>, is UTF-8.  Note that this means
+that Perl expects other software to work, too: if STDIN coming
+in from another command is not UTF-8, Perl will complain about
+malformed UTF-8.
 
 =head2 Unicode and EBCDIC
 
@@ -277,8 +279,8 @@ C<Latin 2>, or C<iso8859-2>, and so forth.  With just
 
     use encoding;
 
-first the environment variable C<PERL_ENCODING> will be consulted,
-and if that doesn't exist, ISO 8859-1 (Latin 1) will be assumed.
+the environment variable C<PERL_ENCODING> will be consulted,
+but if that doesn't exist, the encoding pragma fails.
 
 The C<Encode> module knows about many encodings and it has interfaces
 for doing conversions between those encodings:
@@ -407,11 +409,6 @@ If you run this code twice, the contents of the F<file> will be twice
 UTF-8 encoded.  A C<use open ':utf8'> would have avoided the bug, or
 explicitly opening also the F<file> for input as UTF-8.
 
-In some filesystems (for example Microsoft NTFS and Apple HFS+) the
-filenames are in UTF-8 .  By using opendir() and File::Glob you can
-make readdir() and glob() to return the filenames as Unicode, see
-L<perlfunc/opendir> and L<File::Glob> for details.
-
 B<NOTE>: the C<:utf8> and C<:encoding> features work only if your
 Perl has been built with the new "perlio" feature.  Almost all 
 Perl 5.8 platforms do use "perlio", though: you can see whether
@@ -627,13 +624,16 @@ the output string will be UTF-8-encoded "ab\x80c\x{100}\n", but note
 that C<$a> will stay single byte encoded.
 
 Sometimes you might really need to know the byte length of a string
-instead of the character length.  For that use the C<bytes> pragma
-and its only defined function C<length()>:
+instead of the character length. For that use either the
+C<Encode::encode_utf8()> function or the C<bytes> pragma and its only
+defined function C<length()>:
 
     my $unicode = chr(0x100);
     print length($unicode), "\n"; # will print 1
+    require Encode;
+    print length(Encode::encode_utf8($unicode)), "\n"; # will print 2
     use bytes;
-    print length($unicode), "\n"; # will print 2 (the 0xC4 0x80 of the UTF-8)
+    print length($unicode), "\n"; # will also print 2 (the 0xC4 0x80 of the UTF-8)
 
 =item