It seems the binmode() is needed with UTF-8 locales enabled.

[p5sagit/p5-mst-13.2.git] / ext / Encode / encoding.pm
diff --git a/ext/Encode/encoding.pm b/ext/Encode/encoding.pm

index 420defe..e8aa737 100644 (file)
--- a/ext/Encode/encoding.pm
+++ b/ext/Encode/encoding.pm
@@ -1,5 +1,5 @@
 package encoding;
-our $VERSION = do { my @r = (q$Revision: 1.33 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.38 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
 
 use Encode;
 use strict;
@@ -7,7 +7,7 @@ use strict;
 BEGIN {
     if (ord("A") == 193) {
        require Carp;
-       Carp::croak "encoding pragma does not support EBCDIC platforms";
+       Carp::croak("encoding pragma does not support EBCDIC platforms");
     }
 }
 
@@ -26,16 +26,17 @@ sub import {
     my $enc = find_encoding($name);
     unless (defined $enc) {
        require Carp;
-       Carp::croak "Unknown encoding '$name'";
+       Carp::croak("Unknown encoding '$name'");
     }
-    unless ($arg{Filter}){
-       ${^ENCODING} = $enc; # this is all you need, actually.
+    unless ($arg{Filter}) {
+       ${^ENCODING} = $enc # this is all you need, actually.
+           unless $name =~ /^(?:utf-?(?:8|16|32)|ucs-?(?:2|4))(?:[bl]e)?$/i;
        $HAS_PERLIO or return 1;
        for my $h (qw(STDIN STDOUT)){
            if ($arg{$h}){
                unless (defined find_encoding($arg{$h})) {
                    require Carp;
-                   Carp::croak "Unknown encoding for $h, '$arg{$h}'";
+                   Carp::croak("Unknown encoding for $h, '$arg{$h}'");
                }
                eval { binmode($h, ":encoding($arg{$h})") };
            }else{
@@ -75,8 +76,13 @@ sub import {
 sub unimport{
     no warnings;
     undef ${^ENCODING};
+    if ($HAS_PERLIO){
+       binmode(STDIN,  ":raw");
+       binmode(STDOUT, ":raw");
+    }else{
     binmode(STDIN);
     binmode(STDOUT);
+    }
     if ($INC{"Filter/Util/Call.pm"}){
        eval { filter_del() };
     }
@@ -101,11 +107,6 @@ encoding - allows you to write your script in non-ascii or non-utf8
   perl -Mencoding=latin2 -e '...' # Feeling centrally European?
   perl -Mencoding=euc-kr -e '...' # Or Korean?
 
-  # or from the shebang line
-
-  #!/your/path/to/perl -Mencoding="8859-6" # Arabian Nights
-  #!/your/path/to/perl -Mencoding=big5     # Taiwanese
-
   # more control
 
   # A simple euc-cn => utf-8 converter
@@ -144,7 +145,7 @@ the code in UTF-8:
   s/\bCamel\b/$Rakuda/;
 
 The B<encoding> pragma also modifies the filehandle disciplines of
-STDIN, STDOUT, and STDERR to the specified encoding.  Therefore,
+STDIN and STDOUT to the specified encoding.  Therefore,
 
   use encoding "euc-jp";
   my $message = "Camel is the symbol of perl.\n";
@@ -193,10 +194,16 @@ reset to ":raw" (the default unprocessed raw stream of bytes).
 =head2 NOT SCOPED
 
 The pragma is a per script, not a per block lexical.  Only the last
-C<use encoding> or C<no encoding> matters, and it affects B<the whole script>.
-However, the <no encoding> pragma is supported and C<use encoding> can
-appear as many times as you want in a given script.  The multiple use
-of this pragma is discouraged.
+C<use encoding> or C<no encoding> matters, and it affects 
+B<the whole script>.  However, the <no encoding> pragma is supported and 
+B<use encoding> can appear as many times as you want in a given script. 
+The multiple use of this pragma is discouraged.
+
+Because of this nature, the use of this pragma inside the module is
+strongly discouraged (because the influence of this pragma lasts not
+only for the module but the script that uses).  But if you have to,
+make sure you say C<no encoding> at the end of the module so you
+contain the influence of the pragma within the module.
 
 =head2 DO NOT MIX MULTIPLE ENCODINGS
 
@@ -231,6 +238,53 @@ resort to \x{....} just to spell your name in a native encoding.
 So feel free to put your strings in your encoding in quotes and
 regexes.
 
+=head2 tr/// with ranges remain unaffected
+
+The B<encoding> pragma works by decoding string literals in
+C<q//,qq//,qr//,qw///, qx//> and so forth.  As of perl 5.8.0, this
+does not apply to C<tr///>.  Therefore,
+
+  use encoding 'euc-jp';
+  #....
+  $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
+  #           -------- -------- -------- --------
+
+Does not work as
+
+  $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
+
+=over
+
+=item Legend of characters above
+
+  utf8     euc-jp   charnames::viacode()
+  -----------------------------------------
+  \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
+  \x{3093} \xA4\xF3 HIRAGANA LETTER N
+  \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
+  \x{30f3} \xA5\xF3 KATAKANA LETTER N
+
+=back
+
+=head3 workaround to tr///;
+
+You can, however, achieve the same as simply as follows;
+
+  use encoding 'euc-jp';
+  # ....
+  eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
+
+Note the C<tr//> expression is surronded by C<qq{}>.  The idea behind
+is the same as classic idiom that makes C<tr///> 'interpolate'.
+
+   tr/$from/$to/;            # wrong!
+   eval qq{ tr/$from/$to/ }; # workaround.
+
+Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
+C<tr///> not being decoded was obviously against the will of Perl5
+Porters.  In future version of perl, this counter-intuitive behaviour
+of C<tr///> will be fixed so C<eval qq{}> trick will be unneccesary.
+
 =head1 Non-ASCII Identifiers and Filter option
 
 The magic of C<use encoding> is not applied to the names of