ext/Encode/encoding.pm

   1 package encoding;
   2 our $VERSION = do { my @r = (q$Revision: 1.25 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   3
   4 use Encode;
   5
   6 BEGIN {
   7     if (ord("A") == 193) {
   8         require Carp;
   9         Carp::croak "encoding pragma does not support EBCDIC platforms";
  10     }
  11 }
  12
  13 sub import {
  14     my $class = shift;
  15     my $name  = shift;
  16     my %arg = @_;
  17     $name ||= $ENV{PERL_ENCODING};
  18
  19     my $enc = find_encoding($name);
  20     unless (defined $enc) {
  21         require Carp;
  22         Carp::croak "Unknown encoding '$name'";
  23     }
  24     ${^ENCODING} = $enc; # this is all you need, actually.
  25
  26     # $_OPEN_ORIG = ${^OPEN};
  27     for my $h (qw(STDIN STDOUT STDERR)){
  28         if ($arg{$h}){
  29             unless (defined find_encoding($name)) {
  30                 require Carp;
  31                 Carp::croak "Unknown encoding for $fhname, '$arg{$h}'";
  32             }
  33             eval qq{ binmode($h, ":encoding($arg{h})") };
  34         }else{
  35             eval qq{ binmode($h, ":encoding($name)") };
  36         }
  37         if ($@){
  38             require Carp;
  39             Carp::croak($@);
  40         }
  41     }
  42     return 1; # I doubt if we need it, though
  43 }
  44
  45 sub unimport{
  46     no warnings;
  47     undef ${^ENCODING};
  48     binmode(STDIN,  ":raw");
  49     binmode(STDOUT, ":raw");
  50     # Leaves STDERR alone.
  51     # binmode(STDERR, ":raw");
  52 }
  53
  54 1;
  55 __END__
  56 =pod
  57
  58 =head1 NAME
  59
  60 encoding -  allows you to write your script in non-asii or non-utf8
  61
  62 =head1 SYNOPSIS
  63
  64   use encoding "euc-jp"; # Jperl!
  65
  66   # or you can even do this if your shell supports euc-jp
  67
  68   > perl -Mencoding=euc-jp -e '...'
  69
  70   # or from the shebang line
  71
  72   #!/your/path/to/perl -Mencoding=euc-jp
  73
  74   # more control
  75
  76   # A simple euc-jp => utf-8 converter
  77   use encoding "euc-jp", STDOUT => "utf8";  while(<>){print};
  78
  79   # "no encoding;" supported (but not scoped!)
  80   no encoding;
  81
  82 =head1 ABSTRACT
  83
  84 Perl 5.6.0 has introduced Unicode support.  You could apply
  85 C<substr()> and regexes even to complex CJK characters -- so long as
  86 the script was written in UTF-8.  But back then text editors that
  87 support UTF-8 was still rare and many users rather chose to writer
  88 scripts in legacy encodings, given up whole new feature of Perl 5.6.
  89
  90 With B<encoding> pragma, you can write your script in any encoding you like
  91 (so long as the C<Encode> module supports it) and still enjoy Unicode
  92 support.  You can write a code in EUC-JP as follows;
  93
  94   my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
  95                #<-char-><-char->   # 4 octets
  96   s/\bCamel\b/$Rakuda/;
  97
  98 And with C<use encoding "euc-jp"> in effect, it is the same thing as
  99 the code in UTF-8 as follow.
 100
 101   my $Rakuda = "\x{99F1}\x{99DD}"; # who Unicode Characters
 102   s/\bCamel\b/$Rakuda/;
 103
 104 The B<encoding> pragma also modifies the file handle disciplines of
 105 STDIN, STDOUT, and STDERR to the specified encoding.  Therefore,
 106
 107   use encoding "euc-jp";
 108   my $message = "Camel is the symbol of perl.\n";
 109   my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
 110   $message =~ s/\bCamel\b/$Rakuda/;
 111   print $message;
 112
 113 Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", not
 114 "\x{99F1}\x{99DD} is the symbol of perl.\n".
 115
 116 You can override this by giving extra arguments.  See below.
 117
 118 =head1 USAGE
 119
 120 =over 4
 121
 122 =item use encoding [I<ENCNAME>] ;
 123
 124 Sets the script encoding to I<ENCNAME> and file handle disciplines of
 125 STDIN, STDOUT are set to ":encoding(I<ENCNAME>)". Note STDERR will not
 126 be changed.
 127
 128 If no encoding is specified, the environment variable L<PERL_ENCODING>
 129 is consulted. If no  encoding can be found, C<Unknown encoding 'I<ENCNAME>'>
 130 error will be thrown.
 131
 132 Note that non-STD file handles remain unaffected.  Use C<use open> or
 133 C<binmode> to change disciplines of those.
 134
 135 =item use encoding I<ENCNAME> [ STDIN => I<ENCNAME_IN> ...] ;
 136
 137 You can also individually set encodings of STDIN, STDOUT, and STDERR
 138 via STDI<FH> => I<ENCNAME_FH> form.  In this case, you cannot omit the
 139 first I<ENCNAME>.
 140
 141 =item no encoding;
 142
 143 Unsets the script encoding and the disciplines of STDIN, STDOUT are
 144 reset to ":raw".
 145
 146 =back
 147
 148 =head1 CAVEATS
 149
 150 =head2 NOT SCOPED
 151
 152 The pragma is a per script, not a per block lexical.  Only the last
 153 C<use encoding> or C<matters, and it affects B<the whole script>.
 154 Though <no encoding> pragma is supported and C<use encoding> can
 155 appear as many times as you want in a given script, the multiple use
 156 of this pragma is discouraged.
 157
 158 =head2 DO NOT MIX MULTIPLE ENCODINGS
 159
 160 Notice that only literals (string or regular expression) having only
 161 legacy code points are affected: if you mix data like this
 162
 163         \xDF\x{100}
 164
 165 the data is assumed to be in (Latin 1 and) Unicode, not in your native
 166 encoding.  In other words, this will match in "greek":
 167
 168         "\xDF" =~ /\x{3af}/
 169
 170 but this will not
 171
 172         "\xDF\x{100}" =~ /\x{3af}\x{100}/
 173
 174 since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}>
 175 because of the C<\x{100}> on the left.  You should not be mixing your
 176 legacy data and Unicode in the same string.
 177
 178 This pragma also affects encoding of the 0x80..0xFF code point range:
 179 normally characters in that range are left as eight-bit bytes (unless
 180 they are combined with characters with code points 0x100 or larger,
 181 in which case all characters need to become UTF-8 encoded), but if
 182 the C<encoding> pragma is present, even the 0x80..0xFF range always
 183 gets UTF-8 encoded.
 184
 185 After all, the best thing about this pragma is that you don't have to
 186 resort to \x... just to spell your name in native encoding.  So feel
 187 free to put your strings in your encoding in quotes and regexes.
 188
 189 =head1 EXAMPLE - Greekperl
 190
 191     use encoding "iso 8859-7";
 192
 193     # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.
 194
 195     $a = "\xDF";
 196     $b = "\x{100}";
 197
 198     printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
 199
 200     $c = $a . $b;
 201
 202     # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
 203
 204     # chr() is affected, and ...
 205
 206     print "mega\n"  if ord(chr(0xdf)) == 0x3af;
 207
 208     # ... ord() is affected by the encoding pragma ...
 209
 210     print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
 211
 212     # ... as are eq and cmp ...
 213
 214     print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
 215     print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;
 216
 217     # ... but pack/unpack C are not affected, in case you still
 218     # want back to your native encoding
 219
 220     print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
 221
 222 =head1 KNOWN PROBLEMS
 223
 224 For native multibyte encodings (either fixed or variable length)
 225 the current implementation of the regular expressions may introduce
 226 recoding errors for longer regular expression literals than 127 bytes.
 227
 228 The encoding pragma is not supported on EBCDIC platforms.
 229 (Porters wanted.)
 230
 231 =head1 SEE ALSO
 232
 233 L<perlunicode>, L<Encode>, L<open>
 234
 235 =cut