ext/Encode/encoding.pm

   1 # $Id: encoding.pm,v 1.44 2003/03/09 20:07:37 dankogai Exp $
   2 package encoding;
   3 our $VERSION = do { my @r = (q$Revision: 1.44 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   4
   5 use Encode;
   6 use strict;
   7 our $DEBUG = 0;
   8
   9 BEGIN {
  10     if (ord("A") == 193) {
  11         require Carp;
  12         Carp::croak("encoding pragma does not support EBCDIC platforms");
  13     }
  14 }
  15
  16 our $HAS_PERLIO = 0;
  17 eval { require PerlIO::encoding };
  18 unless ($@){
  19     $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
  20 }
  21
  22 sub _exception{
  23     my $name = shift;
  24     $] > 5.008 and return 0;             # 5.8.1 then no
  25     my %utfs = map {$_=>1}
  26         qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
  27            UTF-32 UTF-32BE UTF-32LE);
  28     $utfs{$name} or return 0;            # UTFs or no
  29     require Config; Config->import(); our %Config;
  30     return $Config{perl_patchlevel} == 0 # maintperl then no
  31 }
  32
  33 sub import {
  34     my $class = shift;
  35     my $name  = shift;
  36     my %arg = @_;
  37     $name ||= $ENV{PERL_ENCODING};
  38     my $enc = find_encoding($name);
  39     unless (defined $enc) {
  40         require Carp;
  41         Carp::croak("Unknown encoding '$name'");
  42     }
  43     $name = $enc->name; # canonize
  44     unless ($arg{Filter}) {
  45         $DEBUG and warn "_exception($name) = ", _exception($name);
  46         _exception($name) or ${^ENCODING} = $enc;
  47         $HAS_PERLIO or return 1;
  48     }else{
  49         defined(${^ENCODING}) and undef ${^ENCODING};
  50         # implicitly 'use utf8'
  51         require utf8; # to fetch $utf8::hint_bits;
  52         $^H |= $utf8::hint_bits;
  53         eval {
  54             require Filter::Util::Call ;
  55             Filter::Util::Call->import ;
  56             filter_add(sub{
  57                            my $status = filter_read();
  58                            if ($status > 0){
  59                                # $DEBUG and warn $_;
  60                                $_ = $enc->decode($_, 1);
  61                                $DEBUG and warn $_;
  62                            }
  63                            $status ;
  64                        });
  65         };
  66     }   $DEBUG and warn "Filter installed";
  67     defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
  68     for my $h (qw(STDIN STDOUT)){
  69         if ($arg{$h}){
  70             unless (defined find_encoding($arg{$h})) {
  71                 require Carp;
  72                 Carp::croak("Unknown encoding for $h, '$arg{$h}'");
  73             }
  74             eval { binmode($h, ":raw :encoding($arg{$h})") };
  75         }else{
  76             unless (exists $arg{$h}){
  77                 eval {
  78                     no warnings 'uninitialized';
  79                     binmode($h, ":raw :encoding($name)");
  80                 };
  81             }
  82         }
  83         if ($@){
  84             require Carp;
  85             Carp::croak($@);
  86         }
  87     }
  88     return 1; # I doubt if we need it, though
  89 }
  90
  91 sub unimport{
  92     no warnings;
  93     undef ${^ENCODING};
  94     if ($HAS_PERLIO){
  95         binmode(STDIN,  ":raw");
  96         binmode(STDOUT, ":raw");
  97     }else{
  98         binmode(STDIN);
  99         binmode(STDOUT);
 100     }
 101     if ($INC{"Filter/Util/Call.pm"}){
 102         eval { filter_del() };
 103     }
 104 }
 105
 106 1;
 107 __END__
 108
 109 =pod
 110
 111 =head1 NAME
 112
 113 encoding - allows you to write your script in non-ascii or non-utf8
 114
 115 =head1 SYNOPSIS
 116
 117   use encoding "greek";  # Perl like Greek to you?
 118   use encoding "euc-jp"; # Jperl!
 119
 120   # or you can even do this if your shell supports your native encoding
 121
 122   perl -Mencoding=latin2 -e '...' # Feeling centrally European?
 123   perl -Mencoding=euc-kr -e '...' # Or Korean?
 124
 125   # more control
 126
 127   # A simple euc-cn => utf-8 converter
 128   use encoding "euc-cn", STDOUT => "utf8";  while(<>){print};
 129
 130   # "no encoding;" supported (but not scoped!)
 131   no encoding;
 132
 133   # an alternate way, Filter
 134   use encoding "euc-jp", Filter=>1;
 135   # now you can use kanji identifiers -- in euc-jp!
 136
 137 =head1 ABSTRACT
 138
 139 Let's start with a bit of history: Perl 5.6.0 introduced Unicode
 140 support.  You could apply C<substr()> and regexes even to complex CJK
 141 characters -- so long as the script was written in UTF-8.  But back
 142 then, text editors that supported UTF-8 were still rare and many users
 143 instead chose to write scripts in legacy encodings, giving up a whole
 144 new feature of Perl 5.6.
 145
 146 Rewind to the future: starting from perl 5.8.0 with the B<encoding>
 147 pragma, you can write your script in any encoding you like (so long
 148 as the C<Encode> module supports it) and still enjoy Unicode support.
 149 This pragma achieves that by doing the following:
 150
 151 =over
 152
 153 =item *
 154
 155 Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from
 156 the encoding specified to utf8.  In Perl 5.8.1 and later, literals in
 157 C<tr///> and C<DATA> pseudo-filehandle are also converted.
 158
 159 =item *
 160
 161 Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding
 162  specified.
 163
 164 =back
 165
 166 =head2 Literal Conversions
 167
 168 You can write code in EUC-JP as follows:
 169
 170   my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
 171                #<-char-><-char->   # 4 octets
 172   s/\bCamel\b/$Rakuda/;
 173
 174 And with C<use encoding "euc-jp"> in effect, it is the same thing as
 175 the code in UTF-8:
 176
 177   my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
 178   s/\bCamel\b/$Rakuda/;
 179
 180 =head2 PerlIO layers for C<STD(IN|OUT)>
 181
 182 The B<encoding> pragma also modifies the filehandle layers of
 183 STDIN and STDOUT to the specified encoding.  Therefore,
 184
 185   use encoding "euc-jp";
 186   my $message = "Camel is the symbol of perl.\n";
 187   my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
 188   $message =~ s/\bCamel\b/$Rakuda/;
 189   print $message;
 190
 191 Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
 192 not "\x{99F1}\x{99DD} is the symbol of perl.\n".
 193
 194 You can override this by giving extra arguments; see below.
 195
 196 =head1 FEATURES THAT REQUIRE 5.8.1
 197
 198 Some of the features offered by this pragma requires perl 5.8.1.  Most
 199 of these are done by Inaba Hiroto.  Any other features and changes
 200 are good for 5.8.0.
 201
 202 =over
 203
 204 =item "NON-EUC" doublebyte encodings
 205
 206 Because perl needs to parse script before applying this pragma, such
 207 encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH;
 208 \x5c) in the second byte fails because the second byte may
 209 accidentally escape the quoting character that follows.  Perl 5.8.1
 210 or later fixes this problem.
 211
 212 =item tr//
 213
 214 C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
 215 See the section below for details.
 216
 217 =item DATA pseudo-filehandle
 218
 219 Another feature that was overlooked was C<DATA>.
 220
 221 =back
 222
 223 =head1 USAGE
 224
 225 =over 4
 226
 227 =item use encoding [I<ENCNAME>] ;
 228
 229 Sets the script encoding to I<ENCNAME>.  And unless ${^UNICODE}
 230 exists and non-zero, PerlIO layers of STDIN and STDOUT are set to
 231 ":encoding(I<ENCNAME>)".
 232
 233 Note that STDERR WILL NOT be changed.
 234
 235 Also note that non-STD file handles remain unaffected.  Use C<use
 236 open> or C<binmode> to change layers of those.
 237
 238 If no encoding is specified, the environment variable L<PERL_ENCODING>
 239 is consulted.  If no encoding can be found, the error C<Unknown encoding
 240 'I<ENCNAME>'> will be thrown.
 241
 242 =item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
 243
 244 You can also individually set encodings of STDIN and STDOUT via the
 245 C<< STDIN => I<ENCNAME> >> form.  In this case, you cannot omit the
 246 first I<ENCNAME>.  C<< STDIN => undef >> turns the IO transcoding
 247 completely off.
 248
 249 When ${^UNICODE} exists and non-zero, these options will completely
 250 ignored.  ${^UNICODE} is a variable introduced in perl 5.8.1.  See
 251 L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for
 252 details (perl 5.8.1 and later).
 253
 254 =item use encoding I<ENCNAME> Filter=E<gt>1;
 255
 256 This turns the encoding pragma into a source filter.  While the
 257 default approach just decodes interpolated literals (in qq() and
 258 qr()), this will apply a source filter to the entire source code.  See
 259 L</"The Filter Option"> below for details.
 260
 261 =item no encoding;
 262
 263 Unsets the script encoding. The layers of STDIN, STDOUT are
 264 reset to ":raw" (the default unprocessed raw stream of bytes).
 265
 266 =back
 267
 268 =head1 The Filter Option
 269
 270 The magic of C<use encoding> is not applied to the names of
 271 identifiers.  In order to make C<${"\x{4eba}"}++> ($human++, where human
 272 is a single Han ideograph) work, you still need to write your script
 273 in UTF-8 -- or use a source filter.  That's what 'Filter=>1' does.
 274
 275 What does this mean?  Your source code behaves as if it is written in
 276 UTF-8 with 'use utf8' in effect.  So even if your editor only supports
 277 Shift_JIS, for example, you can still try examples in Chapter 15 of
 278 C<Programming Perl, 3rd Ed.>.  For instance, you can use UTF-8
 279 identifiers.
 280
 281 This option is significantly slower and (as of this writing) non-ASCII
 282 identifiers are not very stable WITHOUT this option and with the
 283 source code written in UTF-8.
 284
 285 =head2 Filter-related changes at Encode version 1.87
 286
 287 =over
 288
 289 =item *
 290
 291 The Filter option now sets STDIN and STDOUT like non-filter options.
 292 And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
 293 non-filter version.
 294
 295 =item *
 296
 297 C<use utf8> is implicitly declared so you no longer have to C<use
 298 utf8> to C<${"\x{4eba}"}++>.
 299
 300 =back
 301
 302 =head1 CAVEATS
 303
 304 =head2 NOT SCOPED
 305
 306 The pragma is a per script, not a per block lexical.  Only the last
 307 C<use encoding> or C<no encoding> matters, and it affects
 308 B<the whole script>.  However, the <no encoding> pragma is supported and
 309 B<use encoding> can appear as many times as you want in a given script.
 310 The multiple use of this pragma is discouraged.
 311
 312 By the same reason, the use this pragma inside modules is also
 313 discouraged (though not as strongly discouranged as the case above.
 314 See below).
 315
 316 If you still have to write a module with this pragma, be very careful
 317 of the load order.  See the codes below;
 318
 319   # called module
 320   package Module_IN_BAR;
 321   use encoding "bar";
 322   # stuff in "bar" encoding here
 323   1;
 324
 325   # caller script
 326   use encoding "foo"
 327   use Module_IN_BAR;
 328   # surprise! use encoding "bar" is in effect.
 329
 330 The best way to avoid this oddity is to use this pragma RIGHT AFTER
 331 other modules are loaded.  i.e.
 332
 333   use Module_IN_BAR;
 334   use encoding "foo";
 335
 336 =head2 DO NOT MIX MULTIPLE ENCODINGS
 337
 338 Notice that only literals (string or regular expression) having only
 339 legacy code points are affected: if you mix data like this
 340
 341         \xDF\x{100}
 342
 343 the data is assumed to be in (Latin 1 and) Unicode, not in your native
 344 encoding.  In other words, this will match in "greek":
 345
 346         "\xDF" =~ /\x{3af}/
 347
 348 but this will not
 349
 350         "\xDF\x{100}" =~ /\x{3af}\x{100}/
 351
 352 since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
 353 the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
 354 LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left.  You
 355 should not be mixing your legacy data and Unicode in the same string.
 356
 357 This pragma also affects encoding of the 0x80..0xFF code point range:
 358 normally characters in that range are left as eight-bit bytes (unless
 359 they are combined with characters with code points 0x100 or larger,
 360 in which case all characters need to become UTF-8 encoded), but if
 361 the C<encoding> pragma is present, even the 0x80..0xFF range always
 362 gets UTF-8 encoded.
 363
 364 After all, the best thing about this pragma is that you don't have to
 365 resort to \x{....} just to spell your name in a native encoding.
 366 So feel free to put your strings in your encoding in quotes and
 367 regexes.
 368
 369 =head2 tr/// with ranges
 370
 371 The B<encoding> pragma works by decoding string literals in
 372 C<q//,qq//,qr//,qw///, qx//> and so forth.  In perl 5.8.0, this
 373 does not apply to C<tr///>.  Therefore,
 374
 375   use encoding 'euc-jp';
 376   #....
 377   $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
 378   #           -------- -------- -------- --------
 379
 380 Does not work as
 381
 382   $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
 383
 384 =over
 385
 386 =item Legend of characters above
 387
 388   utf8     euc-jp   charnames::viacode()
 389   -----------------------------------------
 390   \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
 391   \x{3093} \xA4\xF3 HIRAGANA LETTER N
 392   \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
 393   \x{30f3} \xA5\xF3 KATAKANA LETTER N
 394
 395 =back
 396
 397 This counterintuitive behavior has been fixed in perl 5.8.1.
 398
 399 =head3 workaround to tr///;
 400
 401 In perl 5.8.0, you can work aroud as follows;
 402
 403   use encoding 'euc-jp';
 404   #  ....
 405   eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
 406
 407 Note the C<tr//> expression is surronded by C<qq{}>.  The idea behind
 408 is the same as classic idiom that makes C<tr///> 'interpolate'.
 409
 410    tr/$from/$to/;            # wrong!
 411    eval qq{ tr/$from/$to/ }; # workaround.
 412
 413 Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
 414 C<tr///> not being decoded was obviously against the will of Perl5
 415 Porters so it has been fixed in Perl 5.8.1 or later.
 416
 417 =head1 EXAMPLE - Greekperl
 418
 419     use encoding "iso 8859-7";
 420
 421     # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
 422
 423     $a = "\xDF";
 424     $b = "\x{100}";
 425
 426     printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
 427
 428     $c = $a . $b;
 429
 430     # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
 431
 432     # chr() is affected, and ...
 433
 434     print "mega\n"  if ord(chr(0xdf)) == 0x3af;
 435
 436     # ... ord() is affected by the encoding pragma ...
 437
 438     print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
 439
 440     # ... as are eq and cmp ...
 441
 442     print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
 443     print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;
 444
 445     # ... but pack/unpack C are not affected, in case you still
 446     # want to go back to your native encoding
 447
 448     print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
 449
 450 =head1 KNOWN PROBLEMS
 451
 452 =over
 453
 454 =item literals in regex that are longer than 127 bytes
 455
 456 For native multibyte encodings (either fixed or variable length),
 457 the current implementation of the regular expressions may introduce
 458 recoding errors for regular expression literals longer than 127 bytes.
 459
 460 =item EBCDIC
 461
 462 The encoding pragma is not supported on EBCDIC platforms.
 463 (Porters who are willing and able to remove this limitation are
 464 welcome.)
 465
 466 =item format
 467
 468 This pragma doesn't work well with format because PerlIO does not
 469 get along very well with it.  When format contains non-ascii
 470 characters it prints funny or gets "wide character warnings".
 471 To understand it, try the code below.
 472
 473   # Save this one in utf8
 474   # replace *non-ascii* with a non-ascii string
 475   my $camel;
 476   format STDOUT =
 477   *non-ascii*@>>>>>>>
 478   $camel
 479   .
 480   $camel = "*non-ascii*";
 481   binmode(STDOUT=>':encoding(utf8)'); # bang!
 482   write;              # funny
 483   print $camel, "\n"; # fine
 484
 485 Without binmode this happens to work but without binmode, print()
 486 fails instead of write().
 487
 488 At any rate, the very use of format is questionable when it comes to
 489 unicode characters since you have to consider such things as character
 490 width (i.e. double-width for ideographs) and directions (i.e. BIDI for
 491 Arabic and Hebrew).
 492
 493 =back
 494
 495 =head1 HISTORY
 496
 497 This pragma first appeared in Perl 5.8.0.  For features that require
 498 5.8.1 and better, see above.
 499
 500 =head1 SEE ALSO
 501
 502 L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
 503
 504 Ch. 15 of C<Programming Perl (3rd Edition)>
 505 by Larry Wall, Tom Christiansen, Jon Orwant;
 506 O'Reilly & Associates; ISBN 0-596-00027-8
 507
 508 =cut