ext/Encode/encoding.pm

   1 # $Id: encoding.pm,v 1.48 2003/12/29 02:47:16 dankogai Exp dankogai $
   2 package encoding;
   3 our $VERSION = do { my @r = (q$Revision: 1.48 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   4
   5 use Encode;
   6 use strict;
   7 sub DEBUG () { 0 }
   8
   9 BEGIN {
  10     if (ord("A") == 193) {
  11         require Carp;
  12         Carp::croak("encoding pragma does not support EBCDIC platforms");
  13     }
  14 }
  15
  16 our $HAS_PERLIO = 0;
  17 eval { require PerlIO::encoding };
  18 unless ($@){
  19     $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
  20 }
  21
  22 sub _exception{
  23     my $name = shift;
  24     $] > 5.008 and return 0;               # 5.8.1 or higher then no
  25     my %utfs = map {$_=>1}
  26         qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
  27            UTF-32 UTF-32BE UTF-32LE);
  28     $utfs{$name} or return 0;               # UTFs or no
  29     require Config; Config->import(); our %Config;
  30     return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no
  31 }
  32
  33 sub import {
  34     my $class = shift;
  35     my $name  = shift;
  36     my %arg = @_;
  37     $name ||= $ENV{PERL_ENCODING};
  38     my $enc = find_encoding($name);
  39     unless (defined $enc) {
  40         require Carp;
  41         Carp::croak("Unknown encoding '$name'");
  42     }
  43     $name = $enc->name; # canonize
  44     unless ($arg{Filter}) {
  45         DEBUG and warn "_exception($name) = ", _exception($name);
  46         _exception($name) or ${^ENCODING} = $enc;
  47         $HAS_PERLIO or return 1;
  48     }else{
  49         defined(${^ENCODING}) and undef ${^ENCODING};
  50         # implicitly 'use utf8'
  51         require utf8; # to fetch $utf8::hint_bits;
  52         $^H |= $utf8::hint_bits;
  53         eval {
  54             require Filter::Util::Call ;
  55             Filter::Util::Call->import ;
  56             filter_add(sub{
  57                            my $status = filter_read();
  58                            if ($status > 0){
  59                                $_ = $enc->decode($_, 1);
  60                                DEBUG and warn $_;
  61                            }
  62                            $status ;
  63                        });
  64         };
  65     }   DEBUG and warn "Filter installed";
  66     defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
  67     for my $h (qw(STDIN STDOUT)){
  68         if ($arg{$h}){
  69             unless (defined find_encoding($arg{$h})) {
  70                 require Carp;
  71                 Carp::croak("Unknown encoding for $h, '$arg{$h}'");
  72             }
  73             eval { binmode($h, ":raw :encoding($arg{$h})") };
  74         }else{
  75             unless (exists $arg{$h}){
  76                 eval {
  77                     no warnings 'uninitialized';
  78                     binmode($h, ":raw :encoding($name)");
  79                 };
  80             }
  81         }
  82         if ($@){
  83             require Carp;
  84             Carp::croak($@);
  85         }
  86     }
  87     return 1; # I doubt if we need it, though
  88 }
  89
  90 sub unimport{
  91     no warnings;
  92     undef ${^ENCODING};
  93     if ($HAS_PERLIO){
  94         binmode(STDIN,  ":raw");
  95         binmode(STDOUT, ":raw");
  96     }else{
  97         binmode(STDIN);
  98         binmode(STDOUT);
  99     }
 100     if ($INC{"Filter/Util/Call.pm"}){
 101         eval { filter_del() };
 102     }
 103 }
 104
 105 1;
 106 __END__
 107
 108 =pod
 109
 110 =head1 NAME
 111
 112 encoding - allows you to write your script in non-ascii or non-utf8
 113
 114 =head1 SYNOPSIS
 115
 116   use encoding "greek";  # Perl like Greek to you?
 117   use encoding "euc-jp"; # Jperl!
 118
 119   # or you can even do this if your shell supports your native encoding
 120
 121   perl -Mencoding=latin2 -e '...' # Feeling centrally European?
 122   perl -Mencoding=euc-kr -e '...' # Or Korean?
 123
 124   # more control
 125
 126   # A simple euc-cn => utf-8 converter
 127   use encoding "euc-cn", STDOUT => "utf8";  while(<>){print};
 128
 129   # "no encoding;" supported (but not scoped!)
 130   no encoding;
 131
 132   # an alternate way, Filter
 133   use encoding "euc-jp", Filter=>1;
 134   # now you can use kanji identifiers -- in euc-jp!
 135
 136 =head1 ABSTRACT
 137
 138 Let's start with a bit of history: Perl 5.6.0 introduced Unicode
 139 support.  You could apply C<substr()> and regexes even to complex CJK
 140 characters -- so long as the script was written in UTF-8.  But back
 141 then, text editors that supported UTF-8 were still rare and many users
 142 instead chose to write scripts in legacy encodings, giving up a whole
 143 new feature of Perl 5.6.
 144
 145 Rewind to the future: starting from perl 5.8.0 with the B<encoding>
 146 pragma, you can write your script in any encoding you like (so long
 147 as the C<Encode> module supports it) and still enjoy Unicode support.
 148 This pragma achieves that by doing the following:
 149
 150 =over
 151
 152 =item *
 153
 154 Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from
 155 the encoding specified to utf8.  In Perl 5.8.1 and later, literals in
 156 C<tr///> and C<DATA> pseudo-filehandle are also converted.
 157
 158 =item *
 159
 160 Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding
 161  specified.
 162
 163 =back
 164
 165 =head2 Literal Conversions
 166
 167 You can write code in EUC-JP as follows:
 168
 169   my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
 170                #<-char-><-char->   # 4 octets
 171   s/\bCamel\b/$Rakuda/;
 172
 173 And with C<use encoding "euc-jp"> in effect, it is the same thing as
 174 the code in UTF-8:
 175
 176   my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
 177   s/\bCamel\b/$Rakuda/;
 178
 179 =head2 PerlIO layers for C<STD(IN|OUT)>
 180
 181 The B<encoding> pragma also modifies the filehandle layers of
 182 STDIN and STDOUT to the specified encoding.  Therefore,
 183
 184   use encoding "euc-jp";
 185   my $message = "Camel is the symbol of perl.\n";
 186   my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
 187   $message =~ s/\bCamel\b/$Rakuda/;
 188   print $message;
 189
 190 Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
 191 not "\x{99F1}\x{99DD} is the symbol of perl.\n".
 192
 193 You can override this by giving extra arguments; see below.
 194
 195 =head2 Implicit upgrading for byte strings
 196
 197 By default, if strings operating under byte semantics and strings
 198 with Unicode character data are concatenated, the new string will
 199 be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>.
 200
 201 The B<encoding> pragma changes this to use the specified encoding
 202 instead.  For example:
 203
 204     use encoding 'utf8';
 205     my $string = chr(20000); # a Unicode string
 206     utf8::encode($string);   # now it's a UTF-8 encoded byte string
 207     # concatenate with another Unicode string
 208     print length($string . chr(20000));
 209
 210 Will print C<2>, because C<$string> is upgraded as UTF-8.  Without
 211 C<use encoding 'utf8';>, it will print C<4> instead, since C<$string>
 212 is three octets when interpreted as Latin-1.
 213
 214 =head1 FEATURES THAT REQUIRE 5.8.1
 215
 216 Some of the features offered by this pragma requires perl 5.8.1.  Most
 217 of these are done by Inaba Hiroto.  Any other features and changes
 218 are good for 5.8.0.
 219
 220 =over
 221
 222 =item "NON-EUC" doublebyte encodings
 223
 224 Because perl needs to parse script before applying this pragma, such
 225 encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH;
 226 \x5c) in the second byte fails because the second byte may
 227 accidentally escape the quoting character that follows.  Perl 5.8.1
 228 or later fixes this problem.
 229
 230 =item tr//
 231
 232 C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
 233 See the section below for details.
 234
 235 =item DATA pseudo-filehandle
 236
 237 Another feature that was overlooked was C<DATA>.
 238
 239 =back
 240
 241 =head1 USAGE
 242
 243 =over 4
 244
 245 =item use encoding [I<ENCNAME>] ;
 246
 247 Sets the script encoding to I<ENCNAME>.  And unless ${^UNICODE}
 248 exists and non-zero, PerlIO layers of STDIN and STDOUT are set to
 249 ":encoding(I<ENCNAME>)".
 250
 251 Note that STDERR WILL NOT be changed.
 252
 253 Also note that non-STD file handles remain unaffected.  Use C<use
 254 open> or C<binmode> to change layers of those.
 255
 256 If no encoding is specified, the environment variable L<PERL_ENCODING>
 257 is consulted.  If no encoding can be found, the error C<Unknown encoding
 258 'I<ENCNAME>'> will be thrown.
 259
 260 =item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
 261
 262 You can also individually set encodings of STDIN and STDOUT via the
 263 C<< STDIN => I<ENCNAME> >> form.  In this case, you cannot omit the
 264 first I<ENCNAME>.  C<< STDIN => undef >> turns the IO transcoding
 265 completely off.
 266
 267 When ${^UNICODE} exists and non-zero, these options will completely
 268 ignored.  ${^UNICODE} is a variable introduced in perl 5.8.1.  See
 269 L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for
 270 details (perl 5.8.1 and later).
 271
 272 =item use encoding I<ENCNAME> Filter=E<gt>1;
 273
 274 This turns the encoding pragma into a source filter.  While the
 275 default approach just decodes interpolated literals (in qq() and
 276 qr()), this will apply a source filter to the entire source code.  See
 277 L</"The Filter Option"> below for details.
 278
 279 =item no encoding;
 280
 281 Unsets the script encoding. The layers of STDIN, STDOUT are
 282 reset to ":raw" (the default unprocessed raw stream of bytes).
 283
 284 =back
 285
 286 =head1 The Filter Option
 287
 288 The magic of C<use encoding> is not applied to the names of
 289 identifiers.  In order to make C<${"\x{4eba}"}++> ($human++, where human
 290 is a single Han ideograph) work, you still need to write your script
 291 in UTF-8 -- or use a source filter.  That's what 'Filter=>1' does.
 292
 293 What does this mean?  Your source code behaves as if it is written in
 294 UTF-8 with 'use utf8' in effect.  So even if your editor only supports
 295 Shift_JIS, for example, you can still try examples in Chapter 15 of
 296 C<Programming Perl, 3rd Ed.>.  For instance, you can use UTF-8
 297 identifiers.
 298
 299 This option is significantly slower and (as of this writing) non-ASCII
 300 identifiers are not very stable WITHOUT this option and with the
 301 source code written in UTF-8.
 302
 303 =head2 Filter-related changes at Encode version 1.87
 304
 305 =over
 306
 307 =item *
 308
 309 The Filter option now sets STDIN and STDOUT like non-filter options.
 310 And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
 311 non-filter version.
 312
 313 =item *
 314
 315 C<use utf8> is implicitly declared so you no longer have to C<use
 316 utf8> to C<${"\x{4eba}"}++>.
 317
 318 =back
 319
 320 =head1 CAVEATS
 321
 322 =head2 NOT SCOPED
 323
 324 The pragma is a per script, not a per block lexical.  Only the last
 325 C<use encoding> or C<no encoding> matters, and it affects
 326 B<the whole script>.  However, the <no encoding> pragma is supported and
 327 B<use encoding> can appear as many times as you want in a given script.
 328 The multiple use of this pragma is discouraged.
 329
 330 By the same reason, the use this pragma inside modules is also
 331 discouraged (though not as strongly discouranged as the case above.
 332 See below).
 333
 334 If you still have to write a module with this pragma, be very careful
 335 of the load order.  See the codes below;
 336
 337   # called module
 338   package Module_IN_BAR;
 339   use encoding "bar";
 340   # stuff in "bar" encoding here
 341   1;
 342
 343   # caller script
 344   use encoding "foo"
 345   use Module_IN_BAR;
 346   # surprise! use encoding "bar" is in effect.
 347
 348 The best way to avoid this oddity is to use this pragma RIGHT AFTER
 349 other modules are loaded.  i.e.
 350
 351   use Module_IN_BAR;
 352   use encoding "foo";
 353
 354 =head2 DO NOT MIX MULTIPLE ENCODINGS
 355
 356 Notice that only literals (string or regular expression) having only
 357 legacy code points are affected: if you mix data like this
 358
 359         \xDF\x{100}
 360
 361 the data is assumed to be in (Latin 1 and) Unicode, not in your native
 362 encoding.  In other words, this will match in "greek":
 363
 364         "\xDF" =~ /\x{3af}/
 365
 366 but this will not
 367
 368         "\xDF\x{100}" =~ /\x{3af}\x{100}/
 369
 370 since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
 371 the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
 372 LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left.  You
 373 should not be mixing your legacy data and Unicode in the same string.
 374
 375 This pragma also affects encoding of the 0x80..0xFF code point range:
 376 normally characters in that range are left as eight-bit bytes (unless
 377 they are combined with characters with code points 0x100 or larger,
 378 in which case all characters need to become UTF-8 encoded), but if
 379 the C<encoding> pragma is present, even the 0x80..0xFF range always
 380 gets UTF-8 encoded.
 381
 382 After all, the best thing about this pragma is that you don't have to
 383 resort to \x{....} just to spell your name in a native encoding.
 384 So feel free to put your strings in your encoding in quotes and
 385 regexes.
 386
 387 =head2 tr/// with ranges
 388
 389 The B<encoding> pragma works by decoding string literals in
 390 C<q//,qq//,qr//,qw///, qx//> and so forth.  In perl 5.8.0, this
 391 does not apply to C<tr///>.  Therefore,
 392
 393   use encoding 'euc-jp';
 394   #....
 395   $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
 396   #           -------- -------- -------- --------
 397
 398 Does not work as
 399
 400   $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
 401
 402 =over
 403
 404 =item Legend of characters above
 405
 406   utf8     euc-jp   charnames::viacode()
 407   -----------------------------------------
 408   \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
 409   \x{3093} \xA4\xF3 HIRAGANA LETTER N
 410   \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
 411   \x{30f3} \xA5\xF3 KATAKANA LETTER N
 412
 413 =back
 414
 415 This counterintuitive behavior has been fixed in perl 5.8.1.
 416
 417 =head3 workaround to tr///;
 418
 419 In perl 5.8.0, you can work around as follows;
 420
 421   use encoding 'euc-jp';
 422   #  ....
 423   eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
 424
 425 Note the C<tr//> expression is surrounded by C<qq{}>.  The idea behind
 426 is the same as classic idiom that makes C<tr///> 'interpolate'.
 427
 428    tr/$from/$to/;            # wrong!
 429    eval qq{ tr/$from/$to/ }; # workaround.
 430
 431 Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
 432 C<tr///> not being decoded was obviously against the will of Perl5
 433 Porters so it has been fixed in Perl 5.8.1 or later.
 434
 435 =head1 EXAMPLE - Greekperl
 436
 437     use encoding "iso 8859-7";
 438
 439     # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
 440
 441     $a = "\xDF";
 442     $b = "\x{100}";
 443
 444     printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
 445
 446     $c = $a . $b;
 447
 448     # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
 449
 450     # chr() is affected, and ...
 451
 452     print "mega\n"  if ord(chr(0xdf)) == 0x3af;
 453
 454     # ... ord() is affected by the encoding pragma ...
 455
 456     print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
 457
 458     # ... as are eq and cmp ...
 459
 460     print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
 461     print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;
 462
 463     # ... but pack/unpack C are not affected, in case you still
 464     # want to go back to your native encoding
 465
 466     print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
 467
 468 =head1 KNOWN PROBLEMS
 469
 470 =over
 471
 472 =item literals in regex that are longer than 127 bytes
 473
 474 For native multibyte encodings (either fixed or variable length),
 475 the current implementation of the regular expressions may introduce
 476 recoding errors for regular expression literals longer than 127 bytes.
 477
 478 =item EBCDIC
 479
 480 The encoding pragma is not supported on EBCDIC platforms.
 481 (Porters who are willing and able to remove this limitation are
 482 welcome.)
 483
 484 =item format
 485
 486 This pragma doesn't work well with format because PerlIO does not
 487 get along very well with it.  When format contains non-ascii
 488 characters it prints funny or gets "wide character warnings".
 489 To understand it, try the code below.
 490
 491   # Save this one in utf8
 492   # replace *non-ascii* with a non-ascii string
 493   my $camel;
 494   format STDOUT =
 495   *non-ascii*@>>>>>>>
 496   $camel
 497   .
 498   $camel = "*non-ascii*";
 499   binmode(STDOUT=>':encoding(utf8)'); # bang!
 500   write;              # funny
 501   print $camel, "\n"; # fine
 502
 503 Without binmode this happens to work but without binmode, print()
 504 fails instead of write().
 505
 506 At any rate, the very use of format is questionable when it comes to
 507 unicode characters since you have to consider such things as character
 508 width (i.e. double-width for ideographs) and directions (i.e. BIDI for
 509 Arabic and Hebrew).
 510
 511 =back
 512
 513 =head1 HISTORY
 514
 515 This pragma first appeared in Perl 5.8.0.  For features that require
 516 5.8.1 and better, see above.
 517
 518 =head1 SEE ALSO
 519
 520 L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
 521
 522 Ch. 15 of C<Programming Perl (3rd Edition)>
 523 by Larry Wall, Tom Christiansen, Jon Orwant;
 524 O'Reilly & Associates; ISBN 0-596-00027-8
 525
 526 =cut