ext/Encode/Encode.pm

   1 package Encode;
   2 use strict;
   3 our $VERSION = do { my @r = (q$Revision: 0.94 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   4
   5 require DynaLoader;
   6 require Exporter;
   7
   8 our @ISA = qw(Exporter DynaLoader);
   9
  10 # Public, encouraged API is exported by default
  11 our @EXPORT = qw (
  12   encode
  13   decode
  14   encode_utf8
  15   decode_utf8
  16   find_encoding
  17   encodings
  18 );
  19
  20 our @EXPORT_OK =
  21     qw(
  22        define_encoding
  23        define_alias
  24        from_to
  25        is_utf8
  26        is_8bit
  27        is_16bit
  28        utf8_upgrade
  29        utf8_downgrade
  30        _utf8_on
  31        _utf8_off
  32       );
  33
  34 bootstrap Encode ();
  35
  36 # Documentation moved after __END__ for speed - NI-S
  37
  38 use Carp;
  39
  40 # Make a %encoding package variable to allow a certain amount of cheating
  41 our %encoding;
  42 my @alias;  # ordered matching list
  43 my %alias;  # cached known aliases
  44
  45                      # 0  1  2  3  4  5   6   7   8   9  10
  46 our @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
  47
  48 our %winlatin2cp   = (
  49                       'Latin1'     => 1252,
  50                       'Latin2'     => 1250,
  51                       'Cyrillic'   => 1251,
  52                       'Greek'      => 1253,
  53                       'Turkish'    => 1254,
  54                       'Hebrew'     => 1255,
  55                       'Arabic'     => 1256,
  56                       'Baltic'     => 1257,
  57                       'Vietnamese' => 1258,
  58                      );
  59
  60 our %external_tables =
  61     (
  62         'euc-cn'        => 'Encode/CN.pm',
  63         gb2312          => 'Encode/CN.pm',
  64         gb12345         => 'Encode/CN.pm',
  65         gbk             => 'Encode/CN.pm',
  66         cp936           => 'Encode/CN.pm',
  67         'iso-ir-165'    => 'Encode/CN.pm',
  68         'euc-jp'        => 'Encode/JP.pm',
  69         'iso-2022-jp'   => 'Encode/JP.pm',
  70         '7bit-jis'      => 'Encode/JP.pm',
  71         shiftjis        => 'Encode/JP.pm',
  72         macjapan        => 'Encode/JP.pm',
  73         cp932           => 'Encode/JP.pm',
  74         'euc-kr'        => 'Encode/KR.pm',
  75         ksc5601         => 'Encode/KR.pm',
  76         cp949           => 'Encode/KR.pm',
  77         big5            => 'Encode/TW.pm',
  78         'big5-hkscs'    => 'Encode/TW.pm',
  79         cp950           => 'Encode/TW.pm',
  80         gb18030         => 'Encode/HanExtra.pm',
  81         big5plus        => 'Encode/HanExtra.pm',
  82         'euc-tw'        => 'Encode/HanExtra.pm',
  83     );
  84
  85 sub encodings
  86 {
  87  my ($class) = @_;
  88  return
  89      map { $_->[0] }
  90          sort { $a->[1] cmp $b->[1] }
  91                map { [$_, lc $_] }
  92                    grep { $_ ne 'Internal' }
  93                         keys %encoding;
  94 }
  95
  96 sub findAlias
  97 {
  98     my $class = shift;
  99     local $_ = shift;
 100     # print "# findAlias $_\n";
 101     unless (exists $alias{$_})
 102     {
 103         for (my $i=0; $i < @alias; $i += 2)
 104         {
 105             my $alias = $alias[$i];
 106             my $val   = $alias[$i+1];
 107             my $new;
 108             if (ref($alias) eq 'Regexp' && $_ =~ $alias)
 109             {
 110                 $new = eval $val;
 111             }
 112             elsif (ref($alias) eq 'CODE')
 113             {
 114                 $new = &{$alias}($val)
 115                 }
 116             elsif (lc($_) eq lc($alias))
 117             {
 118                 $new = $val;
 119             }
 120             if (defined($new))
 121             {
 122                 next if $new eq $_; # avoid (direct) recursion on bugs
 123                 my $enc = (ref($new)) ? $new : find_encoding($new);
 124                 if ($enc)
 125                 {
 126                     $alias{$_} = $enc;
 127                     last;
 128                 }
 129             }
 130         }
 131     }
 132     return $alias{$_};
 133 }
 134
 135 sub define_alias
 136 {
 137     while (@_)
 138     {
 139         my ($alias,$name) = splice(@_,0,2);
 140         push(@alias, $alias => $name);
 141     }
 142 }
 143
 144 # Allow variants of iso-8859-1 etc.
 145 define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
 146
 147 # At least HP-UX has these.
 148 define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );
 149
 150 # More HP stuff.
 151 define_alias( qr/^(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
 152
 153 # The Official name of ASCII.
 154 define_alias( qr/^ANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
 155
 156 # This is a font issue, not an encoding issue.
 157 # (The currency symbol of the Latin 1 upper half
 158 #  has been redefined as the euro symbol.)
 159 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
 160
 161 # Allow latin-1 style names as well
 162 define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
 163
 164 # Allow winlatin1 style names as well
 165 define_alias( qr/^win(latin[12]|cyrillic|baltic|greek|turkish|hebrew|arabic|baltic|vietnamese)$/i => '"cp$winlatin2cp{\u$1}"' );
 166
 167 # Common names for non-latin prefered MIME names
 168 define_alias( 'ascii'    => 'US-ascii',
 169               'cyrillic' => 'iso-8859-5',
 170               'arabic'   => 'iso-8859-6',
 171               'greek'    => 'iso-8859-7',
 172               'hebrew'   => 'iso-8859-8',
 173               'thai'     => 'iso-8859-11',
 174               'tis620'   => 'iso-8859-11',
 175             );
 176
 177 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
 178 # And Microsoft has their own naming (again, surprisingly).
 179 define_alias( qr/^(?:ibm|ms)[-_]?(\d\d\d\d?)$/i => '"cp$1"');
 180
 181 # Sometimes seen with a leading zero.
 182 define_alias( qr/^cp037$/i => '"cp37"');
 183
 184 # Ououououou.
 185 define_alias( qr/^macRomanian$/i => '"macRumanian"');
 186
 187 # Standardize on the dashed versions.
 188 define_alias( qr/^utf8$/i  => 'utf-8' );
 189 define_alias( qr/^koi8r$/i => 'koi8-r' );
 190 define_alias( qr/^koi8u$/i => 'koi8-u' );
 191
 192 # Seen in some Linuxes.
 193 define_alias( qr/^ujis$/i => 'euc-jp' );
 194
 195 # CP936 doesn't have vendor-addon for GBK, so they're identical.
 196 define_alias( qr/^gbk$/i => '"cp936"');
 197
 198 # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
 199 # TODO: HP-UX '15' encodings japanese15 korean15 roi15
 200 # TODO: Cyrillic encoding ISO-IR-111 (useful?)
 201 # TODO: Armenian encoding ARMSCII-8
 202 # TODO: Hebrew encoding ISO-8859-8-1
 203 # TODO: Thai encoding TCVN
 204 # TODO: Korean encoding Johab
 205 # TODO: Vietnamese encodings VPS
 206 # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
 207 #       ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
 208 #       Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
 209 #       Kannada Khmer Korean Laotian Malayalam Mongolian
 210 #       Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
 211
 212 # Map white space and _ to '-'
 213 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
 214
 215 sub define_encoding
 216 {
 217     my $obj  = shift;
 218     my $name = shift;
 219     $encoding{$name} = $obj;
 220     my $lc = lc($name);
 221     define_alias($lc => $obj) unless $lc eq $name;
 222     while (@_)
 223     {
 224         my $alias = shift;
 225         define_alias($alias,$obj);
 226     }
 227     return $obj;
 228 }
 229
 230 sub getEncoding
 231 {
 232     my ($class,$name,$skip_external) = @_;
 233     my $enc;
 234     if (ref($name) && $name->can('new_sequence'))
 235     {
 236         return $name;
 237     }
 238     my $lc = lc $name;
 239     if (exists $encoding{$name})
 240     {
 241         return $encoding{$name};
 242     }
 243     if (exists $encoding{$lc})
 244     {
 245         return $encoding{$lc};
 246     }
 247
 248     my $oc = $class->findAlias($name);
 249     return $oc if defined $oc;
 250
 251     $oc = $class->findAlias($lc) if $lc ne $name;
 252     return $oc if defined $oc;
 253
 254     if (!$skip_external and exists $external_tables{$lc})
 255     {
 256         require $external_tables{$lc};
 257         return $encoding{$name} if exists $encoding{$name};
 258     }
 259
 260     return;
 261 }
 262
 263 sub find_encoding
 264 {
 265     my ($name,$skip_external) = @_;
 266     return __PACKAGE__->getEncoding($name,$skip_external);
 267 }
 268
 269 sub encode
 270 {
 271     my ($name,$string,$check) = @_;
 272     my $enc = find_encoding($name);
 273     croak("Unknown encoding '$name'") unless defined $enc;
 274     my $octets = $enc->encode($string,$check);
 275     return undef if ($check && length($string));
 276     return $octets;
 277 }
 278
 279 sub decode
 280 {
 281     my ($name,$octets,$check) = @_;
 282     my $enc = find_encoding($name);
 283     croak("Unknown encoding '$name'") unless defined $enc;
 284     my $string = $enc->decode($octets,$check);
 285     $_[1] = $octets if $check;
 286     return $string;
 287 }
 288
 289 sub from_to
 290 {
 291     my ($string,$from,$to,$check) = @_;
 292     my $f = find_encoding($from);
 293     croak("Unknown encoding '$from'") unless defined $f;
 294     my $t = find_encoding($to);
 295     croak("Unknown encoding '$to'") unless defined $t;
 296     my $uni = $f->decode($string,$check);
 297     return undef if ($check && length($string));
 298     $string = $t->encode($uni,$check);
 299     return undef if ($check && length($uni));
 300     return length($_[0] = $string);
 301 }
 302
 303 sub encode_utf8
 304 {
 305     my ($str) = @_;
 306   utf8::encode($str);
 307     return $str;
 308 }
 309
 310 sub decode_utf8
 311 {
 312     my ($str) = @_;
 313     return undef unless utf8::decode($str);
 314     return $str;
 315 }
 316
 317 require Encode::Encoding;
 318 require Encode::XS;
 319 require Encode::Internal;
 320 require Encode::Unicode;
 321 require Encode::utf8;
 322 require Encode::iso10646_1;
 323 require Encode::ucs2_le;
 324
 325 1;
 326
 327 __END__
 328
 329 =head1 NAME
 330
 331 Encode - character encodings
 332
 333 =head1 SYNOPSIS
 334
 335     use Encode;
 336
 337 =head1 DESCRIPTION
 338
 339 The C<Encode> module provides the interfaces between Perl's strings
 340 and the rest of the system.  Perl strings are sequences of B<characters>.
 341
 342 To find more about character encodings, please consult
 343 L<Encode::Description> . This document focuses on programming references.
 344
 345 =head1 PERL ENCODING API
 346
 347 =head2 Generic Encoding Interface
 348
 349 =over 4
 350
 351 =item *
 352
 353         $bytes  = encode(ENCODING, $string[, CHECK])
 354
 355 Encodes string from Perl's internal form into I<ENCODING> and returns
 356 a sequence of octets.  For CHECK see L</"Handling Malformed Data">.
 357
 358 For example to convert (internally UTF-8 encoded) Unicode data
 359 to octets:
 360
 361         $octets = encode("utf8", $unicode);
 362
 363 =item *
 364
 365         $string = decode(ENCODING, $bytes[, CHECK])
 366
 367 Decode sequence of octets assumed to be in I<ENCODING> into Perl's
 368 internal form and returns the resulting string.  For CHECK see
 369 L</"Handling Malformed Data">.
 370
 371 For example to convert ISO-8859-1 data to UTF-8:
 372
 373         $utf8 = decode("latin1", $latin1);
 374
 375 =item *
 376
 377         from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
 378
 379 Convert B<in-place> the data between two encodings.  How did the data
 380 in $string originally get to be in FROM_ENCODING?  Either using
 381 encode() or through PerlIO: See L</"Encoding and IO">.  For CHECK
 382 see L</"Handling Malformed Data">.
 383
 384 For example to convert ISO-8859-1 data to UTF-8:
 385
 386         from_to($data, "iso-8859-1", "utf-8");
 387
 388 and to convert it back:
 389
 390         from_to($data, "utf-8", "iso-8859-1");
 391
 392 Note that because the conversion happens in place, the data to be
 393 converted cannot be a string constant, it must be a scalar variable.
 394
 395 =back
 396
 397 =head2 Handling Malformed Data
 398
 399 If CHECK is not set, C<undef> is returned.  If the data is supposed to
 400 be UTF-8, an optional lexical warning (category utf8) is given.  If
 401 CHECK is true but not a code reference, dies.
 402
 403 It would desirable to have a way to indicate that transform should use
 404 the encodings "replacement character" - no such mechanism is defined yet.
 405
 406 It is also planned to allow I<CHECK> to be a code reference.
 407
 408 This is not yet implemented as there are design issues with what its
 409 arguments should be and how it returns its results.
 410
 411 =over 4
 412
 413 =item Scheme 1
 414
 415 Passed remaining fragment of string being processed.
 416 Modifies it in place to remove bytes/characters it can understand
 417 and returns a string used to represent them.
 418 e.g.
 419
 420  sub fixup {
 421    my $ch = substr($_[0],0,1,'');
 422    return sprintf("\x{%02X}",ord($ch);
 423  }
 424
 425 This scheme is close to how underlying C code for Encode works, but gives
 426 the fixup routine very little context.
 427
 428 =item Scheme 2
 429
 430 Passed original string, and an index into it of the problem area, and
 431 output string so far.  Appends what it will to output string and
 432 returns new index into original string.  For example:
 433
 434  sub fixup {
 435    # my ($s,$i,$d) = @_;
 436    my $ch = substr($_[0],$_[1],1);
 437    $_[2] .= sprintf("\x{%02X}",ord($ch);
 438    return $_[1]+1;
 439  }
 440
 441 This scheme gives maximal control to the fixup routine but is more
 442 complicated to code, and may need internals of Encode to be tweaked to
 443 keep original string intact.
 444
 445 =item Other Schemes
 446
 447 Hybrids of above.
 448
 449 Multiple return values rather than in-place modifications.
 450
 451 Index into the string could be pos($str) allowing s/\G...//.
 452
 453 =back
 454
 455 =head2 UTF-8 / utf8
 456
 457 The Unicode consortium defines the UTF-8 standard as a way of encoding
 458 the entire Unicode repertiore as sequences of octets.  This encoding is
 459 expected to become very widespread. Perl can use this form internaly
 460 to represent strings, so conversions to and from this form are
 461 particularly efficient (as octets in memory do not have to change,
 462 just the meta-data that tells Perl how to treat them).
 463
 464 =over 4
 465
 466 =item *
 467
 468         $bytes = encode_utf8($string);
 469
 470 The characters that comprise string are encoded in Perl's superset of UTF-8
 471 and the resulting octets returned as a sequence of bytes. All possible
 472 characters have a UTF-8 representation so this function cannot fail.
 473
 474 =item *
 475
 476         $string = decode_utf8($bytes [,CHECK]);
 477
 478 The sequence of octets represented by $bytes is decoded from UTF-8
 479 into a sequence of logical characters. Not all sequences of octets
 480 form valid UTF-8 encodings, so it is possible for this call to fail.
 481 For CHECK see L</"Handling Malformed Data">.
 482
 483 =back
 484
 485 =head2 Listing available encodings
 486
 487   use Encode qw(encodings);
 488   @list = encodings();
 489
 490 Returns a list of the canonical names of the available encodings.
 491
 492 =head2 Defining Aliases
 493
 494   use Encode qw(define_alias);
 495   define_alias( newName => ENCODING);
 496
 497 Allows newName to be used as am alias for ENCODING. ENCODING may be
 498 either the name of an encoding or and encoding object (as above).
 499
 500 Currently I<newName> can be specified in the following ways:
 501
 502 =over 4
 503
 504 =item As a simple string.
 505
 506 =item As a qr// compiled regular expression, e.g.:
 507
 508   define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
 509
 510 In this case if I<ENCODING> is not a reference it is C<eval>-ed to
 511 allow C<$1> etc. to be subsituted.  The example is one way to names as
 512 used in X11 font names to alias the MIME names for the iso-8859-*
 513 family.  Note the double quote inside the single quote.  If you are
 514 using regex here, you have to do so or it won't work in this case.
 515
 516 =item As a code reference, e.g.:
 517
 518   define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
 519
 520 In this case C<$_> will be set to the name that is being looked up and
 521 I<ENCODING> is passed to the sub as its first argument.  The example
 522 is another way to names as used in X11 font names to alias the MIME
 523 names for the iso-8859-* family.
 524
 525 =back
 526
 527 =head1 Defining Encodings
 528
 529     use Encode qw(define_alias);
 530     define_encoding( $object, 'canonicalName' [,alias...]);
 531
 532 Causes I<canonicalName> to be associated with I<$object>.  The object
 533 should provide the interface described in L<Encode::Encoding>
 534 below.  If more than two arguments are provided then additional
 535 arguments are taken as aliases for I<$object> as for C<define_alias>.
 536
 537 =head1 Encoding and IO
 538
 539 It is very common to want to do encoding transformations when
 540 reading or writing files, network connections, pipes etc.
 541 If Perl is configured to use the new 'perlio' IO system then
 542 C<Encode> provides a "layer" (See L<perliol>) which can transform
 543 data as it is read or written.
 544
 545 Here is how the blind poet would modernise the encoding:
 546
 547     use Encode;
 548     open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
 549     open(my $utf8,'>:utf8','iliad.utf8');
 550     my @epic = <$iliad>;
 551     print $utf8 @epic;
 552     close($utf8);
 553     close($illiad);
 554
 555 In addition the new IO system can also be configured to read/write
 556 UTF-8 encoded characters (as noted above this is efficient):
 557
 558     open(my $fh,'>:utf8','anything');
 559     print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
 560
 561 Either of the above forms of "layer" specifications can be made the default
 562 for a lexical scope with the C<use open ...> pragma. See L<open>.
 563
 564 Once a handle is open is layers can be altered using C<binmode>.
 565
 566 Without any such configuration, or if Perl itself is built using
 567 system's own IO, then write operations assume that file handle accepts
 568 only I<bytes> and will C<die> if a character larger than 255 is
 569 written to the handle. When reading, each octet from the handle
 570 becomes a byte-in-a-character. Note that this default is the same
 571 behaviour as bytes-only languages (including Perl before v5.6) would
 572 have, and is sufficient to handle native 8-bit encodings
 573 e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
 574 other encodings and binary data.
 575
 576 In other cases it is the programs responsibility to transform
 577 characters into bytes using the API above before doing writes, and to
 578 transform the bytes read from a handle into characters before doing
 579 "character operations" (e.g. C<lc>, C</\W+/>, ...).
 580
 581 You can also use PerlIO to convert larger amounts of data you don't
 582 want to bring into memory.  For example to convert between ISO-8859-1
 583 (Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
 584
 585     open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
 586     open(G, ">:utf8",                 "data.utf") or die $!;
 587     while (<F>) { print G }
 588
 589     # Could also do "print G <F>" but that would pull
 590     # the whole file into memory just to write it out again.
 591
 592 More examples:
 593
 594     open(my $f, "<:encoding(cp1252)")
 595     open(my $g, ">:encoding(iso-8859-2)")
 596     open(my $h, ">:encoding(latin9)")       # iso-8859-15
 597
 598 See L<PerlIO> for more information.
 599
 600 See also L<encoding> for how to change the default encoding of the
 601 data in your script.
 602
 603 =head1 Messing with Perl's Internals
 604
 605 The following API uses parts of Perl's internals in the current
 606 implementation.  As such they are efficient, but may change.
 607
 608 =over 4
 609
 610 =item * is_utf8(STRING [, CHECK])
 611
 612 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
 613 If CHECK is true, also checks the data in STRING for being well-formed
 614 UTF-8.  Returns true if successful, false otherwise.
 615
 616 =item *
 617
 618         _utf8_on(STRING)
 619
 620 [INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
 621 B<not> checked for being well-formed UTF-8.  Do not use unless you
 622 B<know> that the STRING is well-formed UTF-8.  Returns the previous
 623 state of the UTF-8 flag (so please don't test the return value as
 624 I<not> success or failure), or C<undef> if STRING is not a string.
 625
 626 =item *
 627
 628         _utf8_off(STRING)
 629
 630 [INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
 631 Returns the previous state of the UTF-8 flag (so please don't test the
 632 return value as I<not> success or failure), or C<undef> if STRING is
 633 not a string.
 634
 635 =back
 636
 637 =head1 SEE ALSO
 638
 639 L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>, L<encoding>,
 640 L<utf8>, L<Encode::Description>, L<Encode::Encoding> the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
 641
 642
 643 =cut
 644