ext/Encode/Encode.pm

   1 package Encode;
   2 use strict;
   3 our $VERSION = do { my @r = (q$Revision: 1.11 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   4 our $DEBUG = 0;
   5
   6 require DynaLoader;
   7 require Exporter;
   8
   9 our @ISA = qw(Exporter DynaLoader);
  10
  11 # Public, encouraged API is exported by default
  12 our @EXPORT = qw (
  13   encode
  14   decode
  15   encode_utf8
  16   decode_utf8
  17   find_encoding
  18   encodings
  19 );
  20
  21 our @EXPORT_OK =
  22     qw(
  23        define_encoding
  24        from_to
  25        is_utf8
  26        is_8bit
  27        is_16bit
  28        utf8_upgrade
  29        utf8_downgrade
  30        _utf8_on
  31        _utf8_off
  32       );
  33
  34 bootstrap Encode ();
  35
  36 # Documentation moved after __END__ for speed - NI-S
  37
  38 use Carp;
  39
  40 our $ON_EBCDIC = (ord("A") == 193);
  41 use Encode::Alias;
  42
  43 # Make a %Encoding package variable to allow a certain amount of cheating
  44 our %Encoding;
  45
  46 our %ExtModule =
  47     (
  48      viscii             => 'Encode/Byte.pm',
  49      'koi8-r'           => 'Encode/Byte.pm',
  50      cp1047             => 'Encode/EBCDIC.pm',
  51      cp37               => 'Encode/EBCDIC.pm',
  52      'posix-bc'         => 'Encode/EBCDIC.pm',
  53      symbol             => 'Encode/Symbol.pm',
  54      dingbats           => 'Encode/Symbol.pm',
  55     );
  56
  57 for my $k (2..11,13..16){
  58     $ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm';
  59 }
  60
  61 for my $k (1250..1258){
  62     $ExtModule{"cp$k"} = 'Encode/Byte.pm';
  63 }
  64
  65 unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
  66 %ExtModule =(
  67              %ExtModule,
  68              'euc-cn'           => 'Encode/CN.pm',
  69              gb2312             => 'Encode/CN.pm',
  70              gb12345            => 'Encode/CN.pm',
  71              gbk                => 'Encode/CN.pm',
  72              cp936              => 'Encode/CN.pm',
  73              'iso-ir-165'       => 'Encode/CN.pm',
  74              'euc-jp'           => 'Encode/JP.pm',
  75              'iso-2022-jp'      => 'Encode/JP.pm',
  76              'iso-2022-jp-1'    => 'Encode/JP.pm',
  77              '7bit-jis'         => 'Encode/JP.pm',
  78              shiftjis           => 'Encode/JP.pm',
  79              macjapan           => 'Encode/JP.pm',
  80              cp932              => 'Encode/JP.pm',
  81              'euc-kr'           => 'Encode/KR.pm',
  82              ksc5601            => 'Encode/KR.pm',
  83              cp949              => 'Encode/KR.pm',
  84              big5               => 'Encode/TW.pm',
  85              'big5-hkscs'       => 'Encode/TW.pm',
  86              cp950              => 'Encode/TW.pm',
  87              gb18030            => 'Encode/HanExtra.pm',
  88              big5plus           => 'Encode/HanExtra.pm',
  89              'euc-tw'           => 'Encode/HanExtra.pm',
  90              );
  91 }
  92
  93 for my $k (qw{ CentralEurRoman  Croatian  Cyrillic   Greek
  94                Iceland          Roman     Rumanian   Sami
  95                Thai             Turkish   Ukrainian
  96              })
  97 {
  98     $ExtModule{"mac$k"} = 'Encode/Byte.pm';
  99 }
 100
 101 sub encodings
 102 {
 103     my $class = shift;
 104     my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
 105     for my $m (@modules)
 106     {
 107         $DEBUG and warn "about to require $m;";
 108         eval { require $m; };
 109     }
 110     return
 111         map({$_->[0]}
 112             sort({$a->[1] cmp $b->[1]}
 113                  map({[$_, lc $_]}
 114                      grep({ $_ ne 'Internal' }  keys %Encoding))));
 115 }
 116
 117 sub define_encoding
 118 {
 119     my $obj  = shift;
 120     my $name = shift;
 121     $Encoding{$name} = $obj;
 122     my $lc = lc($name);
 123     define_alias($lc => $obj) unless $lc eq $name;
 124     while (@_)
 125     {
 126         my $alias = shift;
 127         define_alias($alias,$obj);
 128     }
 129     return $obj;
 130 }
 131
 132 sub getEncoding
 133 {
 134     my ($class,$name,$skip_external) = @_;
 135     my $enc;
 136     if (ref($name) && $name->can('new_sequence'))
 137     {
 138         return $name;
 139     }
 140     my $lc = lc $name;
 141     if (exists $Encoding{$name})
 142     {
 143         return $Encoding{$name};
 144     }
 145     if (exists $Encoding{$lc})
 146     {
 147         return $Encoding{$lc};
 148     }
 149
 150     my $oc = $class->find_alias($name);
 151     return $oc if defined $oc;
 152
 153     $oc = $class->find_alias($lc) if $lc ne $name;
 154     return $oc if defined $oc;
 155
 156     if (!$skip_external and exists $ExtModule{$lc})
 157     {
 158         eval{ require $ExtModule{$lc}; };
 159         return $Encoding{$name} if exists $Encoding{$name};
 160     }
 161
 162     return;
 163 }
 164
 165 sub find_encoding
 166 {
 167     my ($name,$skip_external) = @_;
 168     return __PACKAGE__->getEncoding($name,$skip_external);
 169 }
 170
 171 sub encode
 172 {
 173     my ($name,$string,$check) = @_;
 174     my $enc = find_encoding($name);
 175     croak("Unknown encoding '$name'") unless defined $enc;
 176     my $octets = $enc->encode($string,$check);
 177     return undef if ($check && length($string));
 178     return $octets;
 179 }
 180
 181 sub decode
 182 {
 183     my ($name,$octets,$check) = @_;
 184     my $enc = find_encoding($name);
 185     croak("Unknown encoding '$name'") unless defined $enc;
 186     my $string = $enc->decode($octets,$check);
 187     $_[1] = $octets if $check;
 188     return $string;
 189 }
 190
 191 sub from_to
 192 {
 193     my ($string,$from,$to,$check) = @_;
 194     my $f = find_encoding($from);
 195     croak("Unknown encoding '$from'") unless defined $f;
 196     my $t = find_encoding($to);
 197     croak("Unknown encoding '$to'") unless defined $t;
 198     my $uni = $f->decode($string,$check);
 199     return undef if ($check && length($string));
 200     $string = $t->encode($uni,$check);
 201     return undef if ($check && length($uni));
 202     return defined($_[0] = $string) ? length($string) : undef ;
 203 }
 204
 205 sub encode_utf8
 206 {
 207     my ($str) = @_;
 208   utf8::encode($str);
 209     return $str;
 210 }
 211
 212 sub decode_utf8
 213 {
 214     my ($str) = @_;
 215     return undef unless utf8::decode($str);
 216     return $str;
 217 }
 218
 219 require Encode::Encoding;
 220 require Encode::XS;
 221 require Encode::Internal;
 222 require Encode::Unicode;
 223 require Encode::utf8;
 224 require Encode::10646_1;
 225 require Encode::ucs2_le;
 226
 227 1;
 228
 229 __END__
 230
 231 =head1 NAME
 232
 233 Encode - character encodings
 234
 235 =head1 SYNOPSIS
 236
 237     use Encode;
 238
 239
 240 =head2 Table of Contents
 241
 242 Encode consists of a collection of modules which details are too big
 243 to fit in one document.  This POD itself explains the top-level APIs
 244 and general topics at a glance.  For other topics and more details,
 245 see the PODs below;
 246
 247   Name                          Description
 248   --------------------------------------------------------
 249   Encode::Alias         Alias defintions to encodings
 250   Encode::Encoding      Encode Implementation Base Class
 251   Encode::Supported     List of Supported Encodings
 252   Encode::CN            Simplified Chinese Encodings
 253   Encode::JP            Japanese Encodings
 254   Encode::KR            Korean Encodings
 255   Encode::TW            Traditional Chinese Encodings
 256   --------------------------------------------------------
 257
 258 =head1 DESCRIPTION
 259
 260 The C<Encode> module provides the interfaces between Perl's strings
 261 and the rest of the system.  Perl strings are sequences of
 262 B<characters>.
 263
 264 The repertoire of characters that Perl can represent is at least that
 265 defined by the Unicode Consortium. On most platforms the ordinal
 266 values of the characters (as returned by C<ord(ch)>) is the "Unicode
 267 codepoint" for the character (the exceptions are those platforms where
 268 the legacy encoding is some variant of EBCDIC rather than a super-set
 269 of ASCII - see L<perlebcdic>).
 270
 271 Traditionally computer data has been moved around in 8-bit chunks
 272 often called "bytes". These chunks are also known as "octets" in
 273 networking standards. Perl is widely used to manipulate data of many
 274 types - not only strings of characters representing human or computer
 275 languages but also "binary" data being the machines representation of
 276 numbers, pixels in an image - or just about anything.
 277
 278 When Perl is processing "binary data" the programmer wants Perl to
 279 process "sequences of bytes". This is not a problem for Perl - as a
 280 byte has 256 possible values it easily fits in Perl's much larger
 281 "logical character".
 282
 283 =head2 TERMINOLOGY
 284
 285 =over 4
 286
 287 =item *
 288
 289 I<character>: a character in the range 0..(2**32-1) (or more).
 290 (What Perl's strings are made of.)
 291
 292 =item *
 293
 294 I<byte>: a character in the range 0..255
 295 (A special case of a Perl character.)
 296
 297 =item *
 298
 299 I<octet>: 8 bits of data, with ordinal values 0..255
 300 (Term for bytes passed to or from a non-Perl context, e.g. disk file.)
 301
 302 =back
 303
 304 The marker [INTERNAL] marks Internal Implementation Details, in
 305 general meant only for those who think they know what they are doing,
 306 and such details may change in future releases.
 307
 308 =head1 PERL ENCODING API
 309
 310 =over 4
 311
 312 =item $bytes  = encode(ENCODING, $string[, CHECK])
 313
 314 Encodes string from Perl's internal form into I<ENCODING> and returns
 315 a sequence of octets.  ENCODING can be either a canonical name or
 316 alias.  For encoding names and aliases, see L</"Defining Aliases">.
 317 For CHECK see L</"Handling Malformed Data">.
 318
 319 For example to convert (internally UTF-8 encoded) Unicode string to
 320 iso-8859-1 (also known as Latin1),
 321
 322   $octets = encode("iso-8859-1", $unicode);
 323
 324 =item $string = decode(ENCODING, $bytes[, CHECK])
 325
 326 Decode sequence of octets assumed to be in I<ENCODING> into Perl's
 327 internal form and returns the resulting string.  as in encode(),
 328 ENCODING can be either a canonical name or alias. For encoding names
 329 and aliases, see L</"Defining Aliases">.  For CHECK see
 330 L</"Handling Malformed Data">.
 331
 332 For example to convert ISO-8859-1 data to UTF-8:
 333
 334   $utf8 = decode("iso-8859-1", $latin1);
 335
 336 =item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
 337
 338 Convert B<in-place> the data between two encodings.  How did the data
 339 in $string originally get to be in FROM_ENCODING?  Either using
 340 encode() or through PerlIO: See L</"Encoding and IO">.
 341 For encoding names and aliases, see L</"Defining Aliases">.
 342 For CHECK see L</"Handling Malformed Data">.
 343
 344 For example to convert ISO-8859-1 data to UTF-8:
 345
 346         from_to($data, "iso-8859-1", "utf-8");
 347
 348 and to convert it back:
 349
 350         from_to($data, "utf-8", "iso-8859-1");
 351
 352 Note that because the conversion happens in place, the data to be
 353 converted cannot be a string constant, it must be a scalar variable.
 354
 355 from_to() return the length of the converted string on success, undef
 356 otherwise.
 357
 358 =back
 359
 360 =head2 Listing available encodings
 361
 362   use Encode;
 363   @list = Encode->encodings();
 364
 365 Returns a list of the canonical names of the available encodings that
 366 are loaded.  To get a list of all available encodings including the
 367 ones that are not loaded yet, say
 368
 369   @all_encodings = Encode->encodings(":all");
 370
 371 Or you can give the name of specific module.
 372
 373   @with_jp = Encode->encodings("Encode/JP.pm");
 374
 375 Note in this case you have to say C<"Encode/JP.pm"> instead of
 376 C<"Encode::JP">.
 377
 378 To find which encodings are supported by this package in details,
 379 see L<Encode::Supported>.
 380
 381
 382 =head2 Defining Aliases
 383
 384 To add new alias to a given encoding,  Use;
 385
 386   use Encode;
 387   use Encode::Alias;
 388   define_alias(newName => ENCODING);
 389
 390 After that, newName can be used as an alias for ENCODING.
 391 ENCODING may be either the name of an encoding or an I<encoding
 392  object>
 393
 394 See L<Encode::Alias> on details.
 395
 396 =head1 Encoding and IO
 397
 398 It is very common to want to do encoding transformations when
 399 reading or writing files, network connections, pipes etc.
 400 If Perl is configured to use the new 'perlio' IO system then
 401 C<Encode> provides a "layer" (See L<perliol>) which can transform
 402 data as it is read or written.
 403
 404 Here is how the blind poet would modernise the encoding:
 405
 406     use Encode;
 407     open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
 408     open(my $utf8,'>:utf8','iliad.utf8');
 409     my @epic = <$iliad>;
 410     print $utf8 @epic;
 411     close($utf8);
 412     close($illiad);
 413
 414 In addition the new IO system can also be configured to read/write
 415 UTF-8 encoded characters (as noted above this is efficient):
 416
 417     open(my $fh,'>:utf8','anything');
 418     print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
 419
 420 Either of the above forms of "layer" specifications can be made the default
 421 for a lexical scope with the C<use open ...> pragma. See L<open>.
 422
 423 Once a handle is open is layers can be altered using C<binmode>.
 424
 425 Without any such configuration, or if Perl itself is built using
 426 system's own IO, then write operations assume that file handle accepts
 427 only I<bytes> and will C<die> if a character larger than 255 is
 428 written to the handle. When reading, each octet from the handle
 429 becomes a byte-in-a-character. Note that this default is the same
 430 behaviour as bytes-only languages (including Perl before v5.6) would
 431 have, and is sufficient to handle native 8-bit encodings
 432 e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
 433 other encodings and binary data.
 434
 435 In other cases it is the programs responsibility to transform
 436 characters into bytes using the API above before doing writes, and to
 437 transform the bytes read from a handle into characters before doing
 438 "character operations" (e.g. C<lc>, C</\W+/>, ...).
 439
 440 You can also use PerlIO to convert larger amounts of data you don't
 441 want to bring into memory.  For example to convert between ISO-8859-1
 442 (Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
 443
 444     open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
 445     open(G, ">:utf8",                 "data.utf") or die $!;
 446     while (<F>) { print G }
 447
 448     # Could also do "print G <F>" but that would pull
 449     # the whole file into memory just to write it out again.
 450
 451 More examples:
 452
 453     open(my $f, "<:encoding(cp1252)")
 454     open(my $g, ">:encoding(iso-8859-2)")
 455     open(my $h, ">:encoding(latin9)")       # iso-8859-15
 456
 457 See L<PerlIO> for more information.
 458
 459 See also L<encoding> for how to change the default encoding of the
 460 data in your script.
 461
 462 =head1 Handling Malformed Data
 463
 464 If CHECK is not set, C<undef> is returned.  If the data is supposed to
 465 be UTF-8, an optional lexical warning (category utf8) is given.  If
 466 CHECK is true but not a code reference, dies.
 467
 468 It would desirable to have a way to indicate that transform should use
 469 the encodings "replacement character" - no such mechanism is defined yet.
 470
 471 It is also planned to allow I<CHECK> to be a code reference.
 472
 473 This is not yet implemented as there are design issues with what its
 474 arguments should be and how it returns its results.
 475
 476 =over 4
 477
 478 =item Scheme 1
 479
 480 Passed remaining fragment of string being processed.
 481 Modifies it in place to remove bytes/characters it can understand
 482 and returns a string used to represent them.
 483 e.g.
 484
 485  sub fixup {
 486    my $ch = substr($_[0],0,1,'');
 487    return sprintf("\x{%02X}",ord($ch);
 488  }
 489
 490 This scheme is close to how underlying C code for Encode works, but gives
 491 the fixup routine very little context.
 492
 493 =item Scheme 2
 494
 495 Passed original string, and an index into it of the problem area, and
 496 output string so far.  Appends what it will to output string and
 497 returns new index into original string.  For example:
 498
 499  sub fixup {
 500    # my ($s,$i,$d) = @_;
 501    my $ch = substr($_[0],$_[1],1);
 502    $_[2] .= sprintf("\x{%02X}",ord($ch);
 503    return $_[1]+1;
 504  }
 505
 506 This scheme gives maximal control to the fixup routine but is more
 507 complicated to code, and may need internals of Encode to be tweaked to
 508 keep original string intact.
 509
 510 =item Other Schemes
 511
 512 Hybrids of above.
 513
 514 Multiple return values rather than in-place modifications.
 515
 516 Index into the string could be C<pos($str)> allowing C<s/\G...//>.
 517
 518 =back
 519
 520 =head2 UTF-8 / utf8
 521
 522 The Unicode consortium defines the UTF-8 standard as a way of encoding
 523 the entire Unicode repertoire as sequences of octets.  This encoding is
 524 expected to become very widespread. Perl can use this form internally
 525 to represent strings, so conversions to and from this form are
 526 particularly efficient (as octets in memory do not have to change,
 527 just the meta-data that tells Perl how to treat them).
 528
 529 =over 4
 530
 531 =item $bytes = encode_utf8($string);
 532
 533 The characters that comprise string are encoded in Perl's superset of UTF-8
 534 and the resulting octets returned as a sequence of bytes. All possible
 535 characters have a UTF-8 representation so this function cannot fail.
 536
 537 =item $string = decode_utf8($bytes [, CHECK]);
 538
 539 The sequence of octets represented by $bytes is decoded from UTF-8
 540 into a sequence of logical characters. Not all sequences of octets
 541 form valid UTF-8 encodings, so it is possible for this call to fail.
 542 For CHECK see L</"Handling Malformed Data">.
 543
 544 =back
 545
 546 =head1 Defining Encodings
 547
 548 To define a new encoding, use:
 549
 550     use Encode qw(define_alias);
 551     define_encoding($object, 'canonicalName' [, alias...]);
 552
 553 I<canonicalName> will be associated with I<$object>.  The object
 554 should provide the interface described in L<Encode::Encoding>
 555 If more than two arguments are provided then additional
 556 arguments are taken as aliases for I<$object> as for C<define_alias>.
 557
 558 =head1 Messing with Perl's Internals
 559
 560 The following API uses parts of Perl's internals in the current
 561 implementation.  As such they are efficient, but may change.
 562
 563 =over 4
 564
 565 =item is_utf8(STRING [, CHECK])
 566
 567 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
 568 If CHECK is true, also checks the data in STRING for being well-formed
 569 UTF-8.  Returns true if successful, false otherwise.
 570
 571 =item _utf8_on(STRING)
 572
 573 [INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
 574 B<not> checked for being well-formed UTF-8.  Do not use unless you
 575 B<know> that the STRING is well-formed UTF-8.  Returns the previous
 576 state of the UTF-8 flag (so please don't test the return value as
 577 I<not> success or failure), or C<undef> if STRING is not a string.
 578
 579 =item _utf8_off(STRING)
 580
 581 [INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
 582 Returns the previous state of the UTF-8 flag (so please don't test the
 583 return value as I<not> success or failure), or C<undef> if STRING is
 584 not a string.
 585
 586 =back
 587
 588 =head1 SEE ALSO
 589
 590 L<Encode::Encoding>,
 591 L<Encode::Supported>,
 592 L<PerlIO>,
 593 L<encoding>,
 594 L<perlebcdic>,
 595 L<perlfunc/open>,
 596 L<perlunicode>,
 597 L<utf8>,
 598 the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
 599
 600 =cut