ext/Encode/Encode.pm

   1 package Encode;
   2 use strict;
   3 our $VERSION = do { my @r = (q$Revision: 1.50 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   4 our $DEBUG = 0;
   5
   6 require DynaLoader;
   7 require Exporter;
   8
   9 our @ISA = qw(Exporter DynaLoader);
  10
  11 # Public, encouraged API is exported by default
  12
  13 our @EXPORT = qw(
  14   decode  decode_utf8  encode  encode_utf8
  15   encodings  find_encoding
  16 );
  17
  18 our @FB_FLAGS  = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC PERLQQ);
  19 our @FB_CONSTS = qw(FB_DEFAULT FB_QUIET FB_WARN FB_PERLQQ FB_CROAK);
  20
  21 our @EXPORT_OK =
  22     (
  23      qw(
  24        _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
  25        is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
  26       ),
  27      @FB_FLAGS, @FB_CONSTS,
  28     );
  29
  30 our %EXPORT_TAGS =
  31     (
  32      all          =>  [ @EXPORT, @EXPORT_OK ],
  33      fallbacks    =>  [ @FB_CONSTS ],
  34      fallback_all =>  [ @FB_CONSTS, @FB_FLAGS ],
  35     );
  36
  37
  38 bootstrap Encode ();
  39
  40 # Documentation moved after __END__ for speed - NI-S
  41
  42 use Carp;
  43
  44 our $ON_EBCDIC = (ord("A") == 193);
  45
  46 use Encode::Alias;
  47
  48 # Make a %Encoding package variable to allow a certain amount of cheating
  49 our %Encoding;
  50 our %ExtModule;
  51 require Encode::Config;
  52 eval { require Encode::ConfigLocal };
  53
  54 sub encodings
  55 {
  56     my $class = shift;
  57     my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
  58     for my $mod (@modules){
  59         $mod =~ s,::,/,g or $mod = "Encode/$mod";
  60         $mod .= '.pm';
  61         $DEBUG and warn "about to require $mod;";
  62         eval { require $mod; };
  63     }
  64     my %modules = map {$_ => 1} @modules;
  65     return
  66         sort { lc $a cmp lc $b }
  67              grep {!/^(?:Internal|Unicode)$/o} keys %Encoding;
  68 }
  69
  70 sub perlio_ok{
  71     exists $INC{"PerlIO/encoding.pm"} or return 0;
  72     my $stash = ref($_[0]);
  73     $stash ||= ref(find_encoding($_[0]));
  74     return ($stash eq "Encode::XS" || $stash eq "Encode::Unicode");
  75 }
  76
  77 sub define_encoding
  78 {
  79     my $obj  = shift;
  80     my $name = shift;
  81     $Encoding{$name} = $obj;
  82     my $lc = lc($name);
  83     define_alias($lc => $obj) unless $lc eq $name;
  84     while (@_)
  85     {
  86         my $alias = shift;
  87         define_alias($alias,$obj);
  88     }
  89     return $obj;
  90 }
  91
  92 sub getEncoding
  93 {
  94     my ($class,$name,$skip_external) = @_;
  95     my $enc;
  96     if (ref($name) && $name->can('new_sequence'))
  97     {
  98         return $name;
  99     }
 100     my $lc = lc $name;
 101     if (exists $Encoding{$name})
 102     {
 103         return $Encoding{$name};
 104     }
 105     if (exists $Encoding{$lc})
 106     {
 107         return $Encoding{$lc};
 108     }
 109
 110     my $oc = $class->find_alias($name);
 111     return $oc if defined $oc;
 112
 113     $oc = $class->find_alias($lc) if $lc ne $name;
 114     return $oc if defined $oc;
 115
 116     unless ($skip_external)
 117     {
 118         if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
 119             $mod =~ s,::,/,g ; $mod .= '.pm';
 120             eval{ require $mod; };
 121             return $Encoding{$name} if exists $Encoding{$name};
 122         }
 123     }
 124     return;
 125 }
 126
 127 sub find_encoding
 128 {
 129     my ($name,$skip_external) = @_;
 130     return __PACKAGE__->getEncoding($name,$skip_external);
 131 }
 132
 133 sub resolve_alias {
 134     my $obj = find_encoding(shift);
 135     defined $obj and return $obj->name;
 136     return;
 137 }
 138
 139 sub encode($$;$)
 140 {
 141     my ($name,$string,$check) = @_;
 142     $check ||=0;
 143     my $enc = find_encoding($name);
 144     croak("Unknown encoding '$name'") unless defined $enc;
 145     my $octets = $enc->encode($string,$check);
 146     return undef if ($check && length($string));
 147     return $octets;
 148 }
 149
 150 sub decode($$;$)
 151 {
 152     my ($name,$octets,$check) = @_;
 153     $check ||=0;
 154     my $enc = find_encoding($name);
 155     croak("Unknown encoding '$name'") unless defined $enc;
 156     my $string = $enc->decode($octets,$check);
 157     $_[1] = $octets if $check;
 158     return $string;
 159 }
 160
 161 sub from_to($$$;$)
 162 {
 163     my ($string,$from,$to,$check) = @_;
 164     $check ||=0;
 165     my $f = find_encoding($from);
 166     croak("Unknown encoding '$from'") unless defined $f;
 167     my $t = find_encoding($to);
 168     croak("Unknown encoding '$to'") unless defined $t;
 169     my $uni = $f->decode($string,$check);
 170     return undef if ($check && length($string));
 171     $string =  $t->encode($uni,$check);
 172     return undef if ($check && length($uni));
 173     return defined($_[0] = $string) ? length($string) : undef ;
 174 }
 175
 176 sub encode_utf8($)
 177 {
 178     my ($str) = @_;
 179     utf8::encode($str);
 180     return $str;
 181 }
 182
 183 sub decode_utf8($)
 184 {
 185     my ($str) = @_;
 186     return undef unless utf8::decode($str);
 187     return $str;
 188 }
 189
 190 predefine_encodings();
 191
 192 #
 193 # This is to restore %Encoding if really needed;
 194 #
 195 sub predefine_encodings{
 196     if ($ON_EBCDIC) {
 197         # was in Encode::UTF_EBCDIC
 198         package Encode::UTF_EBCDIC;
 199         *name         = sub{ shift->{'Name'} };
 200         *new_sequence = sub{ return $_[0] };
 201         *decode = sub{
 202             my ($obj,$str,$chk) = @_;
 203             my $res = '';
 204             for (my $i = 0; $i < length($str); $i++) {
 205                 $res .=
 206                     chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
 207             }
 208             $_[1] = '' if $chk;
 209             return $res;
 210         };
 211         *encode = sub{
 212             my ($obj,$str,$chk) = @_;
 213             my $res = '';
 214             for (my $i = 0; $i < length($str); $i++) {
 215                 $res .=
 216                     chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
 217             }
 218             $_[1] = '' if $chk;
 219             return $res;
 220         };
 221         $Encode::Encoding{Unicode} =
 222             bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
 223     } else {
 224         # was in Encode::UTF_EBCDIC
 225         package Encode::Internal;
 226         *name         = sub{ shift->{'Name'} };
 227         *new_sequence = sub{ return $_[0] };
 228         *decode = sub{
 229             my ($obj,$str,$chk) = @_;
 230             utf8::upgrade($str);
 231             $_[1] = '' if $chk;
 232             return $str;
 233         };
 234         *encode = \&decode;
 235         $Encode::Encoding{Unicode} =
 236             bless {Name => "Internal"} => "Encode::Internal";
 237     }
 238
 239     {
 240         # was in Encode::utf8
 241         package Encode::utf8;
 242         *name         = sub{ shift->{'Name'} };
 243         *new_sequence = sub{ return $_[0] };
 244         *decode = sub{
 245             my ($obj,$octets,$chk) = @_;
 246             my $str = Encode::decode_utf8($octets);
 247             if (defined $str) {
 248                 $_[1] = '' if $chk;
 249                 return $str;
 250             }
 251             return undef;
 252         };
 253         *encode = sub {
 254             my ($obj,$string,$chk) = @_;
 255             my $octets = Encode::encode_utf8($string);
 256             $_[1] = '' if $chk;
 257             return $octets;
 258         };
 259         $Encode::Encoding{utf8} =
 260             bless {Name => "utf8"} => "Encode::utf8";
 261     }
 262 }
 263
 264 require Encode::Encoding;
 265
 266 eval qq{ use PerlIO::encoding 0.02 };
 267 # warn $@ if $@;
 268
 269 1;
 270
 271 __END__
 272
 273 =head1 NAME
 274
 275 Encode - character encodings
 276
 277 =head1 SYNOPSIS
 278
 279     use Encode;
 280
 281
 282 =head2 Table of Contents
 283
 284 Encode consists of a collection of modules which details are too big
 285 to fit in one document.  This POD itself explains the top-level APIs
 286 and general topics at a glance.  For other topics and more details,
 287 see the PODs below;
 288
 289   Name                          Description
 290   --------------------------------------------------------
 291   Encode::Alias         Alias defintions to encodings
 292   Encode::Encoding      Encode Implementation Base Class
 293   Encode::Supported     List of Supported Encodings
 294   Encode::CN            Simplified Chinese Encodings
 295   Encode::JP            Japanese Encodings
 296   Encode::KR            Korean Encodings
 297   Encode::TW            Traditional Chinese Encodings
 298   --------------------------------------------------------
 299
 300 =head1 DESCRIPTION
 301
 302 The C<Encode> module provides the interfaces between Perl's strings
 303 and the rest of the system.  Perl strings are sequences of
 304 B<characters>.
 305
 306 The repertoire of characters that Perl can represent is at least that
 307 defined by the Unicode Consortium. On most platforms the ordinal
 308 values of the characters (as returned by C<ord(ch)>) is the "Unicode
 309 codepoint" for the character (the exceptions are those platforms where
 310 the legacy encoding is some variant of EBCDIC rather than a super-set
 311 of ASCII - see L<perlebcdic>).
 312
 313 Traditionally computer data has been moved around in 8-bit chunks
 314 often called "bytes". These chunks are also known as "octets" in
 315 networking standards. Perl is widely used to manipulate data of many
 316 types - not only strings of characters representing human or computer
 317 languages but also "binary" data being the machines representation of
 318 numbers, pixels in an image - or just about anything.
 319
 320 When Perl is processing "binary data" the programmer wants Perl to
 321 process "sequences of bytes". This is not a problem for Perl - as a
 322 byte has 256 possible values it easily fits in Perl's much larger
 323 "logical character".
 324
 325 =head2 TERMINOLOGY
 326
 327 =over 4
 328
 329 =item *
 330
 331 I<character>: a character in the range 0..(2**32-1) (or more).
 332 (What Perl's strings are made of.)
 333
 334 =item *
 335
 336 I<byte>: a character in the range 0..255
 337 (A special case of a Perl character.)
 338
 339 =item *
 340
 341 I<octet>: 8 bits of data, with ordinal values 0..255
 342 (Term for bytes passed to or from a non-Perl context, e.g. disk file.)
 343
 344 =back
 345
 346 The marker [INTERNAL] marks Internal Implementation Details, in
 347 general meant only for those who think they know what they are doing,
 348 and such details may change in future releases.
 349
 350 =head1 PERL ENCODING API
 351
 352 =over 4
 353
 354 =item $octets  = encode(ENCODING, $string[, CHECK])
 355
 356 Encodes string from Perl's internal form into I<ENCODING> and returns
 357 a sequence of octets.  ENCODING can be either a canonical name or
 358 alias.  For encoding names and aliases, see L</"Defining Aliases">.
 359 For CHECK see L</"Handling Malformed Data">.
 360
 361 For example to convert (internally UTF-8 encoded) Unicode string to
 362 iso-8859-1 (also known as Latin1),
 363
 364   $octets = encode("iso-8859-1", $unicode);
 365
 366 =item $string = decode(ENCODING, $octets[, CHECK])
 367
 368 Decode sequence of octets assumed to be in I<ENCODING> into Perl's
 369 internal form and returns the resulting string.  as in encode(),
 370 ENCODING can be either a canonical name or alias. For encoding names
 371 and aliases, see L</"Defining Aliases">.  For CHECK see
 372 L</"Handling Malformed Data">.
 373
 374 For example to convert ISO-8859-1 data to UTF-8:
 375
 376   $utf8 = decode("iso-8859-1", $latin1);
 377
 378 =item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])
 379
 380 Convert B<in-place> the data between two encodings.
 381 For example to convert ISO-8859-1 data to UTF-8:
 382
 383         from_to($data, "iso-8859-1", "utf-8");
 384
 385 and to convert it back:
 386
 387         from_to($data, "utf-8", "iso-8859-1");
 388
 389 Note that because the conversion happens in place, the data to be
 390 converted cannot be a string constant, it must be a scalar variable.
 391
 392 from_to() return the length of the converted string on success, undef
 393 otherwise.
 394
 395 =back
 396
 397 =head2 UTF-8 / utf8
 398
 399 The Unicode consortium defines the UTF-8 standard as a way of encoding
 400 the entire Unicode repertoire as sequences of octets.  This encoding is
 401 expected to become very widespread. Perl can use this form internally
 402 to represent strings, so conversions to and from this form are
 403 particularly efficient (as octets in memory do not have to change,
 404 just the meta-data that tells Perl how to treat them).
 405
 406 =over 4
 407
 408 =item $octets = encode_utf8($string);
 409
 410 The characters that comprise string are encoded in Perl's superset of UTF-8
 411 and the resulting octets returned as a sequence of bytes. All possible
 412 characters have a UTF-8 representation so this function cannot fail.
 413
 414 =item $string = decode_utf8($octets [, CHECK]);
 415
 416 The sequence of octets represented by $octets is decoded from UTF-8
 417 into a sequence of logical characters. Not all sequences of octets
 418 form valid UTF-8 encodings, so it is possible for this call to fail.
 419 For CHECK see L</"Handling Malformed Data">.
 420
 421 =back
 422
 423 =head2 Listing available encodings
 424
 425   use Encode;
 426   @list = Encode->encodings();
 427
 428 Returns a list of the canonical names of the available encodings that
 429 are loaded.  To get a list of all available encodings including the
 430 ones that are not loaded yet, say
 431
 432   @all_encodings = Encode->encodings(":all");
 433
 434 Or you can give the name of specific module.
 435
 436   @with_jp = Encode->encodings("Encode::JP");
 437
 438 When "::" is not in the name, "Encode::" is assumed.
 439
 440   @ebcdic = Encode->encodings("EBCDIC");
 441
 442 To find which encodings are supported by this package in details,
 443 see L<Encode::Supported>.
 444
 445 =head2 Defining Aliases
 446
 447 To add new alias to a given encoding,  Use;
 448
 449   use Encode;
 450   use Encode::Alias;
 451   define_alias(newName => ENCODING);
 452
 453 After that, newName can be used as an alias for ENCODING.
 454 ENCODING may be either the name of an encoding or an
 455 I<encoding object>
 456
 457 But before you do so, make sure the alias is nonexistent with
 458 C<resolve_alias()>, which returns the canonical name thereof.
 459 i.e.
 460
 461   Encode::resolve_alias("latin1") eq "iso-8859-1" # true
 462   Encode::resolve_alias("iso-8859-12")   # false; nonexistent
 463   Encode::resolve_alias($name) eq $name  # true if $name is canonical
 464
 465 This resolve_alias() does not need C<use Encode::Alias> and is
 466 exported via C<use encode qw(resolve_alias)>.
 467
 468 See L<Encode::Alias> on details.
 469
 470 =head1 Encoding via PerlIO
 471
 472 If your perl supports I<PerlIO>, you can use PerlIO layer to directly
 473 decode and encode via filehandle.  The following two examples are
 474 totally identical by functionality.
 475
 476   # via PerlIO
 477   open my $in,  "<:encoding(shiftjis)", $infile  or die;
 478   open my $out, ">:encoding(euc-jp)",   $outfile or die;
 479   while(<>){ print; }
 480
 481   # via from_to
 482   open my $in,  $infile  or die;
 483   open my $out, $outfile or die;
 484   while(<>){
 485     from_to($_, "shiftjis", "euc", 1);
 486   }
 487
 488 Unfortunately, not all encodings are PerlIO-savvy.  You can check if
 489 your encoding is supported by PerlIO by C<perlio_ok> method.
 490
 491   Encode::perlio_ok("iso-20220jp");        # false
 492   find_encoding("iso-2022-jp")->perlio_ok; # false
 493   use Encode qw(perlio_ok);                # exported upon request
 494   perlio_ok("euc-jp")                      # true if PerlIO is enabled
 495
 496 For gory details, see L<Encode::PerlIO>;
 497
 498 =head1 Handling Malformed Data
 499
 500 =over 4
 501
 502 THE I<CHECK> argument is used as follows.  When you omit it, it is
 503 identical to I<CHECK> = 0.
 504
 505 =item I<CHECK> = Encode::FB_DEFAULT ( == 0)
 506
 507 If I<CHECK> is 0, (en|de)code will put I<substitution character> in
 508 place of the malformed character.  for UCM-based encodings,
 509 E<lt>subcharE<gt> will be used.  For Unicode, \xFFFD is used.  If the
 510 data is supposed to be UTF-8, an optional lexical warning (category
 511 utf8) is given.
 512
 513 =item I<CHECK> = Encode::DIE_ON_ERROR (== 1)
 514
 515 If I<CHECK> is 1, methods will die immediately  with an error
 516 message.  so when I<CHECK> is set,  you should trap the fatal error
 517 with eval{} unless you really want to let it die on error.
 518
 519 =item I<CHECK> = Encode::FB_QUIET
 520
 521 If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately
 522 return proccessed part on error, with data passed via argument
 523 overwritten with unproccessed part.  This is handy when have to
 524 repeatedly call because the source data is chopped in the middle for
 525 some reasons, such as fixed-width buffer.  Here is a sample code that
 526 just does this.
 527
 528   my $data = '';
 529   while(defined(read $fh, $buffer, 256)){
 530     # buffer may end in partial character so we append
 531     $data .= $buffer;
 532     $utf8 .= decode($encoding, $data, ENCODE::FB_QUIET);
 533     # $data now contains unprocessed partial character
 534   }
 535
 536 =item I<CHECK> = Encode::FB_WARN
 537
 538 This is the same as above, except it warns on error.  Handy when you
 539 are debugging the mode above.
 540
 541 =item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
 542
 543 For encodings that are implemented by Encode::XS, CHECK ==
 544 Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode.
 545
 546 When you decode, '\xI<XX>' will be placed where I<XX> is the hex
 547 representation of the octet  that could not be decoded to utf8.  And
 548 when you encode, '\x{I<xxxx>}' will be placed where I<xxxx> is the
 549 Unicode ID of the character that cannot be found in the character
 550 repartoire of the encoding.
 551
 552 =item The bitmask
 553
 554 These modes are actually set via bitmask.  here is how FB_XX are laid
 555 out.  for FB_XX you can import via C<use Encode qw(:fallbacks)> for
 556 generic bitmask constants, you can import via
 557  C<use Encode qw(:fallback_all)>.
 558
 559                        FB_DEFAULT FB_CROAK FB_QUIET FB_WARN  FB_PERLQQ
 560   DIE_ON_ERR     0x0001             X
 561   WARN_ON_ERR    0x0002                                X
 562   RETURN_ON_ERR  0x0004                      X         X
 563   LEAVE_SRC      0x0008
 564   PERLQQ         0x0100                                        X
 565
 566 =head2 Unemplemented fallback schemes
 567
 568 In future you will be able to use a code reference to a callback
 569 function for the value of I<CHECK> but its API is still undecided.
 570
 571 =head1 Defining Encodings
 572
 573 To define a new encoding, use:
 574
 575     use Encode qw(define_alias);
 576     define_encoding($object, 'canonicalName' [, alias...]);
 577
 578 I<canonicalName> will be associated with I<$object>.  The object
 579 should provide the interface described in L<Encode::Encoding>
 580 If more than two arguments are provided then additional
 581 arguments are taken as aliases for I<$object> as for C<define_alias>.
 582
 583 See L<Encode::Encoding> for more details.
 584
 585 =head1 Messing with Perl's Internals
 586
 587 The following API uses parts of Perl's internals in the current
 588 implementation.  As such they are efficient, but may change.
 589
 590 =over 4
 591
 592 =item is_utf8(STRING [, CHECK])
 593
 594 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
 595 If CHECK is true, also checks the data in STRING for being well-formed
 596 UTF-8.  Returns true if successful, false otherwise.
 597
 598 =item _utf8_on(STRING)
 599
 600 [INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
 601 B<not> checked for being well-formed UTF-8.  Do not use unless you
 602 B<know> that the STRING is well-formed UTF-8.  Returns the previous
 603 state of the UTF-8 flag (so please don't test the return value as
 604 I<not> success or failure), or C<undef> if STRING is not a string.
 605
 606 =item _utf8_off(STRING)
 607
 608 [INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
 609 Returns the previous state of the UTF-8 flag (so please don't test the
 610 return value as I<not> success or failure), or C<undef> if STRING is
 611 not a string.
 612
 613 =back
 614
 615 =head1 SEE ALSO
 616
 617 L<Encode::Encoding>,
 618 L<Encode::Supported>,
 619 L<Encode::PerlIO>,
 620 L<encoding>,
 621 L<perlebcdic>,
 622 L<perlfunc/open>,
 623 L<perlunicode>,
 624 L<utf8>,
 625 the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
 626
 627 =head1 MAINTAINER
 628
 629 This project was originated by Nick Ing-Simmons and later maintained
 630 by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>.  See AUTHORS for full list
 631 of people involved.  For any questions, use
 632 E<lt>perl-unicode@perl.orgE<gt> so others can share.
 633
 634 =cut