ext/Encode/Encode.pm

   1 package Encode;
   2 use strict;
   3 our $VERSION = do { my @r = (q$Revision: 1.51 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   4 our $DEBUG = 0;
   5
   6 require DynaLoader;
   7 require Exporter;
   8
   9 our @ISA = qw(Exporter DynaLoader);
  10
  11 # Public, encouraged API is exported by default
  12
  13 our @EXPORT = qw(
  14   decode  decode_utf8  encode  encode_utf8
  15   encodings  find_encoding
  16 );
  17
  18 our @FB_FLAGS  = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC PERLQQ);
  19 our @FB_CONSTS = qw(FB_DEFAULT FB_QUIET FB_WARN FB_PERLQQ FB_CROAK);
  20
  21 our @EXPORT_OK =
  22     (
  23      qw(
  24        _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
  25        is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
  26       ),
  27      @FB_FLAGS, @FB_CONSTS,
  28     );
  29
  30 our %EXPORT_TAGS =
  31     (
  32      all          =>  [ @EXPORT, @EXPORT_OK ],
  33      fallbacks    =>  [ @FB_CONSTS ],
  34      fallback_all =>  [ @FB_CONSTS, @FB_FLAGS ],
  35     );
  36
  37
  38 bootstrap Encode ();
  39
  40 # Documentation moved after __END__ for speed - NI-S
  41
  42 use Carp;
  43
  44 our $ON_EBCDIC = (ord("A") == 193);
  45
  46 use Encode::Alias;
  47
  48 # Make a %Encoding package variable to allow a certain amount of cheating
  49 our %Encoding;
  50 our %ExtModule;
  51 require Encode::Config;
  52 eval { require Encode::ConfigLocal };
  53
  54 sub encodings
  55 {
  56     my $class = shift;
  57     my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
  58     for my $mod (@modules){
  59         $mod =~ s,::,/,g or $mod = "Encode/$mod";
  60         $mod .= '.pm';
  61         $DEBUG and warn "about to require $mod;";
  62         eval { require $mod; };
  63     }
  64     my %modules = map {$_ => 1} @modules;
  65     return
  66         sort { lc $a cmp lc $b }
  67              grep {!/^(?:Internal|Unicode)$/o} keys %Encoding;
  68 }
  69
  70 sub perlio_ok{
  71     exists $INC{"PerlIO/encoding.pm"} or return 0;
  72     my $stash = ref($_[0]);
  73     $stash ||= ref(find_encoding($_[0]));
  74     return ($stash eq "Encode::XS" || $stash eq "Encode::Unicode");
  75 }
  76
  77 sub define_encoding
  78 {
  79     my $obj  = shift;
  80     my $name = shift;
  81     $Encoding{$name} = $obj;
  82     my $lc = lc($name);
  83     define_alias($lc => $obj) unless $lc eq $name;
  84     while (@_)
  85     {
  86         my $alias = shift;
  87         define_alias($alias,$obj);
  88     }
  89     return $obj;
  90 }
  91
  92 sub getEncoding
  93 {
  94     my ($class,$name,$skip_external) = @_;
  95     my $enc;
  96     if (ref($name) && $name->can('new_sequence'))
  97     {
  98         return $name;
  99     }
 100     my $lc = lc $name;
 101     if (exists $Encoding{$name})
 102     {
 103         return $Encoding{$name};
 104     }
 105     if (exists $Encoding{$lc})
 106     {
 107         return $Encoding{$lc};
 108     }
 109
 110     my $oc = $class->find_alias($name);
 111     return $oc if defined $oc;
 112
 113     $oc = $class->find_alias($lc) if $lc ne $name;
 114     return $oc if defined $oc;
 115
 116     unless ($skip_external)
 117     {
 118         if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
 119             $mod =~ s,::,/,g ; $mod .= '.pm';
 120             eval{ require $mod; };
 121             return $Encoding{$name} if exists $Encoding{$name};
 122         }
 123     }
 124     return;
 125 }
 126
 127 sub find_encoding
 128 {
 129     my ($name,$skip_external) = @_;
 130     return __PACKAGE__->getEncoding($name,$skip_external);
 131 }
 132
 133 sub resolve_alias {
 134     my $obj = find_encoding(shift);
 135     defined $obj and return $obj->name;
 136     return;
 137 }
 138
 139 sub encode($$;$)
 140 {
 141     my ($name,$string,$check) = @_;
 142     $check ||=0;
 143     my $enc = find_encoding($name);
 144     croak("Unknown encoding '$name'") unless defined $enc;
 145     my $octets = $enc->encode($string,$check);
 146     return undef if ($check && length($string));
 147     return $octets;
 148 }
 149
 150 sub decode($$;$)
 151 {
 152     my ($name,$octets,$check) = @_;
 153     $check ||=0;
 154     my $enc = find_encoding($name);
 155     croak("Unknown encoding '$name'") unless defined $enc;
 156     my $string = $enc->decode($octets,$check);
 157     $_[1] = $octets if $check;
 158     return $string;
 159 }
 160
 161 sub from_to($$$;$)
 162 {
 163     my ($string,$from,$to,$check) = @_;
 164     $check ||=0;
 165     my $f = find_encoding($from);
 166     croak("Unknown encoding '$from'") unless defined $f;
 167     my $t = find_encoding($to);
 168     croak("Unknown encoding '$to'") unless defined $t;
 169     my $uni = $f->decode($string,$check);
 170     return undef if ($check && length($string));
 171     $string =  $t->encode($uni,$check);
 172     return undef if ($check && length($uni));
 173     return defined($_[0] = $string) ? length($string) : undef ;
 174 }
 175
 176 sub encode_utf8($)
 177 {
 178     my ($str) = @_;
 179     utf8::encode($str);
 180     return $str;
 181 }
 182
 183 sub decode_utf8($)
 184 {
 185     my ($str) = @_;
 186     return undef unless utf8::decode($str);
 187     return $str;
 188 }
 189
 190 predefine_encodings();
 191
 192 #
 193 # This is to restore %Encoding if really needed;
 194 #
 195 sub predefine_encodings{
 196     if ($ON_EBCDIC) {
 197         # was in Encode::UTF_EBCDIC
 198         package Encode::UTF_EBCDIC;
 199         *name         = sub{ shift->{'Name'} };
 200         *new_sequence = sub{ return $_[0] };
 201         *decode = sub{
 202             my ($obj,$str,$chk) = @_;
 203             my $res = '';
 204             for (my $i = 0; $i < length($str); $i++) {
 205                 $res .=
 206                     chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
 207             }
 208             $_[1] = '' if $chk;
 209             return $res;
 210         };
 211         *encode = sub{
 212             my ($obj,$str,$chk) = @_;
 213             my $res = '';
 214             for (my $i = 0; $i < length($str); $i++) {
 215                 $res .=
 216                     chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
 217             }
 218             $_[1] = '' if $chk;
 219             return $res;
 220         };
 221         $Encode::Encoding{Unicode} =
 222             bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
 223     } else {
 224         # was in Encode::UTF_EBCDIC
 225         package Encode::Internal;
 226         *name         = sub{ shift->{'Name'} };
 227         *new_sequence = sub{ return $_[0] };
 228         *decode = sub{
 229             my ($obj,$str,$chk) = @_;
 230             utf8::upgrade($str);
 231             $_[1] = '' if $chk;
 232             return $str;
 233         };
 234         *encode = \&decode;
 235         $Encode::Encoding{Unicode} =
 236             bless {Name => "Internal"} => "Encode::Internal";
 237     }
 238
 239     {
 240         # was in Encode::utf8
 241         package Encode::utf8;
 242         *name         = sub{ shift->{'Name'} };
 243         *new_sequence = sub{ return $_[0] };
 244         *decode = sub{
 245             my ($obj,$octets,$chk) = @_;
 246             my $str = Encode::decode_utf8($octets);
 247             if (defined $str) {
 248                 $_[1] = '' if $chk;
 249                 return $str;
 250             }
 251             return undef;
 252         };
 253         *encode = sub {
 254             my ($obj,$string,$chk) = @_;
 255             my $octets = Encode::encode_utf8($string);
 256             $_[1] = '' if $chk;
 257             return $octets;
 258         };
 259         $Encode::Encoding{utf8} =
 260             bless {Name => "utf8"} => "Encode::utf8";
 261     }
 262 }
 263
 264 require Encode::Encoding;
 265
 266 eval {
 267     require PerlIO::encoding;
 268     unless (PerlIO::encoding->VERSION >= 0.02){
 269         delete $INC{"PerlIO/encoding.pm"};
 270     }
 271 };
 272 # warn $@ if $@;
 273
 274 1;
 275
 276 __END__
 277
 278 =head1 NAME
 279
 280 Encode - character encodings
 281
 282 =head1 SYNOPSIS
 283
 284     use Encode;
 285
 286
 287 =head2 Table of Contents
 288
 289 Encode consists of a collection of modules which details are too big
 290 to fit in one document.  This POD itself explains the top-level APIs
 291 and general topics at a glance.  For other topics and more details,
 292 see the PODs below;
 293
 294   Name                          Description
 295   --------------------------------------------------------
 296   Encode::Alias         Alias defintions to encodings
 297   Encode::Encoding      Encode Implementation Base Class
 298   Encode::Supported     List of Supported Encodings
 299   Encode::CN            Simplified Chinese Encodings
 300   Encode::JP            Japanese Encodings
 301   Encode::KR            Korean Encodings
 302   Encode::TW            Traditional Chinese Encodings
 303   --------------------------------------------------------
 304
 305 =head1 DESCRIPTION
 306
 307 The C<Encode> module provides the interfaces between Perl's strings
 308 and the rest of the system.  Perl strings are sequences of
 309 B<characters>.
 310
 311 The repertoire of characters that Perl can represent is at least that
 312 defined by the Unicode Consortium. On most platforms the ordinal
 313 values of the characters (as returned by C<ord(ch)>) is the "Unicode
 314 codepoint" for the character (the exceptions are those platforms where
 315 the legacy encoding is some variant of EBCDIC rather than a super-set
 316 of ASCII - see L<perlebcdic>).
 317
 318 Traditionally computer data has been moved around in 8-bit chunks
 319 often called "bytes". These chunks are also known as "octets" in
 320 networking standards. Perl is widely used to manipulate data of many
 321 types - not only strings of characters representing human or computer
 322 languages but also "binary" data being the machines representation of
 323 numbers, pixels in an image - or just about anything.
 324
 325 When Perl is processing "binary data" the programmer wants Perl to
 326 process "sequences of bytes". This is not a problem for Perl - as a
 327 byte has 256 possible values it easily fits in Perl's much larger
 328 "logical character".
 329
 330 =head2 TERMINOLOGY
 331
 332 =over 4
 333
 334 =item *
 335
 336 I<character>: a character in the range 0..(2**32-1) (or more).
 337 (What Perl's strings are made of.)
 338
 339 =item *
 340
 341 I<byte>: a character in the range 0..255
 342 (A special case of a Perl character.)
 343
 344 =item *
 345
 346 I<octet>: 8 bits of data, with ordinal values 0..255
 347 (Term for bytes passed to or from a non-Perl context, e.g. disk file.)
 348
 349 =back
 350
 351 The marker [INTERNAL] marks Internal Implementation Details, in
 352 general meant only for those who think they know what they are doing,
 353 and such details may change in future releases.
 354
 355 =head1 PERL ENCODING API
 356
 357 =over 4
 358
 359 =item $octets  = encode(ENCODING, $string[, CHECK])
 360
 361 Encodes string from Perl's internal form into I<ENCODING> and returns
 362 a sequence of octets.  ENCODING can be either a canonical name or
 363 alias.  For encoding names and aliases, see L</"Defining Aliases">.
 364 For CHECK see L</"Handling Malformed Data">.
 365
 366 For example to convert (internally UTF-8 encoded) Unicode string to
 367 iso-8859-1 (also known as Latin1),
 368
 369   $octets = encode("iso-8859-1", $unicode);
 370
 371 =item $string = decode(ENCODING, $octets[, CHECK])
 372
 373 Decode sequence of octets assumed to be in I<ENCODING> into Perl's
 374 internal form and returns the resulting string.  as in encode(),
 375 ENCODING can be either a canonical name or alias. For encoding names
 376 and aliases, see L</"Defining Aliases">.  For CHECK see
 377 L</"Handling Malformed Data">.
 378
 379 For example to convert ISO-8859-1 data to UTF-8:
 380
 381   $utf8 = decode("iso-8859-1", $latin1);
 382
 383 =item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])
 384
 385 Convert B<in-place> the data between two encodings.
 386 For example to convert ISO-8859-1 data to UTF-8:
 387
 388         from_to($data, "iso-8859-1", "utf-8");
 389
 390 and to convert it back:
 391
 392         from_to($data, "utf-8", "iso-8859-1");
 393
 394 Note that because the conversion happens in place, the data to be
 395 converted cannot be a string constant, it must be a scalar variable.
 396
 397 from_to() return the length of the converted string on success, undef
 398 otherwise.
 399
 400 =back
 401
 402 =head2 UTF-8 / utf8
 403
 404 The Unicode consortium defines the UTF-8 standard as a way of encoding
 405 the entire Unicode repertoire as sequences of octets.  This encoding is
 406 expected to become very widespread. Perl can use this form internally
 407 to represent strings, so conversions to and from this form are
 408 particularly efficient (as octets in memory do not have to change,
 409 just the meta-data that tells Perl how to treat them).
 410
 411 =over 4
 412
 413 =item $octets = encode_utf8($string);
 414
 415 The characters that comprise string are encoded in Perl's superset of UTF-8
 416 and the resulting octets returned as a sequence of bytes. All possible
 417 characters have a UTF-8 representation so this function cannot fail.
 418
 419 =item $string = decode_utf8($octets [, CHECK]);
 420
 421 The sequence of octets represented by $octets is decoded from UTF-8
 422 into a sequence of logical characters. Not all sequences of octets
 423 form valid UTF-8 encodings, so it is possible for this call to fail.
 424 For CHECK see L</"Handling Malformed Data">.
 425
 426 =back
 427
 428 =head2 Listing available encodings
 429
 430   use Encode;
 431   @list = Encode->encodings();
 432
 433 Returns a list of the canonical names of the available encodings that
 434 are loaded.  To get a list of all available encodings including the
 435 ones that are not loaded yet, say
 436
 437   @all_encodings = Encode->encodings(":all");
 438
 439 Or you can give the name of specific module.
 440
 441   @with_jp = Encode->encodings("Encode::JP");
 442
 443 When "::" is not in the name, "Encode::" is assumed.
 444
 445   @ebcdic = Encode->encodings("EBCDIC");
 446
 447 To find which encodings are supported by this package in details,
 448 see L<Encode::Supported>.
 449
 450 =head2 Defining Aliases
 451
 452 To add new alias to a given encoding,  Use;
 453
 454   use Encode;
 455   use Encode::Alias;
 456   define_alias(newName => ENCODING);
 457
 458 After that, newName can be used as an alias for ENCODING.
 459 ENCODING may be either the name of an encoding or an
 460 I<encoding object>
 461
 462 But before you do so, make sure the alias is nonexistent with
 463 C<resolve_alias()>, which returns the canonical name thereof.
 464 i.e.
 465
 466   Encode::resolve_alias("latin1") eq "iso-8859-1" # true
 467   Encode::resolve_alias("iso-8859-12")   # false; nonexistent
 468   Encode::resolve_alias($name) eq $name  # true if $name is canonical
 469
 470 This resolve_alias() does not need C<use Encode::Alias> and is
 471 exported via C<use encode qw(resolve_alias)>.
 472
 473 See L<Encode::Alias> on details.
 474
 475 =head1 Encoding via PerlIO
 476
 477 If your perl supports I<PerlIO>, you can use PerlIO layer to directly
 478 decode and encode via filehandle.  The following two examples are
 479 totally identical by functionality.
 480
 481   # via PerlIO
 482   open my $in,  "<:encoding(shiftjis)", $infile  or die;
 483   open my $out, ">:encoding(euc-jp)",   $outfile or die;
 484   while(<>){ print; }
 485
 486   # via from_to
 487   open my $in,  $infile  or die;
 488   open my $out, $outfile or die;
 489   while(<>){
 490     from_to($_, "shiftjis", "euc", 1);
 491   }
 492
 493 Unfortunately, not all encodings are PerlIO-savvy.  You can check if
 494 your encoding is supported by PerlIO by C<perlio_ok> method.
 495
 496   Encode::perlio_ok("iso-20220jp");        # false
 497   find_encoding("iso-2022-jp")->perlio_ok; # false
 498   use Encode qw(perlio_ok);                # exported upon request
 499   perlio_ok("euc-jp")                      # true if PerlIO is enabled
 500
 501 For gory details, see L<Encode::PerlIO>;
 502
 503 =head1 Handling Malformed Data
 504
 505 =over 4
 506
 507 THE I<CHECK> argument is used as follows.  When you omit it, it is
 508 identical to I<CHECK> = 0.
 509
 510 =item I<CHECK> = Encode::FB_DEFAULT ( == 0)
 511
 512 If I<CHECK> is 0, (en|de)code will put I<substitution character> in
 513 place of the malformed character.  for UCM-based encodings,
 514 E<lt>subcharE<gt> will be used.  For Unicode, \xFFFD is used.  If the
 515 data is supposed to be UTF-8, an optional lexical warning (category
 516 utf8) is given.
 517
 518 =item I<CHECK> = Encode::DIE_ON_ERROR (== 1)
 519
 520 If I<CHECK> is 1, methods will die immediately  with an error
 521 message.  so when I<CHECK> is set,  you should trap the fatal error
 522 with eval{} unless you really want to let it die on error.
 523
 524 =item I<CHECK> = Encode::FB_QUIET
 525
 526 If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately
 527 return proccessed part on error, with data passed via argument
 528 overwritten with unproccessed part.  This is handy when have to
 529 repeatedly call because the source data is chopped in the middle for
 530 some reasons, such as fixed-width buffer.  Here is a sample code that
 531 just does this.
 532
 533   my $data = '';
 534   while(defined(read $fh, $buffer, 256)){
 535     # buffer may end in partial character so we append
 536     $data .= $buffer;
 537     $utf8 .= decode($encoding, $data, ENCODE::FB_QUIET);
 538     # $data now contains unprocessed partial character
 539   }
 540
 541 =item I<CHECK> = Encode::FB_WARN
 542
 543 This is the same as above, except it warns on error.  Handy when you
 544 are debugging the mode above.
 545
 546 =item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
 547
 548 For encodings that are implemented by Encode::XS, CHECK ==
 549 Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode.
 550
 551 When you decode, '\xI<XX>' will be placed where I<XX> is the hex
 552 representation of the octet  that could not be decoded to utf8.  And
 553 when you encode, '\x{I<xxxx>}' will be placed where I<xxxx> is the
 554 Unicode ID of the character that cannot be found in the character
 555 repartoire of the encoding.
 556
 557 =item The bitmask
 558
 559 These modes are actually set via bitmask.  here is how FB_XX are laid
 560 out.  for FB_XX you can import via C<use Encode qw(:fallbacks)> for
 561 generic bitmask constants, you can import via
 562  C<use Encode qw(:fallback_all)>.
 563
 564                      FB_DEFAULT FB_CROAK FB_QUIET FB_WARN  FB_PERLQQ
 565  DIE_ON_ERR    0x0001             X
 566  WARN_ON_ER    0x0002                               X
 567  RETURN_ON_ERR 0x0004                      X        X
 568  LEAVE_SRC     0x0008
 569  PERLQQ        0x0100                                        X
 570
 571 =head2 Unemplemented fallback schemes
 572
 573 In future you will be able to use a code reference to a callback
 574 function for the value of I<CHECK> but its API is still undecided.
 575
 576 =head1 Defining Encodings
 577
 578 To define a new encoding, use:
 579
 580     use Encode qw(define_alias);
 581     define_encoding($object, 'canonicalName' [, alias...]);
 582
 583 I<canonicalName> will be associated with I<$object>.  The object
 584 should provide the interface described in L<Encode::Encoding>
 585 If more than two arguments are provided then additional
 586 arguments are taken as aliases for I<$object> as for C<define_alias>.
 587
 588 See L<Encode::Encoding> for more details.
 589
 590 =head1 Messing with Perl's Internals
 591
 592 The following API uses parts of Perl's internals in the current
 593 implementation.  As such they are efficient, but may change.
 594
 595 =over 4
 596
 597 =item is_utf8(STRING [, CHECK])
 598
 599 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
 600 If CHECK is true, also checks the data in STRING for being well-formed
 601 UTF-8.  Returns true if successful, false otherwise.
 602
 603 =item _utf8_on(STRING)
 604
 605 [INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
 606 B<not> checked for being well-formed UTF-8.  Do not use unless you
 607 B<know> that the STRING is well-formed UTF-8.  Returns the previous
 608 state of the UTF-8 flag (so please don't test the return value as
 609 I<not> success or failure), or C<undef> if STRING is not a string.
 610
 611 =item _utf8_off(STRING)
 612
 613 [INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
 614 Returns the previous state of the UTF-8 flag (so please don't test the
 615 return value as I<not> success or failure), or C<undef> if STRING is
 616 not a string.
 617
 618 =back
 619
 620 =head1 SEE ALSO
 621
 622 L<Encode::Encoding>,
 623 L<Encode::Supported>,
 624 L<Encode::PerlIO>,
 625 L<encoding>,
 626 L<perlebcdic>,
 627 L<perlfunc/open>,
 628 L<perlunicode>,
 629 L<utf8>,
 630 the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
 631
 632 =head1 MAINTAINER
 633
 634 This project was originated by Nick Ing-Simmons and later maintained
 635 by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>.  See AUTHORS for full list
 636 of people involved.  For any questions, use
 637 E<lt>perl-unicode@perl.orgE<gt> so others can share.
 638
 639 =cut