ext/Encode/Encode.pm

   1 package Encode;
   2 use strict;
   3 our $VERSION = do { my @r = (q$Revision: 1.42 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   4 our $DEBUG = 0;
   5
   6 require DynaLoader;
   7 require Exporter;
   8
   9 our @ISA = qw(Exporter DynaLoader);
  10
  11 # Public, encouraged API is exported by default
  12 our @EXPORT = qw (
  13   decode
  14   decode_utf8
  15   encode
  16   encode_utf8
  17   encodings
  18   find_encoding
  19 );
  20
  21 our @EXPORT_OK =
  22     qw(
  23        _utf8_off
  24        _utf8_on
  25        define_encoding
  26        from_to
  27        is_16bit
  28        is_8bit
  29        is_utf8
  30        resolve_alias
  31        utf8_downgrade
  32        utf8_upgrade
  33       );
  34
  35 bootstrap Encode ();
  36
  37 # Documentation moved after __END__ for speed - NI-S
  38
  39 use Carp;
  40
  41 our $ON_EBCDIC = (ord("A") == 193);
  42
  43 use Encode::Alias;
  44
  45 # Make a %Encoding package variable to allow a certain amount of cheating
  46 our %Encoding;
  47 our %ExtModule;
  48 require Encode::Config;
  49 eval { require Encode::ConfigLocal };
  50
  51 sub encodings
  52 {
  53     my $class = shift;
  54     my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
  55     for my $mod (@modules){
  56         $mod =~ s,::,/,g or $mod = "Encode/$mod";
  57         $mod .= '.pm';
  58         $DEBUG and warn "about to require $mod;";
  59         eval { require $mod; };
  60     }
  61     my %modules = map {$_ => 1} @modules;
  62     return
  63         sort { lc $a cmp lc $b }
  64              grep {!/^(?:Internal|Unicode)$/o} keys %Encoding;
  65 }
  66
  67 sub define_encoding
  68 {
  69     my $obj  = shift;
  70     my $name = shift;
  71     $Encoding{$name} = $obj;
  72     my $lc = lc($name);
  73     define_alias($lc => $obj) unless $lc eq $name;
  74     while (@_)
  75     {
  76         my $alias = shift;
  77         define_alias($alias,$obj);
  78     }
  79     return $obj;
  80 }
  81
  82 sub getEncoding
  83 {
  84     my ($class,$name,$skip_external) = @_;
  85     my $enc;
  86     if (ref($name) && $name->can('new_sequence'))
  87     {
  88         return $name;
  89     }
  90     my $lc = lc $name;
  91     if (exists $Encoding{$name})
  92     {
  93         return $Encoding{$name};
  94     }
  95     if (exists $Encoding{$lc})
  96     {
  97         return $Encoding{$lc};
  98     }
  99
 100     my $oc = $class->find_alias($name);
 101     return $oc if defined $oc;
 102
 103     $oc = $class->find_alias($lc) if $lc ne $name;
 104     return $oc if defined $oc;
 105
 106     unless ($skip_external)
 107     {
 108         if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
 109             $mod =~ s,::,/,g ; $mod .= '.pm';
 110             eval{ require $mod; };
 111             return $Encoding{$name} if exists $Encoding{$name};
 112         }
 113     }
 114     return;
 115 }
 116
 117 sub find_encoding
 118 {
 119     my ($name,$skip_external) = @_;
 120     return __PACKAGE__->getEncoding($name,$skip_external);
 121 }
 122
 123 sub resolve_alias {
 124     my $obj = find_encoding(shift);
 125     defined $obj and return $obj->name;
 126     return;
 127 }
 128
 129 sub encode($$;$)
 130 {
 131     my ($name,$string,$check) = @_;
 132     $check ||=0;
 133     my $enc = find_encoding($name);
 134     croak("Unknown encoding '$name'") unless defined $enc;
 135     my $octets = $enc->encode($string,$check);
 136     return undef if ($check && length($string));
 137     return $octets;
 138 }
 139
 140 sub decode($$;$)
 141 {
 142     my ($name,$octets,$check) = @_;
 143     $check ||=0;
 144     my $enc = find_encoding($name);
 145     croak("Unknown encoding '$name'") unless defined $enc;
 146     my $string = $enc->decode($octets,$check);
 147     $_[1] = $octets if $check;
 148     return $string;
 149 }
 150
 151 sub from_to($$$;$)
 152 {
 153     my ($string,$from,$to,$check) = @_;
 154     $check ||=0;
 155     my $f = find_encoding($from);
 156     croak("Unknown encoding '$from'") unless defined $f;
 157     my $t = find_encoding($to);
 158     croak("Unknown encoding '$to'") unless defined $t;
 159     my $uni = $f->decode($string,$check);
 160     return undef if ($check && length($string));
 161     $string =  $t->encode($uni,$check);
 162     return undef if ($check && length($uni));
 163     return defined($_[0] = $string) ? length($string) : undef ;
 164 }
 165
 166 sub encode_utf8($)
 167 {
 168     my ($str) = @_;
 169     utf8::encode($str);
 170     return $str;
 171 }
 172
 173 sub decode_utf8($)
 174 {
 175     my ($str) = @_;
 176     return undef unless utf8::decode($str);
 177     return $str;
 178 }
 179
 180 predefine_encodings();
 181
 182 #
 183 # This is to restore %Encoding if really needed;
 184 #
 185 sub predefine_encodings{
 186     if ($ON_EBCDIC) {
 187         # was in Encode::UTF_EBCDIC
 188         package Encode::UTF_EBCDIC;
 189         *name         = sub{ shift->{'Name'} };
 190         *new_sequence = sub{ return $_[0] };
 191         *decode = sub{
 192             my ($obj,$str,$chk) = @_;
 193             my $res = '';
 194             for (my $i = 0; $i < length($str); $i++) {
 195                 $res .=
 196                     chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
 197             }
 198             $_[1] = '' if $chk;
 199             return $res;
 200         };
 201         *encode = sub{
 202             my ($obj,$str,$chk) = @_;
 203             my $res = '';
 204             for (my $i = 0; $i < length($str); $i++) {
 205                 $res .=
 206                     chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
 207             }
 208             $_[1] = '' if $chk;
 209             return $res;
 210         };
 211         $Encode::Encoding{Unicode} =
 212             bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
 213     } else {
 214         # was in Encode::UTF_EBCDIC
 215         package Encode::Internal;
 216         *name         = sub{ shift->{'Name'} };
 217         *new_sequence = sub{ return $_[0] };
 218         *decode = sub{
 219             my ($obj,$str,$chk) = @_;
 220             utf8::upgrade($str);
 221             $_[1] = '' if $chk;
 222             return $str;
 223         };
 224         *encode = \&decode;
 225         $Encode::Encoding{Unicode} =
 226             bless {Name => "Internal"} => "Encode::Internal";
 227     }
 228
 229     {
 230         # was in Encode::utf8
 231         package Encode::utf8;
 232         *name         = sub{ shift->{'Name'} };
 233         *new_sequence = sub{ return $_[0] };
 234         *decode = sub{
 235             my ($obj,$octets,$chk) = @_;
 236             my $str = Encode::decode_utf8($octets);
 237             if (defined $str) {
 238                 $_[1] = '' if $chk;
 239                 return $str;
 240             }
 241             return undef;
 242         };
 243         *encode = sub {
 244             my ($obj,$string,$chk) = @_;
 245             my $octets = Encode::encode_utf8($string);
 246             $_[1] = '' if $chk;
 247             return $octets;
 248         };
 249         $Encode::Encoding{utf8} =
 250             bless {Name => "utf8"} => "Encode::utf8";
 251     }
 252 }
 253
 254 require Encode::Encoding;
 255
 256 eval { require PerlIO::encoding };
 257
 258 1;
 259
 260 __END__
 261
 262 =head1 NAME
 263
 264 Encode - character encodings
 265
 266 =head1 SYNOPSIS
 267
 268     use Encode;
 269
 270
 271 =head2 Table of Contents
 272
 273 Encode consists of a collection of modules which details are too big
 274 to fit in one document.  This POD itself explains the top-level APIs
 275 and general topics at a glance.  For other topics and more details,
 276 see the PODs below;
 277
 278   Name                          Description
 279   --------------------------------------------------------
 280   Encode::Alias         Alias defintions to encodings
 281   Encode::Encoding      Encode Implementation Base Class
 282   Encode::Supported     List of Supported Encodings
 283   Encode::CN            Simplified Chinese Encodings
 284   Encode::JP            Japanese Encodings
 285   Encode::KR            Korean Encodings
 286   Encode::TW            Traditional Chinese Encodings
 287   --------------------------------------------------------
 288
 289 =head1 DESCRIPTION
 290
 291 The C<Encode> module provides the interfaces between Perl's strings
 292 and the rest of the system.  Perl strings are sequences of
 293 B<characters>.
 294
 295 The repertoire of characters that Perl can represent is at least that
 296 defined by the Unicode Consortium. On most platforms the ordinal
 297 values of the characters (as returned by C<ord(ch)>) is the "Unicode
 298 codepoint" for the character (the exceptions are those platforms where
 299 the legacy encoding is some variant of EBCDIC rather than a super-set
 300 of ASCII - see L<perlebcdic>).
 301
 302 Traditionally computer data has been moved around in 8-bit chunks
 303 often called "bytes". These chunks are also known as "octets" in
 304 networking standards. Perl is widely used to manipulate data of many
 305 types - not only strings of characters representing human or computer
 306 languages but also "binary" data being the machines representation of
 307 numbers, pixels in an image - or just about anything.
 308
 309 When Perl is processing "binary data" the programmer wants Perl to
 310 process "sequences of bytes". This is not a problem for Perl - as a
 311 byte has 256 possible values it easily fits in Perl's much larger
 312 "logical character".
 313
 314 =head2 TERMINOLOGY
 315
 316 =over 4
 317
 318 =item *
 319
 320 I<character>: a character in the range 0..(2**32-1) (or more).
 321 (What Perl's strings are made of.)
 322
 323 =item *
 324
 325 I<byte>: a character in the range 0..255
 326 (A special case of a Perl character.)
 327
 328 =item *
 329
 330 I<octet>: 8 bits of data, with ordinal values 0..255
 331 (Term for bytes passed to or from a non-Perl context, e.g. disk file.)
 332
 333 =back
 334
 335 The marker [INTERNAL] marks Internal Implementation Details, in
 336 general meant only for those who think they know what they are doing,
 337 and such details may change in future releases.
 338
 339 =head1 PERL ENCODING API
 340
 341 =over 4
 342
 343 =item $octets  = encode(ENCODING, $string[, CHECK])
 344
 345 Encodes string from Perl's internal form into I<ENCODING> and returns
 346 a sequence of octets.  ENCODING can be either a canonical name or
 347 alias.  For encoding names and aliases, see L</"Defining Aliases">.
 348 For CHECK see L</"Handling Malformed Data">.
 349
 350 For example to convert (internally UTF-8 encoded) Unicode string to
 351 iso-8859-1 (also known as Latin1),
 352
 353   $octets = encode("iso-8859-1", $unicode);
 354
 355 =item $string = decode(ENCODING, $octets[, CHECK])
 356
 357 Decode sequence of octets assumed to be in I<ENCODING> into Perl's
 358 internal form and returns the resulting string.  as in encode(),
 359 ENCODING can be either a canonical name or alias. For encoding names
 360 and aliases, see L</"Defining Aliases">.  For CHECK see
 361 L</"Handling Malformed Data">.
 362
 363 For example to convert ISO-8859-1 data to UTF-8:
 364
 365   $utf8 = decode("iso-8859-1", $latin1);
 366
 367 =item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])
 368
 369 Convert B<in-place> the data between two encodings.  How did the data
 370 in $string originally get to be in FROM_ENCODING?  Either using
 371 encode() or through PerlIO: See L</"Encoding and IO">.
 372 For encoding names and aliases, see L</"Defining Aliases">.
 373 For CHECK see L</"Handling Malformed Data">.
 374
 375 For example to convert ISO-8859-1 data to UTF-8:
 376
 377         from_to($data, "iso-8859-1", "utf-8");
 378
 379 and to convert it back:
 380
 381         from_to($data, "utf-8", "iso-8859-1");
 382
 383 Note that because the conversion happens in place, the data to be
 384 converted cannot be a string constant, it must be a scalar variable.
 385
 386 from_to() return the length of the converted string on success, undef
 387 otherwise.
 388
 389 =back
 390
 391 =head2 UTF-8 / utf8
 392
 393 The Unicode consortium defines the UTF-8 standard as a way of encoding
 394 the entire Unicode repertoire as sequences of octets.  This encoding is
 395 expected to become very widespread. Perl can use this form internally
 396 to represent strings, so conversions to and from this form are
 397 particularly efficient (as octets in memory do not have to change,
 398 just the meta-data that tells Perl how to treat them).
 399
 400 =over 4
 401
 402 =item $octets = encode_utf8($string);
 403
 404 The characters that comprise string are encoded in Perl's superset of UTF-8
 405 and the resulting octets returned as a sequence of bytes. All possible
 406 characters have a UTF-8 representation so this function cannot fail.
 407
 408 =item $string = decode_utf8($octets [, CHECK]);
 409
 410 The sequence of octets represented by $octets is decoded from UTF-8
 411 into a sequence of logical characters. Not all sequences of octets
 412 form valid UTF-8 encodings, so it is possible for this call to fail.
 413 For CHECK see L</"Handling Malformed Data">.
 414
 415 =back
 416
 417 =head2 Listing available encodings
 418
 419   use Encode;
 420   @list = Encode->encodings();
 421
 422 Returns a list of the canonical names of the available encodings that
 423 are loaded.  To get a list of all available encodings including the
 424 ones that are not loaded yet, say
 425
 426   @all_encodings = Encode->encodings(":all");
 427
 428 Or you can give the name of specific module.
 429
 430   @with_jp = Encode->encodings("Encode::JP");
 431
 432 When "::" is not in the name, "Encode::" is assumed.
 433
 434   @ebcdic = Encode->encodings("EBCDIC");
 435
 436 To find which encodings are supported by this package in details,
 437 see L<Encode::Supported>.
 438
 439 =head2 Defining Aliases
 440
 441 To add new alias to a given encoding,  Use;
 442
 443   use Encode;
 444   use Encode::Alias;
 445   define_alias(newName => ENCODING);
 446
 447 After that, newName can be used as an alias for ENCODING.
 448 ENCODING may be either the name of an encoding or an
 449 I<encoding object>
 450
 451 But before you do so, make sure the alias is nonexistent with
 452 C<resolve_alias()>, which returns the canonical name thereof.
 453 i.e.
 454
 455   Encode::resolve_alias("latin1") eq "iso-8859-1" # true
 456   Encode::resolve_alias("iso-8859-12")   # false; nonexistent
 457   Encode::resolve_alias($name) eq $name  # true if $name is canonical
 458
 459 This resolve_alias() does not need C<use Encode::Alias> and is
 460 exported via C<use encode qw(resolve_alias)>.
 461
 462 See L<Encode::Alias> on details.
 463
 464 =head1 Encoding and IO
 465
 466 It is very common to want to do encoding transformations when
 467 reading or writing files, network connections, pipes etc.
 468 If Perl is configured to use the new 'perlio' IO system then
 469 C<Encode> provides a "layer" (See L<perliol>) which can transform
 470 data as it is read or written.
 471
 472 Here is how the blind poet would modernise the encoding:
 473
 474     use Encode;
 475     open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
 476     open(my $utf8,'>:utf8','iliad.utf8');
 477     my @epic = <$iliad>;
 478     print $utf8 @epic;
 479     close($utf8);
 480     close($illiad);
 481
 482 In addition the new IO system can also be configured to read/write
 483 UTF-8 encoded characters (as noted above this is efficient):
 484
 485     open(my $fh,'>:utf8','anything');
 486     print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
 487
 488 Either of the above forms of "layer" specifications can be made the default
 489 for a lexical scope with the C<use open ...> pragma. See L<open>.
 490
 491 Once a handle is open is layers can be altered using C<binmode>.
 492
 493 Without any such configuration, or if Perl itself is built using
 494 system's own IO, then write operations assume that file handle accepts
 495 only I<bytes> and will C<die> if a character larger than 255 is
 496 written to the handle. When reading, each octet from the handle
 497 becomes a byte-in-a-character. Note that this default is the same
 498 behaviour as bytes-only languages (including Perl before v5.6) would
 499 have, and is sufficient to handle native 8-bit encodings
 500 e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
 501 other encodings and binary data.
 502
 503 In other cases it is the programs responsibility to transform
 504 characters into bytes using the API above before doing writes, and to
 505 transform the bytes read from a handle into characters before doing
 506 "character operations" (e.g. C<lc>, C</\W+/>, ...).
 507
 508 You can also use PerlIO to convert larger amounts of data you don't
 509 want to bring into memory.  For example to convert between ISO-8859-1
 510 (Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
 511
 512     open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
 513     open(G, ">:utf8",                 "data.utf") or die $!;
 514     while (<F>) { print G }
 515
 516     # Could also do "print G <F>" but that would pull
 517     # the whole file into memory just to write it out again.
 518
 519 More examples:
 520
 521     open(my $f, "<:encoding(cp1252)")
 522     open(my $g, ">:encoding(iso-8859-2)")
 523     open(my $h, ">:encoding(latin9)")       # iso-8859-15
 524
 525 See L<PerlIO> for more information.
 526
 527 See also L<encoding> for how to change the default encoding of the
 528 data in your script.
 529
 530 =head1 Handling Malformed Data
 531
 532 If I<CHECK> is not set, (en|de)code will put I<substitution character> in
 533 place of the malformed character.  for UCM-based encodings,
 534 E<lt>subcharE<gt> will be used.  For Unicode, \xFFFD is used.  If the
 535 data is supposed to be UTF-8, an optional lexical warning (category
 536 utf8) is given.
 537
 538 If I<CHECK> is true but not a code reference, dies with an error message.
 539
 540 In future you will be able to use a code reference to a callback
 541 function for the value of I<CHECK> but its API is still undecided.
 542
 543 =head1 Defining Encodings
 544
 545 To define a new encoding, use:
 546
 547     use Encode qw(define_alias);
 548     define_encoding($object, 'canonicalName' [, alias...]);
 549
 550 I<canonicalName> will be associated with I<$object>.  The object
 551 should provide the interface described in L<Encode::Encoding>
 552 If more than two arguments are provided then additional
 553 arguments are taken as aliases for I<$object> as for C<define_alias>.
 554
 555 See L<Encode::Encoding> for more details.
 556
 557 =head1 Messing with Perl's Internals
 558
 559 The following API uses parts of Perl's internals in the current
 560 implementation.  As such they are efficient, but may change.
 561
 562 =over 4
 563
 564 =item is_utf8(STRING [, CHECK])
 565
 566 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
 567 If CHECK is true, also checks the data in STRING for being well-formed
 568 UTF-8.  Returns true if successful, false otherwise.
 569
 570 =item _utf8_on(STRING)
 571
 572 [INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
 573 B<not> checked for being well-formed UTF-8.  Do not use unless you
 574 B<know> that the STRING is well-formed UTF-8.  Returns the previous
 575 state of the UTF-8 flag (so please don't test the return value as
 576 I<not> success or failure), or C<undef> if STRING is not a string.
 577
 578 =item _utf8_off(STRING)
 579
 580 [INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
 581 Returns the previous state of the UTF-8 flag (so please don't test the
 582 return value as I<not> success or failure), or C<undef> if STRING is
 583 not a string.
 584
 585 =back
 586
 587 =head1 SEE ALSO
 588
 589 L<Encode::Encoding>,
 590 L<Encode::Supported>,
 591 L<PerlIO>,
 592 L<encoding>,
 593 L<perlebcdic>,
 594 L<perlfunc/open>,
 595 L<perlunicode>,
 596 L<utf8>,
 597 the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
 598
 599 head2 MAINTAINER
 600
 601 This project was originated by Nick Ing-Simmons and later maintained
 602 by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>.  See AUTHORS for full list
 603 of people involved.  For any questions, use
 604 E<lt>perl-unicode@perl.orgE<gt> so others can share.
 605
 606 =cut