ext/Encode/Encode.pm

   1 #
   2 # $Id: Encode.pm,v 2.9 2004/12/03 19:16:40 dankogai Exp $
   3 #
   4 package Encode;
   5 use strict;
   6 # our $VERSION = do { my @r = (q$Revision: 2.9 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   7 our $VERSION = '2.0902_01';
   8 sub DEBUG () { 0 }
   9 use XSLoader ();
  10 XSLoader::load(__PACKAGE__, $VERSION);
  11
  12 require Exporter;
  13 use base qw/Exporter/;
  14
  15 # Public, encouraged API is exported by default
  16
  17 our @EXPORT = qw(
  18   decode  decode_utf8  encode  encode_utf8
  19   encodings  find_encoding clone_encoding
  20 );
  21
  22 our @FB_FLAGS  = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
  23                     PERLQQ HTMLCREF XMLCREF);
  24 our @FB_CONSTS = qw(FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
  25                     FB_PERLQQ FB_HTMLCREF FB_XMLCREF);
  26
  27 our @EXPORT_OK =
  28     (
  29      qw(
  30        _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
  31        is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
  32       ),
  33      @FB_FLAGS, @FB_CONSTS,
  34     );
  35
  36 our %EXPORT_TAGS =
  37     (
  38      all          =>  [ @EXPORT, @EXPORT_OK ],
  39      fallbacks    =>  [ @FB_CONSTS ],
  40      fallback_all =>  [ @FB_CONSTS, @FB_FLAGS ],
  41     );
  42
  43 # Documentation moved after __END__ for speed - NI-S
  44
  45 our $ON_EBCDIC = (ord("A") == 193);
  46
  47 use Encode::Alias;
  48
  49 # Make a %Encoding package variable to allow a certain amount of cheating
  50 our %Encoding;
  51 our %ExtModule;
  52 require Encode::Config;
  53 eval { require Encode::ConfigLocal };
  54
  55 sub encodings
  56 {
  57     my $class = shift;
  58     my %enc;
  59     if (@_ and $_[0] eq ":all"){
  60         %enc = ( %Encoding, %ExtModule );
  61     }else{
  62         %enc = %Encoding;
  63         for my $mod (map {m/::/o ? $_ : "Encode::$_" } @_){
  64             DEBUG and warn $mod;
  65             for my $enc (keys %ExtModule){
  66                 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
  67             }
  68         }
  69     }
  70     return
  71         sort { lc $a cmp lc $b }
  72              grep {!/^(?:Internal|Unicode|Guess)$/o} keys %enc;
  73 }
  74
  75 sub perlio_ok{
  76     my $obj = ref($_[0]) ? $_[0] : find_encoding($_[0]);
  77     $obj->can("perlio_ok") and return $obj->perlio_ok();
  78     return 0; # safety net
  79 }
  80
  81 sub define_encoding
  82 {
  83     my $obj  = shift;
  84     my $name = shift;
  85     $Encoding{$name} = $obj;
  86     my $lc = lc($name);
  87     define_alias($lc => $obj) unless $lc eq $name;
  88     while (@_){
  89         my $alias = shift;
  90         define_alias($alias, $obj);
  91     }
  92     return $obj;
  93 }
  94
  95 sub getEncoding
  96 {
  97     my ($class, $name, $skip_external) = @_;
  98
  99     ref($name) && $name->can('renew') and return $name;
 100     exists $Encoding{$name} and return $Encoding{$name};
 101     my $lc = lc $name;
 102     exists $Encoding{$lc} and return $Encoding{$lc};
 103
 104     my $oc = $class->find_alias($name);
 105     defined($oc) and return $oc;
 106     $lc ne $name and $oc = $class->find_alias($lc);
 107     defined($oc) and return $oc;
 108
 109     unless ($skip_external)
 110     {
 111         if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
 112             $mod =~ s,::,/,g ; $mod .= '.pm';
 113             eval{ require $mod; };
 114             exists $Encoding{$name} and return $Encoding{$name};
 115         }
 116     }
 117     return;
 118 }
 119
 120 sub find_encoding($;$)
 121 {
 122     my ($name, $skip_external) = @_;
 123     return __PACKAGE__->getEncoding($name,$skip_external);
 124 }
 125
 126 sub resolve_alias($){
 127     my $obj = find_encoding(shift);
 128     defined $obj and return $obj->name;
 129     return;
 130 }
 131
 132 sub clone_encoding($){
 133     my $obj = find_encoding(shift);
 134     ref $obj or return;
 135     eval { require Storable };
 136     $@ and return;
 137     return Storable::dclone($obj);
 138 }
 139
 140 sub encode($$;$)
 141 {
 142     my ($name, $string, $check) = @_;
 143     return undef unless defined $string;
 144     $string .= '' if ref $string; # stringify;
 145     $check ||=0;
 146     my $enc = find_encoding($name);
 147     unless(defined $enc){
 148         require Carp;
 149         Carp::croak("Unknown encoding '$name'");
 150     }
 151     my $octets = $enc->encode($string,$check);
 152     $_[1] = $string if $check and !($check & LEAVE_SRC());
 153     return $octets;
 154 }
 155
 156 sub decode($$;$)
 157 {
 158     my ($name,$octets,$check) = @_;
 159     return undef unless defined $octets;
 160     $octets .= '' if ref $octets;
 161     $check ||=0;
 162     my $enc = find_encoding($name);
 163     unless(defined $enc){
 164         require Carp;
 165         Carp::croak("Unknown encoding '$name'");
 166     }
 167     my $string = $enc->decode($octets,$check);
 168     $_[1] = $octets if $check and !($check & LEAVE_SRC());
 169     return $string;
 170 }
 171
 172 sub from_to($$$;$)
 173 {
 174     my ($string,$from,$to,$check) = @_;
 175     return undef unless defined $string;
 176     $check ||=0;
 177     my $f = find_encoding($from);
 178     unless (defined $f){
 179         require Carp;
 180         Carp::croak("Unknown encoding '$from'");
 181     }
 182     my $t = find_encoding($to);
 183     unless (defined $t){
 184         require Carp;
 185         Carp::croak("Unknown encoding '$to'");
 186     }
 187     my $uni = $f->decode($string,$check);
 188     return undef if ($check && length($string));
 189     $string =  $t->encode($uni,$check);
 190     return undef if ($check && length($uni));
 191     return defined($_[0] = $string) ? length($string) : undef ;
 192 }
 193
 194 sub encode_utf8($)
 195 {
 196     my ($str) = @_;
 197     utf8::encode($str);
 198     return $str;
 199 }
 200
 201 sub decode_utf8($;$)
 202 {
 203     my ($str, $check) = @_;
 204     if ($check){
 205         return decode("utf8", $str, $check);
 206     }else{
 207         return undef unless utf8::decode($str);
 208         return $str;
 209     }
 210 }
 211
 212 predefine_encodings(1);
 213
 214 #
 215 # This is to restore %Encoding if really needed;
 216 #
 217
 218 sub predefine_encodings{
 219     use Encode::Encoding;
 220     no warnings 'redefine';
 221     my $use_xs = shift;
 222     if ($ON_EBCDIC) {
 223         # was in Encode::UTF_EBCDIC
 224         package Encode::UTF_EBCDIC;
 225         push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding';
 226         *decode = sub{
 227             my ($obj,$str,$chk) = @_;
 228             my $res = '';
 229             for (my $i = 0; $i < length($str); $i++) {
 230                 $res .=
 231                     chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
 232             }
 233             $_[1] = '' if $chk;
 234             return $res;
 235         };
 236         *encode = sub{
 237             my ($obj,$str,$chk) = @_;
 238             my $res = '';
 239             for (my $i = 0; $i < length($str); $i++) {
 240                 $res .=
 241                     chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
 242             }
 243             $_[1] = '' if $chk;
 244             return $res;
 245         };
 246         $Encode::Encoding{Unicode} =
 247             bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
 248     } else {
 249         package Encode::Internal;
 250         push @Encode::Internal::ISA, 'Encode::Encoding';
 251         *decode = sub{
 252             my ($obj,$str,$chk) = @_;
 253             utf8::upgrade($str);
 254             $_[1] = '' if $chk;
 255             return $str;
 256         };
 257         *encode = \&decode;
 258         $Encode::Encoding{Unicode} =
 259             bless {Name => "Internal"} => "Encode::Internal";
 260     }
 261
 262     {
 263         # was in Encode::utf8
 264         package Encode::utf8;
 265         push @Encode::utf8::ISA, 'Encode::Encoding';
 266         #
 267         if ($use_xs){
 268             Encode::DEBUG and warn __PACKAGE__, " XS on";
 269             *decode = \&decode_xs;
 270             *encode = \&encode_xs;
 271         }else{
 272             Encode::DEBUG and warn __PACKAGE__, " XS off";
 273             *decode = sub{
 274                 my ($obj,$octets,$chk) = @_;
 275                 my $str = Encode::decode_utf8($octets);
 276                 if (defined $str) {
 277                     $_[1] = '' if $chk;
 278                     return $str;
 279                 }
 280                 return undef;
 281             };
 282             *encode = sub {
 283                 my ($obj,$string,$chk) = @_;
 284                 my $octets = Encode::encode_utf8($string);
 285                 $_[1] = '' if $chk;
 286                 return $octets;
 287             };
 288         }
 289         *cat_decode = sub{ # ($obj, $dst, $src, $pos, $trm, $chk)
 290             my ($obj, undef, undef, $pos, $trm) = @_; # currently ignores $chk
 291             my ($rdst, $rsrc, $rpos) = \@_[1,2,3];
 292             use bytes;
 293             if ((my $npos = index($$rsrc, $trm, $pos)) >= 0) {
 294                 $$rdst .= substr($$rsrc, $pos, $npos - $pos + length($trm));
 295                 $$rpos = $npos + length($trm);
 296                 return 1;
 297             }
 298             $$rdst .= substr($$rsrc, $pos);
 299             $$rpos = length($$rsrc);
 300             return '';
 301         };
 302         $Encode::Encoding{utf8} =
 303             bless {Name => "utf8"} => "Encode::utf8";
 304         $Encode::Encoding{"utf-8-strict"} =
 305             bless {Name => "utf-8-strict", strict_utf8 => 1 } => "Encode::utf8";
 306     }
 307 }
 308
 309 1;
 310
 311 __END__
 312
 313 =head1 NAME
 314
 315 Encode - character encodings
 316
 317 =head1 SYNOPSIS
 318
 319     use Encode;
 320
 321 =head2 Table of Contents
 322
 323 Encode consists of a collection of modules whose details are too big
 324 to fit in one document.  This POD itself explains the top-level APIs
 325 and general topics at a glance.  For other topics and more details,
 326 see the PODs below:
 327
 328   Name                          Description
 329   --------------------------------------------------------
 330   Encode::Alias         Alias definitions to encodings
 331   Encode::Encoding      Encode Implementation Base Class
 332   Encode::Supported     List of Supported Encodings
 333   Encode::CN            Simplified Chinese Encodings
 334   Encode::JP            Japanese Encodings
 335   Encode::KR            Korean Encodings
 336   Encode::TW            Traditional Chinese Encodings
 337   --------------------------------------------------------
 338
 339 =head1 DESCRIPTION
 340
 341 The C<Encode> module provides the interfaces between Perl's strings
 342 and the rest of the system.  Perl strings are sequences of
 343 B<characters>.
 344
 345 The repertoire of characters that Perl can represent is at least that
 346 defined by the Unicode Consortium. On most platforms the ordinal
 347 values of the characters (as returned by C<ord(ch)>) is the "Unicode
 348 codepoint" for the character (the exceptions are those platforms where
 349 the legacy encoding is some variant of EBCDIC rather than a super-set
 350 of ASCII - see L<perlebcdic>).
 351
 352 Traditionally, computer data has been moved around in 8-bit chunks
 353 often called "bytes". These chunks are also known as "octets" in
 354 networking standards. Perl is widely used to manipulate data of many
 355 types - not only strings of characters representing human or computer
 356 languages but also "binary" data being the machine's representation of
 357 numbers, pixels in an image - or just about anything.
 358
 359 When Perl is processing "binary data", the programmer wants Perl to
 360 process "sequences of bytes". This is not a problem for Perl - as a
 361 byte has 256 possible values, it easily fits in Perl's much larger
 362 "logical character".
 363
 364 =head2 TERMINOLOGY
 365
 366 =over 2
 367
 368 =item *
 369
 370 I<character>: a character in the range 0..(2**32-1) (or more).
 371 (What Perl's strings are made of.)
 372
 373 =item *
 374
 375 I<byte>: a character in the range 0..255
 376 (A special case of a Perl character.)
 377
 378 =item *
 379
 380 I<octet>: 8 bits of data, with ordinal values 0..255
 381 (Term for bytes passed to or from a non-Perl context, e.g. a disk file.)
 382
 383 =back
 384
 385 =head1 PERL ENCODING API
 386
 387 =over 2
 388
 389 =item $octets  = encode(ENCODING, $string [, CHECK])
 390
 391 Encodes a string from Perl's internal form into I<ENCODING> and returns
 392 a sequence of octets.  ENCODING can be either a canonical name or
 393 an alias.  For encoding names and aliases, see L</"Defining Aliases">.
 394 For CHECK, see L</"Handling Malformed Data">.
 395
 396 For example, to convert a string from Perl's internal format to
 397 iso-8859-1 (also known as Latin1),
 398
 399   $octets = encode("iso-8859-1", $string);
 400
 401 B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then $octets
 402 B<may not be equal to> $string.  Though they both contain the same data, the utf8 flag
 403 for $octets is B<always> off.  When you encode anything, utf8 flag of
 404 the result is always off, even when it contains completely valid utf8
 405 string. See L</"The UTF-8 flag"> below.
 406
 407 If the $string is C<undef> then C<undef> is returned.
 408
 409 =item $string = decode(ENCODING, $octets [, CHECK])
 410
 411 Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's
 412 internal form and returns the resulting string.  As in encode(),
 413 ENCODING can be either a canonical name or an alias. For encoding names
 414 and aliases, see L</"Defining Aliases">.  For CHECK, see
 415 L</"Handling Malformed Data">.
 416
 417 For example, to convert ISO-8859-1 data to a string in Perl's internal format:
 418
 419   $string = decode("iso-8859-1", $octets);
 420
 421 B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
 422 B<may not be equal to> $octets.  Though they both contain the same data,
 423 the utf8 flag for $string is on unless $octets entirely consists of
 424 ASCII data (or EBCDIC on EBCDIC machines).  See L</"The UTF-8 flag">
 425 below.
 426
 427 If the $string is C<undef> then C<undef> is returned.
 428
 429 =item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
 430
 431 Converts B<in-place> data between two encodings. The data in $octets
 432 must be encoded as octets and not as characters in Perl's internal
 433 format. For example, to convert ISO-8859-1 data to Microsoft's CP1250
 434 encoding:
 435
 436   from_to($octets, "iso-8859-1", "cp1250");
 437
 438 and to convert it back:
 439
 440   from_to($octets, "cp1250", "iso-8859-1");
 441
 442 Note that because the conversion happens in place, the data to be
 443 converted cannot be a string constant; it must be a scalar variable.
 444
 445 from_to() returns the length of the converted string in octets on
 446 success, I<undef> on error.
 447
 448 B<CAVEAT>: The following operations look the same but are not quite so;
 449
 450   from_to($data, "iso-8859-1", "utf8"); #1
 451   $data = decode("iso-8859-1", $data);  #2
 452
 453 Both #1 and #2 make $data consist of a completely valid UTF-8 string
 454 but only #2 turns utf8 flag on.  #1 is equivalent to
 455
 456   $data = encode("utf8", decode("iso-8859-1", $data));
 457
 458 See L</"The UTF-8 flag"> below.
 459
 460 =item $octets = encode_utf8($string);
 461
 462 Equivalent to C<$octets = encode("utf8", $string);> The characters
 463 that comprise $string are encoded in Perl's internal format and the
 464 result is returned as a sequence of octets. All possible
 465 characters have a UTF-8 representation so this function cannot fail.
 466
 467
 468 =item $string = decode_utf8($octets [, CHECK]);
 469
 470 equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
 471 The sequence of octets represented by
 472 $octets is decoded from UTF-8 into a sequence of logical
 473 characters. Not all sequences of octets form valid UTF-8 encodings, so
 474 it is possible for this call to fail.  For CHECK, see
 475 L</"Handling Malformed Data">.
 476
 477 =back
 478
 479 =head2 Listing available encodings
 480
 481   use Encode;
 482   @list = Encode->encodings();
 483
 484 Returns a list of the canonical names of the available encodings that
 485 are loaded.  To get a list of all available encodings including the
 486 ones that are not loaded yet, say
 487
 488   @all_encodings = Encode->encodings(":all");
 489
 490 Or you can give the name of a specific module.
 491
 492   @with_jp = Encode->encodings("Encode::JP");
 493
 494 When "::" is not in the name, "Encode::" is assumed.
 495
 496   @ebcdic = Encode->encodings("EBCDIC");
 497
 498 To find out in detail which encodings are supported by this package,
 499 see L<Encode::Supported>.
 500
 501 =head2 Defining Aliases
 502
 503 To add a new alias to a given encoding, use:
 504
 505   use Encode;
 506   use Encode::Alias;
 507   define_alias(newName => ENCODING);
 508
 509 After that, newName can be used as an alias for ENCODING.
 510 ENCODING may be either the name of an encoding or an
 511 I<encoding object>
 512
 513 But before you do so, make sure the alias is nonexistent with
 514 C<resolve_alias()>, which returns the canonical name thereof.
 515 i.e.
 516
 517   Encode::resolve_alias("latin1") eq "iso-8859-1" # true
 518   Encode::resolve_alias("iso-8859-12")   # false; nonexistent
 519   Encode::resolve_alias($name) eq $name  # true if $name is canonical
 520
 521 resolve_alias() does not need C<use Encode::Alias>; it can be
 522 exported via C<use Encode qw(resolve_alias)>.
 523
 524 See L<Encode::Alias> for details.
 525
 526 =head1 Encoding via PerlIO
 527
 528 If your perl supports I<PerlIO> (which is the default), you can use a PerlIO layer to decode
 529 and encode directly via a filehandle.  The following two examples
 530 are totally identical in their functionality.
 531
 532   # via PerlIO
 533   open my $in,  "<:encoding(shiftjis)", $infile  or die;
 534   open my $out, ">:encoding(euc-jp)",   $outfile or die;
 535   while(<$in>){ print $out $_; }
 536
 537   # via from_to
 538   open my $in,  "<", $infile  or die;
 539   open my $out, ">", $outfile or die;
 540   while(<$in>){
 541     from_to($_, "shiftjis", "euc-jp", 1);
 542     print $out $_;
 543   }
 544
 545 Unfortunately, it may be that encodings are PerlIO-savvy.  You can check
 546 if your encoding is supported by PerlIO by calling the C<perlio_ok>
 547 method.
 548
 549   Encode::perlio_ok("hz");             # False
 550   find_encoding("euc-cn")->perlio_ok;  # True where PerlIO is available
 551
 552   use Encode qw(perlio_ok);            # exported upon request
 553   perlio_ok("euc-jp")
 554
 555 Fortunately, all encodings that come with Encode core are PerlIO-savvy
 556 except for hz and ISO-2022-kr.  For gory details, see
 557 L<Encode::Encoding> and L<Encode::PerlIO>.
 558
 559 =head1 Handling Malformed Data
 560
 561 The optional I<CHECK> argument is used as follows.  When you omit it,
 562 Encode::FB_DEFAULT ( == 0 ) is assumed.
 563
 564 =over 2
 565
 566 =item B<NOTE:> Not all encoding suppport this feature
 567
 568 Some encodings ignore I<CHECK> argument.  For example,
 569 L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
 570
 571 =back
 572
 573 Now here is the list of I<CHECK> values available
 574
 575 =over 2
 576
 577 =item I<CHECK> = Encode::FB_DEFAULT ( == 0)
 578
 579 If I<CHECK> is 0, (en|de)code will put a I<substitution character> in
 580 place of a malformed character.  When you encode, E<lt>subcharE<gt>
 581 will be used.  When you decode the code point C<0xFFFD> is used.  If
 582 the data is supposed to be UTF-8, an optional lexical warning
 583 (category utf8) is given.
 584
 585 =item I<CHECK> = Encode::FB_CROAK ( == 1)
 586
 587 If I<CHECK> is 1, methods will die on error immediately with an error
 588 message.  Therefore, when I<CHECK> is set to 1,  you should trap the
 589 error with eval{} unless you really want to let it die.
 590
 591 =item I<CHECK> = Encode::FB_QUIET
 592
 593 If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately
 594 return the portion of the data that has been processed so far when an
 595 error occurs. The data argument will be overwritten with everything
 596 after that point (that is, the unprocessed part of data).  This is
 597 handy when you have to call decode repeatedly in the case where your
 598 source data may contain partial multi-byte character sequences,
 599 (i.e. you are reading with a fixed-width buffer). Here is a sample
 600 code that does exactly this:
 601
 602   my $buffer = ''; my $string = '';
 603   while(read $fh, $buffer, 256, length($buffer)){
 604     $string .= decode($encoding, $buffer, Encode::FB_QUIET);
 605     # $buffer now contains the unprocessed partial character
 606   }
 607
 608 =item I<CHECK> = Encode::FB_WARN
 609
 610 This is the same as above, except that it warns on error.  Handy when
 611 you are debugging the mode above.
 612
 613 =item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
 614
 615 =item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
 616
 617 =item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
 618
 619 For encodings that are implemented by Encode::XS, CHECK ==
 620 Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode.
 621
 622 When you decode, C<\xI<HH>> will be inserted for a malformed character,
 623 where I<HH> is the hex representation of the octet  that could not be
 624 decoded to utf8.  And when you encode, C<\x{I<HHHH>}> will be inserted,
 625 where I<HHHH> is the Unicode ID of the character that cannot be found
 626 in the character repertoire of the encoding.
 627
 628 HTML/XML character reference modes are about the same, in place of
 629 C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number and
 630 XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
 631
 632 In Encode 2.10 or later, C<LEAVE_SRC> is also implied.
 633
 634 =item The bitmask
 635
 636 These modes are actually set via a bitmask.  Here is how the FB_XX
 637 constants are laid out.  You can import the FB_XX constants via
 638 C<use Encode qw(:fallbacks)>; you can import the generic bitmask
 639 constants via C<use Encode qw(:fallback_all)>.
 640
 641                      FB_DEFAULT FB_CROAK FB_QUIET FB_WARN  FB_PERLQQ
 642  DIE_ON_ERR    0x0001             X
 643  WARN_ON_ERR   0x0002                               X
 644  RETURN_ON_ERR 0x0004                      X        X
 645  LEAVE_SRC     0x0008                                        X
 646  PERLQQ        0x0100                                        X
 647  HTMLCREF      0x0200
 648  XMLCREF       0x0400
 649
 650 =back
 651
 652 =head2 Unimplemented fallback schemes
 653
 654 In the future, you will be able to use a code reference to a callback
 655 function for the value of I<CHECK> but its API is still undecided.
 656
 657 The fallback scheme does not work on EBCDIC platforms.
 658
 659 =head1 Defining Encodings
 660
 661 To define a new encoding, use:
 662
 663     use Encode qw(define_encoding);
 664     define_encoding($object, 'canonicalName' [, alias...]);
 665
 666 I<canonicalName> will be associated with I<$object>.  The object
 667 should provide the interface described in L<Encode::Encoding>.
 668 If more than two arguments are provided then additional
 669 arguments are taken as aliases for I<$object>.
 670
 671 See L<Encode::Encoding> for more details.
 672
 673 =head1 The UTF-8 flag
 674
 675 Before the introduction of utf8 support in perl, The C<eq> operator
 676 just compared the strings represented by two scalars. Beginning with
 677 perl 5.8, C<eq> compares two strings with simultaneous consideration
 678 of I<the utf8 flag>. To explain why we made it so, I will quote page
 679 402 of C<Programming Perl, 3rd ed.>
 680
 681 =over 2
 682
 683 =item Goal #1:
 684
 685 Old byte-oriented programs should not spontaneously break on the old
 686 byte-oriented data they used to work on.
 687
 688 =item Goal #2:
 689
 690 Old byte-oriented programs should magically start working on the new
 691 character-oriented data when appropriate.
 692
 693 =item Goal #3:
 694
 695 Programs should run just as fast in the new character-oriented mode
 696 as in the old byte-oriented mode.
 697
 698 =item Goal #4:
 699
 700 Perl should remain one language, rather than forking into a
 701 byte-oriented Perl and a character-oriented Perl.
 702
 703 =back
 704
 705 Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0
 706 was born and many features documented in the book remained
 707 unimplemented for a long time.  Perl 5.8 corrected this and the introduction
 708 of the UTF-8 flag is one of them.  You can think of this perl notion as of a
 709 byte-oriented mode (utf8 flag off) and a character-oriented mode (utf8
 710 flag on).
 711
 712 Here is how Encode takes care of the utf8 flag.
 713
 714 =over 2
 715
 716 =item *
 717
 718 When you encode, the resulting utf8 flag is always off.
 719
 720 =item *
 721
 722 When you decode, the resulting utf8 flag is on unless you can
 723 unambiguously represent data.  Here is the definition of
 724 dis-ambiguity.
 725
 726 After C<$utf8 = decode('foo', $octet);>,
 727
 728   When $octet is...   The utf8 flag in $utf8 is
 729   ---------------------------------------------
 730   In ASCII only (or EBCDIC only)            OFF
 731   In ISO-8859-1                              ON
 732   In any other Encoding                      ON
 733   ---------------------------------------------
 734
 735 As you see, there is one exception, In ASCII.  That way you can assue
 736 Goal #1.  And with Encode Goal #2 is assumed but you still have to be
 737 careful in such cases mentioned in B<CAVEAT> paragraphs.
 738
 739 This utf8 flag is not visible in perl scripts, exactly for the same
 740 reason you cannot (or you I<don't have to>) see if a scalar contains a
 741 string, integer, or floating point number.   But you can still peek
 742 and poke these if you will.  See the section below.
 743
 744 =back
 745
 746 =head2 Messing with Perl's Internals
 747
 748 The following API uses parts of Perl's internals in the current
 749 implementation.  As such, they are efficient but may change.
 750
 751 =over 2
 752
 753 =item is_utf8(STRING [, CHECK])
 754
 755 [INTERNAL] Tests whether the UTF-8 flag is turned on in the STRING.
 756 If CHECK is true, also checks the data in STRING for being well-formed
 757 UTF-8.  Returns true if successful, false otherwise.
 758
 759 As of perl 5.8.1, L<utf8> also has utf8::is_utf8().
 760
 761 =item _utf8_on(STRING)
 762
 763 [INTERNAL] Turns on the UTF-8 flag in STRING.  The data in STRING is
 764 B<not> checked for being well-formed UTF-8.  Do not use unless you
 765 B<know> that the STRING is well-formed UTF-8.  Returns the previous
 766 state of the UTF-8 flag (so please don't treat the return value as
 767 indicating success or failure), or C<undef> if STRING is not a string.
 768
 769 =item _utf8_off(STRING)
 770
 771 [INTERNAL] Turns off the UTF-8 flag in STRING.  Do not use frivolously.
 772 Returns the previous state of the UTF-8 flag (so please don't treat the
 773 return value as indicating success or failure), or C<undef> if STRING is
 774 not a string.
 775
 776 =back
 777
 778 =head1 UTF-8 vs. utf8
 779
 780   ....We now view strings not as sequences of bytes, but as sequences
 781   of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
 782   computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
 783
 784 That has been the perl's notion of UTF-8 but official UTF-8 is more
 785 strict; Its ranges is much narrower (0 .. 10FFFF), some sequences are
 786 not allowed (i.e. Those used in the surrogate pair, 0xFFFE, et al).
 787
 788 Now that is overruled by Larry Wall himself.
 789
 790   From: Larry Wall <larry@wall.org>
 791   Date: December 04, 2004 11:51:58 JST
 792   To: perl-unicode@perl.org
 793   Subject: Re: Make Encode.pm support the real UTF-8
 794   Message-Id: <20041204025158.GA28754@wall.org>
 795
 796   On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
 797   : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
 798   : but "UTF-8" is the name of the standard and should give the
 799   : corresponding behaviour.
 800
 801   For what it's worth, that's how I've always kept them straight in my
 802   head.
 803
 804   Also for what it's worth, Perl 6 will mostly default to strict but
 805   make it easy to switch back to lax.
 806
 807   Larry
 808
 809 Do you copy?  As of Perl 5.8.7, B<UTF-8> means strict, official UTF-8
 810 while B<utf8> means liberal, lax, version thereof.  And Encode version
 811 2.10 or later thus groks the difference between C<UTF-8> and C"utf8".
 812
 813   encode("utf8",  "\x{FFFF_FFFF}", 1); # okay
 814   encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
 815
 816 C<UTF-8> in Encode is actually a canonical name for C<utf-8-strict>.
 817 Yes, the hyphen between "UTF" and "8" is important.  Without it Encode
 818 goes "liberal"
 819
 820   find_encoding("UTF-8")->name # is 'utf-8-strict'
 821   find_encoding("utf-8")->name # ditto. names are case insensitive
 822   find_encoding("utf8")->name  # ditto. "_" are treated as "-"
 823   find_encoding("UTF8")->name  # is 'utf8'.
 824
 825
 826 =head1 SEE ALSO
 827
 828 L<Encode::Encoding>,
 829 L<Encode::Supported>,
 830 L<Encode::PerlIO>,
 831 L<encoding>,
 832 L<perlebcdic>,
 833 L<perlfunc/open>,
 834 L<perlunicode>,
 835 L<utf8>,
 836 the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
 837
 838 =head1 MAINTAINER
 839
 840 This project was originated by Nick Ing-Simmons and later maintained
 841 by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>.  See AUTHORS for a full
 842 list of people involved.  For any questions, use
 843 E<lt>perl-unicode@perl.orgE<gt> so we can all share.
 844
 845 =cut