ext/Encode/Encode.pm

   1 package Encode;
   2 use strict;
   3
   4 our $VERSION = '0.02';
   5
   6 require DynaLoader;
   7 require Exporter;
   8
   9 our @ISA = qw(Exporter DynaLoader);
  10
  11 # Public, encouraged API is exported by default
  12 our @EXPORT = qw (
  13   encode
  14   decode
  15   encode_utf8
  16   decode_utf8
  17   find_encoding
  18   encodings
  19 );
  20
  21 our @EXPORT_OK =
  22     qw(
  23        define_encoding
  24        define_alias
  25        from_to
  26        is_utf8
  27        is_8bit
  28        is_16bit
  29        utf8_upgrade
  30        utf8_downgrade
  31        _utf8_on
  32        _utf8_off
  33       );
  34
  35 bootstrap Encode ();
  36
  37 # Documentation moved after __END__ for speed - NI-S
  38
  39 use Carp;
  40
  41 # Make a %encoding package variable to allow a certain amount of cheating
  42 our %encoding;
  43 my @alias;  # ordered matching list
  44 my %alias;  # cached known aliases
  45                      # 0  1  2  3  4  5   6   7   8   9  10
  46 our @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
  47
  48
  49 sub encodings
  50 {
  51  my ($class) = @_;
  52  return keys %encoding;
  53 }
  54
  55 sub findAlias
  56 {
  57  my $class = shift;
  58  local $_ = shift;
  59  unless (exists $alias{$_})
  60   {
  61    for (my $i=0; $i < @alias; $i += 2)
  62     {
  63      my $alias = $alias[$i];
  64      my $val   = $alias[$i+1];
  65      my $new;
  66      if (ref($alias) eq 'Regexp' && $_ =~ $alias)
  67       {
  68        $new = eval $val;
  69       }
  70      elsif (ref($alias) eq 'CODE')
  71       {
  72        $new = &{$alias}($val)
  73       }
  74      elsif (lc($_) eq lc($alias))
  75       {
  76        $new = $val;
  77       }
  78      if (defined($new))
  79       {
  80        next if $new eq $_; # avoid (direct) recursion on bugs
  81        my $enc = (ref($new)) ? $new : find_encoding($new);
  82        if ($enc)
  83         {
  84          $alias{$_} = $enc;
  85          last;
  86         }
  87       }
  88     }
  89   }
  90  return $alias{$_};
  91 }
  92
  93 sub define_alias
  94 {
  95  while (@_)
  96   {
  97    my ($alias,$name) = splice(@_,0,2);
  98    push(@alias, $alias => $name);
  99   }
 100 }
 101
 102 # Allow variants of iso-8859-1 etc.
 103 define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
 104
 105 # This is a font issue, not an encoding issue.
 106 # (The currency symbol of the Latin 1 upper half is redefined
 107 # as the euro symbol.)
 108 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
 109
 110 # Solaris has this as a generic Latin-1 encoding.
 111 define_alias( qr/^iso_8859_1$/ => 'iso-8859-1' );
 112
 113 # At least HP-UX has these.
 114 define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );
 115
 116 # Allow latin-1 style names as well
 117 define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
 118
 119 # Common names for non-latin prefered MIME names
 120 define_alias( 'ascii'    => 'US-ascii',
 121               'cyrillic' => 'iso-8859-5',
 122               'arabic'   => 'iso-8859-6',
 123               'greek'    => 'iso-8859-7',
 124               'hebrew'   => 'iso-8859-8');
 125
 126 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
 127 define_alias( qr/^ibm[-_]?(\d\d\d\d?)$/i => '"cp$1"');
 128
 129 # Standardize on the dashed version.
 130 define_alias( qr/^koi8r$/i => 'koi8-r' );
 131
 132 # Map white space and _ to '-'
 133 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
 134
 135 sub define_encoding
 136 {
 137  my $obj  = shift;
 138  my $name = shift;
 139  $encoding{$name} = $obj;
 140  my $lc = lc($name);
 141  define_alias($lc => $obj) unless $lc eq $name;
 142  while (@_)
 143   {
 144    my $alias = shift;
 145    define_alias($alias,$obj);
 146   }
 147  return $obj;
 148 }
 149
 150 sub getEncoding
 151 {
 152  my ($class,$name) = @_;
 153  my $enc;
 154  if (ref($name) && $name->can('new_sequence'))
 155   {
 156    return $name;
 157   }
 158  if (exists $encoding{$name})
 159   {
 160    return $encoding{$name};
 161   }
 162  else
 163   {
 164    return $class->findAlias($name);
 165   }
 166 }
 167
 168 sub find_encoding
 169 {
 170  my ($name) = @_;
 171  return __PACKAGE__->getEncoding($name);
 172 }
 173
 174 sub encode
 175 {
 176  my ($name,$string,$check) = @_;
 177  my $enc = find_encoding($name);
 178  croak("Unknown encoding '$name'") unless defined $enc;
 179  my $octets = $enc->encode($string,$check);
 180  return undef if ($check && length($string));
 181  return $octets;
 182 }
 183
 184 sub decode
 185 {
 186  my ($name,$octets,$check) = @_;
 187  my $enc = find_encoding($name);
 188  croak("Unknown encoding '$name'") unless defined $enc;
 189  my $string = $enc->decode($octets,$check);
 190  return undef if ($check && length($octets));
 191  return $string;
 192 }
 193
 194 sub from_to
 195 {
 196  my ($string,$from,$to,$check) = @_;
 197  my $f = find_encoding($from);
 198  croak("Unknown encoding '$from'") unless defined $f;
 199  my $t = find_encoding($to);
 200  croak("Unknown encoding '$to'") unless defined $t;
 201  my $uni = $f->decode($string,$check);
 202  return undef if ($check && length($string));
 203  $string = $t->encode($uni,$check);
 204  return undef if ($check && length($uni));
 205  return length($_[0] = $string);
 206 }
 207
 208 sub encode_utf8
 209 {
 210  my ($str) = @_;
 211  utf8::encode($str);
 212  return $str;
 213 }
 214
 215 sub decode_utf8
 216 {
 217  my ($str) = @_;
 218  return undef unless utf8::decode($str);
 219  return $str;
 220 }
 221
 222 package Encode::Encoding;
 223 # Base class for classes which implement encodings
 224
 225 sub Define
 226 {
 227  my $obj = shift;
 228  my $canonical = shift;
 229  $obj = bless { Name => $canonical },$obj unless ref $obj;
 230  # warn "$canonical => $obj\n";
 231  Encode::define_encoding($obj, $canonical, @_);
 232 }
 233
 234 sub name { shift->{'Name'} }
 235
 236 # Temporary legacy methods
 237 sub toUnicode    { shift->decode(@_) }
 238 sub fromUnicode  { shift->encode(@_) }
 239
 240 sub new_sequence { return $_[0] }
 241
 242 package Encode::XS;
 243 use base 'Encode::Encoding';
 244
 245 package Encode::Internal;
 246 use base 'Encode::Encoding';
 247
 248 # Dummy package that provides the encode interface but leaves data
 249 # as UTF-X encoded. It is here so that from_to() works.
 250
 251 __PACKAGE__->Define('Internal');
 252
 253 Encode::define_alias( 'Unicode' => 'Internal' ) if ord('A') == 65;
 254
 255 sub decode
 256 {
 257  my ($obj,$str,$chk) = @_;
 258  utf8::upgrade($str);
 259  $_[1] = '' if $chk;
 260  return $str;
 261 }
 262
 263 *encode = \&decode;
 264
 265 package Encoding::Unicode;
 266 use base 'Encode::Encoding';
 267
 268 __PACKAGE__->Define('Unicode') unless ord('A') == 65;
 269
 270 sub decode
 271 {
 272  my ($obj,$str,$chk) = @_;
 273  my $res = '';
 274  for (my $i = 0; $i < length($str); $i++)
 275   {
 276    $res .= chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
 277   }
 278  $_[1] = '' if $chk;
 279  return $res;
 280 }
 281
 282 sub encode
 283 {
 284  my ($obj,$str,$chk) = @_;
 285  my $res = '';
 286  for (my $i = 0; $i < length($str); $i++)
 287   {
 288    $res .= chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
 289   }
 290  $_[1] = '' if $chk;
 291  return $res;
 292 }
 293
 294
 295 package Encode::utf8;
 296 use base 'Encode::Encoding';
 297 # package to allow long-hand
 298 #   $octets = encode( utf8 => $string );
 299 #
 300
 301 __PACKAGE__->Define(qw(UTF-8 utf8));
 302
 303 sub decode
 304 {
 305  my ($obj,$octets,$chk) = @_;
 306  my $str = Encode::decode_utf8($octets);
 307  if (defined $str)
 308   {
 309    $_[1] = '' if $chk;
 310    return $str;
 311   }
 312  return undef;
 313 }
 314
 315 sub encode
 316 {
 317  my ($obj,$string,$chk) = @_;
 318  my $octets = Encode::encode_utf8($string);
 319  $_[1] = '' if $chk;
 320  return $octets;
 321 }
 322
 323 package Encode::iso10646_1;
 324 use base 'Encode::Encoding';
 325 # Encoding is 16-bit network order Unicode (no surogates)
 326 # Used for X font encodings
 327
 328 __PACKAGE__->Define(qw(UCS-2 iso-10646-1));
 329
 330 sub decode
 331 {
 332  my ($obj,$str,$chk) = @_;
 333  my $uni   = '';
 334  while (length($str))
 335   {
 336    my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
 337    $uni .= chr($code);
 338   }
 339  $_[1] = $str if $chk;
 340  utf8::upgrade($uni);
 341  return $uni;
 342 }
 343
 344 sub encode
 345 {
 346  my ($obj,$uni,$chk) = @_;
 347  my $str   = '';
 348  while (length($uni))
 349   {
 350    my $ch = substr($uni,0,1,'');
 351    my $x  = ord($ch);
 352    unless ($x < 32768)
 353     {
 354      last if ($chk);
 355      $x = 0;
 356     }
 357    $str .= pack('n',$x);
 358   }
 359  $_[1] = $uni if $chk;
 360  return $str;
 361 }
 362
 363 # switch back to Encode package in case we ever add AutoLoader
 364 package Encode;
 365
 366 1;
 367
 368 __END__
 369
 370 =head1 NAME
 371
 372 Encode - character encodings
 373
 374 =head1 SYNOPSIS
 375
 376     use Encode;
 377
 378 =head1 DESCRIPTION
 379
 380 The C<Encode> module provides the interfaces between Perl's strings
 381 and the rest of the system.  Perl strings are sequences of B<characters>.
 382
 383 The repertoire of characters that Perl can represent is at least that
 384 defined by the Unicode Consortium. On most platforms the ordinal
 385 values of the characters (as returned by C<ord(ch)>) is the "Unicode
 386 codepoint" for the character (the exceptions are those platforms where
 387 the legacy encoding is some variant of EBCDIC rather than a super-set
 388 of ASCII - see L<perlebcdic>).
 389
 390 Traditionaly computer data has been moved around in 8-bit chunks
 391 often called "bytes". These chunks are also known as "octets" in
 392 networking standards. Perl is widely used to manipulate data of
 393 many types - not only strings of characters representing human or
 394 computer languages but also "binary" data being the machines representation
 395 of numbers, pixels in an image - or just about anything.
 396
 397 When Perl is processing "binary data" the programmer wants Perl to process
 398 "sequences of bytes". This is not a problem for Perl - as a byte has 256
 399 possible values it easily fits in Perl's much larger "logical character".
 400
 401 =head2 TERMINOLOGY
 402
 403 =over 4
 404
 405 =item *
 406
 407 I<character>: a character in the range 0..(2**32-1) (or more).
 408 (What Perl's strings are made of.)
 409
 410 =item *
 411
 412 I<byte>: a character in the range 0..255
 413 (A special case of a Perl character.)
 414
 415 =item *
 416
 417 I<octet>: 8 bits of data, with ordinal values 0..255
 418 (Term for bytes passed to or from a non-Perl context, e.g. disk file.)
 419
 420 =back
 421
 422 The marker [INTERNAL] marks Internal Implementation Details, in
 423 general meant only for those who think they know what they are doing,
 424 and such details may change in future releases.
 425
 426 =head1 ENCODINGS
 427
 428 =head2 Characteristics of an Encoding
 429
 430 An encoding has a "repertoire" of characters that it can represent,
 431 and for each representable character there is at least one sequence of
 432 octets that represents it.
 433
 434 =head2 Types of Encodings
 435
 436 Encodings can be divided into the following types:
 437
 438 =over 4
 439
 440 =item * Fixed length 8-bit (or less) encodings.
 441
 442 Each character is a single octet so may have a repertoire of up to
 443 256 characters. ASCII and iso-8859-* are typical examples.
 444
 445 =item * Fixed length 16-bit encodings
 446
 447 Each character is two octets so may have a repertoire of up to
 448 65 536 characters.  Unicode's UCS-2 is an example.  Also used for
 449 encodings for East Asian languages.
 450
 451 =item * Fixed length 32-bit encodings.
 452
 453 Not really very "encoded" encodings. The Unicode code points
 454 are just represented as 4-octet integers. None the less because
 455 different architectures use different representations of integers
 456 (so called "endian") there at least two disctinct encodings.
 457
 458 =item * Multi-byte encodings
 459
 460 The number of octets needed to represent a character varies.
 461 UTF-8 is a particularly complex but regular case of a multi-byte
 462 encoding. Several East Asian countries use a multi-byte encoding
 463 where 1-octet is used to cover western roman characters and Asian
 464 characters get 2-octets.
 465 (UTF-16 is strictly a multi-byte encoding taking either 2 or 4 octets
 466 to represent a Unicode code point.)
 467
 468 =item * "Escape" encodings.
 469
 470 These encodings embed "escape sequences" into the octet sequence
 471 which describe how the following octets are to be interpreted.
 472 The iso-2022-* family is typical. Following the escape sequence
 473 octets are encoded by an "embedded" encoding (which will be one
 474 of the above types) until another escape sequence switches to
 475 a different "embedded" encoding.
 476
 477 These schemes are very flexible and can handle mixed languages but are
 478 very complex to process (and have state).  No escape encodings are
 479 implemented for Perl yet.
 480
 481 =back
 482
 483 =head2 Specifying Encodings
 484
 485 Encodings can be specified to the API described below in two ways:
 486
 487 =over 4
 488
 489 =item 1. By name
 490
 491 Encoding names are strings with characters taken from a restricted
 492 repertoire.  See L</"Encoding Names">.
 493
 494 =item 2. As an object
 495
 496 Encoding objects are returned by C<find_encoding($name)>.
 497
 498 =back
 499
 500 =head2 Encoding Names
 501
 502 Encoding names are case insensitive. White space in names is ignored.
 503 In addition an encoding may have aliases. Each encoding has one
 504 "canonical" name.  The "canonical" name is chosen from the names of
 505 the encoding by picking the first in the following sequence:
 506
 507 =over 4
 508
 509 =item * The MIME name as defined in IETF RFC-XXXX.
 510
 511 =item * The name in the IANA registry.
 512
 513 =item * The name used by the the organization that defined it.
 514
 515 =back
 516
 517 Because of all the alias issues, and because in the general case
 518 encodings have state C<Encode> uses the encoding object internally
 519 once an operation is in progress.
 520
 521 =head1 PERL ENCODING API
 522
 523 =head2 Generic Encoding Interface
 524
 525 =over 4
 526
 527 =item *
 528
 529         $bytes  = encode(ENCODING, $string[, CHECK])
 530
 531 Encodes string from Perl's internal form into I<ENCODING> and returns
 532 a sequence of octets.  For CHECK see L</"Handling Malformed Data">.
 533
 534 =item *
 535
 536         $string = decode(ENCODING, $bytes[, CHECK])
 537
 538 Decode sequence of octets assumed to be in I<ENCODING> into Perl's
 539 internal form and returns the resulting string.  For CHECK see
 540 L</"Handling Malformed Data">.
 541
 542 =item *
 543
 544         from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
 545
 546 Convert B<in-place> the data between two encodings.  How did the data
 547 in $string originally get to be in FROM_ENCODING?  Either using
 548 encode() or through PerlIO: See L</"Encoding and IO">.  For CHECK
 549 see L</"Handling Malformed Data">.
 550
 551 For example to convert ISO 8859-1 data to UTF-8:
 552
 553         from_to($data, "iso-8859-1", "utf-8");
 554
 555 and to convert it back:
 556
 557         from_to($data, "utf-8", "iso-8859-1");
 558
 559 Note that because the conversion happens in place, the data to be
 560 converted cannot be a string constant, it must be a scalar variable.
 561
 562 =back
 563
 564 =head2 Handling Malformed Data
 565
 566 If CHECK is not set, C<undef> is returned.  If the data is supposed to
 567 be UTF-8, an optional lexical warning (category utf8) is given.  If
 568 CHECK is true but not a code reference, dies.
 569
 570 It would desirable to have a way to indicate that transform should use
 571 the encodings "replacement character" - no such mechanism is defined yet.
 572
 573 It is also planned to allow I<CHECK> to be a code reference.
 574
 575 This is not yet implemented as there are design issues with what its
 576 arguments should be and how it returns its results.
 577
 578 =over 4
 579
 580 =item Scheme 1
 581
 582 Passed remaining fragment of string being processed.
 583 Modifies it in place to remove bytes/characters it can understand
 584 and returns a string used to represent them.
 585 e.g.
 586
 587  sub fixup {
 588    my $ch = substr($_[0],0,1,'');
 589    return sprintf("\x{%02X}",ord($ch);
 590  }
 591
 592 This scheme is close to how underlying C code for Encode works, but gives
 593 the fixup routine very little context.
 594
 595 =item Scheme 2
 596
 597 Passed original string, and an index into it of the problem area, and
 598 output string so far.  Appends what it will to output string and
 599 returns new index into original string.  For example:
 600
 601  sub fixup {
 602    # my ($s,$i,$d) = @_;
 603    my $ch = substr($_[0],$_[1],1);
 604    $_[2] .= sprintf("\x{%02X}",ord($ch);
 605    return $_[1]+1;
 606  }
 607
 608 This scheme gives maximal control to the fixup routine but is more
 609 complicated to code, and may need internals of Encode to be tweaked to
 610 keep original string intact.
 611
 612 =item Other Schemes
 613
 614 Hybrids of above.
 615
 616 Multiple return values rather than in-place modifications.
 617
 618 Index into the string could be pos($str) allowing s/\G...//.
 619
 620 =back
 621
 622 =head2 UTF-8 / utf8
 623
 624 The Unicode consortium defines the UTF-8 standard as a way of encoding
 625 the entire Unicode repertiore as sequences of octets.  This encoding is
 626 expected to become very widespread. Perl can use this form internaly
 627 to represent strings, so conversions to and from this form are
 628 particularly efficient (as octets in memory do not have to change,
 629 just the meta-data that tells Perl how to treat them).
 630
 631 =over 4
 632
 633 =item *
 634
 635         $bytes = encode_utf8($string);
 636
 637 The characters that comprise string are encoded in Perl's superset of UTF-8
 638 and the resulting octets returned as a sequence of bytes. All possible
 639 characters have a UTF-8 representation so this function cannot fail.
 640
 641 =item *
 642
 643         $string = decode_utf8($bytes [,CHECK]);
 644
 645 The sequence of octets represented by $bytes is decoded from UTF-8
 646 into a sequence of logical characters. Not all sequences of octets
 647 form valid UTF-8 encodings, so it is possible for this call to fail.
 648 For CHECK see L</"Handling Malformed Data">.
 649
 650 =back
 651
 652 =head2 Other Encodings of Unicode
 653
 654 UTF-16 is similar to UCS-2, 16 bit or 2-byte chunks.  UCS-2 can only
 655 represent 0..0xFFFF, while UTF-16 has a "surrogate pair" scheme which
 656 allows it to cover the whole Unicode range.
 657
 658 Encode implements big-endian UCS-2 aliased to "iso-10646-1" as that
 659 happens to be the name used by that representation when used with X11
 660 fonts.
 661
 662 UTF-32 or UCS-4 is 32-bit or 4-byte chunks.  Perl's logical characters
 663 can be considered as being in this form without encoding. An encoding
 664 to transfer strings in this form (e.g. to write them to a file) would
 665 need to
 666
 667      pack('L',map(chr($_),split(//,$string)));   # native
 668   or
 669      pack('V',map(chr($_),split(//,$string)));   # little-endian
 670   or
 671      pack('N',map(chr($_),split(//,$string)));   # big-endian
 672
 673 depending on the endian required.
 674
 675 No UTF-32 encodings are implemented yet.
 676
 677 Both UCS-2 and UCS-4 style encodings can have "byte order marks" by
 678 representing the code point 0xFFFE as the very first thing in a file.
 679
 680 =head2 Listing available encodings
 681
 682   use Encode qw(encodings);
 683   @list = encodings();
 684
 685 Returns a list of the canonical names of the available encodings.
 686
 687 =head2 Defining Aliases
 688
 689   use Encode qw(define_alias);
 690   define_alias( newName => ENCODING);
 691
 692 Allows newName to be used as am alias for ENCODING. ENCODING may be
 693 either the name of an encoding or and encoding object (as above).
 694
 695 Currently I<newName> can be specified in the following ways:
 696
 697 =over 4
 698
 699 =item As a simple string.
 700
 701 =item As a qr// compiled regular expression, e.g.:
 702
 703   define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
 704
 705 In this case if I<ENCODING> is not a reference it is C<eval>-ed to
 706 allow C<$1> etc. to be subsituted.  The example is one way to names as
 707 used in X11 font names to alias the MIME names for the iso-8859-*
 708 family.
 709
 710 =item As a code reference, e.g.:
 711
 712   define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
 713
 714 In this case C<$_> will be set to the name that is being looked up and
 715 I<ENCODING> is passed to the sub as its first argument.  The example
 716 is another way to names as used in X11 font names to alias the MIME
 717 names for the iso-8859-* family.
 718
 719 =back
 720
 721 =head2 Defining Encodings
 722
 723     use Encode qw(define_alias);
 724     define_encoding( $object, 'canonicalName' [,alias...]);
 725
 726 Causes I<canonicalName> to be associated with I<$object>.  The object
 727 should provide the interface described in L</"IMPLEMENTATION CLASSES">
 728 below.  If more than two arguments are provided then additional
 729 arguments are taken as aliases for I<$object> as for C<define_alias>.
 730
 731 =head1 Encoding and IO
 732
 733 It is very common to want to do encoding transformations when
 734 reading or writing files, network connections, pipes etc.
 735 If Perl is configured to use the new 'perlio' IO system then
 736 C<Encode> provides a "layer" (See L<perliol>) which can transform
 737 data as it is read or written.
 738
 739     use Encode;
 740     open(my $ilyad,'>:encoding(iso-8859-7)','ilyad.greek');
 741     print $ilyad @epic;
 742
 743 In addition the new IO system can also be configured to read/write
 744 UTF-8 encoded characters (as noted above this is efficient):
 745
 746     open(my $fh,'>:utf8','anything');
 747     print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
 748
 749 Either of the above forms of "layer" specifications can be made the default
 750 for a lexical scope with the C<use open ...> pragma. See L<open>.
 751
 752 Once a handle is open is layers can be altered using C<binmode>.
 753
 754 Without any such configuration, or if Perl itself is built using
 755 system's own IO, then write operations assume that file handle accepts
 756 only I<bytes> and will C<die> if a character larger than 255 is
 757 written to the handle. When reading, each octet from the handle
 758 becomes a byte-in-a-character. Note that this default is the same
 759 behaviour as bytes-only languages (including Perl before v5.6) would
 760 have, and is sufficient to handle native 8-bit encodings
 761 e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
 762 other encodings and binary data.
 763
 764 In other cases it is the programs responsibility to transform
 765 characters into bytes using the API above before doing writes, and to
 766 transform the bytes read from a handle into characters before doing
 767 "character operations" (e.g. C<lc>, C</\W+/>, ...).
 768
 769 You can also use PerlIO to convert larger amounts of data you don't
 770 want to bring into memory.  For example to convert between ISO 8859-1
 771 (Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
 772
 773     open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
 774     open(G, ">:utf8",                 "data.utf") or die $!;
 775     while (<F>) { print G }
 776
 777     # Could also do "print G <F>" but that would pull
 778     # the whole file into memory just to write it out again.
 779
 780 More examples:
 781
 782     open(my $f, "<:encoding(cp1252)")
 783     open(my $g, ">:encoding(iso-8859-2)")
 784     open(my $h, ">:encoding(latin9)")       # iso-8859-15
 785
 786 See L<PerlIO> for more information.
 787
 788 =head1 Encoding How to ...
 789
 790 To do:
 791
 792 =over 4
 793
 794 =item * IO with mixed content (faking iso-2020-*)
 795
 796 =item * MIME's Content-Length:
 797
 798 =item * UTF-8 strings in binary data.
 799
 800 =item * Perl/Encode wrappers on non-Unicode XS modules.
 801
 802 =back
 803
 804 =head1 Messing with Perl's Internals
 805
 806 The following API uses parts of Perl's internals in the current
 807 implementation.  As such they are efficient, but may change.
 808
 809 =over 4
 810
 811 =item * is_utf8(STRING [, CHECK])
 812
 813 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
 814 If CHECK is true, also checks the data in STRING for being well-formed
 815 UTF-8.  Returns true if successful, false otherwise.
 816
 817 =item * valid_utf8(STRING)
 818
 819 [INTERNAL] Test whether STRING is in a consistent state.  Will return
 820 true if string is held as bytes, or is well-formed UTF-8 and has the
 821 UTF-8 flag on.  Main reason for this routine is to allow Perl's
 822 testsuite to check that operations have left strings in a consistent
 823 state.
 824
 825 =item *
 826
 827         _utf8_on(STRING)
 828
 829 [INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
 830 B<not> checked for being well-formed UTF-8.  Do not use unless you
 831 B<know> that the STRING is well-formed UTF-8.  Returns the previous
 832 state of the UTF-8 flag (so please don't test the return value as
 833 I<not> success or failure), or C<undef> if STRING is not a string.
 834
 835 =item *
 836
 837         _utf8_off(STRING)
 838
 839 [INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
 840 Returns the previous state of the UTF-8 flag (so please don't test the
 841 return value as I<not> success or failure), or C<undef> if STRING is
 842 not a string.
 843
 844 =back
 845
 846 =head1 IMPLEMENTATION CLASSES
 847
 848 As mentioned above encodings are (in the current implementation at least)
 849 defined by objects. The mapping of encoding name to object is via the
 850 C<%encodings> hash.
 851
 852 The values of the hash can currently be either strings or objects.
 853 The string form may go away in the future. The string form occurs
 854 when C<encodings()> has scanned C<@INC> for loadable encodings but has
 855 not actually loaded the encoding in question. This is because the
 856 current "loading" process is all Perl and a bit slow.
 857
 858 Once an encoding is loaded then value of the hash is object which
 859 implements the encoding. The object should provide the following
 860 interface:
 861
 862 =over 4
 863
 864 =item -E<gt>name
 865
 866 Should return the string representing the canonical name of the encoding.
 867
 868 =item -E<gt>new_sequence
 869
 870 This is a placeholder for encodings with state. It should return an
 871 object which implements this interface, all current implementations
 872 return the original object.
 873
 874 =item -E<gt>encode($string,$check)
 875
 876 Should return the octet sequence representing I<$string>. If I<$check>
 877 is true it should modify I<$string> in place to remove the converted
 878 part (i.e.  the whole string unless there is an error).  If an error
 879 occurs it should return the octet sequence for the fragment of string
 880 that has been converted, and modify $string in-place to remove the
 881 converted part leaving it starting with the problem fragment.
 882
 883 If check is is false then C<encode> should make a "best effort" to
 884 convert the string - for example by using a replacement character.
 885
 886 =item -E<gt>decode($octets,$check)
 887
 888 Should return the string that I<$octets> represents. If I<$check> is
 889 true it should modify I<$octets> in place to remove the converted part
 890 (i.e.  the whole sequence unless there is an error).  If an error
 891 occurs it should return the fragment of string that has been
 892 converted, and modify $octets in-place to remove the converted part
 893 leaving it starting with the problem fragment.
 894
 895 If check is is false then C<decode> should make a "best effort" to
 896 convert the string - for example by using Unicode's "\x{FFFD}" as a
 897 replacement character.
 898
 899 =back
 900
 901 It should be noted that the check behaviour is different from the
 902 outer public API. The logic is that the "unchecked" case is useful
 903 when encoding is part of a stream which may be reporting errors
 904 (e.g. STDERR).  In such cases it is desirable to get everything
 905 through somehow without causing additional errors which obscure the
 906 original one. Also the encoding is best placed to know what the
 907 correct replacement character is, so if that is the desired behaviour
 908 then letting low level code do it is the most efficient.
 909
 910 In contrast if check is true, the scheme above allows the encoding to
 911 do as much as it can and tell layer above how much that was. What is
 912 lacking at present is a mechanism to report what went wrong. The most
 913 likely interface will be an additional method call to the object, or
 914 perhaps (to avoid forcing per-stream objects on otherwise stateless
 915 encodings) and additional parameter.
 916
 917 It is also highly desirable that encoding classes inherit from
 918 C<Encode::Encoding> as a base class. This allows that class to define
 919 additional behaviour for all encoding objects. For example built in
 920 Unicode, UCS-2 and UTF-8 classes use :
 921
 922   package Encode::MyEncoding;
 923   use base qw(Encode::Encoding);
 924
 925   __PACKAGE__->Define(qw(myCanonical myAlias));
 926
 927 To create an object with bless {Name => ...},$class, and call
 928 define_encoding.  They inherit their C<name> method from
 929 C<Encode::Encoding>.
 930
 931 =head2 Compiled Encodings
 932
 933 F<Encode.xs> provides a class C<Encode::XS> which provides the
 934 interface described above. It calls a generic octet-sequence to
 935 octet-sequence "engine" that is driven by tables (defined in
 936 F<encengine.c>). The same engine is used for both encode and
 937 decode. C<Encode:XS>'s C<encode> forces Perl's characters to their
 938 UTF-8 form and then treats them as just another multibyte
 939 encoding. C<Encode:XS>'s C<decode> transforms the sequence and then
 940 turns the UTF-8-ness flag as that is the form that the tables are
 941 defined to produce. For details of the engine see the comments in
 942 F<encengine.c>.
 943
 944 The tables are produced by the Perl script F<compile> (the name needs
 945 to change so we can eventually install it somewhere). F<compile> can
 946 currently read two formats:
 947
 948 =over 4
 949
 950 =item *.enc
 951
 952 This is a coined format used by Tcl. It is documented in
 953 Encode/EncodeFormat.pod.
 954
 955 =item *.ucm
 956
 957 This is the semi-standard format used by IBM's ICU package.
 958
 959 =back
 960
 961 F<compile> can write the following forms:
 962
 963 =over 4
 964
 965 =item *.ucm
 966
 967 See above - the F<Encode/*.ucm> files provided with the distribution have
 968 been created from the original Tcl .enc files using this approach.
 969
 970 =item *.c
 971
 972 Produces tables as C data structures - this is used to build in encodings
 973 into F<Encode.so>/F<Encode.dll>.
 974
 975 =item *.xs
 976
 977 In theory this allows encodings to be stand-alone loadable Perl
 978 extensions.  The process has not yet been tested. The plan is to use
 979 this approach for large East Asian encodings.
 980
 981 =back
 982
 983 The set of encodings built-in to F<Encode.so>/F<Encode.dll> is
 984 determined by F<Makefile.PL>.  The current set is as follows:
 985
 986 =over 4
 987
 988 =item ascii and iso-8859-*
 989
 990 That is all the common 8-bit "western" encodings.
 991
 992 =item IBM-1047 and two other variants of EBCDIC.
 993
 994 These are the same variants that are supported by EBCDIC Perl as
 995 "native" encodings.  They are included to prove "reversibility" of
 996 some constructs in EBCDIC Perl.
 997
 998 =item symbol and dingbats as used by Tk on X11.
 999
1000 (The reason Encode got started was to support Perl/Tk.)
1001
1002 =back
1003
1004 That set is rather ad hoc and has been driven by the needs of the
1005 tests rather than the needs of typical applications. It is likely
1006 to be rationalized.
1007
1008 =head1 SEE ALSO
1009
1010 L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>
1011
1012 =cut
1013