ext/Encode/Encode.pm

   1 package Encode;
   2 use strict;
   3
   4 our $VERSION = 0.02;
   5
   6 require DynaLoader;
   7 require Exporter;
   8
   9 our @ISA = qw(Exporter DynaLoader);
  10
  11 # Public, encouraged API is exported by default
  12 our @EXPORT = qw (
  13   encode
  14   decode
  15   encode_utf8
  16   decode_utf8
  17   find_encoding
  18   encodings
  19 );
  20
  21 our @EXPORT_OK =
  22     qw(
  23        define_encoding
  24        define_alias
  25        from_to
  26        is_utf8
  27        is_8bit
  28        is_16bit
  29        utf8_upgrade
  30        utf8_downgrade
  31        _utf8_on
  32        _utf8_off
  33       );
  34
  35 bootstrap Encode ();
  36
  37 # Documentation moved after __END__ for speed - NI-S
  38
  39 use Carp;
  40
  41 # Make a %encoding package variable to allow a certain amount of cheating
  42 our %encoding;
  43 my @alias;  # ordered matching list
  44 my %alias;  # cached known aliases
  45
  46 sub encodings
  47 {
  48  my ($class) = @_;
  49  return keys %encoding;
  50 }
  51
  52 sub findAlias
  53 {
  54  my $class = shift;
  55  local $_ = shift;
  56  unless (exists $alias{$_})
  57   {
  58    for (my $i=0; $i < @alias; $i += 2)
  59     {
  60      my $alias = $alias[$i];
  61      my $val   = $alias[$i+1];
  62      my $new;
  63      if (ref($alias) eq 'Regexp' && $_ =~ $alias)
  64       {
  65        $new = eval $val;
  66       }
  67      elsif (ref($alias) eq 'CODE')
  68       {
  69        $new = &{$alias}($val)
  70       }
  71      elsif (lc($_) eq $alias)
  72       {
  73        $new = $val;
  74       }
  75      if (defined($new))
  76       {
  77        next if $new eq $_; # avoid (direct) recursion on bugs
  78        my $enc = (ref($new)) ? $new : find_encoding($new);
  79        if ($enc)
  80         {
  81          $alias{$_} = $enc;
  82          last;
  83         }
  84       }
  85     }
  86   }
  87  return $alias{$_};
  88 }
  89
  90 sub define_alias
  91 {
  92  while (@_)
  93   {
  94    my ($alias,$name) = splice(@_,0,2);
  95    push(@alias, $alias => $name);
  96   }
  97 }
  98
  99 my %isolatin2num =
 100         (
 101         1 =>  1,
 102         2 =>  2,
 103         3 =>  3,
 104         4 =>  4,
 105         5 =>  9,
 106         6 => 10,
 107         7 => 13,
 108         8 => 14,
 109         9 => 15,
 110        10 => 16,
 111         );
 112
 113 define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
 114 define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i =>
 115               '"iso-8859-$isolatin2num{$1}"' );
 116 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
 117 #define_alias( sub { return /^iso-(\d+-\d+)$/i  ? "iso$1" : '' } );
 118 define_alias( 'ascii' => 'US-ascii');
 119 define_alias( 'ibm-1047' => 'cp1047');
 120
 121 sub define_encoding
 122 {
 123  my $obj  = shift;
 124  my $name = shift;
 125  $encoding{$name} = $obj;
 126  my $lc = lc($name);
 127  define_alias($lc => $obj) unless $lc eq $name;
 128  while (@_)
 129   {
 130    my $alias = shift;
 131    define_alias($alias,$obj);
 132   }
 133  return $obj;
 134 }
 135
 136 sub getEncoding
 137 {
 138  my ($class,$name) = @_;
 139  my $enc;
 140  if (exists $encoding{$name})
 141   {
 142    return $encoding{$name};
 143   }
 144  else
 145   {
 146    return $class->findAlias($name);
 147   }
 148 }
 149
 150 sub find_encoding
 151 {
 152  my ($name) = @_;
 153  return __PACKAGE__->getEncoding($name);
 154 }
 155
 156 sub encode
 157 {
 158  my ($name,$string,$check) = @_;
 159  my $enc = find_encoding($name);
 160  croak("Unknown encoding '$name'") unless defined $enc;
 161  my $octets = $enc->encode($string,$check);
 162  return undef if ($check && length($string));
 163  return $octets;
 164 }
 165
 166 sub decode
 167 {
 168  my ($name,$octets,$check) = @_;
 169  my $enc = find_encoding($name);
 170  croak("Unknown encoding '$name'") unless defined $enc;
 171  my $string = $enc->decode($octets,$check);
 172  return undef if ($check && length($octets));
 173  return $string;
 174 }
 175
 176 sub from_to
 177 {
 178  my ($string,$from,$to,$check) = @_;
 179  my $f = find_encoding($from);
 180  croak("Unknown encoding '$from'") unless defined $f;
 181  my $t = find_encoding($to);
 182  croak("Unknown encoding '$to'") unless defined $t;
 183  my $uni = $f->decode($string,$check);
 184  return undef if ($check && length($string));
 185  $string = $t->encode($uni,$check);
 186  return undef if ($check && length($uni));
 187  return length($_[0] = $string);
 188 }
 189
 190 sub encode_utf8
 191 {
 192  my ($str) = @_;
 193  utf8_encode($str);
 194  return $str;
 195 }
 196
 197 sub decode_utf8
 198 {
 199  my ($str) = @_;
 200  return undef unless utf8_decode($str);
 201  return $str;
 202 }
 203
 204 package Encode::Encoding;
 205 # Base class for classes which implement encodings
 206
 207 sub Define
 208 {
 209  my $obj = shift;
 210  my $canonical = shift;
 211  $obj = bless { Name => $canonical },$obj unless ref $obj;
 212  # warn "$canonical => $obj\n";
 213  Encode::define_encoding($obj, $canonical, @_);
 214 }
 215
 216 sub name { shift->{'Name'} }
 217
 218 # Temporary legacy methods
 219 sub toUnicode    { shift->decode(@_) }
 220 sub fromUnicode  { shift->encode(@_) }
 221
 222 sub new_sequence { return $_[0] }
 223
 224 package Encode::XS;
 225 use base 'Encode::Encoding';
 226
 227 package Encode::Unicode;
 228 use base 'Encode::Encoding';
 229
 230 # Dummy package that provides the encode interface but leaves data
 231 # as UTF-8 encoded. It is here so that from_to() works.
 232
 233 __PACKAGE__->Define('Unicode');
 234
 235 sub decode
 236 {
 237  my ($obj,$str,$chk) = @_;
 238  Encode::utf8_upgrade($str);
 239  $_[1] = '' if $chk;
 240  return $str;
 241 }
 242
 243 *encode = \&decode;
 244
 245 package Encode::utf8;
 246 use base 'Encode::Encoding';
 247 # package to allow long-hand
 248 #   $octets = encode( utf8 => $string );
 249 #
 250
 251 __PACKAGE__->Define(qw(UTF-8 utf8));
 252
 253 sub decode
 254 {
 255  my ($obj,$octets,$chk) = @_;
 256  my $str = Encode::decode_utf8($octets);
 257  if (defined $str)
 258   {
 259    $_[1] = '' if $chk;
 260    return $str;
 261   }
 262  return undef;
 263 }
 264
 265 sub encode
 266 {
 267  my ($obj,$string,$chk) = @_;
 268  my $octets = Encode::encode_utf8($string);
 269  $_[1] = '' if $chk;
 270  return $octets;
 271 }
 272
 273 package Encode::iso10646_1;
 274 use base 'Encode::Encoding';
 275 # Encoding is 16-bit network order Unicode (no surogates)
 276 # Used for X font encodings
 277
 278 __PACKAGE__->Define(qw(UCS-2 iso10646-1));
 279
 280 sub decode
 281 {
 282  my ($obj,$str,$chk) = @_;
 283  my $uni   = '';
 284  while (length($str))
 285   {
 286    my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
 287    $uni .= chr($code);
 288   }
 289  $_[1] = $str if $chk;
 290  Encode::utf8_upgrade($uni);
 291  return $uni;
 292 }
 293
 294 sub encode
 295 {
 296  my ($obj,$uni,$chk) = @_;
 297  my $str   = '';
 298  while (length($uni))
 299   {
 300    my $ch = substr($uni,0,1,'');
 301    my $x  = ord($ch);
 302    unless ($x < 32768)
 303     {
 304      last if ($chk);
 305      $x = 0;
 306     }
 307    $str .= pack('n',$x);
 308   }
 309  $_[1] = $uni if $chk;
 310  return $str;
 311 }
 312
 313 # switch back to Encode package in case we ever add AutoLoader
 314 package Encode;
 315
 316 1;
 317
 318 __END__
 319
 320 =head1 NAME
 321
 322 Encode - character encodings
 323
 324 =head1 SYNOPSIS
 325
 326     use Encode;
 327
 328 =head1 DESCRIPTION
 329
 330 The C<Encode> module provides the interfaces between perl's strings
 331 and the rest of the system. Perl strings are sequences of B<characters>.
 332
 333 The repertoire of characters that Perl can represent is at least that
 334 defined by the Unicode Consortium. On most platforms the ordinal values
 335 of the  characters (as returned by C<ord(ch)>) is the "Unicode codepoint" for
 336 the character (the exceptions are those platforms where the legacy
 337 encoding is some variant of EBCDIC rather than a super-set of ASCII
 338 - see L<perlebcdic>).
 339
 340 Traditionaly computer data has been moved around in 8-bit chunks
 341 often called "bytes". These chunks are also known as "octets" in
 342 networking standards. Perl is widely used to manipulate data of
 343 many types - not only strings of characters representing human or
 344 computer languages but also "binary" data being the machines representation
 345 of numbers, pixels in an image - or just about anything.
 346
 347 When perl is processing "binary data" the programmer wants perl to process
 348 "sequences of bytes". This is not a problem for perl - as a byte has 256
 349 possible values it easily fits in perl's much larger "logical character".
 350
 351 =head2 TERMINOLOGY
 352
 353 =over 4
 354
 355 =item *
 356
 357 I<character>: a character in the range 0..(2**32-1) (or more).
 358 (What perl's strings are made of.)
 359
 360 =item *
 361
 362 I<byte>: a character in the range 0..255
 363 (A special case of a perl character.)
 364
 365 =item *
 366
 367 I<octet>: 8 bits of data, with ordinal values 0..255
 368 (Term for bytes passed to or from a non-perl context, e.g. disk file.)
 369
 370 =back
 371
 372 The marker [INTERNAL] marks Internal Implementation Details, in
 373 general meant only for those who think they know what they are doing,
 374 and such details may change in future releases.
 375
 376 =head1 ENCODINGS
 377
 378 =head2 Characteristics of an Encoding
 379
 380 An encoding has a "repertoire" of characters that it can represent,
 381 and for each representable character there is at least one sequence of
 382 octets that represents it.
 383
 384 =head2 Types of Encodings
 385
 386 Encodings can be divided into the following types:
 387
 388 =over 4
 389
 390 =item * Fixed length 8-bit (or less) encodings.
 391
 392 Each character is a single octet so may have a repertoire of up to
 393 256 characters. ASCII and iso-8859-* are typical examples.
 394
 395 =item * Fixed length 16-bit encodings
 396
 397 Each character is two octets so may have a repertoire of up to
 398 65,536 characters. Unicode's UCS-2 is an example. Also used for
 399 encodings for East Asian languages.
 400
 401 =item * Fixed length 32-bit encodings.
 402
 403 Not really very "encoded" encodings. The Unicode code points
 404 are just represented as 4-octet integers. None the less because
 405 different architectures use different representations of integers
 406 (so called "endian") there at least two disctinct encodings.
 407
 408 =item * Multi-byte encodings
 409
 410 The number of octets needed to represent a character varies.
 411 UTF-8 is a particularly complex but regular case of a multi-byte
 412 encoding. Several East Asian countries use a multi-byte encoding
 413 where 1-octet is used to cover western roman characters and Asian
 414 characters get 2-octets.
 415 (UTF-16 is strictly a multi-byte encoding taking either 2 or 4 octets
 416 to represent a Unicode code point.)
 417
 418 =item * "Escape" encodings.
 419
 420 These encodings embed "escape sequences" into the octet sequence
 421 which describe how the following octets are to be interpreted.
 422 The iso-2022-* family is typical. Following the escape sequence
 423 octets are encoded by an "embedded" encoding (which will be one
 424 of the above types) until another escape sequence switches to
 425 a different "embedded" encoding.
 426
 427 These schemes are very flexible and can handle mixed languages but are
 428 very complex to process (and have state).
 429 No escape encodings are implemented for perl yet.
 430
 431 =back
 432
 433 =head2 Specifying Encodings
 434
 435 Encodings can be specified to the API described below in two ways:
 436
 437 =over 4
 438
 439 =item 1. By name
 440
 441 Encoding names are strings with characters taken from a restricted repertoire.
 442 See L</"Encoding Names">.
 443
 444 =item 2. As an object
 445
 446 Encoding objects are returned by C<find_encoding($name)>.
 447
 448 =back
 449
 450 =head2 Encoding Names
 451
 452 Encoding names are case insensitive. White space in names is ignored.
 453 In addition an encoding may have aliases. Each encoding has one "canonical" name.
 454 The "canonical" name is chosen from the names of the encoding by picking
 455 the first in the following sequence:
 456
 457 =over 4
 458
 459 =item * The MIME name as defined in IETF RFC-XXXX.
 460
 461 =item * The name in the IANA registry.
 462
 463 =item * The name used by the the organization that defined it.
 464
 465 =back
 466
 467 Because of all the alias issues, and because in the general case
 468 encodings have state C<Encode> uses the encoding object internally
 469 once an operation is in progress.
 470
 471 =head1 PERL ENCODING API
 472
 473 =head2 Generic Encoding Interface
 474
 475 =over 4
 476
 477 =item *
 478
 479         $bytes  = encode(ENCODING, $string[, CHECK])
 480
 481 Encodes string from perl's internal form into I<ENCODING> and returns a
 482 sequence of octets.
 483 See L</"Handling Malformed Data">.
 484
 485 =item *
 486
 487         $string = decode(ENCODING, $bytes[, CHECK])
 488
 489 Decode sequence of octets assumed to be in I<ENCODING> into perls internal
 490 form and returns the resuting string.
 491 See L</"Handling Malformed Data">.
 492
 493 =back
 494
 495 =head2 Handling Malformed Data
 496
 497 If CHECK is not set, C<undef> is returned.  If the data is supposed to
 498 be UTF-8, an optional lexical warning (category utf8) is given.
 499 If CHECK is true but not a code reference, dies.
 500
 501 It would desirable to have a way to indicate that transform should use the
 502 encodings "replacement character" - no such mechanism is defined yet.
 503
 504 It is also planned to allow I<CHECK> to be a code reference.
 505
 506 This is not yet implemented as there are design issues with what its arguments
 507 should be and how it returns its results.
 508
 509 =over 4
 510
 511 =item Scheme 1
 512
 513 Passed remaining fragment of string being processed.
 514 Modifies it in place to remove bytes/characters it can understand
 515 and returns a string used to represent them.
 516 e.g.
 517
 518  sub fixup {
 519    my $ch = substr($_[0],0,1,'');
 520    return sprintf("\x{%02X}",ord($ch);
 521  }
 522
 523 This scheme is close to how underlying C code for Encode works, but gives
 524 the fixup routine very little context.
 525
 526 =item Scheme 2
 527
 528 Passed original string, and an index into it of the problem area,
 529 and output string so far.
 530 Appends what it will to output string and returns new index into
 531 original string.
 532 e.g.
 533
 534  sub fixup {
 535    # my ($s,$i,$d) = @_;
 536    my $ch = substr($_[0],$_[1],1);
 537    $_[2] .= sprintf("\x{%02X}",ord($ch);
 538    return $_[1]+1;
 539  }
 540
 541 This scheme gives maximal control to the fixup routine but is more complicated
 542 to code, and may need internals of Encode to be tweaked to keep original
 543 string intact.
 544
 545 =item Other Schemes
 546
 547 Hybrids of above.
 548
 549 Multiple return values rather than in-place modifications.
 550
 551 Index into the string could be pos($str) allowing s/\G...//.
 552
 553 =back
 554
 555 =head2 UTF-8 / utf8
 556
 557 The Unicode consortium defines the UTF-8 standard as a way of encoding
 558 the entire Unicode repertiore as sequences of octets. This encoding
 559 is expected to become very widespread. Perl can use this form internaly
 560 to represent strings, so conversions to and from this form are particularly
 561 efficient (as octets in memory do not have to change, just the meta-data
 562 that tells perl how to treat them).
 563
 564 =over 4
 565
 566 =item *
 567
 568         $bytes = encode_utf8($string);
 569
 570 The characters that comprise string are encoded in perl's superset of UTF-8
 571 and the resulting octets returned as a sequence of bytes. All possible
 572 characters have a UTF-8 representation so this function cannot fail.
 573
 574 =item *
 575
 576         $string = decode_utf8($bytes [,CHECK]);
 577
 578 The sequence of octets represented by $bytes is decoded from UTF-8 into
 579 a sequence of logical characters. Not all sequences of octets form valid
 580 UTF-8 encodings, so it is possible for this call to fail.
 581 See L</"Handling Malformed Data">.
 582
 583 =back
 584
 585 =head2 Other Encodings of Unicode
 586
 587 UTF-16 is similar to UCS-2, 16 bit or 2-byte chunks.
 588 UCS-2 can only represent 0..0xFFFF, while UTF-16 has a "surogate pair"
 589 scheme which allows it to cover the whole Unicode range.
 590
 591 Encode implements big-endian UCS-2 aliased to "iso10646-1" as that
 592 happens to be the name used by that representation when used with X11 fonts.
 593
 594 UTF-32 or UCS-4 is 32-bit or 4-byte chunks.  Perl's logical characters
 595 can be considered as being in this form without encoding. An encoding
 596 to transfer strings in this form (e.g. to write them to a file) would need to
 597
 598      pack('L',map(chr($_),split(//,$string)));   # native
 599   or
 600      pack('V',map(chr($_),split(//,$string)));   # little-endian
 601   or
 602      pack('N',map(chr($_),split(//,$string)));   # big-endian
 603
 604 depending on the endian required.
 605
 606 No UTF-32 encodings are implemented yet.
 607
 608 Both UCS-2 and UCS-4 style encodings can have "byte order marks" by representing
 609 the code point 0xFFFE as the very first thing in a file.
 610
 611 =head2 Listing available encodings
 612
 613   use Encode qw(encodings);
 614   @list = encodings();
 615
 616 Returns a list of the canonical names of the available encodings.
 617
 618 =head2 Defining Aliases
 619
 620   use Encode qw(define_alias);
 621   define_alias( newName => ENCODING);
 622
 623 Allows newName to be used as am alias for ENCODING. ENCODING may be either the
 624 name of an encoding or and encoding object (as above).
 625
 626 Currently I<newName> can be specified in the following ways:
 627
 628 =over 4
 629
 630 =item As a simple string.
 631
 632 =item As a qr// compiled regular expression, e.g.:
 633
 634   define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
 635
 636 In this case if I<ENCODING> is not a reference it is C<eval>-ed to allow
 637 C<$1> etc. to be subsituted.
 638 The example is one way to names as used in X11 font names to alias the MIME names for the
 639 iso-8859-* family.
 640
 641 =item As a code reference, e.g.:
 642
 643   define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
 644
 645 In this case C<$_> will be set to the name that is being looked up and
 646 I<ENCODING> is passed to the sub as its first argument.
 647 The example is another way to names as used in X11 font names to alias the MIME names for
 648 the iso-8859-* family.
 649
 650 =back
 651
 652 =head2 Defining Encodings
 653
 654   use Encode qw(define_alias);
 655   define_encoding( $object, 'canonicalName' [,alias...]);
 656
 657 Causes I<canonicalName> to be associated with I<$object>.
 658 The object should provide the interface described in L</"IMPLEMENTATION CLASSES"> below.
 659 If more than two arguments are provided then additional arguments are taken
 660 as aliases for I<$object> as for C<define_alias>.
 661
 662 =head1 Encoding and IO
 663
 664 It is very common to want to do encoding transformations when
 665 reading or writing files, network connections, pipes etc.
 666 If perl is configured to use the new 'perlio' IO system then
 667 C<Encode> provides a "layer" (See L<perliol>) which can transform
 668 data as it is read or written.
 669
 670      open(my $ilyad,'>:encoding(iso-8859-7)','ilyad.greek');
 671      print $ilyad @epic;
 672
 673 In addition the new IO system can also be configured to read/write
 674 UTF-8 encoded characters (as noted above this is efficient):
 675
 676      open(my $fh,'>:utf8','anything');
 677      print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
 678
 679 Either of the above forms of "layer" specifications can be made the default
 680 for a lexical scope with the C<use open ...> pragma. See L<open>.
 681
 682 Once a handle is open is layers can be altered using C<binmode>.
 683
 684 Without any such configuration, or if perl itself is built using
 685 system's own IO, then write operations assume that file handle accepts
 686 only I<bytes> and will C<die> if a character larger than 255 is
 687 written to the handle. When reading, each octet from the handle
 688 becomes a byte-in-a-character. Note that this default is the same
 689 behaviour as bytes-only languages (including perl before v5.6) would have,
 690 and is sufficient to handle native 8-bit encodings e.g. iso-8859-1,
 691 EBCDIC etc. and any legacy mechanisms for handling other encodings
 692 and binary data.
 693
 694 In other cases it is the programs responsibility
 695 to transform characters into bytes using the API above before
 696 doing writes, and to transform the bytes read from a handle into characters
 697 before doing "character operations" (e.g. C<lc>, C</\W+/>, ...).
 698
 699 =head1 Encoding How to ...
 700
 701 To do:
 702
 703 =over 4
 704
 705 =item * IO with mixed content (faking iso-2020-*)
 706
 707 =item * MIME's Content-Length:
 708
 709 =item * UTF-8 strings in binary data.
 710
 711 =item * perl/Encode wrappers on non-Unicode XS modules.
 712
 713 =back
 714
 715 =head1 Messing with Perl's Internals
 716
 717 The following API uses parts of perl's internals in the current implementation.
 718 As such they are efficient, but may change.
 719
 720 =over 4
 721
 722 =item *
 723
 724         $num_octets = utf8_upgrade($string);
 725
 726 Converts internal representation of string to the UTF-8 form.
 727 Returns the number of octets necessary to represent the string as UTF-8.
 728
 729 =item * utf8_downgrade($string[, CHECK])
 730
 731 Converts internal representation of string to be un-encoded bytes.
 732
 733 =item * is_utf8(STRING [, CHECK])
 734
 735 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
 736 If CHECK is true, also checks the data in STRING for being
 737 well-formed UTF-8.  Returns true if successful, false otherwise.
 738
 739 =item * valid_utf8(STRING)
 740
 741 [INTERNAL] Test whether STRING is in a consistent state.
 742 Will return true if string is held as bytes, or is well-formed UTF-8
 743 and has the UTF-8 flag on.
 744 Main reason for this routine is to allow perl's testsuite to check
 745 that operations have left strings in a consistent state.
 746
 747 =item *
 748
 749         _utf8_on(STRING)
 750
 751 [INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
 752 B<not> checked for being well-formed UTF-8.  Do not use unless you
 753 B<know> that the STRING is well-formed UTF-8.  Returns the previous
 754 state of the UTF-8 flag (so please don't test the return value as
 755 I<not> success or failure), or C<undef> if STRING is not a string.
 756
 757 =item *
 758
 759         _utf8_off(STRING)
 760
 761 [INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
 762 Returns the previous state of the UTF-8 flag (so please don't test the
 763 return value as I<not> success or failure), or C<undef> if STRING is
 764 not a string.
 765
 766 =back
 767
 768 =head1 IMPLEMENTATION CLASSES
 769
 770 As mentioned above encodings are (in the current implementation at least)
 771 defined by objects. The mapping of encoding name to object is via the
 772 C<%encodings> hash.
 773
 774 The values of the hash can currently be either strings or objects.
 775 The string form may go away in the future. The string form occurs
 776 when C<encodings()> has scanned C<@INC> for loadable encodings but has
 777 not actually loaded the encoding in question. This is because the
 778 current "loading" process is all perl and a bit slow.
 779
 780 Once an encoding is loaded then value of the hash is object which implements
 781 the encoding. The object should provide the following interface:
 782
 783 =over 4
 784
 785 =item -E<gt>name
 786
 787 Should return the string representing the canonical name of the encoding.
 788
 789 =item -E<gt>new_sequence
 790
 791 This is a placeholder for encodings with state. It should return an object
 792 which implements this interface, all current implementations return the
 793 original object.
 794
 795 =item -E<gt>encode($string,$check)
 796
 797 Should return the octet sequence representing I<$string>. If I<$check> is true
 798 it should modify I<$string> in place to remove the converted part (i.e.
 799 the whole string unless there is an error).
 800 If an error occurs it should return the octet sequence for the
 801 fragment of string that has been converted, and modify $string in-place
 802 to remove the converted part leaving it starting with the problem fragment.
 803
 804 If check is is false then C<encode> should make a "best effort" to convert
 805 the string - for example by using a replacement character.
 806
 807 =item -E<gt>decode($octets,$check)
 808
 809 Should return the string that I<$octets> represents. If I<$check> is true
 810 it should modify I<$octets> in place to remove the converted part (i.e.
 811 the whole sequence unless there is an error).
 812 If an error occurs it should return the fragment of string
 813 that has been converted, and modify $octets in-place to remove the converted part
 814 leaving it starting with the problem fragment.
 815
 816 If check is is false then C<decode> should make a "best effort" to convert
 817 the string - for example by using Unicode's "\x{FFFD}" as a replacement character.
 818
 819 =back
 820
 821 It should be noted that the check behaviour is different from the outer
 822 public API. The logic is that the "unchecked" case is useful when
 823 encoding is part of a stream which may be reporting errors (e.g. STDERR).
 824 In such cases it is desirable to get everything through somehow without
 825 causing additional errors which obscure the original one. Also the encoding
 826 is best placed to know what the correct replacement character is, so if that
 827 is the desired behaviour then letting low level code do it is the most efficient.
 828
 829 In contrast if check is true, the scheme above allows the encoding to do as
 830 much as it can and tell layer above how much that was. What is lacking
 831 at present is a mechanism to report what went wrong. The most likely interface
 832 will be an additional method call to the object, or perhaps
 833 (to avoid forcing per-stream objects on otherwise stateless encodings)
 834 and additional parameter.
 835
 836 It is also highly desirable that encoding classes inherit from C<Encode::Encoding>
 837 as a base class. This allows that class to define additional behaviour for
 838 all encoding objects. For example built in Unicode, UCS-2 and UTF-8 classes
 839 use :
 840
 841   package Encode::MyEncoding;
 842   use base qw(Encode::Encoding);
 843
 844   __PACKAGE__->Define(qw(myCanonical myAlias));
 845
 846 To create an object with bless {Name => ...},$class, and call define_encoding.
 847 They inherit their C<name> method from C<Encode::Encoding>.
 848
 849 =head2 Compiled Encodings
 850
 851 F<Encode.xs> provides a class C<Encode::XS> which provides the interface described
 852 above. It calls a generic octet-sequence to octet-sequence "engine" that is
 853 driven by tables (defined in F<encengine.c>). The same engine is used for both
 854 encode and decode. C<Encode:XS>'s C<encode> forces perl's characters to their UTF-8 form
 855 and then treats them as just another multibyte encoding. C<Encode:XS>'s C<decode> transforms
 856 the sequence and then turns the UTF-8-ness flag as that is the form that the tables
 857 are defined to produce. For details of the engine see the comments in F<encengine.c>.
 858
 859 The tables are produced by the perl script F<compile> (the name needs to change so
 860 we can eventually install it somewhere). F<compile> can currently read two formats:
 861
 862 =over 4
 863
 864 =item *.enc
 865
 866 This is a coined format used by Tcl. It is documented in Encode/EncodeFormat.pod.
 867
 868 =item *.ucm
 869
 870 This is the semi-standard format used by IBM's ICU package.
 871
 872 =back
 873
 874 F<compile> can write the following forms:
 875
 876 =over 4
 877
 878 =item *.ucm
 879
 880 See above - the F<Encode/*.ucm> files provided with the distribution have
 881 been created from the original Tcl .enc files using this approach.
 882
 883 =item *.c
 884
 885 Produces tables as C data structures - this is used to build in encodings
 886 into F<Encode.so>/F<Encode.dll>.
 887
 888 =item *.xs
 889
 890 In theory this allows encodings to be stand-alone loadable perl extensions.
 891 The process has not yet been tested. The plan is to use this approach
 892 for large East Asian encodings.
 893
 894 =back
 895
 896 The set of encodings built-in to F<Encode.so>/F<Encode.dll> is determined by
 897 F<Makefile.PL>. The current set is as follows:
 898
 899 =over 4
 900
 901 =item ascii and iso-8859-*
 902
 903 That is all the common 8-bit "western" encodings.
 904
 905 =item IBM-1047 and two other variants of EBCDIC.
 906
 907 These are the same variants that are supported by EBCDIC perl as "native" encodings.
 908 They are included to prove "reversibility" of some constructs in EBCDIC perl.
 909
 910 =item symbol and dingbats as used by Tk on X11.
 911
 912 (The reason Encode got started was to support perl/Tk.)
 913
 914 =back
 915
 916 That set is rather ad. hoc. and has been driven by the needs of the tests rather
 917 than the needs of typical applications. It is likely to be rationalized.
 918
 919 =head1 SEE ALSO
 920
 921 L<perlunicode>, L<perlebcdic>, L<perlfunc/open>
 922
 923 =cut
 924
 925
 926