8 our @ISA = qw(Exporter DynaLoader);
10 # Public, encouraged API is exported by default
36 # Documentation moved after __END__ for speed - NI-S
40 # Make a %encoding package variable to allow a certain amount of cheating
42 my @alias; # ordered matching list
43 my %alias; # cached known aliases
45 # 0 1 2 3 4 5 6 7 8 9 10
46 our @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
65 sort { $a->[1] cmp $b->[1] }
67 grep { $_ ne 'Internal' }
75 # print "# findAlias $_\n";
76 unless (exists $alias{$_})
78 for (my $i=0; $i < @alias; $i += 2)
80 my $alias = $alias[$i];
81 my $val = $alias[$i+1];
83 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
87 elsif (ref($alias) eq 'CODE')
89 $new = &{$alias}($val)
91 elsif (lc($_) eq lc($alias))
97 next if $new eq $_; # avoid (direct) recursion on bugs
98 my $enc = (ref($new)) ? $new : find_encoding($new);
114 my ($alias,$name) = splice(@_,0,2);
115 push(@alias, $alias => $name);
119 # Allow variants of iso-8859-1 etc.
120 define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
122 # At least HP-UX has these.
123 define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );
126 define_alias( qr/^(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
128 # The Official name of ASCII.
129 define_alias( qr/^ANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
131 # This is a font issue, not an encoding issue.
132 # (The currency symbol of the Latin 1 upper half
133 # has been redefined as the euro symbol.)
134 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
136 # Allow latin-1 style names as well
137 define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
139 # Allow winlatin1 style names as well
140 define_alias( qr/^win(latin[12]|cyrillic|baltic|greek|turkish|hebrew|arabic|baltic|vietnamese)$/i => '"cp$winlatin2cp{\u$1}"' );
142 # Common names for non-latin prefered MIME names
143 define_alias( 'ascii' => 'US-ascii',
144 'cyrillic' => 'iso-8859-5',
145 'arabic' => 'iso-8859-6',
146 'greek' => 'iso-8859-7',
147 'hebrew' => 'iso-8859-8',
148 'thai' => 'iso-8859-11',
149 'tis620' => 'iso-8859-11',
152 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
153 define_alias( qr/^ibm[-_]?(\d\d\d\d?)$/i => '"cp$1"');
155 # Standardize on the dashed versions.
156 define_alias( qr/^utf8$/i => 'utf-8' );
157 define_alias( qr/^koi8r$/i => 'koi8-r' );
158 define_alias( qr/^koi8u$/i => 'koi8-u' );
160 # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
161 # TODO: HP-UX '15' encodings japanese15 korean15 roi15
162 # TODO: Cyrillic encoding ISO-IR-111 (useful?)
163 # TODO: Chinese encodings GB18030 GBK Big5-HSKCS EUC-TW
164 # TODO: Armenian encoding ARMSCII-8
165 # TODO: Hebrew encoding ISO-8859-8-1
166 # TODO: Thai encoding TCVN
167 # TODO: Korean encoding Johab
168 # TODO: Vietnamese encodings VPS
169 # TODO: Japanese encoding JIS (not the same as SJIS)
170 # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
171 # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
172 # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
173 # Kannada Khmer Korean Laotian Malayalam Mongolian
174 # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
175 # TODO: what is the Japanese 'UJIS' encoding seen in some Linuxes?
176 # Answer: euc-jp <dankogai@dan.co.jp>
177 # Map white space and _ to '-'
179 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
185 $encoding{$name} = $obj;
187 define_alias($lc => $obj) unless $lc eq $name;
191 define_alias($alias,$obj);
198 my ($class,$name) = @_;
200 if (ref($name) && $name->can('new_sequence'))
205 if (exists $encoding{$name})
207 return $encoding{$name};
209 if (exists $encoding{$lc})
211 return $encoding{$lc};
214 my $oc = $class->findAlias($name);
215 return $oc if defined $oc;
216 return $class->findAlias($lc) if $lc ne $name;
224 return __PACKAGE__->getEncoding($name);
229 my ($name,$string,$check) = @_;
230 my $enc = find_encoding($name);
231 croak("Unknown encoding '$name'") unless defined $enc;
232 my $octets = $enc->encode($string,$check);
233 return undef if ($check && length($string));
239 my ($name,$octets,$check) = @_;
240 my $enc = find_encoding($name);
241 croak("Unknown encoding '$name'") unless defined $enc;
242 my $string = $enc->decode($octets,$check);
243 $_[1] = $octets if $check;
249 my ($string,$from,$to,$check) = @_;
250 my $f = find_encoding($from);
251 croak("Unknown encoding '$from'") unless defined $f;
252 my $t = find_encoding($to);
253 croak("Unknown encoding '$to'") unless defined $t;
254 my $uni = $f->decode($string,$check);
255 return undef if ($check && length($string));
256 $string = $t->encode($uni,$check);
257 return undef if ($check && length($uni));
258 return length($_[0] = $string);
271 return undef unless utf8::decode($str);
275 require Encode::Encoding;
277 require Encode::Internal;
278 require Encode::Unicode;
279 require Encode::utf8;
280 require Encode::iso10646_1;
281 require Encode::ucs2_le;
289 Encode - character encodings
297 The C<Encode> module provides the interfaces between Perl's strings
298 and the rest of the system. Perl strings are sequences of B<characters>.
300 The repertoire of characters that Perl can represent is at least that
301 defined by the Unicode Consortium. On most platforms the ordinal
302 values of the characters (as returned by C<ord(ch)>) is the "Unicode
303 codepoint" for the character (the exceptions are those platforms where
304 the legacy encoding is some variant of EBCDIC rather than a super-set
305 of ASCII - see L<perlebcdic>).
307 Traditionaly computer data has been moved around in 8-bit chunks
308 often called "bytes". These chunks are also known as "octets" in
309 networking standards. Perl is widely used to manipulate data of
310 many types - not only strings of characters representing human or
311 computer languages but also "binary" data being the machines representation
312 of numbers, pixels in an image - or just about anything.
314 When Perl is processing "binary data" the programmer wants Perl to process
315 "sequences of bytes". This is not a problem for Perl - as a byte has 256
316 possible values it easily fits in Perl's much larger "logical character".
324 I<character>: a character in the range 0..(2**32-1) (or more).
325 (What Perl's strings are made of.)
329 I<byte>: a character in the range 0..255
330 (A special case of a Perl character.)
334 I<octet>: 8 bits of data, with ordinal values 0..255
335 (Term for bytes passed to or from a non-Perl context, e.g. disk file.)
339 The marker [INTERNAL] marks Internal Implementation Details, in
340 general meant only for those who think they know what they are doing,
341 and such details may change in future releases.
345 =head2 Characteristics of an Encoding
347 An encoding has a "repertoire" of characters that it can represent,
348 and for each representable character there is at least one sequence of
349 octets that represents it.
351 =head2 Types of Encodings
353 Encodings can be divided into the following types:
357 =item * Fixed length 8-bit (or less) encodings.
359 Each character is a single octet so may have a repertoire of up to
360 256 characters. ASCII and iso-8859-* are typical examples.
362 =item * Fixed length 16-bit encodings
364 Each character is two octets so may have a repertoire of up to
365 65 536 characters. Unicode's UCS-2 is an example. Also used for
366 encodings for East Asian languages.
368 =item * Fixed length 32-bit encodings.
370 Not really very "encoded" encodings. The Unicode code points
371 are just represented as 4-octet integers. None the less because
372 different architectures use different representations of integers
373 (so called "endian") there at least two disctinct encodings.
375 =item * Multi-byte encodings
377 The number of octets needed to represent a character varies.
378 UTF-8 is a particularly complex but regular case of a multi-byte
379 encoding. Several East Asian countries use a multi-byte encoding
380 where 1-octet is used to cover western roman characters and Asian
381 characters get 2-octets.
382 (UTF-16 is strictly a multi-byte encoding taking either 2 or 4 octets
383 to represent a Unicode code point.)
385 =item * "Escape" encodings.
387 These encodings embed "escape sequences" into the octet sequence
388 which describe how the following octets are to be interpreted.
389 The iso-2022-* family is typical. Following the escape sequence
390 octets are encoded by an "embedded" encoding (which will be one
391 of the above types) until another escape sequence switches to
392 a different "embedded" encoding.
394 These schemes are very flexible and can handle mixed languages but are
395 very complex to process (and have state). No escape encodings are
396 implemented for Perl yet.
400 =head2 Specifying Encodings
402 Encodings can be specified to the API described below in two ways:
408 Encoding names are strings with characters taken from a restricted
409 repertoire. See L</"Encoding Names">.
411 =item 2. As an object
413 Encoding objects are returned by C<find_encoding($name)>.
417 =head2 Encoding Names
419 Encoding names are case insensitive. White space in names is ignored.
420 In addition an encoding may have aliases. Each encoding has one
421 "canonical" name. The "canonical" name is chosen from the names of
422 the encoding by picking the first in the following sequence:
426 =item * The MIME name as defined in IETF RFCs.
428 =item * The name in the IANA registry.
430 =item * The name used by the organization that defined it.
434 Because of all the alias issues, and because in the general case
435 encodings have state C<Encode> uses the encoding object internally
436 once an operation is in progress.
438 As of Perl 5.8.0, at least the following encodings are recognized
439 (the => marks aliases):
453 The ISO 8859 and KOI:
455 ISO 8859-1 ISO 8859-6 ISO 8859-11 KOI8-F
456 ISO 8859-2 ISO 8859-7 (12 doesn't exist) KOI8-R
457 ISO 8859-3 ISO 8859-8 ISO 8859-13 KOI8-U
458 ISO 8859-4 ISO 8859-9 ISO 8859-14
459 ISO 8859-5 ISO 8859-10 ISO 8859-15
462 Latin1 => 8859-1 Latin6 => 8859-10
463 Latin2 => 8859-2 Latin7 => 8859-13
464 Latin3 => 8859-3 Latin8 => 8859-14
465 Latin4 => 8859-4 Latin9 => 8859-15
466 Latin5 => 8859-9 Latin10 => 8859-16
475 The CJKV: Chinese, Japanese, Korean, Vietnamese:
477 ISO 2022 ISO 2022 JP-1 JIS 0201 GB 1988 Big5 EUC-CN
478 ISO 2022 CN ISO 2022 JP-2 JIS 0208 GB 2312 HZ EUC-JP
479 ISO 2022 JP ISO 2022 KR JIS 0210 GB 12345 CNS 11643 EUC-JP-0212
485 CP37 CP852 CP861 CP866 CP949 CP1251 CP1256
486 CP424 CP855 CP862 CP869 CP950 CP1252 CP1257
487 CP737 CP856 CP863 CP874 CP1006 CP1253 CP1258
488 CP775 CP857 CP864 CP932 CP1047 CP1254
489 CP850 CP860 CP865 CP936 CP1250 CP1255
493 WinCyrillic => CP1251
495 WinTurkiskh => CP1254
499 WinVietnamese => CP1258
501 (All the CPI<NNN...> are available also as IBMI<NNN...>.)
505 MacCentralEuropean MacJapanese
507 MacCyrillic MacRumanian
510 MacIcelandic MacTurkish
521 =head1 PERL ENCODING API
523 =head2 Generic Encoding Interface
529 $bytes = encode(ENCODING, $string[, CHECK])
531 Encodes string from Perl's internal form into I<ENCODING> and returns
532 a sequence of octets. For CHECK see L</"Handling Malformed Data">.
534 For example to convert (internally UTF-8 encoded) Unicode data
537 $octets = encode("utf8", $unicode);
541 $string = decode(ENCODING, $bytes[, CHECK])
543 Decode sequence of octets assumed to be in I<ENCODING> into Perl's
544 internal form and returns the resulting string. For CHECK see
545 L</"Handling Malformed Data">.
547 For example to convert ISO 8859-1 data to UTF-8:
549 $utf8 = decode("latin1", $latin1);
553 from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
555 Convert B<in-place> the data between two encodings. How did the data
556 in $string originally get to be in FROM_ENCODING? Either using
557 encode() or through PerlIO: See L</"Encoding and IO">. For CHECK
558 see L</"Handling Malformed Data">.
560 For example to convert ISO 8859-1 data to UTF-8:
562 from_to($data, "iso-8859-1", "utf-8");
564 and to convert it back:
566 from_to($data, "utf-8", "iso-8859-1");
568 Note that because the conversion happens in place, the data to be
569 converted cannot be a string constant, it must be a scalar variable.
573 =head2 Handling Malformed Data
575 If CHECK is not set, C<undef> is returned. If the data is supposed to
576 be UTF-8, an optional lexical warning (category utf8) is given. If
577 CHECK is true but not a code reference, dies.
579 It would desirable to have a way to indicate that transform should use
580 the encodings "replacement character" - no such mechanism is defined yet.
582 It is also planned to allow I<CHECK> to be a code reference.
584 This is not yet implemented as there are design issues with what its
585 arguments should be and how it returns its results.
591 Passed remaining fragment of string being processed.
592 Modifies it in place to remove bytes/characters it can understand
593 and returns a string used to represent them.
597 my $ch = substr($_[0],0,1,'');
598 return sprintf("\x{%02X}",ord($ch);
601 This scheme is close to how underlying C code for Encode works, but gives
602 the fixup routine very little context.
606 Passed original string, and an index into it of the problem area, and
607 output string so far. Appends what it will to output string and
608 returns new index into original string. For example:
611 # my ($s,$i,$d) = @_;
612 my $ch = substr($_[0],$_[1],1);
613 $_[2] .= sprintf("\x{%02X}",ord($ch);
617 This scheme gives maximal control to the fixup routine but is more
618 complicated to code, and may need internals of Encode to be tweaked to
619 keep original string intact.
625 Multiple return values rather than in-place modifications.
627 Index into the string could be pos($str) allowing s/\G...//.
633 The Unicode consortium defines the UTF-8 standard as a way of encoding
634 the entire Unicode repertiore as sequences of octets. This encoding is
635 expected to become very widespread. Perl can use this form internaly
636 to represent strings, so conversions to and from this form are
637 particularly efficient (as octets in memory do not have to change,
638 just the meta-data that tells Perl how to treat them).
644 $bytes = encode_utf8($string);
646 The characters that comprise string are encoded in Perl's superset of UTF-8
647 and the resulting octets returned as a sequence of bytes. All possible
648 characters have a UTF-8 representation so this function cannot fail.
652 $string = decode_utf8($bytes [,CHECK]);
654 The sequence of octets represented by $bytes is decoded from UTF-8
655 into a sequence of logical characters. Not all sequences of octets
656 form valid UTF-8 encodings, so it is possible for this call to fail.
657 For CHECK see L</"Handling Malformed Data">.
661 =head2 Other Encodings of Unicode
663 UTF-16 is similar to UCS-2, 16 bit or 2-byte chunks. UCS-2 can only
664 represent 0..0xFFFF, while UTF-16 has a I<surrogate pair> scheme which
665 allows it to cover the whole Unicode range.
667 Surrogates are code points set aside to encode the 0x01000..0x10FFFF
668 range of Unicode code points in pairs of 16-bit units. The I<high
669 surrogates> are the range 0xD800..0xDBFF, and the I<low surrogates>
670 are the range 0xDC00..0xDFFFF. The surrogate encoding is
672 $hi = ($uni - 0x10000) / 0x400 + 0xD800;
673 $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
677 $uni = 0x10000 + ($hi - 0xD8000) * 0x400 + ($lo - 0xDC00);
679 Encode implements big-endian UCS-2 aliased to "iso-10646-1" as that
680 happens to be the name used by that representation when used with X11
683 UTF-32 or UCS-4 is 32-bit or 4-byte chunks. Perl's logical characters
684 can be considered as being in this form without encoding. An encoding
685 to transfer strings in this form (e.g. to write them to a file) would
688 pack('L*', unpack('U*', $string)); # native
690 pack('V*', unpack('U*', $string)); # little-endian
692 pack('N*', unpack('U*', $string)); # big-endian
694 depending on the endianness required.
696 No UTF-32 encodings are implemented yet.
698 Both UCS-2 and UCS-4 style encodings can have "byte order marks" by
699 representing the code point 0xFFFE as the very first thing in a file.
701 =head2 Listing available encodings
703 use Encode qw(encodings);
706 Returns a list of the canonical names of the available encodings.
708 =head2 Defining Aliases
710 use Encode qw(define_alias);
711 define_alias( newName => ENCODING);
713 Allows newName to be used as am alias for ENCODING. ENCODING may be
714 either the name of an encoding or and encoding object (as above).
716 Currently I<newName> can be specified in the following ways:
720 =item As a simple string.
722 =item As a qr// compiled regular expression, e.g.:
724 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
726 In this case if I<ENCODING> is not a reference it is C<eval>-ed to
727 allow C<$1> etc. to be subsituted. The example is one way to names as
728 used in X11 font names to alias the MIME names for the iso-8859-*
731 =item As a code reference, e.g.:
733 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
735 In this case C<$_> will be set to the name that is being looked up and
736 I<ENCODING> is passed to the sub as its first argument. The example
737 is another way to names as used in X11 font names to alias the MIME
738 names for the iso-8859-* family.
742 =head2 Defining Encodings
744 use Encode qw(define_alias);
745 define_encoding( $object, 'canonicalName' [,alias...]);
747 Causes I<canonicalName> to be associated with I<$object>. The object
748 should provide the interface described in L</"IMPLEMENTATION CLASSES">
749 below. If more than two arguments are provided then additional
750 arguments are taken as aliases for I<$object> as for C<define_alias>.
752 =head1 Encoding and IO
754 It is very common to want to do encoding transformations when
755 reading or writing files, network connections, pipes etc.
756 If Perl is configured to use the new 'perlio' IO system then
757 C<Encode> provides a "layer" (See L<perliol>) which can transform
758 data as it is read or written.
760 Here is how the blind poet would modernise the encoding:
763 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
764 open(my $utf8,'>:utf8','iliad.utf8');
770 In addition the new IO system can also be configured to read/write
771 UTF-8 encoded characters (as noted above this is efficient):
773 open(my $fh,'>:utf8','anything');
774 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
776 Either of the above forms of "layer" specifications can be made the default
777 for a lexical scope with the C<use open ...> pragma. See L<open>.
779 Once a handle is open is layers can be altered using C<binmode>.
781 Without any such configuration, or if Perl itself is built using
782 system's own IO, then write operations assume that file handle accepts
783 only I<bytes> and will C<die> if a character larger than 255 is
784 written to the handle. When reading, each octet from the handle
785 becomes a byte-in-a-character. Note that this default is the same
786 behaviour as bytes-only languages (including Perl before v5.6) would
787 have, and is sufficient to handle native 8-bit encodings
788 e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
789 other encodings and binary data.
791 In other cases it is the programs responsibility to transform
792 characters into bytes using the API above before doing writes, and to
793 transform the bytes read from a handle into characters before doing
794 "character operations" (e.g. C<lc>, C</\W+/>, ...).
796 You can also use PerlIO to convert larger amounts of data you don't
797 want to bring into memory. For example to convert between ISO 8859-1
798 (Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
800 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
801 open(G, ">:utf8", "data.utf") or die $!;
802 while (<F>) { print G }
804 # Could also do "print G <F>" but that would pull
805 # the whole file into memory just to write it out again.
809 open(my $f, "<:encoding(cp1252)")
810 open(my $g, ">:encoding(iso-8859-2)")
811 open(my $h, ">:encoding(latin9)") # iso-8859-15
813 See L<PerlIO> for more information.
815 See also L<encoding> for how to change the default encoding of the
818 =head1 Encoding How to ...
824 =item * IO with mixed content (faking iso-2020-*)
826 =item * MIME's Content-Length:
828 =item * UTF-8 strings in binary data.
830 =item * Perl/Encode wrappers on non-Unicode XS modules.
834 =head1 Messing with Perl's Internals
836 The following API uses parts of Perl's internals in the current
837 implementation. As such they are efficient, but may change.
841 =item * is_utf8(STRING [, CHECK])
843 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
844 If CHECK is true, also checks the data in STRING for being well-formed
845 UTF-8. Returns true if successful, false otherwise.
847 =item * valid_utf8(STRING)
849 [INTERNAL] Test whether STRING is in a consistent state. Will return
850 true if string is held as bytes, or is well-formed UTF-8 and has the
851 UTF-8 flag on. Main reason for this routine is to allow Perl's
852 testsuite to check that operations have left strings in a consistent
859 [INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
860 B<not> checked for being well-formed UTF-8. Do not use unless you
861 B<know> that the STRING is well-formed UTF-8. Returns the previous
862 state of the UTF-8 flag (so please don't test the return value as
863 I<not> success or failure), or C<undef> if STRING is not a string.
869 [INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
870 Returns the previous state of the UTF-8 flag (so please don't test the
871 return value as I<not> success or failure), or C<undef> if STRING is
876 =head1 IMPLEMENTATION CLASSES
878 As mentioned above encodings are (in the current implementation at least)
879 defined by objects. The mapping of encoding name to object is via the
882 The values of the hash can currently be either strings or objects.
883 The string form may go away in the future. The string form occurs
884 when C<encodings()> has scanned C<@INC> for loadable encodings but has
885 not actually loaded the encoding in question. This is because the
886 current "loading" process is all Perl and a bit slow.
888 Once an encoding is loaded then value of the hash is object which
889 implements the encoding. The object should provide the following
896 Should return the string representing the canonical name of the encoding.
898 =item -E<gt>new_sequence
900 This is a placeholder for encodings with state. It should return an
901 object which implements this interface, all current implementations
902 return the original object.
904 =item -E<gt>encode($string,$check)
906 Should return the octet sequence representing I<$string>. If I<$check>
907 is true it should modify I<$string> in place to remove the converted
908 part (i.e. the whole string unless there is an error). If an error
909 occurs it should return the octet sequence for the fragment of string
910 that has been converted, and modify $string in-place to remove the
911 converted part leaving it starting with the problem fragment.
913 If check is is false then C<encode> should make a "best effort" to
914 convert the string - for example by using a replacement character.
916 =item -E<gt>decode($octets,$check)
918 Should return the string that I<$octets> represents. If I<$check> is
919 true it should modify I<$octets> in place to remove the converted part
920 (i.e. the whole sequence unless there is an error). If an error
921 occurs it should return the fragment of string that has been
922 converted, and modify $octets in-place to remove the converted part
923 leaving it starting with the problem fragment.
925 If check is is false then C<decode> should make a "best effort" to
926 convert the string - for example by using Unicode's "\x{FFFD}" as a
927 replacement character.
931 It should be noted that the check behaviour is different from the
932 outer public API. The logic is that the "unchecked" case is useful
933 when encoding is part of a stream which may be reporting errors
934 (e.g. STDERR). In such cases it is desirable to get everything
935 through somehow without causing additional errors which obscure the
936 original one. Also the encoding is best placed to know what the
937 correct replacement character is, so if that is the desired behaviour
938 then letting low level code do it is the most efficient.
940 In contrast if check is true, the scheme above allows the encoding to
941 do as much as it can and tell layer above how much that was. What is
942 lacking at present is a mechanism to report what went wrong. The most
943 likely interface will be an additional method call to the object, or
944 perhaps (to avoid forcing per-stream objects on otherwise stateless
945 encodings) and additional parameter.
947 It is also highly desirable that encoding classes inherit from
948 C<Encode::Encoding> as a base class. This allows that class to define
949 additional behaviour for all encoding objects. For example built in
950 Unicode, UCS-2 and UTF-8 classes use :
952 package Encode::MyEncoding;
953 use base qw(Encode::Encoding);
955 __PACKAGE__->Define(qw(myCanonical myAlias));
957 To create an object with bless {Name => ...},$class, and call
958 define_encoding. They inherit their C<name> method from
961 =head2 Compiled Encodings
963 F<Encode.xs> provides a class C<Encode::XS> which provides the
964 interface described above. It calls a generic octet-sequence to
965 octet-sequence "engine" that is driven by tables (defined in
966 F<encengine.c>). The same engine is used for both encode and
967 decode. C<Encode:XS>'s C<encode> forces Perl's characters to their
968 UTF-8 form and then treats them as just another multibyte
969 encoding. C<Encode:XS>'s C<decode> transforms the sequence and then
970 turns the UTF-8-ness flag as that is the form that the tables are
971 defined to produce. For details of the engine see the comments in
974 The tables are produced by the Perl script F<compile> (the name needs
975 to change so we can eventually install it somewhere). F<compile> can
976 currently read two formats:
982 This is a coined format used by Tcl. It is documented in
983 Encode/EncodeFormat.pod.
987 This is the semi-standard format used by IBM's ICU package.
991 F<compile> can write the following forms:
997 See above - the F<Encode/*.ucm> files provided with the distribution have
998 been created from the original Tcl .enc files using this approach.
1002 Produces tables as C data structures - this is used to build in encodings
1003 into F<Encode.so>/F<Encode.dll>.
1007 In theory this allows encodings to be stand-alone loadable Perl
1008 extensions. The process has not yet been tested. The plan is to use
1009 this approach for large East Asian encodings.
1013 The set of encodings built-in to F<Encode.so>/F<Encode.dll> is
1014 determined by F<Makefile.PL>. The current set is as follows:
1018 =item ascii and iso-8859-*
1020 That is all the common 8-bit "western" encodings.
1022 =item IBM-1047 and two other variants of EBCDIC.
1024 These are the same variants that are supported by EBCDIC Perl as
1025 "native" encodings. They are included to prove "reversibility" of
1026 some constructs in EBCDIC Perl.
1028 =item symbol and dingbats as used by Tk on X11.
1030 (The reason Encode got started was to support Perl/Tk.)
1034 That set is rather ad hoc and has been driven by the needs of the
1035 tests rather than the needs of typical applications. It is likely
1040 L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>, L<encoding>