3 our $VERSION = do { my @r = (q$Revision: 1.30 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
9 our @ISA = qw(Exporter DynaLoader);
11 # Public, encouraged API is exported by default
36 # Documentation moved after __END__ for speed - NI-S
40 our $ON_EBCDIC = (ord("A") == 193);
44 # Make a %Encoding package variable to allow a certain amount of cheating
49 37 424 437 500 737 775 850 852 855
50 856 857 860 861 862 863 864 865 866
51 869 874 875 932 936 949 950 1006 1026
52 1047 1250 1251 1252 1253 1254 1255 1256 1257
57 CentralEurRoman Croatian Cyrillic Greek
58 Iceland Roman Rumanian Sami
59 Thai Turkish Ukrainian
62 for my $k (2..11,13..16){
63 $ExtModule{"iso-8859-$k"} = 'Encode::Byte';
66 for my $k (@codepages){
67 $ExtModule{"cp$k"} = 'Encode::Byte';
70 for my $k (@macintosh)
72 $ExtModule{"mac$k"} = 'Encode::Byte';
75 for my $k (qw(UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
76 UTF-32 UTF-32BE UTF-32LE)){
77 $ExtModule{$k} = 'Encode::Unicode';
82 'koi8-r' => 'Encode::Byte',
83 'posix-bc' => 'Encode::EBCDIC',
84 cp37 => 'Encode::EBCDIC',
85 cp1026 => 'Encode::EBCDIC',
86 cp1047 => 'Encode::EBCDIC',
87 cp500 => 'Encode::EBCDIC',
88 cp875 => 'Encode::EBCDIC',
89 dingbats => 'Encode::Symbol',
90 macDingbats => 'Encode::Symbol',
91 macSymbol => 'Encode::Symbol',
92 symbol => 'Encode::Symbol',
93 viscii => 'Encode::Byte',
96 unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
100 'cp936' => 'Encode::CN',
101 'euc-cn' => 'Encode::CN',
102 'gb12345-raw' => 'Encode::CN',
103 'gb2312-raw' => 'Encode::CN',
104 'gbk' => 'Encode::CN',
105 'iso-ir-165' => 'Encode::CN',
107 '7bit-jis' => 'Encode::JP',
108 'cp932' => 'Encode::JP',
109 'euc-jp' => 'Encode::JP',
110 'iso-2022-jp' => 'Encode::JP',
111 'iso-2022-jp-1' => 'Encode::JP',
112 'jis0201-raw' => 'Encode::JP',
113 'jis0208-raw' => 'Encode::JP',
114 'jis0212-raw' => 'Encode::JP',
115 'macJapanese' => 'Encode::JP',
116 'shiftjis' => 'Encode::JP',
118 'cp949' => 'Encode::KR',
119 'euc-kr' => 'Encode::KR',
120 'ksc5601' => 'Encode::KR',
121 'macKorean' => 'Encode::KR',
123 'big5' => 'Encode::TW',
124 'big5-hkscs' => 'Encode::TW',
125 'cp950' => 'Encode::TW',
127 'big5plus' => 'Encode::HanExtra',
128 'euc-tw' => 'Encode::HanExtra',
129 'gb18030' => 'Encode::HanExtra',
136 my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
137 for my $mod (@modules){
138 $mod =~ s,::,/,g or $mod = "Encode/$mod";
140 $DEBUG and warn "about to require $mod;";
141 eval { require $mod; };
143 my %modules = map {$_ => 1} @modules;
145 sort grep {!/^(?:Internal|Unicode)$/o} keys %Encoding;
152 $Encoding{$name} = $obj;
154 define_alias($lc => $obj) unless $lc eq $name;
158 define_alias($alias,$obj);
165 my ($class,$name,$skip_external) = @_;
167 if (ref($name) && $name->can('new_sequence'))
172 if (exists $Encoding{$name})
174 return $Encoding{$name};
176 if (exists $Encoding{$lc})
178 return $Encoding{$lc};
181 my $oc = $class->find_alias($name);
182 return $oc if defined $oc;
184 $oc = $class->find_alias($lc) if $lc ne $name;
185 return $oc if defined $oc;
187 unless ($skip_external)
189 if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
190 $mod =~ s,::,/,g ; $mod .= '.pm';
191 eval{ require $mod; };
192 return $Encoding{$name} if exists $Encoding{$name};
200 my ($name,$skip_external) = @_;
201 return __PACKAGE__->getEncoding($name,$skip_external);
206 my ($name,$string,$check) = @_;
207 my $enc = find_encoding($name);
208 croak("Unknown encoding '$name'") unless defined $enc;
209 my $octets = $enc->encode($string,$check);
210 return undef if ($check && length($string));
216 my ($name,$octets,$check) = @_;
217 my $enc = find_encoding($name);
218 croak("Unknown encoding '$name'") unless defined $enc;
219 my $string = $enc->decode($octets,$check);
220 $_[1] = $octets if $check;
226 my ($string,$from,$to,$check) = @_;
227 my $f = find_encoding($from);
228 croak("Unknown encoding '$from'") unless defined $f;
229 my $t = find_encoding($to);
230 croak("Unknown encoding '$to'") unless defined $t;
231 my $uni = $f->decode($string,$check);
232 return undef if ($check && length($string));
233 $string = $t->encode($uni,$check);
234 return undef if ($check && length($uni));
235 return defined($_[0] = $string) ? length($string) : undef ;
248 return undef unless utf8::decode($str);
252 predefine_encodings();
255 # This is to restore %Encoding if really needed;
257 sub predefine_encodings{
259 # was in Encode::UTF_EBCDIC
260 package Encode::UTF_EBCDIC;
261 *name = sub{ shift->{'Name'} };
262 *new_sequence = sub{ return $_[0] };
264 my ($obj,$str,$chk) = @_;
266 for (my $i = 0; $i < length($str); $i++) {
268 chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
274 my ($obj,$str,$chk) = @_;
276 for (my $i = 0; $i < length($str); $i++) {
278 chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
283 $Encode::Encoding{Internal} =
284 bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
286 # was in Encode::UTF_EBCDIC
287 package Encode::Internal;
288 *name = sub{ shift->{'Name'} };
289 *new_sequence = sub{ return $_[0] };
291 my ($obj,$str,$chk) = @_;
297 $Encode::Encoding{Unicode} =
298 bless {Name => "Internal"} => "Encode::Internal";
302 # was in Encode::utf8
303 package Encode::utf8;
304 *name = sub{ shift->{'Name'} };
305 *new_sequence = sub{ return $_[0] };
307 my ($obj,$octets,$chk) = @_;
308 my $str = Encode::decode_utf8($octets);
316 my ($obj,$string,$chk) = @_;
317 my $octets = Encode::encode_utf8($string);
321 $Encode::Encoding{utf8} =
322 bless {Name => "utf8"} => "Encode::utf8";
324 # do externals if necessary
325 require File::Basename;
329 File::Spec->catfile(File::Basename::dirname($INC{'Encode.pm'}),
330 "Encode", "$ext.pm");
335 require Encode::Encoding;
344 Encode - character encodings
351 =head2 Table of Contents
353 Encode consists of a collection of modules which details are too big
354 to fit in one document. This POD itself explains the top-level APIs
355 and general topics at a glance. For other topics and more details,
359 --------------------------------------------------------
360 Encode::Alias Alias defintions to encodings
361 Encode::Encoding Encode Implementation Base Class
362 Encode::Supported List of Supported Encodings
363 Encode::CN Simplified Chinese Encodings
364 Encode::JP Japanese Encodings
365 Encode::KR Korean Encodings
366 Encode::TW Traditional Chinese Encodings
367 --------------------------------------------------------
371 The C<Encode> module provides the interfaces between Perl's strings
372 and the rest of the system. Perl strings are sequences of
375 The repertoire of characters that Perl can represent is at least that
376 defined by the Unicode Consortium. On most platforms the ordinal
377 values of the characters (as returned by C<ord(ch)>) is the "Unicode
378 codepoint" for the character (the exceptions are those platforms where
379 the legacy encoding is some variant of EBCDIC rather than a super-set
380 of ASCII - see L<perlebcdic>).
382 Traditionally computer data has been moved around in 8-bit chunks
383 often called "bytes". These chunks are also known as "octets" in
384 networking standards. Perl is widely used to manipulate data of many
385 types - not only strings of characters representing human or computer
386 languages but also "binary" data being the machines representation of
387 numbers, pixels in an image - or just about anything.
389 When Perl is processing "binary data" the programmer wants Perl to
390 process "sequences of bytes". This is not a problem for Perl - as a
391 byte has 256 possible values it easily fits in Perl's much larger
400 I<character>: a character in the range 0..(2**32-1) (or more).
401 (What Perl's strings are made of.)
405 I<byte>: a character in the range 0..255
406 (A special case of a Perl character.)
410 I<octet>: 8 bits of data, with ordinal values 0..255
411 (Term for bytes passed to or from a non-Perl context, e.g. disk file.)
415 The marker [INTERNAL] marks Internal Implementation Details, in
416 general meant only for those who think they know what they are doing,
417 and such details may change in future releases.
419 =head1 PERL ENCODING API
423 =item $octets = encode(ENCODING, $string[, CHECK])
425 Encodes string from Perl's internal form into I<ENCODING> and returns
426 a sequence of octets. ENCODING can be either a canonical name or
427 alias. For encoding names and aliases, see L</"Defining Aliases">.
428 For CHECK see L</"Handling Malformed Data">.
430 For example to convert (internally UTF-8 encoded) Unicode string to
431 iso-8859-1 (also known as Latin1),
433 $octets = encode("iso-8859-1", $unicode);
435 =item $string = decode(ENCODING, $octets[, CHECK])
437 Decode sequence of octets assumed to be in I<ENCODING> into Perl's
438 internal form and returns the resulting string. as in encode(),
439 ENCODING can be either a canonical name or alias. For encoding names
440 and aliases, see L</"Defining Aliases">. For CHECK see
441 L</"Handling Malformed Data">.
443 For example to convert ISO-8859-1 data to UTF-8:
445 $utf8 = decode("iso-8859-1", $latin1);
447 =item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])
449 Convert B<in-place> the data between two encodings. How did the data
450 in $string originally get to be in FROM_ENCODING? Either using
451 encode() or through PerlIO: See L</"Encoding and IO">.
452 For encoding names and aliases, see L</"Defining Aliases">.
453 For CHECK see L</"Handling Malformed Data">.
455 For example to convert ISO-8859-1 data to UTF-8:
457 from_to($data, "iso-8859-1", "utf-8");
459 and to convert it back:
461 from_to($data, "utf-8", "iso-8859-1");
463 Note that because the conversion happens in place, the data to be
464 converted cannot be a string constant, it must be a scalar variable.
466 from_to() return the length of the converted string on success, undef
473 The Unicode consortium defines the UTF-8 standard as a way of encoding
474 the entire Unicode repertoire as sequences of octets. This encoding is
475 expected to become very widespread. Perl can use this form internally
476 to represent strings, so conversions to and from this form are
477 particularly efficient (as octets in memory do not have to change,
478 just the meta-data that tells Perl how to treat them).
482 =item $octets = encode_utf8($string);
484 The characters that comprise string are encoded in Perl's superset of UTF-8
485 and the resulting octets returned as a sequence of bytes. All possible
486 characters have a UTF-8 representation so this function cannot fail.
488 =item $string = decode_utf8($octets [, CHECK]);
490 The sequence of octets represented by $octets is decoded from UTF-8
491 into a sequence of logical characters. Not all sequences of octets
492 form valid UTF-8 encodings, so it is possible for this call to fail.
493 For CHECK see L</"Handling Malformed Data">.
497 =head2 Listing available encodings
500 @list = Encode->encodings();
502 Returns a list of the canonical names of the available encodings that
503 are loaded. To get a list of all available encodings including the
504 ones that are not loaded yet, say
506 @all_encodings = Encode->encodings(":all");
508 Or you can give the name of specific module.
510 @with_jp = Encode->encodings("Encode::JP");
512 When "::" is not in the name, "Encode::" is assumed.
514 @ebcdic = Encode->encodings("EBCDIC");
516 To find which encodings are supported by this package in details,
517 see L<Encode::Supported>.
519 =head2 Defining Aliases
521 To add new alias to a given encoding, Use;
525 define_alias(newName => ENCODING);
527 After that, newName can be used as an alias for ENCODING.
528 ENCODING may be either the name of an encoding or an
531 See L<Encode::Alias> on details.
533 =head1 Encoding and IO
535 It is very common to want to do encoding transformations when
536 reading or writing files, network connections, pipes etc.
537 If Perl is configured to use the new 'perlio' IO system then
538 C<Encode> provides a "layer" (See L<perliol>) which can transform
539 data as it is read or written.
541 Here is how the blind poet would modernise the encoding:
544 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
545 open(my $utf8,'>:utf8','iliad.utf8');
551 In addition the new IO system can also be configured to read/write
552 UTF-8 encoded characters (as noted above this is efficient):
554 open(my $fh,'>:utf8','anything');
555 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
557 Either of the above forms of "layer" specifications can be made the default
558 for a lexical scope with the C<use open ...> pragma. See L<open>.
560 Once a handle is open is layers can be altered using C<binmode>.
562 Without any such configuration, or if Perl itself is built using
563 system's own IO, then write operations assume that file handle accepts
564 only I<bytes> and will C<die> if a character larger than 255 is
565 written to the handle. When reading, each octet from the handle
566 becomes a byte-in-a-character. Note that this default is the same
567 behaviour as bytes-only languages (including Perl before v5.6) would
568 have, and is sufficient to handle native 8-bit encodings
569 e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
570 other encodings and binary data.
572 In other cases it is the programs responsibility to transform
573 characters into bytes using the API above before doing writes, and to
574 transform the bytes read from a handle into characters before doing
575 "character operations" (e.g. C<lc>, C</\W+/>, ...).
577 You can also use PerlIO to convert larger amounts of data you don't
578 want to bring into memory. For example to convert between ISO-8859-1
579 (Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
581 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
582 open(G, ">:utf8", "data.utf") or die $!;
583 while (<F>) { print G }
585 # Could also do "print G <F>" but that would pull
586 # the whole file into memory just to write it out again.
590 open(my $f, "<:encoding(cp1252)")
591 open(my $g, ">:encoding(iso-8859-2)")
592 open(my $h, ">:encoding(latin9)") # iso-8859-15
594 See L<PerlIO> for more information.
596 See also L<encoding> for how to change the default encoding of the
599 =head1 Handling Malformed Data
601 If I<CHECK> is not set, (en|de)code will put I<substitution character> in
602 place of the malformed character. for UCM-based encodings,
603 E<lt>subcharE<gt> will be used. For Unicode, \xFFFD is used. If the
604 data is supposed to be UTF-8, an optional lexical warning (category
607 If I<CHECK> is true but not a code reference, dies with an error message.
609 In future you will be able to use a code reference to a callback
610 function for the value of I<CHECK> but its API is still undecided.
612 =head1 Defining Encodings
614 To define a new encoding, use:
616 use Encode qw(define_alias);
617 define_encoding($object, 'canonicalName' [, alias...]);
619 I<canonicalName> will be associated with I<$object>. The object
620 should provide the interface described in L<Encode::Encoding>
621 If more than two arguments are provided then additional
622 arguments are taken as aliases for I<$object> as for C<define_alias>.
624 See L<Encode::Encoding> for more details.
626 =head1 Messing with Perl's Internals
628 The following API uses parts of Perl's internals in the current
629 implementation. As such they are efficient, but may change.
633 =item is_utf8(STRING [, CHECK])
635 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
636 If CHECK is true, also checks the data in STRING for being well-formed
637 UTF-8. Returns true if successful, false otherwise.
639 =item _utf8_on(STRING)
641 [INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
642 B<not> checked for being well-formed UTF-8. Do not use unless you
643 B<know> that the STRING is well-formed UTF-8. Returns the previous
644 state of the UTF-8 flag (so please don't test the return value as
645 I<not> success or failure), or C<undef> if STRING is not a string.
647 =item _utf8_off(STRING)
649 [INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
650 Returns the previous state of the UTF-8 flag (so please don't test the
651 return value as I<not> success or failure), or C<undef> if STRING is
659 L<Encode::Supported>,
666 the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>