3 our $VERSION = do { my @r = (q$Revision: 1.26 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
9 our @ISA = qw(Exporter DynaLoader);
11 # Public, encouraged API is exported by default
36 # Documentation moved after __END__ for speed - NI-S
40 our $ON_EBCDIC = (ord("A") == 193);
44 # Make a %Encoding package variable to allow a certain amount of cheating
49 37 424 437 500 737 775 850 852 855
50 856 857 860 861 862 863 864 865 866
51 869 874 875 932 936 949 950 1006 1026
52 1047 1250 1251 1252 1253 1254 1255 1256 1257
57 CentralEurRoman Croatian Cyrillic Greek
58 Iceland Roman Rumanian Sami
59 Thai Turkish Ukrainian
62 for my $k (2..11,13..16){
63 $ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm';
66 for my $k (@codepages){
67 $ExtModule{"cp$k"} = 'Encode/Byte.pm';
70 for my $k (@macintosh)
72 $ExtModule{"mac$k"} = 'Encode/Byte.pm';
77 'koi8-r' => 'Encode/Byte.pm',
78 'posix-bc' => 'Encode/EBCDIC.pm',
79 cp037 => 'Encode/EBCDIC.pm',
80 cp1026 => 'Encode/EBCDIC.pm',
81 cp1047 => 'Encode/EBCDIC.pm',
82 cp500 => 'Encode/EBCDIC.pm',
83 cp875 => 'Encode/EBCDIC.pm',
84 dingbats => 'Encode/Symbol.pm',
85 macDingbats => 'Encode/Symbol.pm',
86 macSymbol => 'Encode/Symbol.pm',
87 symbol => 'Encode/Symbol.pm',
88 viscii => 'Encode/Byte.pm',
91 unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
95 'cp936' => 'Encode/CN.pm',
96 'euc-cn' => 'Encode/CN.pm',
97 'gb12345-raw' => 'Encode/CN.pm',
98 'gb2312-raw' => 'Encode/CN.pm',
99 'gbk' => 'Encode/CN.pm',
100 'iso-ir-165' => 'Encode/CN.pm',
102 '7bit-jis' => 'Encode/JP.pm',
103 'cp932' => 'Encode/JP.pm',
104 'euc-jp' => 'Encode/JP.pm',
105 'iso-2022-jp' => 'Encode/JP.pm',
106 'iso-2022-jp-1' => 'Encode/JP.pm',
107 'jis0201-raw' => 'Encode/JP.pm',
108 'jis0208-raw' => 'Encode/JP.pm',
109 'jis0212-raw' => 'Encode/JP.pm',
110 'macJapanese' => 'Encode/JP.pm',
111 'shiftjis' => 'Encode/JP.pm',
113 'cp949' => 'Encode/KR.pm',
114 'euc-kr' => 'Encode/KR.pm',
115 'ksc5601' => 'Encode/KR.pm',
116 'macKorean' => 'Encode/KR.pm',
118 'big5' => 'Encode/TW.pm',
119 'big5-hkscs' => 'Encode/TW.pm',
120 'cp950' => 'Encode/TW.pm',
122 'big5plus' => 'Encode/HanExtra.pm',
123 'euc-tw' => 'Encode/HanExtra.pm',
124 'gb18030' => 'Encode/HanExtra.pm',
131 my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
134 $DEBUG and warn "about to require $m;";
135 eval { require $m; };
139 sort({$a->[1] cmp $b->[1]}
141 grep({ $_ ne 'Internal' } keys %Encoding))));
148 $Encoding{$name} = $obj;
150 define_alias($lc => $obj) unless $lc eq $name;
154 define_alias($alias,$obj);
161 my ($class,$name,$skip_external) = @_;
163 if (ref($name) && $name->can('new_sequence'))
168 if (exists $Encoding{$name})
170 return $Encoding{$name};
172 if (exists $Encoding{$lc})
174 return $Encoding{$lc};
177 my $oc = $class->find_alias($name);
178 return $oc if defined $oc;
180 $oc = $class->find_alias($lc) if $lc ne $name;
181 return $oc if defined $oc;
183 if (!$skip_external and exists $ExtModule{$lc})
185 eval{ require $ExtModule{$lc}; };
186 return $Encoding{$name} if exists $Encoding{$name};
194 my ($name,$skip_external) = @_;
195 return __PACKAGE__->getEncoding($name,$skip_external);
200 my ($name,$string,$check) = @_;
201 my $enc = find_encoding($name);
202 croak("Unknown encoding '$name'") unless defined $enc;
203 my $octets = $enc->encode($string,$check);
204 return undef if ($check && length($string));
210 my ($name,$octets,$check) = @_;
211 my $enc = find_encoding($name);
212 croak("Unknown encoding '$name'") unless defined $enc;
213 my $string = $enc->decode($octets,$check);
214 $_[1] = $octets if $check;
220 my ($string,$from,$to,$check) = @_;
221 my $f = find_encoding($from);
222 croak("Unknown encoding '$from'") unless defined $f;
223 my $t = find_encoding($to);
224 croak("Unknown encoding '$to'") unless defined $t;
225 my $uni = $f->decode($string,$check);
226 return undef if ($check && length($string));
227 $string = $t->encode($uni,$check);
228 return undef if ($check && length($uni));
229 return defined($_[0] = $string) ? length($string) : undef ;
242 return undef unless utf8::decode($str);
246 predefine_encodings();
249 # This is to restore %Encoding if really needed;
251 sub predefine_encodings{
253 # was in Encode::UTF_EBCDIC
254 package Encode::UTF_EBCDIC;
255 *name = sub{ shift->{'Name'} };
256 *new_sequence = sub{ return $_[0] };
258 my ($obj,$str,$chk) = @_;
260 for (my $i = 0; $i < length($str); $i++) {
262 chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
268 my ($obj,$str,$chk) = @_;
270 for (my $i = 0; $i < length($str); $i++) {
272 chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
277 $Encode::Encoding{Unicode} =
278 bless {Name => "UTF_EBCDIC"}, "Encode::UTF_EBCDIC";
280 # was in Encode::UTF_EBCDIC
281 package Encode::Internal;
282 *name = sub{ shift->{'Name'} };
283 *new_sequence = sub{ return $_[0] };
285 my ($obj,$str,$chk) = @_;
291 $Encode::Encoding{Unicode} =
292 bless {Name => "Internal"}, "Encode::Internal";
296 # was in Encode::utf8
297 package Encode::utf8;
298 *name = sub{ shift->{'Name'} };
299 *new_sequence = sub{ return $_[0] };
301 my ($obj,$octets,$chk) = @_;
302 my $str = Encode::decode_utf8($octets);
310 my ($obj,$string,$chk) = @_;
311 my $octets = Encode::encode_utf8($string);
315 $Encode::Encoding{utf8} =
316 bless {Name => "utf8"}, "Encode::utf8";
318 # do externals if necessary
319 require File::Basename;
321 for my $ext (qw(Unicode)){
323 File::Spec->catfile(File::Basename::dirname($INC{'Encode.pm'}),
324 "Encode", "$ext.pm");
329 require Encode::Encoding;
338 Encode - character encodings
345 =head2 Table of Contents
347 Encode consists of a collection of modules which details are too big
348 to fit in one document. This POD itself explains the top-level APIs
349 and general topics at a glance. For other topics and more details,
353 --------------------------------------------------------
354 Encode::Alias Alias defintions to encodings
355 Encode::Encoding Encode Implementation Base Class
356 Encode::Supported List of Supported Encodings
357 Encode::CN Simplified Chinese Encodings
358 Encode::JP Japanese Encodings
359 Encode::KR Korean Encodings
360 Encode::TW Traditional Chinese Encodings
361 --------------------------------------------------------
365 The C<Encode> module provides the interfaces between Perl's strings
366 and the rest of the system. Perl strings are sequences of
369 The repertoire of characters that Perl can represent is at least that
370 defined by the Unicode Consortium. On most platforms the ordinal
371 values of the characters (as returned by C<ord(ch)>) is the "Unicode
372 codepoint" for the character (the exceptions are those platforms where
373 the legacy encoding is some variant of EBCDIC rather than a super-set
374 of ASCII - see L<perlebcdic>).
376 Traditionally computer data has been moved around in 8-bit chunks
377 often called "bytes". These chunks are also known as "octets" in
378 networking standards. Perl is widely used to manipulate data of many
379 types - not only strings of characters representing human or computer
380 languages but also "binary" data being the machines representation of
381 numbers, pixels in an image - or just about anything.
383 When Perl is processing "binary data" the programmer wants Perl to
384 process "sequences of bytes". This is not a problem for Perl - as a
385 byte has 256 possible values it easily fits in Perl's much larger
394 I<character>: a character in the range 0..(2**32-1) (or more).
395 (What Perl's strings are made of.)
399 I<byte>: a character in the range 0..255
400 (A special case of a Perl character.)
404 I<octet>: 8 bits of data, with ordinal values 0..255
405 (Term for bytes passed to or from a non-Perl context, e.g. disk file.)
409 The marker [INTERNAL] marks Internal Implementation Details, in
410 general meant only for those who think they know what they are doing,
411 and such details may change in future releases.
413 =head1 PERL ENCODING API
417 =item $octets = encode(ENCODING, $string[, CHECK])
419 Encodes string from Perl's internal form into I<ENCODING> and returns
420 a sequence of octets. ENCODING can be either a canonical name or
421 alias. For encoding names and aliases, see L</"Defining Aliases">.
422 For CHECK see L</"Handling Malformed Data">.
424 For example to convert (internally UTF-8 encoded) Unicode string to
425 iso-8859-1 (also known as Latin1),
427 $octets = encode("iso-8859-1", $unicode);
429 =item $string = decode(ENCODING, $octets[, CHECK])
431 Decode sequence of octets assumed to be in I<ENCODING> into Perl's
432 internal form and returns the resulting string. as in encode(),
433 ENCODING can be either a canonical name or alias. For encoding names
434 and aliases, see L</"Defining Aliases">. For CHECK see
435 L</"Handling Malformed Data">.
437 For example to convert ISO-8859-1 data to UTF-8:
439 $utf8 = decode("iso-8859-1", $latin1);
441 =item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])
443 Convert B<in-place> the data between two encodings. How did the data
444 in $string originally get to be in FROM_ENCODING? Either using
445 encode() or through PerlIO: See L</"Encoding and IO">.
446 For encoding names and aliases, see L</"Defining Aliases">.
447 For CHECK see L</"Handling Malformed Data">.
449 For example to convert ISO-8859-1 data to UTF-8:
451 from_to($data, "iso-8859-1", "utf-8");
453 and to convert it back:
455 from_to($data, "utf-8", "iso-8859-1");
457 Note that because the conversion happens in place, the data to be
458 converted cannot be a string constant, it must be a scalar variable.
460 from_to() return the length of the converted string on success, undef
467 The Unicode consortium defines the UTF-8 standard as a way of encoding
468 the entire Unicode repertoire as sequences of octets. This encoding is
469 expected to become very widespread. Perl can use this form internally
470 to represent strings, so conversions to and from this form are
471 particularly efficient (as octets in memory do not have to change,
472 just the meta-data that tells Perl how to treat them).
476 =item $octets = encode_utf8($string);
478 The characters that comprise string are encoded in Perl's superset of UTF-8
479 and the resulting octets returned as a sequence of bytes. All possible
480 characters have a UTF-8 representation so this function cannot fail.
482 =item $string = decode_utf8($octets [, CHECK]);
484 The sequence of octets represented by $octets is decoded from UTF-8
485 into a sequence of logical characters. Not all sequences of octets
486 form valid UTF-8 encodings, so it is possible for this call to fail.
487 For CHECK see L</"Handling Malformed Data">.
491 =head2 Listing available encodings
494 @list = Encode->encodings();
496 Returns a list of the canonical names of the available encodings that
497 are loaded. To get a list of all available encodings including the
498 ones that are not loaded yet, say
500 @all_encodings = Encode->encodings(":all");
502 Or you can give the name of specific module.
504 @with_jp = Encode->encodings("Encode/JP.pm");
506 Note in this case you have to say C<"Encode/JP.pm"> instead of
509 To find which encodings are supported by this package in details,
510 see L<Encode::Supported>.
512 =head2 Defining Aliases
514 To add new alias to a given encoding, Use;
518 define_alias(newName => ENCODING);
520 After that, newName can be used as an alias for ENCODING.
521 ENCODING may be either the name of an encoding or an
524 See L<Encode::Alias> on details.
526 =head1 Encoding and IO
528 It is very common to want to do encoding transformations when
529 reading or writing files, network connections, pipes etc.
530 If Perl is configured to use the new 'perlio' IO system then
531 C<Encode> provides a "layer" (See L<perliol>) which can transform
532 data as it is read or written.
534 Here is how the blind poet would modernise the encoding:
537 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
538 open(my $utf8,'>:utf8','iliad.utf8');
544 In addition the new IO system can also be configured to read/write
545 UTF-8 encoded characters (as noted above this is efficient):
547 open(my $fh,'>:utf8','anything');
548 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
550 Either of the above forms of "layer" specifications can be made the default
551 for a lexical scope with the C<use open ...> pragma. See L<open>.
553 Once a handle is open is layers can be altered using C<binmode>.
555 Without any such configuration, or if Perl itself is built using
556 system's own IO, then write operations assume that file handle accepts
557 only I<bytes> and will C<die> if a character larger than 255 is
558 written to the handle. When reading, each octet from the handle
559 becomes a byte-in-a-character. Note that this default is the same
560 behaviour as bytes-only languages (including Perl before v5.6) would
561 have, and is sufficient to handle native 8-bit encodings
562 e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
563 other encodings and binary data.
565 In other cases it is the programs responsibility to transform
566 characters into bytes using the API above before doing writes, and to
567 transform the bytes read from a handle into characters before doing
568 "character operations" (e.g. C<lc>, C</\W+/>, ...).
570 You can also use PerlIO to convert larger amounts of data you don't
571 want to bring into memory. For example to convert between ISO-8859-1
572 (Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
574 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
575 open(G, ">:utf8", "data.utf") or die $!;
576 while (<F>) { print G }
578 # Could also do "print G <F>" but that would pull
579 # the whole file into memory just to write it out again.
583 open(my $f, "<:encoding(cp1252)")
584 open(my $g, ">:encoding(iso-8859-2)")
585 open(my $h, ">:encoding(latin9)") # iso-8859-15
587 See L<PerlIO> for more information.
589 See also L<encoding> for how to change the default encoding of the
592 =head1 Handling Malformed Data
594 If I<CHECK> is not set, (en|de)code will put I<substitution character> in
595 place of the malformed character. for UCM-based encodings,
596 E<lt>subcharE<gt> will be used. For Unicode, \xFFFD is used. If the
597 data is supposed to be UTF-8, an optional lexical warning (category
600 If I<CHECK> is true but not a code reference, dies with an error message.
602 In future you will be able to use a code reference to a callback
603 function for the value of I<CHECK> but its API is still undecided.
605 =head1 Defining Encodings
607 To define a new encoding, use:
609 use Encode qw(define_alias);
610 define_encoding($object, 'canonicalName' [, alias...]);
612 I<canonicalName> will be associated with I<$object>. The object
613 should provide the interface described in L<Encode::Encoding>
614 If more than two arguments are provided then additional
615 arguments are taken as aliases for I<$object> as for C<define_alias>.
617 See L<Encode::Encoding> for more details.
619 =head1 Messing with Perl's Internals
621 The following API uses parts of Perl's internals in the current
622 implementation. As such they are efficient, but may change.
626 =item is_utf8(STRING [, CHECK])
628 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
629 If CHECK is true, also checks the data in STRING for being well-formed
630 UTF-8. Returns true if successful, false otherwise.
632 =item _utf8_on(STRING)
634 [INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
635 B<not> checked for being well-formed UTF-8. Do not use unless you
636 B<know> that the STRING is well-formed UTF-8. Returns the previous
637 state of the UTF-8 flag (so please don't test the return value as
638 I<not> success or failure), or C<undef> if STRING is not a string.
640 =item _utf8_off(STRING)
642 [INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
643 Returns the previous state of the UTF-8 flag (so please don't test the
644 return value as I<not> success or failure), or C<undef> if STRING is
652 L<Encode::Supported>,
659 the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>