3 our $VERSION = do { my @r = (q$Revision: 1.20 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
9 our @ISA = qw(Exporter DynaLoader);
11 # Public, encouraged API is exported by default
36 # Documentation moved after __END__ for speed - NI-S
40 our $ON_EBCDIC = (ord("A") == 193);
43 # Make a %Encoding package variable to allow a certain amount of cheating
48 37 424 437 500 737 775 850 852 855
49 856 857 860 861 862 863 864 865 866
50 869 874 875 932 936 949 950 1006 1026
51 1047 1250 1251 1252 1253 1254 1255 1256 1257
56 CentralEurRoman Croatian Cyrillic Greek
57 Iceland Roman Rumanian Sami
58 Thai Turkish Ukrainian
61 for my $k (2..11,13..16){
62 $ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm';
65 for my $k (@codepages){
66 $ExtModule{"cp$k"} = 'Encode/Byte.pm';
69 for my $k (@macintosh)
71 $ExtModule{"mac$k"} = 'Encode/Byte.pm';
76 'koi8-r' => 'Encode/Byte.pm',
77 'posix-bc' => 'Encode/EBCDIC.pm',
78 cp037 => 'Encode/EBCDIC.pm',
79 cp1026 => 'Encode/EBCDIC.pm',
80 cp1047 => 'Encode/EBCDIC.pm',
81 cp500 => 'Encode/EBCDIC.pm',
82 cp875 => 'Encode/EBCDIC.pm',
83 dingbats => 'Encode/Symbol.pm',
84 macDingbats => 'Encode/Symbol.pm',
85 macSymbol => 'Encode/Symbol.pm',
86 symbol => 'Encode/Symbol.pm',
87 viscii => 'Encode/Byte.pm',
90 unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
91 %ExtModule =(%ExtModule,
92 'euc-cn' => 'Encode/CN.pm',
93 gb2312 => 'Encode/CN.pm',
94 gb12345 => 'Encode/CN.pm',
95 gbk => 'Encode/CN.pm',
96 cp936 => 'Encode/CN.pm',
97 'iso-ir-165' => 'Encode/CN.pm',
98 'euc-jp' => 'Encode/JP.pm',
99 'iso-2022-jp' => 'Encode/JP.pm',
100 'iso-2022-jp-1' => 'Encode/JP.pm',
101 '7bit-jis' => 'Encode/JP.pm',
102 shiftjis => 'Encode/JP.pm',
103 macJapanese => 'Encode/JP.pm',
104 cp932 => 'Encode/JP.pm',
105 'euc-kr' => 'Encode/KR.pm',
106 ksc5601 => 'Encode/KR.pm',
107 macKorean => 'Encode/KR.pm',
108 cp949 => 'Encode/KR.pm',
109 big5 => 'Encode/TW.pm',
110 'big5-hkscs' => 'Encode/TW.pm',
111 cp950 => 'Encode/TW.pm',
112 gb18030 => 'Encode/HanExtra.pm',
113 big5plus => 'Encode/HanExtra.pm',
114 'euc-tw' => 'Encode/HanExtra.pm',
124 my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
127 $DEBUG and warn "about to require $m;";
128 eval { require $m; };
132 sort({$a->[1] cmp $b->[1]}
134 grep({ $_ ne 'Internal' } keys %Encoding))));
141 $Encoding{$name} = $obj;
143 define_alias($lc => $obj) unless $lc eq $name;
147 define_alias($alias,$obj);
154 my ($class,$name,$skip_external) = @_;
156 if (ref($name) && $name->can('new_sequence'))
161 if (exists $Encoding{$name})
163 return $Encoding{$name};
165 if (exists $Encoding{$lc})
167 return $Encoding{$lc};
170 my $oc = $class->find_alias($name);
171 return $oc if defined $oc;
173 $oc = $class->find_alias($lc) if $lc ne $name;
174 return $oc if defined $oc;
176 if (!$skip_external and exists $ExtModule{$lc})
178 eval{ require $ExtModule{$lc}; };
179 return $Encoding{$name} if exists $Encoding{$name};
187 my ($name,$skip_external) = @_;
188 return __PACKAGE__->getEncoding($name,$skip_external);
193 my ($name,$string,$check) = @_;
194 my $enc = find_encoding($name);
195 croak("Unknown encoding '$name'") unless defined $enc;
196 my $octets = $enc->encode($string,$check);
197 return undef if ($check && length($string));
203 my ($name,$octets,$check) = @_;
204 my $enc = find_encoding($name);
205 croak("Unknown encoding '$name'") unless defined $enc;
206 my $string = $enc->decode($octets,$check);
207 $_[1] = $octets if $check;
213 my ($string,$from,$to,$check) = @_;
214 my $f = find_encoding($from);
215 croak("Unknown encoding '$from'") unless defined $f;
216 my $t = find_encoding($to);
217 croak("Unknown encoding '$to'") unless defined $t;
218 my $uni = $f->decode($string,$check);
219 return undef if ($check && length($string));
220 $string = $t->encode($uni,$check);
221 return undef if ($check && length($uni));
222 return defined($_[0] = $string) ? length($string) : undef ;
235 return undef unless utf8::decode($str);
239 require Encode::Encoding;
241 require Encode::Internal;
242 require Encode::Unicode;
243 require Encode::utf8;
244 require Encode::10646_1;
245 require Encode::ucs2_le;
253 Encode - character encodings
260 =head2 Table of Contents
262 Encode consists of a collection of modules which details are too big
263 to fit in one document. This POD itself explains the top-level APIs
264 and general topics at a glance. For other topics and more details,
268 --------------------------------------------------------
269 Encode::Alias Alias defintions to encodings
270 Encode::Encoding Encode Implementation Base Class
271 Encode::Supported List of Supported Encodings
272 Encode::CN Simplified Chinese Encodings
273 Encode::JP Japanese Encodings
274 Encode::KR Korean Encodings
275 Encode::TW Traditional Chinese Encodings
276 --------------------------------------------------------
280 The C<Encode> module provides the interfaces between Perl's strings
281 and the rest of the system. Perl strings are sequences of
284 The repertoire of characters that Perl can represent is at least that
285 defined by the Unicode Consortium. On most platforms the ordinal
286 values of the characters (as returned by C<ord(ch)>) is the "Unicode
287 codepoint" for the character (the exceptions are those platforms where
288 the legacy encoding is some variant of EBCDIC rather than a super-set
289 of ASCII - see L<perlebcdic>).
291 Traditionally computer data has been moved around in 8-bit chunks
292 often called "bytes". These chunks are also known as "octets" in
293 networking standards. Perl is widely used to manipulate data of many
294 types - not only strings of characters representing human or computer
295 languages but also "binary" data being the machines representation of
296 numbers, pixels in an image - or just about anything.
298 When Perl is processing "binary data" the programmer wants Perl to
299 process "sequences of bytes". This is not a problem for Perl - as a
300 byte has 256 possible values it easily fits in Perl's much larger
309 I<character>: a character in the range 0..(2**32-1) (or more).
310 (What Perl's strings are made of.)
314 I<byte>: a character in the range 0..255
315 (A special case of a Perl character.)
319 I<octet>: 8 bits of data, with ordinal values 0..255
320 (Term for bytes passed to or from a non-Perl context, e.g. disk file.)
324 The marker [INTERNAL] marks Internal Implementation Details, in
325 general meant only for those who think they know what they are doing,
326 and such details may change in future releases.
328 =head1 PERL ENCODING API
332 =item $bytes = encode(ENCODING, $string[, CHECK])
334 Encodes string from Perl's internal form into I<ENCODING> and returns
335 a sequence of octets. ENCODING can be either a canonical name or
336 alias. For encoding names and aliases, see L</"Defining Aliases">.
337 For CHECK see L</"Handling Malformed Data">.
339 For example to convert (internally UTF-8 encoded) Unicode string to
340 iso-8859-1 (also known as Latin1),
342 $octets = encode("iso-8859-1", $unicode);
344 =item $string = decode(ENCODING, $bytes[, CHECK])
346 Decode sequence of octets assumed to be in I<ENCODING> into Perl's
347 internal form and returns the resulting string. as in encode(),
348 ENCODING can be either a canonical name or alias. For encoding names
349 and aliases, see L</"Defining Aliases">. For CHECK see
350 L</"Handling Malformed Data">.
352 For example to convert ISO-8859-1 data to UTF-8:
354 $utf8 = decode("iso-8859-1", $latin1);
356 =item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
358 Convert B<in-place> the data between two encodings. How did the data
359 in $string originally get to be in FROM_ENCODING? Either using
360 encode() or through PerlIO: See L</"Encoding and IO">.
361 For encoding names and aliases, see L</"Defining Aliases">.
362 For CHECK see L</"Handling Malformed Data">.
364 For example to convert ISO-8859-1 data to UTF-8:
366 from_to($data, "iso-8859-1", "utf-8");
368 and to convert it back:
370 from_to($data, "utf-8", "iso-8859-1");
372 Note that because the conversion happens in place, the data to be
373 converted cannot be a string constant, it must be a scalar variable.
375 from_to() return the length of the converted string on success, undef
380 =head2 Listing available encodings
383 @list = Encode->encodings();
385 Returns a list of the canonical names of the available encodings that
386 are loaded. To get a list of all available encodings including the
387 ones that are not loaded yet, say
389 @all_encodings = Encode->encodings(":all");
391 Or you can give the name of specific module.
393 @with_jp = Encode->encodings("Encode/JP.pm");
395 Note in this case you have to say C<"Encode/JP.pm"> instead of
398 To find which encodings are supported by this package in details,
399 see L<Encode::Supported>.
402 =head2 Defining Aliases
404 To add new alias to a given encoding, Use;
408 define_alias(newName => ENCODING);
410 After that, newName can be used as an alias for ENCODING.
411 ENCODING may be either the name of an encoding or an I<encoding
414 See L<Encode::Alias> on details.
416 =head1 Encoding and IO
418 It is very common to want to do encoding transformations when
419 reading or writing files, network connections, pipes etc.
420 If Perl is configured to use the new 'perlio' IO system then
421 C<Encode> provides a "layer" (See L<perliol>) which can transform
422 data as it is read or written.
424 Here is how the blind poet would modernise the encoding:
427 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
428 open(my $utf8,'>:utf8','iliad.utf8');
434 In addition the new IO system can also be configured to read/write
435 UTF-8 encoded characters (as noted above this is efficient):
437 open(my $fh,'>:utf8','anything');
438 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
440 Either of the above forms of "layer" specifications can be made the default
441 for a lexical scope with the C<use open ...> pragma. See L<open>.
443 Once a handle is open is layers can be altered using C<binmode>.
445 Without any such configuration, or if Perl itself is built using
446 system's own IO, then write operations assume that file handle accepts
447 only I<bytes> and will C<die> if a character larger than 255 is
448 written to the handle. When reading, each octet from the handle
449 becomes a byte-in-a-character. Note that this default is the same
450 behaviour as bytes-only languages (including Perl before v5.6) would
451 have, and is sufficient to handle native 8-bit encodings
452 e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
453 other encodings and binary data.
455 In other cases it is the programs responsibility to transform
456 characters into bytes using the API above before doing writes, and to
457 transform the bytes read from a handle into characters before doing
458 "character operations" (e.g. C<lc>, C</\W+/>, ...).
460 You can also use PerlIO to convert larger amounts of data you don't
461 want to bring into memory. For example to convert between ISO-8859-1
462 (Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
464 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
465 open(G, ">:utf8", "data.utf") or die $!;
466 while (<F>) { print G }
468 # Could also do "print G <F>" but that would pull
469 # the whole file into memory just to write it out again.
473 open(my $f, "<:encoding(cp1252)")
474 open(my $g, ">:encoding(iso-8859-2)")
475 open(my $h, ">:encoding(latin9)") # iso-8859-15
477 See L<PerlIO> for more information.
479 See also L<encoding> for how to change the default encoding of the
482 =head1 Handling Malformed Data
484 If CHECK is not set, C<undef> is returned. If the data is supposed to
485 be UTF-8, an optional lexical warning (category utf8) is given. If
486 CHECK is true but not a code reference, dies.
488 It would desirable to have a way to indicate that transform should use
489 the encodings "replacement character" - no such mechanism is defined yet.
491 It is also planned to allow I<CHECK> to be a code reference.
493 This is not yet implemented as there are design issues with what its
494 arguments should be and how it returns its results.
500 Passed remaining fragment of string being processed.
501 Modifies it in place to remove bytes/characters it can understand
502 and returns a string used to represent them.
506 my $ch = substr($_[0],0,1,'');
507 return sprintf("\x{%02X}",ord($ch);
510 This scheme is close to how underlying C code for Encode works, but gives
511 the fixup routine very little context.
515 Passed original string, and an index into it of the problem area, and
516 output string so far. Appends what it will to output string and
517 returns new index into original string. For example:
520 # my ($s,$i,$d) = @_;
521 my $ch = substr($_[0],$_[1],1);
522 $_[2] .= sprintf("\x{%02X}",ord($ch);
526 This scheme gives maximal control to the fixup routine but is more
527 complicated to code, and may need internals of Encode to be tweaked to
528 keep original string intact.
534 Multiple return values rather than in-place modifications.
536 Index into the string could be C<pos($str)> allowing C<s/\G...//>.
542 The Unicode consortium defines the UTF-8 standard as a way of encoding
543 the entire Unicode repertoire as sequences of octets. This encoding is
544 expected to become very widespread. Perl can use this form internally
545 to represent strings, so conversions to and from this form are
546 particularly efficient (as octets in memory do not have to change,
547 just the meta-data that tells Perl how to treat them).
551 =item $bytes = encode_utf8($string);
553 The characters that comprise string are encoded in Perl's superset of UTF-8
554 and the resulting octets returned as a sequence of bytes. All possible
555 characters have a UTF-8 representation so this function cannot fail.
557 =item $string = decode_utf8($bytes [, CHECK]);
559 The sequence of octets represented by $bytes is decoded from UTF-8
560 into a sequence of logical characters. Not all sequences of octets
561 form valid UTF-8 encodings, so it is possible for this call to fail.
562 For CHECK see L</"Handling Malformed Data">.
566 =head1 Defining Encodings
568 To define a new encoding, use:
570 use Encode qw(define_alias);
571 define_encoding($object, 'canonicalName' [, alias...]);
573 I<canonicalName> will be associated with I<$object>. The object
574 should provide the interface described in L<Encode::Encoding>
575 If more than two arguments are provided then additional
576 arguments are taken as aliases for I<$object> as for C<define_alias>.
578 =head1 Messing with Perl's Internals
580 The following API uses parts of Perl's internals in the current
581 implementation. As such they are efficient, but may change.
585 =item is_utf8(STRING [, CHECK])
587 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
588 If CHECK is true, also checks the data in STRING for being well-formed
589 UTF-8. Returns true if successful, false otherwise.
591 =item _utf8_on(STRING)
593 [INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
594 B<not> checked for being well-formed UTF-8. Do not use unless you
595 B<know> that the STRING is well-formed UTF-8. Returns the previous
596 state of the UTF-8 flag (so please don't test the return value as
597 I<not> success or failure), or C<undef> if STRING is not a string.
599 =item _utf8_off(STRING)
601 [INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
602 Returns the previous state of the UTF-8 flag (so please don't test the
603 return value as I<not> success or failure), or C<undef> if STRING is
611 L<Encode::Supported>,
618 the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>