3 our $VERSION = do { my @r = (q$Revision: 1.11 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
9 our @ISA = qw(Exporter DynaLoader);
11 # Public, encouraged API is exported by default
36 # Documentation moved after __END__ for speed - NI-S
40 our $ON_EBCDIC = (ord("A") == 193);
43 # Make a %Encoding package variable to allow a certain amount of cheating
48 viscii => 'Encode/Byte.pm',
49 'koi8-r' => 'Encode/Byte.pm',
50 cp1047 => 'Encode/EBCDIC.pm',
51 cp37 => 'Encode/EBCDIC.pm',
52 'posix-bc' => 'Encode/EBCDIC.pm',
53 symbol => 'Encode/Symbol.pm',
54 dingbats => 'Encode/Symbol.pm',
57 for my $k (2..11,13..16){
58 $ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm';
61 for my $k (1250..1258){
62 $ExtModule{"cp$k"} = 'Encode/Byte.pm';
65 unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
68 'euc-cn' => 'Encode/CN.pm',
69 gb2312 => 'Encode/CN.pm',
70 gb12345 => 'Encode/CN.pm',
71 gbk => 'Encode/CN.pm',
72 cp936 => 'Encode/CN.pm',
73 'iso-ir-165' => 'Encode/CN.pm',
74 'euc-jp' => 'Encode/JP.pm',
75 'iso-2022-jp' => 'Encode/JP.pm',
76 'iso-2022-jp-1' => 'Encode/JP.pm',
77 '7bit-jis' => 'Encode/JP.pm',
78 shiftjis => 'Encode/JP.pm',
79 macjapan => 'Encode/JP.pm',
80 cp932 => 'Encode/JP.pm',
81 'euc-kr' => 'Encode/KR.pm',
82 ksc5601 => 'Encode/KR.pm',
83 cp949 => 'Encode/KR.pm',
84 big5 => 'Encode/TW.pm',
85 'big5-hkscs' => 'Encode/TW.pm',
86 cp950 => 'Encode/TW.pm',
87 gb18030 => 'Encode/HanExtra.pm',
88 big5plus => 'Encode/HanExtra.pm',
89 'euc-tw' => 'Encode/HanExtra.pm',
93 for my $k (qw{ CentralEurRoman Croatian Cyrillic Greek
94 Iceland Roman Rumanian Sami
95 Thai Turkish Ukrainian
98 $ExtModule{"mac$k"} = 'Encode/Byte.pm';
104 my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
107 $DEBUG and warn "about to require $m;";
108 eval { require $m; };
112 sort({$a->[1] cmp $b->[1]}
114 grep({ $_ ne 'Internal' } keys %Encoding))));
121 $Encoding{$name} = $obj;
123 define_alias($lc => $obj) unless $lc eq $name;
127 define_alias($alias,$obj);
134 my ($class,$name,$skip_external) = @_;
136 if (ref($name) && $name->can('new_sequence'))
141 if (exists $Encoding{$name})
143 return $Encoding{$name};
145 if (exists $Encoding{$lc})
147 return $Encoding{$lc};
150 my $oc = $class->find_alias($name);
151 return $oc if defined $oc;
153 $oc = $class->find_alias($lc) if $lc ne $name;
154 return $oc if defined $oc;
156 if (!$skip_external and exists $ExtModule{$lc})
158 eval{ require $ExtModule{$lc}; };
159 return $Encoding{$name} if exists $Encoding{$name};
167 my ($name,$skip_external) = @_;
168 return __PACKAGE__->getEncoding($name,$skip_external);
173 my ($name,$string,$check) = @_;
174 my $enc = find_encoding($name);
175 croak("Unknown encoding '$name'") unless defined $enc;
176 my $octets = $enc->encode($string,$check);
177 return undef if ($check && length($string));
183 my ($name,$octets,$check) = @_;
184 my $enc = find_encoding($name);
185 croak("Unknown encoding '$name'") unless defined $enc;
186 my $string = $enc->decode($octets,$check);
187 $_[1] = $octets if $check;
193 my ($string,$from,$to,$check) = @_;
194 my $f = find_encoding($from);
195 croak("Unknown encoding '$from'") unless defined $f;
196 my $t = find_encoding($to);
197 croak("Unknown encoding '$to'") unless defined $t;
198 my $uni = $f->decode($string,$check);
199 return undef if ($check && length($string));
200 $string = $t->encode($uni,$check);
201 return undef if ($check && length($uni));
202 return defined($_[0] = $string) ? length($string) : undef ;
215 return undef unless utf8::decode($str);
219 require Encode::Encoding;
221 require Encode::Internal;
222 require Encode::Unicode;
223 require Encode::utf8;
224 require Encode::10646_1;
225 require Encode::ucs2_le;
233 Encode - character encodings
240 =head2 Table of Contents
242 Encode consists of a collection of modules which details are too big
243 to fit in one document. This POD itself explains the top-level APIs
244 and general topics at a glance. For other topics and more details,
248 --------------------------------------------------------
249 Encode::Alias Alias defintions to encodings
250 Encode::Encoding Encode Implementation Base Class
251 Encode::Supported List of Supported Encodings
252 Encode::CN Simplified Chinese Encodings
253 Encode::JP Japanese Encodings
254 Encode::KR Korean Encodings
255 Encode::TW Traditional Chinese Encodings
256 --------------------------------------------------------
260 The C<Encode> module provides the interfaces between Perl's strings
261 and the rest of the system. Perl strings are sequences of
264 The repertoire of characters that Perl can represent is at least that
265 defined by the Unicode Consortium. On most platforms the ordinal
266 values of the characters (as returned by C<ord(ch)>) is the "Unicode
267 codepoint" for the character (the exceptions are those platforms where
268 the legacy encoding is some variant of EBCDIC rather than a super-set
269 of ASCII - see L<perlebcdic>).
271 Traditionally computer data has been moved around in 8-bit chunks
272 often called "bytes". These chunks are also known as "octets" in
273 networking standards. Perl is widely used to manipulate data of many
274 types - not only strings of characters representing human or computer
275 languages but also "binary" data being the machines representation of
276 numbers, pixels in an image - or just about anything.
278 When Perl is processing "binary data" the programmer wants Perl to
279 process "sequences of bytes". This is not a problem for Perl - as a
280 byte has 256 possible values it easily fits in Perl's much larger
289 I<character>: a character in the range 0..(2**32-1) (or more).
290 (What Perl's strings are made of.)
294 I<byte>: a character in the range 0..255
295 (A special case of a Perl character.)
299 I<octet>: 8 bits of data, with ordinal values 0..255
300 (Term for bytes passed to or from a non-Perl context, e.g. disk file.)
304 The marker [INTERNAL] marks Internal Implementation Details, in
305 general meant only for those who think they know what they are doing,
306 and such details may change in future releases.
308 =head1 PERL ENCODING API
312 =item $bytes = encode(ENCODING, $string[, CHECK])
314 Encodes string from Perl's internal form into I<ENCODING> and returns
315 a sequence of octets. ENCODING can be either a canonical name or
316 alias. For encoding names and aliases, see L</"Defining Aliases">.
317 For CHECK see L</"Handling Malformed Data">.
319 For example to convert (internally UTF-8 encoded) Unicode string to
320 iso-8859-1 (also known as Latin1),
322 $octets = encode("iso-8859-1", $unicode);
324 =item $string = decode(ENCODING, $bytes[, CHECK])
326 Decode sequence of octets assumed to be in I<ENCODING> into Perl's
327 internal form and returns the resulting string. as in encode(),
328 ENCODING can be either a canonical name or alias. For encoding names
329 and aliases, see L</"Defining Aliases">. For CHECK see
330 L</"Handling Malformed Data">.
332 For example to convert ISO-8859-1 data to UTF-8:
334 $utf8 = decode("iso-8859-1", $latin1);
336 =item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
338 Convert B<in-place> the data between two encodings. How did the data
339 in $string originally get to be in FROM_ENCODING? Either using
340 encode() or through PerlIO: See L</"Encoding and IO">.
341 For encoding names and aliases, see L</"Defining Aliases">.
342 For CHECK see L</"Handling Malformed Data">.
344 For example to convert ISO-8859-1 data to UTF-8:
346 from_to($data, "iso-8859-1", "utf-8");
348 and to convert it back:
350 from_to($data, "utf-8", "iso-8859-1");
352 Note that because the conversion happens in place, the data to be
353 converted cannot be a string constant, it must be a scalar variable.
355 from_to() return the length of the converted string on success, undef
360 =head2 Listing available encodings
363 @list = Encode->encodings();
365 Returns a list of the canonical names of the available encodings that
366 are loaded. To get a list of all available encodings including the
367 ones that are not loaded yet, say
369 @all_encodings = Encode->encodings(":all");
371 Or you can give the name of specific module.
373 @with_jp = Encode->encodings("Encode/JP.pm");
375 Note in this case you have to say C<"Encode/JP.pm"> instead of
378 To find which encodings are supported by this package in details,
379 see L<Encode::Supported>.
382 =head2 Defining Aliases
384 To add new alias to a given encoding, Use;
388 define_alias(newName => ENCODING);
390 After that, newName can be used as an alias for ENCODING.
391 ENCODING may be either the name of an encoding or an I<encoding
394 See L<Encode::Alias> on details.
396 =head1 Encoding and IO
398 It is very common to want to do encoding transformations when
399 reading or writing files, network connections, pipes etc.
400 If Perl is configured to use the new 'perlio' IO system then
401 C<Encode> provides a "layer" (See L<perliol>) which can transform
402 data as it is read or written.
404 Here is how the blind poet would modernise the encoding:
407 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
408 open(my $utf8,'>:utf8','iliad.utf8');
414 In addition the new IO system can also be configured to read/write
415 UTF-8 encoded characters (as noted above this is efficient):
417 open(my $fh,'>:utf8','anything');
418 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
420 Either of the above forms of "layer" specifications can be made the default
421 for a lexical scope with the C<use open ...> pragma. See L<open>.
423 Once a handle is open is layers can be altered using C<binmode>.
425 Without any such configuration, or if Perl itself is built using
426 system's own IO, then write operations assume that file handle accepts
427 only I<bytes> and will C<die> if a character larger than 255 is
428 written to the handle. When reading, each octet from the handle
429 becomes a byte-in-a-character. Note that this default is the same
430 behaviour as bytes-only languages (including Perl before v5.6) would
431 have, and is sufficient to handle native 8-bit encodings
432 e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
433 other encodings and binary data.
435 In other cases it is the programs responsibility to transform
436 characters into bytes using the API above before doing writes, and to
437 transform the bytes read from a handle into characters before doing
438 "character operations" (e.g. C<lc>, C</\W+/>, ...).
440 You can also use PerlIO to convert larger amounts of data you don't
441 want to bring into memory. For example to convert between ISO-8859-1
442 (Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
444 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
445 open(G, ">:utf8", "data.utf") or die $!;
446 while (<F>) { print G }
448 # Could also do "print G <F>" but that would pull
449 # the whole file into memory just to write it out again.
453 open(my $f, "<:encoding(cp1252)")
454 open(my $g, ">:encoding(iso-8859-2)")
455 open(my $h, ">:encoding(latin9)") # iso-8859-15
457 See L<PerlIO> for more information.
459 See also L<encoding> for how to change the default encoding of the
462 =head1 Handling Malformed Data
464 If CHECK is not set, C<undef> is returned. If the data is supposed to
465 be UTF-8, an optional lexical warning (category utf8) is given. If
466 CHECK is true but not a code reference, dies.
468 It would desirable to have a way to indicate that transform should use
469 the encodings "replacement character" - no such mechanism is defined yet.
471 It is also planned to allow I<CHECK> to be a code reference.
473 This is not yet implemented as there are design issues with what its
474 arguments should be and how it returns its results.
480 Passed remaining fragment of string being processed.
481 Modifies it in place to remove bytes/characters it can understand
482 and returns a string used to represent them.
486 my $ch = substr($_[0],0,1,'');
487 return sprintf("\x{%02X}",ord($ch);
490 This scheme is close to how underlying C code for Encode works, but gives
491 the fixup routine very little context.
495 Passed original string, and an index into it of the problem area, and
496 output string so far. Appends what it will to output string and
497 returns new index into original string. For example:
500 # my ($s,$i,$d) = @_;
501 my $ch = substr($_[0],$_[1],1);
502 $_[2] .= sprintf("\x{%02X}",ord($ch);
506 This scheme gives maximal control to the fixup routine but is more
507 complicated to code, and may need internals of Encode to be tweaked to
508 keep original string intact.
514 Multiple return values rather than in-place modifications.
516 Index into the string could be C<pos($str)> allowing C<s/\G...//>.
522 The Unicode consortium defines the UTF-8 standard as a way of encoding
523 the entire Unicode repertoire as sequences of octets. This encoding is
524 expected to become very widespread. Perl can use this form internally
525 to represent strings, so conversions to and from this form are
526 particularly efficient (as octets in memory do not have to change,
527 just the meta-data that tells Perl how to treat them).
531 =item $bytes = encode_utf8($string);
533 The characters that comprise string are encoded in Perl's superset of UTF-8
534 and the resulting octets returned as a sequence of bytes. All possible
535 characters have a UTF-8 representation so this function cannot fail.
537 =item $string = decode_utf8($bytes [, CHECK]);
539 The sequence of octets represented by $bytes is decoded from UTF-8
540 into a sequence of logical characters. Not all sequences of octets
541 form valid UTF-8 encodings, so it is possible for this call to fail.
542 For CHECK see L</"Handling Malformed Data">.
546 =head1 Defining Encodings
548 To define a new encoding, use:
550 use Encode qw(define_alias);
551 define_encoding($object, 'canonicalName' [, alias...]);
553 I<canonicalName> will be associated with I<$object>. The object
554 should provide the interface described in L<Encode::Encoding>
555 If more than two arguments are provided then additional
556 arguments are taken as aliases for I<$object> as for C<define_alias>.
558 =head1 Messing with Perl's Internals
560 The following API uses parts of Perl's internals in the current
561 implementation. As such they are efficient, but may change.
565 =item is_utf8(STRING [, CHECK])
567 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
568 If CHECK is true, also checks the data in STRING for being well-formed
569 UTF-8. Returns true if successful, false otherwise.
571 =item _utf8_on(STRING)
573 [INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
574 B<not> checked for being well-formed UTF-8. Do not use unless you
575 B<know> that the STRING is well-formed UTF-8. Returns the previous
576 state of the UTF-8 flag (so please don't test the return value as
577 I<not> success or failure), or C<undef> if STRING is not a string.
579 =item _utf8_off(STRING)
581 [INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
582 Returns the previous state of the UTF-8 flag (so please don't test the
583 return value as I<not> success or failure), or C<undef> if STRING is
591 L<Encode::Supported>,
598 the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>