3 our $VERSION = do { my @r = (q$Revision: 0.97 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
9 our @ISA = qw(Exporter DynaLoader);
11 # Public, encouraged API is exported by default
36 # Documentation moved after __END__ for speed - NI-S
42 # Make a %Encoding package variable to allow a certain amount of cheating
47 viscii => 'Encode/Byte.pm',
48 'koi8-r' => 'Encode/Byte.pm',
49 cp1047 => 'Encode/EBCDIC.pm',
50 cp37 => 'Encode/EBCDIC.pm',
51 'posix-bc' => 'Encode/EBCDIC.pm',
52 symbol => 'Encode/Symbol.pm',
53 dingbats => 'Encode/Symbol.pm',
54 'euc-cn' => 'Encode/CN.pm',
55 gb2312 => 'Encode/CN.pm',
56 gb12345 => 'Encode/CN.pm',
57 gbk => 'Encode/CN.pm',
58 cp936 => 'Encode/CN.pm',
59 'iso-ir-165' => 'Encode/CN.pm',
60 'euc-jp' => 'Encode/JP.pm',
61 'iso-2022-jp' => 'Encode/JP.pm',
62 '7bit-jis' => 'Encode/JP.pm',
63 shiftjis => 'Encode/JP.pm',
64 macjapan => 'Encode/JP.pm',
65 cp932 => 'Encode/JP.pm',
66 'euc-kr' => 'Encode/KR.pm',
67 ksc5601 => 'Encode/KR.pm',
68 cp949 => 'Encode/KR.pm',
69 big5 => 'Encode/TW.pm',
70 'big5-hkscs' => 'Encode/TW.pm',
71 cp950 => 'Encode/TW.pm',
72 gb18030 => 'Encode/HanExtra.pm',
73 big5plus => 'Encode/HanExtra.pm',
74 'euc-tw' => 'Encode/HanExtra.pm',
77 for my $k (2..11,13..16){
78 $ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm';
81 for my $k (1250..1258){
82 $ExtModule{"cp$k"} = 'Encode/Byte.pm';
85 for my $k (qw(centeuro croatian cyrillic dingbats greek
86 iceland roman rumanian sami
87 thai turkish ukraine))
89 $ExtModule{"mac$k"} = 'Encode/Byte.pm';
96 my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
99 $DEBUG and warn "about to require $m;";
100 eval { require $m; };
104 sort({$a->[1] cmp $b->[1]}
106 grep({ $_ ne 'Internal' } keys %Encoding))));
113 $Encoding{$name} = $obj;
115 define_alias($lc => $obj) unless $lc eq $name;
119 define_alias($alias,$obj);
126 my ($class,$name,$skip_external) = @_;
128 if (ref($name) && $name->can('new_sequence'))
133 if (exists $Encoding{$name})
135 return $Encoding{$name};
137 if (exists $Encoding{$lc})
139 return $Encoding{$lc};
142 my $oc = $class->find_alias($name);
143 return $oc if defined $oc;
145 $oc = $class->find_alias($lc) if $lc ne $name;
146 return $oc if defined $oc;
148 if (!$skip_external and exists $ExtModule{$lc})
150 eval{ require $ExtModule{$lc}; };
151 return $Encoding{$name} if exists $Encoding{$name};
159 my ($name,$skip_external) = @_;
160 return __PACKAGE__->getEncoding($name,$skip_external);
165 my ($name,$string,$check) = @_;
166 my $enc = find_encoding($name);
167 croak("Unknown encoding '$name'") unless defined $enc;
168 my $octets = $enc->encode($string,$check);
169 return undef if ($check && length($string));
175 my ($name,$octets,$check) = @_;
176 my $enc = find_encoding($name);
177 croak("Unknown encoding '$name'") unless defined $enc;
178 my $string = $enc->decode($octets,$check);
179 $_[1] = $octets if $check;
185 my ($string,$from,$to,$check) = @_;
186 my $f = find_encoding($from);
187 croak("Unknown encoding '$from'") unless defined $f;
188 my $t = find_encoding($to);
189 croak("Unknown encoding '$to'") unless defined $t;
190 my $uni = $f->decode($string,$check);
191 return undef if ($check && length($string));
192 $string = $t->encode($uni,$check);
193 return undef if ($check && length($uni));
194 return length($_[0] = $string);
207 return undef unless utf8::decode($str);
211 require Encode::Encoding;
213 require Encode::Internal;
214 require Encode::Unicode;
215 require Encode::utf8;
216 require Encode::iso10646_1;
217 require Encode::ucs2_le;
225 Encode - character encodings
233 The C<Encode> module provides the interfaces between Perl's strings
234 and the rest of the system. Perl strings are sequences of B<characters>.
236 To find more about character encodings, please consult
237 L<Encode::Details> . This document focuses on programming references.
239 =head1 PERL ENCODING API
241 =head2 Generic Encoding Interface
247 $bytes = encode(ENCODING, $string[, CHECK])
249 Encodes string from Perl's internal form into I<ENCODING> and returns
250 a sequence of octets. For CHECK see L</"Handling Malformed Data">.
252 For example to convert (internally UTF-8 encoded) Unicode data
255 $octets = encode("utf8", $unicode);
259 $string = decode(ENCODING, $bytes[, CHECK])
261 Decode sequence of octets assumed to be in I<ENCODING> into Perl's
262 internal form and returns the resulting string. For CHECK see
263 L</"Handling Malformed Data">.
265 For example to convert ISO-8859-1 data to UTF-8:
267 $utf8 = decode("latin1", $latin1);
271 from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
273 Convert B<in-place> the data between two encodings. How did the data
274 in $string originally get to be in FROM_ENCODING? Either using
275 encode() or through PerlIO: See L</"Encoding and IO">. For CHECK
276 see L</"Handling Malformed Data">.
278 For example to convert ISO-8859-1 data to UTF-8:
280 from_to($data, "iso-8859-1", "utf-8");
282 and to convert it back:
284 from_to($data, "utf-8", "iso-8859-1");
286 Note that because the conversion happens in place, the data to be
287 converted cannot be a string constant, it must be a scalar variable.
291 =head2 Handling Malformed Data
293 If CHECK is not set, C<undef> is returned. If the data is supposed to
294 be UTF-8, an optional lexical warning (category utf8) is given. If
295 CHECK is true but not a code reference, dies.
297 It would desirable to have a way to indicate that transform should use
298 the encodings "replacement character" - no such mechanism is defined yet.
300 It is also planned to allow I<CHECK> to be a code reference.
302 This is not yet implemented as there are design issues with what its
303 arguments should be and how it returns its results.
309 Passed remaining fragment of string being processed.
310 Modifies it in place to remove bytes/characters it can understand
311 and returns a string used to represent them.
315 my $ch = substr($_[0],0,1,'');
316 return sprintf("\x{%02X}",ord($ch);
319 This scheme is close to how underlying C code for Encode works, but gives
320 the fixup routine very little context.
324 Passed original string, and an index into it of the problem area, and
325 output string so far. Appends what it will to output string and
326 returns new index into original string. For example:
329 # my ($s,$i,$d) = @_;
330 my $ch = substr($_[0],$_[1],1);
331 $_[2] .= sprintf("\x{%02X}",ord($ch);
335 This scheme gives maximal control to the fixup routine but is more
336 complicated to code, and may need internals of Encode to be tweaked to
337 keep original string intact.
343 Multiple return values rather than in-place modifications.
345 Index into the string could be pos($str) allowing s/\G...//.
351 The Unicode consortium defines the UTF-8 standard as a way of encoding
352 the entire Unicode repertiore as sequences of octets. This encoding is
353 expected to become very widespread. Perl can use this form internaly
354 to represent strings, so conversions to and from this form are
355 particularly efficient (as octets in memory do not have to change,
356 just the meta-data that tells Perl how to treat them).
362 $bytes = encode_utf8($string);
364 The characters that comprise string are encoded in Perl's superset of UTF-8
365 and the resulting octets returned as a sequence of bytes. All possible
366 characters have a UTF-8 representation so this function cannot fail.
370 $string = decode_utf8($bytes [,CHECK]);
372 The sequence of octets represented by $bytes is decoded from UTF-8
373 into a sequence of logical characters. Not all sequences of octets
374 form valid UTF-8 encodings, so it is possible for this call to fail.
375 For CHECK see L</"Handling Malformed Data">.
379 =head2 Listing available encodings
382 @list = Encode->encodings();
384 Returns a list of the canonical names of the available encodings that
385 are loaded. To get a list of all available encodings including the
386 ones that are not loaded yet, say
388 @all_encodings = Encode->encodings(":all");
390 Or you can give the name of specific module.
392 @with_jp = Encode->encodings("Encode/JP.pm");
394 Note in this case you have to say "Encode/JP.pm instead of Encode::JP.
396 To find which encodings are suppoted by this package in details,
397 see L<Encode::Supported>.
399 =head2 Defining Aliases
403 define_alias( newName => ENCODING);
405 Allows newName to be used as am alias for ENCODING. ENCODING may be
406 either the name of an encoding or and encoding object (as above).
408 See L<Encode::Alias> on details.
410 =head1 Defining Encodings
412 use Encode qw(define_alias);
413 define_encoding( $object, 'canonicalName' [,alias...]);
415 Causes I<canonicalName> to be associated with I<$object>. The object
416 should provide the interface described in L<Encode::Encoding>
417 below. If more than two arguments are provided then additional
418 arguments are taken as aliases for I<$object> as for C<define_alias>.
420 =head1 Encoding and IO
422 It is very common to want to do encoding transformations when
423 reading or writing files, network connections, pipes etc.
424 If Perl is configured to use the new 'perlio' IO system then
425 C<Encode> provides a "layer" (See L<perliol>) which can transform
426 data as it is read or written.
428 Here is how the blind poet would modernise the encoding:
431 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
432 open(my $utf8,'>:utf8','iliad.utf8');
438 In addition the new IO system can also be configured to read/write
439 UTF-8 encoded characters (as noted above this is efficient):
441 open(my $fh,'>:utf8','anything');
442 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
444 Either of the above forms of "layer" specifications can be made the default
445 for a lexical scope with the C<use open ...> pragma. See L<open>.
447 Once a handle is open is layers can be altered using C<binmode>.
449 Without any such configuration, or if Perl itself is built using
450 system's own IO, then write operations assume that file handle accepts
451 only I<bytes> and will C<die> if a character larger than 255 is
452 written to the handle. When reading, each octet from the handle
453 becomes a byte-in-a-character. Note that this default is the same
454 behaviour as bytes-only languages (including Perl before v5.6) would
455 have, and is sufficient to handle native 8-bit encodings
456 e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
457 other encodings and binary data.
459 In other cases it is the programs responsibility to transform
460 characters into bytes using the API above before doing writes, and to
461 transform the bytes read from a handle into characters before doing
462 "character operations" (e.g. C<lc>, C</\W+/>, ...).
464 You can also use PerlIO to convert larger amounts of data you don't
465 want to bring into memory. For example to convert between ISO-8859-1
466 (Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
468 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
469 open(G, ">:utf8", "data.utf") or die $!;
470 while (<F>) { print G }
472 # Could also do "print G <F>" but that would pull
473 # the whole file into memory just to write it out again.
477 open(my $f, "<:encoding(cp1252)")
478 open(my $g, ">:encoding(iso-8859-2)")
479 open(my $h, ">:encoding(latin9)") # iso-8859-15
481 See L<PerlIO> for more information.
483 See also L<encoding> for how to change the default encoding of the
486 =head1 Messing with Perl's Internals
488 The following API uses parts of Perl's internals in the current
489 implementation. As such they are efficient, but may change.
493 =item * is_utf8(STRING [, CHECK])
495 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
496 If CHECK is true, also checks the data in STRING for being well-formed
497 UTF-8. Returns true if successful, false otherwise.
503 [INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
504 B<not> checked for being well-formed UTF-8. Do not use unless you
505 B<know> that the STRING is well-formed UTF-8. Returns the previous
506 state of the UTF-8 flag (so please don't test the return value as
507 I<not> success or failure), or C<undef> if STRING is not a string.
513 [INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
514 Returns the previous state of the UTF-8 flag (so please don't test the
515 return value as I<not> success or failure), or C<undef> if STRING is
524 L<Encode::Supported>,
531 the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>