3 our $VERSION = do { my @r = (q$Revision: 0.94 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
8 our @ISA = qw(Exporter DynaLoader);
10 # Public, encouraged API is exported by default
36 # Documentation moved after __END__ for speed - NI-S
40 # Make a %encoding package variable to allow a certain amount of cheating
42 my @alias; # ordered matching list
43 my %alias; # cached known aliases
45 # 0 1 2 3 4 5 6 7 8 9 10
46 our @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
60 our %external_tables =
62 'euc-cn' => 'Encode/CN.pm',
63 gb2312 => 'Encode/CN.pm',
64 gb12345 => 'Encode/CN.pm',
65 gbk => 'Encode/CN.pm',
66 cp936 => 'Encode/CN.pm',
67 'iso-ir-165' => 'Encode/CN.pm',
68 'euc-jp' => 'Encode/JP.pm',
69 'iso-2022-jp' => 'Encode/JP.pm',
70 '7bit-jis' => 'Encode/JP.pm',
71 shiftjis => 'Encode/JP.pm',
72 macjapan => 'Encode/JP.pm',
73 cp932 => 'Encode/JP.pm',
74 'euc-kr' => 'Encode/KR.pm',
75 ksc5601 => 'Encode/KR.pm',
76 cp949 => 'Encode/KR.pm',
77 big5 => 'Encode/TW.pm',
78 'big5-hkscs' => 'Encode/TW.pm',
79 cp950 => 'Encode/TW.pm',
80 gb18030 => 'Encode/HanExtra.pm',
81 big5plus => 'Encode/HanExtra.pm',
82 'euc-tw' => 'Encode/HanExtra.pm',
90 sort { $a->[1] cmp $b->[1] }
92 grep { $_ ne 'Internal' }
100 # print "# findAlias $_\n";
101 unless (exists $alias{$_})
103 for (my $i=0; $i < @alias; $i += 2)
105 my $alias = $alias[$i];
106 my $val = $alias[$i+1];
108 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
112 elsif (ref($alias) eq 'CODE')
114 $new = &{$alias}($val)
116 elsif (lc($_) eq lc($alias))
122 next if $new eq $_; # avoid (direct) recursion on bugs
123 my $enc = (ref($new)) ? $new : find_encoding($new);
139 my ($alias,$name) = splice(@_,0,2);
140 push(@alias, $alias => $name);
144 # Allow variants of iso-8859-1 etc.
145 define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
147 # At least HP-UX has these.
148 define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );
151 define_alias( qr/^(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
153 # The Official name of ASCII.
154 define_alias( qr/^ANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
156 # This is a font issue, not an encoding issue.
157 # (The currency symbol of the Latin 1 upper half
158 # has been redefined as the euro symbol.)
159 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
161 # Allow latin-1 style names as well
162 define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
164 # Allow winlatin1 style names as well
165 define_alias( qr/^win(latin[12]|cyrillic|baltic|greek|turkish|hebrew|arabic|baltic|vietnamese)$/i => '"cp$winlatin2cp{\u$1}"' );
167 # Common names for non-latin prefered MIME names
168 define_alias( 'ascii' => 'US-ascii',
169 'cyrillic' => 'iso-8859-5',
170 'arabic' => 'iso-8859-6',
171 'greek' => 'iso-8859-7',
172 'hebrew' => 'iso-8859-8',
173 'thai' => 'iso-8859-11',
174 'tis620' => 'iso-8859-11',
177 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
178 # And Microsoft has their own naming (again, surprisingly).
179 define_alias( qr/^(?:ibm|ms)[-_]?(\d\d\d\d?)$/i => '"cp$1"');
181 # Sometimes seen with a leading zero.
182 define_alias( qr/^cp037$/i => '"cp37"');
185 define_alias( qr/^macRomanian$/i => '"macRumanian"');
187 # Standardize on the dashed versions.
188 define_alias( qr/^utf8$/i => 'utf-8' );
189 define_alias( qr/^koi8r$/i => 'koi8-r' );
190 define_alias( qr/^koi8u$/i => 'koi8-u' );
192 # Seen in some Linuxes.
193 define_alias( qr/^ujis$/i => 'euc-jp' );
195 # CP936 doesn't have vendor-addon for GBK, so they're identical.
196 define_alias( qr/^gbk$/i => '"cp936"');
198 # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
199 # TODO: HP-UX '15' encodings japanese15 korean15 roi15
200 # TODO: Cyrillic encoding ISO-IR-111 (useful?)
201 # TODO: Armenian encoding ARMSCII-8
202 # TODO: Hebrew encoding ISO-8859-8-1
203 # TODO: Thai encoding TCVN
204 # TODO: Korean encoding Johab
205 # TODO: Vietnamese encodings VPS
206 # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
207 # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
208 # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
209 # Kannada Khmer Korean Laotian Malayalam Mongolian
210 # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
212 # Map white space and _ to '-'
213 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
219 $encoding{$name} = $obj;
221 define_alias($lc => $obj) unless $lc eq $name;
225 define_alias($alias,$obj);
232 my ($class,$name,$skip_external) = @_;
234 if (ref($name) && $name->can('new_sequence'))
239 if (exists $encoding{$name})
241 return $encoding{$name};
243 if (exists $encoding{$lc})
245 return $encoding{$lc};
248 my $oc = $class->findAlias($name);
249 return $oc if defined $oc;
251 $oc = $class->findAlias($lc) if $lc ne $name;
252 return $oc if defined $oc;
254 if (!$skip_external and exists $external_tables{$lc})
256 require $external_tables{$lc};
257 return $encoding{$name} if exists $encoding{$name};
265 my ($name,$skip_external) = @_;
266 return __PACKAGE__->getEncoding($name,$skip_external);
271 my ($name,$string,$check) = @_;
272 my $enc = find_encoding($name);
273 croak("Unknown encoding '$name'") unless defined $enc;
274 my $octets = $enc->encode($string,$check);
275 return undef if ($check && length($string));
281 my ($name,$octets,$check) = @_;
282 my $enc = find_encoding($name);
283 croak("Unknown encoding '$name'") unless defined $enc;
284 my $string = $enc->decode($octets,$check);
285 $_[1] = $octets if $check;
291 my ($string,$from,$to,$check) = @_;
292 my $f = find_encoding($from);
293 croak("Unknown encoding '$from'") unless defined $f;
294 my $t = find_encoding($to);
295 croak("Unknown encoding '$to'") unless defined $t;
296 my $uni = $f->decode($string,$check);
297 return undef if ($check && length($string));
298 $string = $t->encode($uni,$check);
299 return undef if ($check && length($uni));
300 return length($_[0] = $string);
313 return undef unless utf8::decode($str);
317 require Encode::Encoding;
319 require Encode::Internal;
320 require Encode::Unicode;
321 require Encode::utf8;
322 require Encode::iso10646_1;
323 require Encode::ucs2_le;
331 Encode - character encodings
339 The C<Encode> module provides the interfaces between Perl's strings
340 and the rest of the system. Perl strings are sequences of B<characters>.
342 To find more about character encodings, please consult
343 L<Encode::Description> . This document focuses on programming references.
345 =head1 PERL ENCODING API
347 =head2 Generic Encoding Interface
353 $bytes = encode(ENCODING, $string[, CHECK])
355 Encodes string from Perl's internal form into I<ENCODING> and returns
356 a sequence of octets. For CHECK see L</"Handling Malformed Data">.
358 For example to convert (internally UTF-8 encoded) Unicode data
361 $octets = encode("utf8", $unicode);
365 $string = decode(ENCODING, $bytes[, CHECK])
367 Decode sequence of octets assumed to be in I<ENCODING> into Perl's
368 internal form and returns the resulting string. For CHECK see
369 L</"Handling Malformed Data">.
371 For example to convert ISO-8859-1 data to UTF-8:
373 $utf8 = decode("latin1", $latin1);
377 from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
379 Convert B<in-place> the data between two encodings. How did the data
380 in $string originally get to be in FROM_ENCODING? Either using
381 encode() or through PerlIO: See L</"Encoding and IO">. For CHECK
382 see L</"Handling Malformed Data">.
384 For example to convert ISO-8859-1 data to UTF-8:
386 from_to($data, "iso-8859-1", "utf-8");
388 and to convert it back:
390 from_to($data, "utf-8", "iso-8859-1");
392 Note that because the conversion happens in place, the data to be
393 converted cannot be a string constant, it must be a scalar variable.
397 =head2 Handling Malformed Data
399 If CHECK is not set, C<undef> is returned. If the data is supposed to
400 be UTF-8, an optional lexical warning (category utf8) is given. If
401 CHECK is true but not a code reference, dies.
403 It would desirable to have a way to indicate that transform should use
404 the encodings "replacement character" - no such mechanism is defined yet.
406 It is also planned to allow I<CHECK> to be a code reference.
408 This is not yet implemented as there are design issues with what its
409 arguments should be and how it returns its results.
415 Passed remaining fragment of string being processed.
416 Modifies it in place to remove bytes/characters it can understand
417 and returns a string used to represent them.
421 my $ch = substr($_[0],0,1,'');
422 return sprintf("\x{%02X}",ord($ch);
425 This scheme is close to how underlying C code for Encode works, but gives
426 the fixup routine very little context.
430 Passed original string, and an index into it of the problem area, and
431 output string so far. Appends what it will to output string and
432 returns new index into original string. For example:
435 # my ($s,$i,$d) = @_;
436 my $ch = substr($_[0],$_[1],1);
437 $_[2] .= sprintf("\x{%02X}",ord($ch);
441 This scheme gives maximal control to the fixup routine but is more
442 complicated to code, and may need internals of Encode to be tweaked to
443 keep original string intact.
449 Multiple return values rather than in-place modifications.
451 Index into the string could be pos($str) allowing s/\G...//.
457 The Unicode consortium defines the UTF-8 standard as a way of encoding
458 the entire Unicode repertiore as sequences of octets. This encoding is
459 expected to become very widespread. Perl can use this form internaly
460 to represent strings, so conversions to and from this form are
461 particularly efficient (as octets in memory do not have to change,
462 just the meta-data that tells Perl how to treat them).
468 $bytes = encode_utf8($string);
470 The characters that comprise string are encoded in Perl's superset of UTF-8
471 and the resulting octets returned as a sequence of bytes. All possible
472 characters have a UTF-8 representation so this function cannot fail.
476 $string = decode_utf8($bytes [,CHECK]);
478 The sequence of octets represented by $bytes is decoded from UTF-8
479 into a sequence of logical characters. Not all sequences of octets
480 form valid UTF-8 encodings, so it is possible for this call to fail.
481 For CHECK see L</"Handling Malformed Data">.
485 =head2 Listing available encodings
487 use Encode qw(encodings);
490 Returns a list of the canonical names of the available encodings.
492 =head2 Defining Aliases
494 use Encode qw(define_alias);
495 define_alias( newName => ENCODING);
497 Allows newName to be used as am alias for ENCODING. ENCODING may be
498 either the name of an encoding or and encoding object (as above).
500 Currently I<newName> can be specified in the following ways:
504 =item As a simple string.
506 =item As a qr// compiled regular expression, e.g.:
508 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
510 In this case if I<ENCODING> is not a reference it is C<eval>-ed to
511 allow C<$1> etc. to be subsituted. The example is one way to names as
512 used in X11 font names to alias the MIME names for the iso-8859-*
513 family. Note the double quote inside the single quote. If you are
514 using regex here, you have to do so or it won't work in this case.
516 =item As a code reference, e.g.:
518 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
520 In this case C<$_> will be set to the name that is being looked up and
521 I<ENCODING> is passed to the sub as its first argument. The example
522 is another way to names as used in X11 font names to alias the MIME
523 names for the iso-8859-* family.
527 =head1 Defining Encodings
529 use Encode qw(define_alias);
530 define_encoding( $object, 'canonicalName' [,alias...]);
532 Causes I<canonicalName> to be associated with I<$object>. The object
533 should provide the interface described in L<Encode::Encoding>
534 below. If more than two arguments are provided then additional
535 arguments are taken as aliases for I<$object> as for C<define_alias>.
537 =head1 Encoding and IO
539 It is very common to want to do encoding transformations when
540 reading or writing files, network connections, pipes etc.
541 If Perl is configured to use the new 'perlio' IO system then
542 C<Encode> provides a "layer" (See L<perliol>) which can transform
543 data as it is read or written.
545 Here is how the blind poet would modernise the encoding:
548 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
549 open(my $utf8,'>:utf8','iliad.utf8');
555 In addition the new IO system can also be configured to read/write
556 UTF-8 encoded characters (as noted above this is efficient):
558 open(my $fh,'>:utf8','anything');
559 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
561 Either of the above forms of "layer" specifications can be made the default
562 for a lexical scope with the C<use open ...> pragma. See L<open>.
564 Once a handle is open is layers can be altered using C<binmode>.
566 Without any such configuration, or if Perl itself is built using
567 system's own IO, then write operations assume that file handle accepts
568 only I<bytes> and will C<die> if a character larger than 255 is
569 written to the handle. When reading, each octet from the handle
570 becomes a byte-in-a-character. Note that this default is the same
571 behaviour as bytes-only languages (including Perl before v5.6) would
572 have, and is sufficient to handle native 8-bit encodings
573 e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
574 other encodings and binary data.
576 In other cases it is the programs responsibility to transform
577 characters into bytes using the API above before doing writes, and to
578 transform the bytes read from a handle into characters before doing
579 "character operations" (e.g. C<lc>, C</\W+/>, ...).
581 You can also use PerlIO to convert larger amounts of data you don't
582 want to bring into memory. For example to convert between ISO-8859-1
583 (Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
585 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
586 open(G, ">:utf8", "data.utf") or die $!;
587 while (<F>) { print G }
589 # Could also do "print G <F>" but that would pull
590 # the whole file into memory just to write it out again.
594 open(my $f, "<:encoding(cp1252)")
595 open(my $g, ">:encoding(iso-8859-2)")
596 open(my $h, ">:encoding(latin9)") # iso-8859-15
598 See L<PerlIO> for more information.
600 See also L<encoding> for how to change the default encoding of the
603 =head1 Messing with Perl's Internals
605 The following API uses parts of Perl's internals in the current
606 implementation. As such they are efficient, but may change.
610 =item * is_utf8(STRING [, CHECK])
612 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
613 If CHECK is true, also checks the data in STRING for being well-formed
614 UTF-8. Returns true if successful, false otherwise.
620 [INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
621 B<not> checked for being well-formed UTF-8. Do not use unless you
622 B<know> that the STRING is well-formed UTF-8. Returns the previous
623 state of the UTF-8 flag (so please don't test the return value as
624 I<not> success or failure), or C<undef> if STRING is not a string.
630 [INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
631 Returns the previous state of the UTF-8 flag (so please don't test the
632 return value as I<not> success or failure), or C<undef> if STRING is
639 L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>, L<encoding>,
640 L<utf8>, L<Encode::Description>, L<Encode::Encoding> the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>