Upgrade to Encode 1.26, from Dan Kogai.
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
f2a2953c 3our $VERSION = do { my @r = (q$Revision: 1.26 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c 4our $DEBUG = 0;
2c674647 5
6require DynaLoader;
7require Exporter;
8
51ef4e11 9our @ISA = qw(Exporter DynaLoader);
2c674647 10
4411f3b6 11# Public, encouraged API is exported by default
51ef4e11 12our @EXPORT = qw (
4411f3b6 13 encode
14 decode
15 encode_utf8
16 decode_utf8
17 find_encoding
51ef4e11 18 encodings
4411f3b6 19);
20
51ef4e11 21our @EXPORT_OK =
2c674647 22 qw(
51ef4e11 23 define_encoding
2c674647 24 from_to
25 is_utf8
4411f3b6 26 is_8bit
27 is_16bit
a12c0f56 28 utf8_upgrade
29 utf8_downgrade
4411f3b6 30 _utf8_on
31 _utf8_off
2c674647 32 );
33
34bootstrap Encode ();
35
4411f3b6 36# Documentation moved after __END__ for speed - NI-S
2c674647 37
bf230f3d 38use Carp;
39
a63c962f 40our $ON_EBCDIC = (ord("A") == 193);
f2a2953c 41
5d030b67 42use Encode::Alias;
43
5129552c 44# Make a %Encoding package variable to allow a certain amount of cheating
45our %Encoding;
a999c27c 46our %ExtModule;
47
48my @codepages = qw(
49 37 424 437 500 737 775 850 852 855
50 856 857 860 861 862 863 864 865 866
51 869 874 875 932 936 949 950 1006 1026
52 1047 1250 1251 1252 1253 1254 1255 1256 1257
53 1258
54 );
55
56my @macintosh = qw(
57 CentralEurRoman Croatian Cyrillic Greek
58 Iceland Roman Rumanian Sami
59 Thai Turkish Ukrainian
60 );
d1ed7747 61
5129552c 62for my $k (2..11,13..16){
63 $ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm';
64}
65
a999c27c 66for my $k (@codepages){
5129552c 67 $ExtModule{"cp$k"} = 'Encode/Byte.pm';
68}
69
a999c27c 70for my $k (@macintosh)
71{
72 $ExtModule{"mac$k"} = 'Encode/Byte.pm';
73}
74
75%ExtModule =
76 (%ExtModule,
77 'koi8-r' => 'Encode/Byte.pm',
78 'posix-bc' => 'Encode/EBCDIC.pm',
79 cp037 => 'Encode/EBCDIC.pm',
80 cp1026 => 'Encode/EBCDIC.pm',
81 cp1047 => 'Encode/EBCDIC.pm',
82 cp500 => 'Encode/EBCDIC.pm',
83 cp875 => 'Encode/EBCDIC.pm',
84 dingbats => 'Encode/Symbol.pm',
85 macDingbats => 'Encode/Symbol.pm',
86 macSymbol => 'Encode/Symbol.pm',
87 symbol => 'Encode/Symbol.pm',
88 viscii => 'Encode/Byte.pm',
89);
90
a63c962f 91unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
f2a2953c 92%ExtModule =
93 (%ExtModule,
a999c27c 94
f2a2953c 95 'cp936' => 'Encode/CN.pm',
96 'euc-cn' => 'Encode/CN.pm',
97 'gb12345-raw' => 'Encode/CN.pm',
98 'gb2312-raw' => 'Encode/CN.pm',
99 'gbk' => 'Encode/CN.pm',
100 'iso-ir-165' => 'Encode/CN.pm',
101
102 '7bit-jis' => 'Encode/JP.pm',
103 'cp932' => 'Encode/JP.pm',
104 'euc-jp' => 'Encode/JP.pm',
105 'iso-2022-jp' => 'Encode/JP.pm',
106 'iso-2022-jp-1' => 'Encode/JP.pm',
107 'jis0201-raw' => 'Encode/JP.pm',
108 'jis0208-raw' => 'Encode/JP.pm',
109 'jis0212-raw' => 'Encode/JP.pm',
110 'macJapanese' => 'Encode/JP.pm',
111 'shiftjis' => 'Encode/JP.pm',
112
113 'cp949' => 'Encode/KR.pm',
114 'euc-kr' => 'Encode/KR.pm',
115 'ksc5601' => 'Encode/KR.pm',
116 'macKorean' => 'Encode/KR.pm',
117
118 'big5' => 'Encode/TW.pm',
119 'big5-hkscs' => 'Encode/TW.pm',
120 'cp950' => 'Encode/TW.pm',
121
122 'big5plus' => 'Encode/HanExtra.pm',
123 'euc-tw' => 'Encode/HanExtra.pm',
124 'gb18030' => 'Encode/HanExtra.pm',
125 );
126}
5129552c 127
656753f8 128sub encodings
129{
5129552c 130 my $class = shift;
071db25d 131 my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
5129552c 132 for my $m (@modules)
133 {
134 $DEBUG and warn "about to require $m;";
135 eval { require $m; };
136 }
137 return
138 map({$_->[0]}
139 sort({$a->[1] cmp $b->[1]}
140 map({[$_, lc $_]}
141 grep({ $_ ne 'Internal' } keys %Encoding))));
51ef4e11 142}
143
51ef4e11 144sub define_encoding
145{
18586f54 146 my $obj = shift;
147 my $name = shift;
5129552c 148 $Encoding{$name} = $obj;
18586f54 149 my $lc = lc($name);
150 define_alias($lc => $obj) unless $lc eq $name;
151 while (@_)
152 {
153 my $alias = shift;
154 define_alias($alias,$obj);
155 }
156 return $obj;
656753f8 157}
158
656753f8 159sub getEncoding
160{
dd9703c9 161 my ($class,$name,$skip_external) = @_;
18586f54 162 my $enc;
163 if (ref($name) && $name->can('new_sequence'))
164 {
165 return $name;
166 }
167 my $lc = lc $name;
5129552c 168 if (exists $Encoding{$name})
18586f54 169 {
5129552c 170 return $Encoding{$name};
18586f54 171 }
5129552c 172 if (exists $Encoding{$lc})
18586f54 173 {
5129552c 174 return $Encoding{$lc};
18586f54 175 }
c50d192e 176
5129552c 177 my $oc = $class->find_alias($name);
c50d192e 178 return $oc if defined $oc;
179
5129552c 180 $oc = $class->find_alias($lc) if $lc ne $name;
c50d192e 181 return $oc if defined $oc;
182
5129552c 183 if (!$skip_external and exists $ExtModule{$lc})
d1ed7747 184 {
5129552c 185 eval{ require $ExtModule{$lc}; };
186 return $Encoding{$name} if exists $Encoding{$name};
d1ed7747 187 }
18586f54 188
18586f54 189 return;
656753f8 190}
191
4411f3b6 192sub find_encoding
193{
dd9703c9 194 my ($name,$skip_external) = @_;
195 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6 196}
197
198sub encode
199{
18586f54 200 my ($name,$string,$check) = @_;
201 my $enc = find_encoding($name);
202 croak("Unknown encoding '$name'") unless defined $enc;
203 my $octets = $enc->encode($string,$check);
204 return undef if ($check && length($string));
205 return $octets;
4411f3b6 206}
207
208sub decode
209{
18586f54 210 my ($name,$octets,$check) = @_;
211 my $enc = find_encoding($name);
212 croak("Unknown encoding '$name'") unless defined $enc;
213 my $string = $enc->decode($octets,$check);
214 $_[1] = $octets if $check;
215 return $string;
4411f3b6 216}
217
218sub from_to
219{
18586f54 220 my ($string,$from,$to,$check) = @_;
221 my $f = find_encoding($from);
222 croak("Unknown encoding '$from'") unless defined $f;
223 my $t = find_encoding($to);
224 croak("Unknown encoding '$to'") unless defined $t;
225 my $uni = $f->decode($string,$check);
226 return undef if ($check && length($string));
a999c27c 227 $string = $t->encode($uni,$check);
18586f54 228 return undef if ($check && length($uni));
3ef515df 229 return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6 230}
231
232sub encode_utf8
233{
18586f54 234 my ($str) = @_;
235 utf8::encode($str);
236 return $str;
4411f3b6 237}
238
239sub decode_utf8
240{
18586f54 241 my ($str) = @_;
242 return undef unless utf8::decode($str);
243 return $str;
5ad8ef52 244}
245
f2a2953c 246predefine_encodings();
247
248#
249# This is to restore %Encoding if really needed;
250#
251sub predefine_encodings{
252 if ($ON_EBCDIC) {
253 # was in Encode::UTF_EBCDIC
254 package Encode::UTF_EBCDIC;
255 *name = sub{ shift->{'Name'} };
256 *new_sequence = sub{ return $_[0] };
257 *decode = sub{
258 my ($obj,$str,$chk) = @_;
259 my $res = '';
260 for (my $i = 0; $i < length($str); $i++) {
261 $res .=
262 chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
263 }
264 $_[1] = '' if $chk;
265 return $res;
266 };
267 *encode = sub{
268 my ($obj,$str,$chk) = @_;
269 my $res = '';
270 for (my $i = 0; $i < length($str); $i++) {
271 $res .=
272 chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
273 }
274 $_[1] = '' if $chk;
275 return $res;
276 };
277 $Encode::Encoding{Unicode} =
278 bless {Name => "UTF_EBCDIC"}, "Encode::UTF_EBCDIC";
279 } else {
280 # was in Encode::UTF_EBCDIC
281 package Encode::Internal;
282 *name = sub{ shift->{'Name'} };
283 *new_sequence = sub{ return $_[0] };
284 *decode = sub{
285 my ($obj,$str,$chk) = @_;
286 utf8::upgrade($str);
287 $_[1] = '' if $chk;
288 return $str;
289 };
290 *encode = \&decode;
291 $Encode::Encoding{Unicode} =
292 bless {Name => "Internal"}, "Encode::Internal";
293 }
294
295 {
296 # was in Encode::utf8
297 package Encode::utf8;
298 *name = sub{ shift->{'Name'} };
299 *new_sequence = sub{ return $_[0] };
300 *decode = sub{
301 my ($obj,$octets,$chk) = @_;
302 my $str = Encode::decode_utf8($octets);
303 if (defined $str) {
304 $_[1] = '' if $chk;
305 return $str;
306 }
307 return undef;
308 };
309 *encode = sub {
310 my ($obj,$string,$chk) = @_;
311 my $octets = Encode::encode_utf8($string);
312 $_[1] = '' if $chk;
313 return $octets;
314 };
315 $Encode::Encoding{utf8} =
316 bless {Name => "utf8"}, "Encode::utf8";
317 }
318 # do externals if necessary
319 require File::Basename;
320 require File::Spec;
321 for my $ext (qw(Unicode)){
322 my $pm =
323 File::Spec->catfile(File::Basename::dirname($INC{'Encode.pm'}),
324 "Encode", "$ext.pm");
325 do $pm;
326 }
327}
328
18586f54 329require Encode::Encoding;
330require Encode::XS;
4411f3b6 331
656753f8 3321;
333
2a936312 334__END__
335
4411f3b6 336=head1 NAME
337
338Encode - character encodings
339
340=head1 SYNOPSIS
341
342 use Encode;
343
67d7b5ef 344
345=head2 Table of Contents
346
347Encode consists of a collection of modules which details are too big
348to fit in one document. This POD itself explains the top-level APIs
349and general topics at a glance. For other topics and more details,
350see the PODs below;
351
352 Name Description
353 --------------------------------------------------------
354 Encode::Alias Alias defintions to encodings
355 Encode::Encoding Encode Implementation Base Class
356 Encode::Supported List of Supported Encodings
357 Encode::CN Simplified Chinese Encodings
358 Encode::JP Japanese Encodings
359 Encode::KR Korean Encodings
360 Encode::TW Traditional Chinese Encodings
361 --------------------------------------------------------
362
4411f3b6 363=head1 DESCRIPTION
364
47bfe92f 365The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef 366and the rest of the system. Perl strings are sequences of
367B<characters>.
368
369The repertoire of characters that Perl can represent is at least that
370defined by the Unicode Consortium. On most platforms the ordinal
371values of the characters (as returned by C<ord(ch)>) is the "Unicode
372codepoint" for the character (the exceptions are those platforms where
373the legacy encoding is some variant of EBCDIC rather than a super-set
374of ASCII - see L<perlebcdic>).
375
376Traditionally computer data has been moved around in 8-bit chunks
377often called "bytes". These chunks are also known as "octets" in
378networking standards. Perl is widely used to manipulate data of many
379types - not only strings of characters representing human or computer
380languages but also "binary" data being the machines representation of
381numbers, pixels in an image - or just about anything.
382
383When Perl is processing "binary data" the programmer wants Perl to
384process "sequences of bytes". This is not a problem for Perl - as a
385byte has 256 possible values it easily fits in Perl's much larger
386"logical character".
387
388=head2 TERMINOLOGY
4411f3b6 389
67d7b5ef 390=over 4
21938dfa 391
67d7b5ef 392=item *
393
394I<character>: a character in the range 0..(2**32-1) (or more).
395(What Perl's strings are made of.)
396
397=item *
398
399I<byte>: a character in the range 0..255
400(A special case of a Perl character.)
401
402=item *
403
404I<octet>: 8 bits of data, with ordinal values 0..255
405(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
406
407=back
4411f3b6 408
67d7b5ef 409The marker [INTERNAL] marks Internal Implementation Details, in
410general meant only for those who think they know what they are doing,
411and such details may change in future releases.
412
413=head1 PERL ENCODING API
4411f3b6 414
415=over 4
416
f2a2953c 417=item $octets = encode(ENCODING, $string[, CHECK])
4411f3b6 418
47bfe92f 419Encodes string from Perl's internal form into I<ENCODING> and returns
67d7b5ef 420a sequence of octets. ENCODING can be either a canonical name or
421alias. For encoding names and aliases, see L</"Defining Aliases">.
422For CHECK see L</"Handling Malformed Data">.
4411f3b6 423
67d7b5ef 424For example to convert (internally UTF-8 encoded) Unicode string to
425iso-8859-1 (also known as Latin1),
681a7c68 426
67d7b5ef 427 $octets = encode("iso-8859-1", $unicode);
681a7c68 428
f2a2953c 429=item $string = decode(ENCODING, $octets[, CHECK])
4411f3b6 430
47bfe92f 431Decode sequence of octets assumed to be in I<ENCODING> into Perl's
67d7b5ef 432internal form and returns the resulting string. as in encode(),
433ENCODING can be either a canonical name or alias. For encoding names
434and aliases, see L</"Defining Aliases">. For CHECK see
47bfe92f 435L</"Handling Malformed Data">.
436
1b2c56c8 437For example to convert ISO-8859-1 data to UTF-8:
681a7c68 438
67d7b5ef 439 $utf8 = decode("iso-8859-1", $latin1);
681a7c68 440
f2a2953c 441=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])
47bfe92f 442
2b106fbe 443Convert B<in-place> the data between two encodings. How did the data
444in $string originally get to be in FROM_ENCODING? Either using
67d7b5ef 445encode() or through PerlIO: See L</"Encoding and IO">.
446For encoding names and aliases, see L</"Defining Aliases">.
447For CHECK see L</"Handling Malformed Data">.
2b106fbe 448
1b2c56c8 449For example to convert ISO-8859-1 data to UTF-8:
2b106fbe 450
451 from_to($data, "iso-8859-1", "utf-8");
452
453and to convert it back:
454
455 from_to($data, "utf-8", "iso-8859-1");
4411f3b6 456
ab97ca19 457Note that because the conversion happens in place, the data to be
458converted cannot be a string constant, it must be a scalar variable.
459
3ef515df 460from_to() return the length of the converted string on success, undef
461otherwise.
462
4411f3b6 463=back
464
f2a2953c 465=head2 UTF-8 / utf8
466
467The Unicode consortium defines the UTF-8 standard as a way of encoding
468the entire Unicode repertoire as sequences of octets. This encoding is
469expected to become very widespread. Perl can use this form internally
470to represent strings, so conversions to and from this form are
471particularly efficient (as octets in memory do not have to change,
472just the meta-data that tells Perl how to treat them).
473
474=over 4
475
476=item $octets = encode_utf8($string);
477
478The characters that comprise string are encoded in Perl's superset of UTF-8
479and the resulting octets returned as a sequence of bytes. All possible
480characters have a UTF-8 representation so this function cannot fail.
481
482=item $string = decode_utf8($octets [, CHECK]);
483
484The sequence of octets represented by $octets is decoded from UTF-8
485into a sequence of logical characters. Not all sequences of octets
486form valid UTF-8 encodings, so it is possible for this call to fail.
487For CHECK see L</"Handling Malformed Data">.
488
489=back
490
51ef4e11 491=head2 Listing available encodings
492
5129552c 493 use Encode;
494 @list = Encode->encodings();
495
496Returns a list of the canonical names of the available encodings that
497are loaded. To get a list of all available encodings including the
498ones that are not loaded yet, say
499
500 @all_encodings = Encode->encodings(":all");
501
502Or you can give the name of specific module.
503
504 @with_jp = Encode->encodings("Encode/JP.pm");
51ef4e11 505
a63c962f 506Note in this case you have to say C<"Encode/JP.pm"> instead of
507C<"Encode::JP">.
5d030b67 508
a63c962f 509To find which encodings are supported by this package in details,
5d030b67 510see L<Encode::Supported>.
51ef4e11 511
512=head2 Defining Aliases
513
67d7b5ef 514To add new alias to a given encoding, Use;
515
5129552c 516 use Encode;
517 use Encode::Alias;
a63c962f 518 define_alias(newName => ENCODING);
51ef4e11 519
3ef515df 520After that, newName can be used as an alias for ENCODING.
f2a2953c 521ENCODING may be either the name of an encoding or an
522I<encoding object>
51ef4e11 523
5d030b67 524See L<Encode::Alias> on details.
51ef4e11 525
4411f3b6 526=head1 Encoding and IO
527
528It is very common to want to do encoding transformations when
529reading or writing files, network connections, pipes etc.
47bfe92f 530If Perl is configured to use the new 'perlio' IO system then
4411f3b6 531C<Encode> provides a "layer" (See L<perliol>) which can transform
532data as it is read or written.
533
8e86646e 534Here is how the blind poet would modernise the encoding:
535
42234700 536 use Encode;
8e86646e 537 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
538 open(my $utf8,'>:utf8','iliad.utf8');
539 my @epic = <$iliad>;
540 print $utf8 @epic;
541 close($utf8);
542 close($illiad);
4411f3b6 543
544In addition the new IO system can also be configured to read/write
545UTF-8 encoded characters (as noted above this is efficient):
546
e9692b5b 547 open(my $fh,'>:utf8','anything');
548 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6 549
550Either of the above forms of "layer" specifications can be made the default
551for a lexical scope with the C<use open ...> pragma. See L<open>.
552
553Once a handle is open is layers can be altered using C<binmode>.
554
47bfe92f 555Without any such configuration, or if Perl itself is built using
4411f3b6 556system's own IO, then write operations assume that file handle accepts
557only I<bytes> and will C<die> if a character larger than 255 is
558written to the handle. When reading, each octet from the handle
559becomes a byte-in-a-character. Note that this default is the same
47bfe92f 560behaviour as bytes-only languages (including Perl before v5.6) would
561have, and is sufficient to handle native 8-bit encodings
562e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
563other encodings and binary data.
564
565In other cases it is the programs responsibility to transform
566characters into bytes using the API above before doing writes, and to
567transform the bytes read from a handle into characters before doing
568"character operations" (e.g. C<lc>, C</\W+/>, ...).
569
47bfe92f 570You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8 571want to bring into memory. For example to convert between ISO-8859-1
47bfe92f 572(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
573
e9692b5b 574 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
575 open(G, ">:utf8", "data.utf") or die $!;
576 while (<F>) { print G }
577
578 # Could also do "print G <F>" but that would pull
579 # the whole file into memory just to write it out again.
580
581More examples:
47bfe92f 582
e9692b5b 583 open(my $f, "<:encoding(cp1252)")
584 open(my $g, ">:encoding(iso-8859-2)")
585 open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f 586
587See L<PerlIO> for more information.
4411f3b6 588
1768d7eb 589See also L<encoding> for how to change the default encoding of the
d521382b 590data in your script.
1768d7eb 591
67d7b5ef 592=head1 Handling Malformed Data
593
f2a2953c 594If I<CHECK> is not set, (en|de)code will put I<substitution character> in
595place of the malformed character. for UCM-based encodings,
596E<lt>subcharE<gt> will be used. For Unicode, \xFFFD is used. If the
597data is supposed to be UTF-8, an optional lexical warning (category
598utf8) is given.
67d7b5ef 599
f2a2953c 600If I<CHECK> is true but not a code reference, dies with an error message.
67d7b5ef 601
f2a2953c 602In future you will be able to use a code reference to a callback
603function for the value of I<CHECK> but its API is still undecided.
67d7b5ef 604
605=head1 Defining Encodings
606
607To define a new encoding, use:
608
609 use Encode qw(define_alias);
610 define_encoding($object, 'canonicalName' [, alias...]);
611
612I<canonicalName> will be associated with I<$object>. The object
613should provide the interface described in L<Encode::Encoding>
614If more than two arguments are provided then additional
615arguments are taken as aliases for I<$object> as for C<define_alias>.
616
f2a2953c 617See L<Encode::Encoding> for more details.
618
4411f3b6 619=head1 Messing with Perl's Internals
620
47bfe92f 621The following API uses parts of Perl's internals in the current
622implementation. As such they are efficient, but may change.
4411f3b6 623
624=over 4
625
a63c962f 626=item is_utf8(STRING [, CHECK])
4411f3b6 627
628[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f 629If CHECK is true, also checks the data in STRING for being well-formed
630UTF-8. Returns true if successful, false otherwise.
4411f3b6 631
a63c962f 632=item _utf8_on(STRING)
4411f3b6 633
634[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
635B<not> checked for being well-formed UTF-8. Do not use unless you
636B<know> that the STRING is well-formed UTF-8. Returns the previous
637state of the UTF-8 flag (so please don't test the return value as
638I<not> success or failure), or C<undef> if STRING is not a string.
639
a63c962f 640=item _utf8_off(STRING)
4411f3b6 641
642[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
643Returns the previous state of the UTF-8 flag (so please don't test the
644return value as I<not> success or failure), or C<undef> if STRING is
645not a string.
646
647=back
648
649=head1 SEE ALSO
650
5d030b67 651L<Encode::Encoding>,
652L<Encode::Supported>,
653L<PerlIO>,
654L<encoding>,
655L<perlebcdic>,
656L<perlfunc/open>,
657L<perlunicode>,
658L<utf8>,
659the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 660
661=cut