Integrate mainline
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
a999c27c 3our $VERSION = do { my @r = (q$Revision: 1.20 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c 4our $DEBUG = 0;
2c674647 5
6require DynaLoader;
7require Exporter;
8
51ef4e11 9our @ISA = qw(Exporter DynaLoader);
2c674647 10
4411f3b6 11# Public, encouraged API is exported by default
51ef4e11 12our @EXPORT = qw (
4411f3b6 13 encode
14 decode
15 encode_utf8
16 decode_utf8
17 find_encoding
51ef4e11 18 encodings
4411f3b6 19);
20
51ef4e11 21our @EXPORT_OK =
2c674647 22 qw(
51ef4e11 23 define_encoding
2c674647 24 from_to
25 is_utf8
4411f3b6 26 is_8bit
27 is_16bit
a12c0f56 28 utf8_upgrade
29 utf8_downgrade
4411f3b6 30 _utf8_on
31 _utf8_off
2c674647 32 );
33
34bootstrap Encode ();
35
4411f3b6 36# Documentation moved after __END__ for speed - NI-S
2c674647 37
bf230f3d 38use Carp;
39
a63c962f 40our $ON_EBCDIC = (ord("A") == 193);
5d030b67 41use Encode::Alias;
42
5129552c 43# Make a %Encoding package variable to allow a certain amount of cheating
44our %Encoding;
a999c27c 45our %ExtModule;
46
47my @codepages = qw(
48 37 424 437 500 737 775 850 852 855
49 856 857 860 861 862 863 864 865 866
50 869 874 875 932 936 949 950 1006 1026
51 1047 1250 1251 1252 1253 1254 1255 1256 1257
52 1258
53 );
54
55my @macintosh = qw(
56 CentralEurRoman Croatian Cyrillic Greek
57 Iceland Roman Rumanian Sami
58 Thai Turkish Ukrainian
59 );
d1ed7747 60
5129552c 61for my $k (2..11,13..16){
62 $ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm';
63}
64
a999c27c 65for my $k (@codepages){
5129552c 66 $ExtModule{"cp$k"} = 'Encode/Byte.pm';
67}
68
a999c27c 69for my $k (@macintosh)
70{
71 $ExtModule{"mac$k"} = 'Encode/Byte.pm';
72}
73
74%ExtModule =
75 (%ExtModule,
76 'koi8-r' => 'Encode/Byte.pm',
77 'posix-bc' => 'Encode/EBCDIC.pm',
78 cp037 => 'Encode/EBCDIC.pm',
79 cp1026 => 'Encode/EBCDIC.pm',
80 cp1047 => 'Encode/EBCDIC.pm',
81 cp500 => 'Encode/EBCDIC.pm',
82 cp875 => 'Encode/EBCDIC.pm',
83 dingbats => 'Encode/Symbol.pm',
84 macDingbats => 'Encode/Symbol.pm',
85 macSymbol => 'Encode/Symbol.pm',
86 symbol => 'Encode/Symbol.pm',
87 viscii => 'Encode/Byte.pm',
88);
89
a63c962f 90unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
a999c27c 91%ExtModule =(%ExtModule,
a63c962f 92 'euc-cn' => 'Encode/CN.pm',
93 gb2312 => 'Encode/CN.pm',
94 gb12345 => 'Encode/CN.pm',
95 gbk => 'Encode/CN.pm',
96 cp936 => 'Encode/CN.pm',
97 'iso-ir-165' => 'Encode/CN.pm',
98 'euc-jp' => 'Encode/JP.pm',
99 'iso-2022-jp' => 'Encode/JP.pm',
100 'iso-2022-jp-1' => 'Encode/JP.pm',
101 '7bit-jis' => 'Encode/JP.pm',
102 shiftjis => 'Encode/JP.pm',
a999c27c 103 macJapanese => 'Encode/JP.pm',
a63c962f 104 cp932 => 'Encode/JP.pm',
105 'euc-kr' => 'Encode/KR.pm',
106 ksc5601 => 'Encode/KR.pm',
a999c27c 107 macKorean => 'Encode/KR.pm',
a63c962f 108 cp949 => 'Encode/KR.pm',
109 big5 => 'Encode/TW.pm',
110 'big5-hkscs' => 'Encode/TW.pm',
111 cp950 => 'Encode/TW.pm',
112 gb18030 => 'Encode/HanExtra.pm',
113 big5plus => 'Encode/HanExtra.pm',
114 'euc-tw' => 'Encode/HanExtra.pm',
115 );
116}
117
a999c27c 118
119
5129552c 120
656753f8 121sub encodings
122{
5129552c 123 my $class = shift;
071db25d 124 my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
5129552c 125 for my $m (@modules)
126 {
127 $DEBUG and warn "about to require $m;";
128 eval { require $m; };
129 }
130 return
131 map({$_->[0]}
132 sort({$a->[1] cmp $b->[1]}
133 map({[$_, lc $_]}
134 grep({ $_ ne 'Internal' } keys %Encoding))));
51ef4e11 135}
136
51ef4e11 137sub define_encoding
138{
18586f54 139 my $obj = shift;
140 my $name = shift;
5129552c 141 $Encoding{$name} = $obj;
18586f54 142 my $lc = lc($name);
143 define_alias($lc => $obj) unless $lc eq $name;
144 while (@_)
145 {
146 my $alias = shift;
147 define_alias($alias,$obj);
148 }
149 return $obj;
656753f8 150}
151
656753f8 152sub getEncoding
153{
dd9703c9 154 my ($class,$name,$skip_external) = @_;
18586f54 155 my $enc;
156 if (ref($name) && $name->can('new_sequence'))
157 {
158 return $name;
159 }
160 my $lc = lc $name;
5129552c 161 if (exists $Encoding{$name})
18586f54 162 {
5129552c 163 return $Encoding{$name};
18586f54 164 }
5129552c 165 if (exists $Encoding{$lc})
18586f54 166 {
5129552c 167 return $Encoding{$lc};
18586f54 168 }
c50d192e 169
5129552c 170 my $oc = $class->find_alias($name);
c50d192e 171 return $oc if defined $oc;
172
5129552c 173 $oc = $class->find_alias($lc) if $lc ne $name;
c50d192e 174 return $oc if defined $oc;
175
5129552c 176 if (!$skip_external and exists $ExtModule{$lc})
d1ed7747 177 {
5129552c 178 eval{ require $ExtModule{$lc}; };
179 return $Encoding{$name} if exists $Encoding{$name};
d1ed7747 180 }
18586f54 181
18586f54 182 return;
656753f8 183}
184
4411f3b6 185sub find_encoding
186{
dd9703c9 187 my ($name,$skip_external) = @_;
188 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6 189}
190
191sub encode
192{
18586f54 193 my ($name,$string,$check) = @_;
194 my $enc = find_encoding($name);
195 croak("Unknown encoding '$name'") unless defined $enc;
196 my $octets = $enc->encode($string,$check);
197 return undef if ($check && length($string));
198 return $octets;
4411f3b6 199}
200
201sub decode
202{
18586f54 203 my ($name,$octets,$check) = @_;
204 my $enc = find_encoding($name);
205 croak("Unknown encoding '$name'") unless defined $enc;
206 my $string = $enc->decode($octets,$check);
207 $_[1] = $octets if $check;
208 return $string;
4411f3b6 209}
210
211sub from_to
212{
18586f54 213 my ($string,$from,$to,$check) = @_;
214 my $f = find_encoding($from);
215 croak("Unknown encoding '$from'") unless defined $f;
216 my $t = find_encoding($to);
217 croak("Unknown encoding '$to'") unless defined $t;
218 my $uni = $f->decode($string,$check);
219 return undef if ($check && length($string));
a999c27c 220 $string = $t->encode($uni,$check);
18586f54 221 return undef if ($check && length($uni));
3ef515df 222 return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6 223}
224
225sub encode_utf8
226{
18586f54 227 my ($str) = @_;
228 utf8::encode($str);
229 return $str;
4411f3b6 230}
231
232sub decode_utf8
233{
18586f54 234 my ($str) = @_;
235 return undef unless utf8::decode($str);
236 return $str;
5ad8ef52 237}
238
18586f54 239require Encode::Encoding;
240require Encode::XS;
241require Encode::Internal;
242require Encode::Unicode;
243require Encode::utf8;
64ffdd5e 244require Encode::10646_1;
18586f54 245require Encode::ucs2_le;
4411f3b6 246
656753f8 2471;
248
2a936312 249__END__
250
4411f3b6 251=head1 NAME
252
253Encode - character encodings
254
255=head1 SYNOPSIS
256
257 use Encode;
258
67d7b5ef 259
260=head2 Table of Contents
261
262Encode consists of a collection of modules which details are too big
263to fit in one document. This POD itself explains the top-level APIs
264and general topics at a glance. For other topics and more details,
265see the PODs below;
266
267 Name Description
268 --------------------------------------------------------
269 Encode::Alias Alias defintions to encodings
270 Encode::Encoding Encode Implementation Base Class
271 Encode::Supported List of Supported Encodings
272 Encode::CN Simplified Chinese Encodings
273 Encode::JP Japanese Encodings
274 Encode::KR Korean Encodings
275 Encode::TW Traditional Chinese Encodings
276 --------------------------------------------------------
277
4411f3b6 278=head1 DESCRIPTION
279
47bfe92f 280The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef 281and the rest of the system. Perl strings are sequences of
282B<characters>.
283
284The repertoire of characters that Perl can represent is at least that
285defined by the Unicode Consortium. On most platforms the ordinal
286values of the characters (as returned by C<ord(ch)>) is the "Unicode
287codepoint" for the character (the exceptions are those platforms where
288the legacy encoding is some variant of EBCDIC rather than a super-set
289of ASCII - see L<perlebcdic>).
290
291Traditionally computer data has been moved around in 8-bit chunks
292often called "bytes". These chunks are also known as "octets" in
293networking standards. Perl is widely used to manipulate data of many
294types - not only strings of characters representing human or computer
295languages but also "binary" data being the machines representation of
296numbers, pixels in an image - or just about anything.
297
298When Perl is processing "binary data" the programmer wants Perl to
299process "sequences of bytes". This is not a problem for Perl - as a
300byte has 256 possible values it easily fits in Perl's much larger
301"logical character".
302
303=head2 TERMINOLOGY
4411f3b6 304
67d7b5ef 305=over 4
21938dfa 306
67d7b5ef 307=item *
308
309I<character>: a character in the range 0..(2**32-1) (or more).
310(What Perl's strings are made of.)
311
312=item *
313
314I<byte>: a character in the range 0..255
315(A special case of a Perl character.)
316
317=item *
318
319I<octet>: 8 bits of data, with ordinal values 0..255
320(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
321
322=back
4411f3b6 323
67d7b5ef 324The marker [INTERNAL] marks Internal Implementation Details, in
325general meant only for those who think they know what they are doing,
326and such details may change in future releases.
327
328=head1 PERL ENCODING API
4411f3b6 329
330=over 4
331
a63c962f 332=item $bytes = encode(ENCODING, $string[, CHECK])
4411f3b6 333
47bfe92f 334Encodes string from Perl's internal form into I<ENCODING> and returns
67d7b5ef 335a sequence of octets. ENCODING can be either a canonical name or
336alias. For encoding names and aliases, see L</"Defining Aliases">.
337For CHECK see L</"Handling Malformed Data">.
4411f3b6 338
67d7b5ef 339For example to convert (internally UTF-8 encoded) Unicode string to
340iso-8859-1 (also known as Latin1),
681a7c68 341
67d7b5ef 342 $octets = encode("iso-8859-1", $unicode);
681a7c68 343
a63c962f 344=item $string = decode(ENCODING, $bytes[, CHECK])
4411f3b6 345
47bfe92f 346Decode sequence of octets assumed to be in I<ENCODING> into Perl's
67d7b5ef 347internal form and returns the resulting string. as in encode(),
348ENCODING can be either a canonical name or alias. For encoding names
349and aliases, see L</"Defining Aliases">. For CHECK see
47bfe92f 350L</"Handling Malformed Data">.
351
1b2c56c8 352For example to convert ISO-8859-1 data to UTF-8:
681a7c68 353
67d7b5ef 354 $utf8 = decode("iso-8859-1", $latin1);
681a7c68 355
3ef515df 356=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
47bfe92f 357
2b106fbe 358Convert B<in-place> the data between two encodings. How did the data
359in $string originally get to be in FROM_ENCODING? Either using
67d7b5ef 360encode() or through PerlIO: See L</"Encoding and IO">.
361For encoding names and aliases, see L</"Defining Aliases">.
362For CHECK see L</"Handling Malformed Data">.
2b106fbe 363
1b2c56c8 364For example to convert ISO-8859-1 data to UTF-8:
2b106fbe 365
366 from_to($data, "iso-8859-1", "utf-8");
367
368and to convert it back:
369
370 from_to($data, "utf-8", "iso-8859-1");
4411f3b6 371
ab97ca19 372Note that because the conversion happens in place, the data to be
373converted cannot be a string constant, it must be a scalar variable.
374
3ef515df 375from_to() return the length of the converted string on success, undef
376otherwise.
377
4411f3b6 378=back
379
51ef4e11 380=head2 Listing available encodings
381
5129552c 382 use Encode;
383 @list = Encode->encodings();
384
385Returns a list of the canonical names of the available encodings that
386are loaded. To get a list of all available encodings including the
387ones that are not loaded yet, say
388
389 @all_encodings = Encode->encodings(":all");
390
391Or you can give the name of specific module.
392
393 @with_jp = Encode->encodings("Encode/JP.pm");
51ef4e11 394
a63c962f 395Note in this case you have to say C<"Encode/JP.pm"> instead of
396C<"Encode::JP">.
5d030b67 397
a63c962f 398To find which encodings are supported by this package in details,
5d030b67 399see L<Encode::Supported>.
51ef4e11 400
67d7b5ef 401
51ef4e11 402=head2 Defining Aliases
403
67d7b5ef 404To add new alias to a given encoding, Use;
405
5129552c 406 use Encode;
407 use Encode::Alias;
a63c962f 408 define_alias(newName => ENCODING);
51ef4e11 409
3ef515df 410After that, newName can be used as an alias for ENCODING.
411ENCODING may be either the name of an encoding or an I<encoding
412 object>
51ef4e11 413
5d030b67 414See L<Encode::Alias> on details.
51ef4e11 415
4411f3b6 416=head1 Encoding and IO
417
418It is very common to want to do encoding transformations when
419reading or writing files, network connections, pipes etc.
47bfe92f 420If Perl is configured to use the new 'perlio' IO system then
4411f3b6 421C<Encode> provides a "layer" (See L<perliol>) which can transform
422data as it is read or written.
423
8e86646e 424Here is how the blind poet would modernise the encoding:
425
42234700 426 use Encode;
8e86646e 427 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
428 open(my $utf8,'>:utf8','iliad.utf8');
429 my @epic = <$iliad>;
430 print $utf8 @epic;
431 close($utf8);
432 close($illiad);
4411f3b6 433
434In addition the new IO system can also be configured to read/write
435UTF-8 encoded characters (as noted above this is efficient):
436
e9692b5b 437 open(my $fh,'>:utf8','anything');
438 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6 439
440Either of the above forms of "layer" specifications can be made the default
441for a lexical scope with the C<use open ...> pragma. See L<open>.
442
443Once a handle is open is layers can be altered using C<binmode>.
444
47bfe92f 445Without any such configuration, or if Perl itself is built using
4411f3b6 446system's own IO, then write operations assume that file handle accepts
447only I<bytes> and will C<die> if a character larger than 255 is
448written to the handle. When reading, each octet from the handle
449becomes a byte-in-a-character. Note that this default is the same
47bfe92f 450behaviour as bytes-only languages (including Perl before v5.6) would
451have, and is sufficient to handle native 8-bit encodings
452e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
453other encodings and binary data.
454
455In other cases it is the programs responsibility to transform
456characters into bytes using the API above before doing writes, and to
457transform the bytes read from a handle into characters before doing
458"character operations" (e.g. C<lc>, C</\W+/>, ...).
459
47bfe92f 460You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8 461want to bring into memory. For example to convert between ISO-8859-1
47bfe92f 462(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
463
e9692b5b 464 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
465 open(G, ">:utf8", "data.utf") or die $!;
466 while (<F>) { print G }
467
468 # Could also do "print G <F>" but that would pull
469 # the whole file into memory just to write it out again.
470
471More examples:
47bfe92f 472
e9692b5b 473 open(my $f, "<:encoding(cp1252)")
474 open(my $g, ">:encoding(iso-8859-2)")
475 open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f 476
477See L<PerlIO> for more information.
4411f3b6 478
1768d7eb 479See also L<encoding> for how to change the default encoding of the
d521382b 480data in your script.
1768d7eb 481
67d7b5ef 482=head1 Handling Malformed Data
483
484If CHECK is not set, C<undef> is returned. If the data is supposed to
485be UTF-8, an optional lexical warning (category utf8) is given. If
486CHECK is true but not a code reference, dies.
487
488It would desirable to have a way to indicate that transform should use
489the encodings "replacement character" - no such mechanism is defined yet.
490
491It is also planned to allow I<CHECK> to be a code reference.
492
493This is not yet implemented as there are design issues with what its
494arguments should be and how it returns its results.
495
496=over 4
497
498=item Scheme 1
499
500Passed remaining fragment of string being processed.
501Modifies it in place to remove bytes/characters it can understand
502and returns a string used to represent them.
503e.g.
504
505 sub fixup {
506 my $ch = substr($_[0],0,1,'');
507 return sprintf("\x{%02X}",ord($ch);
508 }
509
510This scheme is close to how underlying C code for Encode works, but gives
511the fixup routine very little context.
512
513=item Scheme 2
514
515Passed original string, and an index into it of the problem area, and
516output string so far. Appends what it will to output string and
517returns new index into original string. For example:
518
519 sub fixup {
520 # my ($s,$i,$d) = @_;
521 my $ch = substr($_[0],$_[1],1);
522 $_[2] .= sprintf("\x{%02X}",ord($ch);
523 return $_[1]+1;
524 }
525
526This scheme gives maximal control to the fixup routine but is more
527complicated to code, and may need internals of Encode to be tweaked to
528keep original string intact.
529
530=item Other Schemes
531
532Hybrids of above.
533
534Multiple return values rather than in-place modifications.
535
536Index into the string could be C<pos($str)> allowing C<s/\G...//>.
537
538=back
539
540=head2 UTF-8 / utf8
541
542The Unicode consortium defines the UTF-8 standard as a way of encoding
543the entire Unicode repertoire as sequences of octets. This encoding is
544expected to become very widespread. Perl can use this form internally
545to represent strings, so conversions to and from this form are
546particularly efficient (as octets in memory do not have to change,
547just the meta-data that tells Perl how to treat them).
548
549=over 4
550
551=item $bytes = encode_utf8($string);
552
553The characters that comprise string are encoded in Perl's superset of UTF-8
554and the resulting octets returned as a sequence of bytes. All possible
555characters have a UTF-8 representation so this function cannot fail.
556
557=item $string = decode_utf8($bytes [, CHECK]);
558
559The sequence of octets represented by $bytes is decoded from UTF-8
560into a sequence of logical characters. Not all sequences of octets
561form valid UTF-8 encodings, so it is possible for this call to fail.
562For CHECK see L</"Handling Malformed Data">.
563
564=back
565
566=head1 Defining Encodings
567
568To define a new encoding, use:
569
570 use Encode qw(define_alias);
571 define_encoding($object, 'canonicalName' [, alias...]);
572
573I<canonicalName> will be associated with I<$object>. The object
574should provide the interface described in L<Encode::Encoding>
575If more than two arguments are provided then additional
576arguments are taken as aliases for I<$object> as for C<define_alias>.
577
4411f3b6 578=head1 Messing with Perl's Internals
579
47bfe92f 580The following API uses parts of Perl's internals in the current
581implementation. As such they are efficient, but may change.
4411f3b6 582
583=over 4
584
a63c962f 585=item is_utf8(STRING [, CHECK])
4411f3b6 586
587[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f 588If CHECK is true, also checks the data in STRING for being well-formed
589UTF-8. Returns true if successful, false otherwise.
4411f3b6 590
a63c962f 591=item _utf8_on(STRING)
4411f3b6 592
593[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
594B<not> checked for being well-formed UTF-8. Do not use unless you
595B<know> that the STRING is well-formed UTF-8. Returns the previous
596state of the UTF-8 flag (so please don't test the return value as
597I<not> success or failure), or C<undef> if STRING is not a string.
598
a63c962f 599=item _utf8_off(STRING)
4411f3b6 600
601[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
602Returns the previous state of the UTF-8 flag (so please don't test the
603return value as I<not> success or failure), or C<undef> if STRING is
604not a string.
605
606=back
607
608=head1 SEE ALSO
609
5d030b67 610L<Encode::Encoding>,
611L<Encode::Supported>,
612L<PerlIO>,
613L<encoding>,
614L<perlebcdic>,
615L<perlfunc/open>,
616L<perlunicode>,
617L<utf8>,
618the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 619
620=cut