Integrate mainline
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
48e3bbdd 3our $VERSION = do { my @r = (q$Revision: 1.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c 4our $DEBUG = 0;
2c674647 5
6require DynaLoader;
7require Exporter;
8
51ef4e11 9our @ISA = qw(Exporter DynaLoader);
2c674647 10
4411f3b6 11# Public, encouraged API is exported by default
51ef4e11 12our @EXPORT = qw (
4411f3b6 13 encode
14 decode
15 encode_utf8
16 decode_utf8
17 find_encoding
51ef4e11 18 encodings
4411f3b6 19);
20
51ef4e11 21our @EXPORT_OK =
2c674647 22 qw(
51ef4e11 23 define_encoding
2c674647 24 from_to
25 is_utf8
4411f3b6 26 is_8bit
27 is_16bit
a12c0f56 28 utf8_upgrade
29 utf8_downgrade
4411f3b6 30 _utf8_on
31 _utf8_off
2c674647 32 );
33
34bootstrap Encode ();
35
4411f3b6 36# Documentation moved after __END__ for speed - NI-S
2c674647 37
bf230f3d 38use Carp;
39
a63c962f 40our $ON_EBCDIC = (ord("A") == 193);
5d030b67 41use Encode::Alias;
42
5129552c 43# Make a %Encoding package variable to allow a certain amount of cheating
44our %Encoding;
5345d506 45
5129552c 46our %ExtModule =
2b217bf7 47 (
5129552c 48 viscii => 'Encode/Byte.pm',
49 'koi8-r' => 'Encode/Byte.pm',
50 cp1047 => 'Encode/EBCDIC.pm',
51 cp37 => 'Encode/EBCDIC.pm',
52 'posix-bc' => 'Encode/EBCDIC.pm',
53 symbol => 'Encode/Symbol.pm',
54 dingbats => 'Encode/Symbol.pm',
2b217bf7 55 );
d1ed7747 56
5129552c 57for my $k (2..11,13..16){
58 $ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm';
59}
60
61for my $k (1250..1258){
62 $ExtModule{"cp$k"} = 'Encode/Byte.pm';
63}
64
a63c962f 65unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
66%ExtModule =(
67 %ExtModule,
68 'euc-cn' => 'Encode/CN.pm',
69 gb2312 => 'Encode/CN.pm',
70 gb12345 => 'Encode/CN.pm',
71 gbk => 'Encode/CN.pm',
72 cp936 => 'Encode/CN.pm',
73 'iso-ir-165' => 'Encode/CN.pm',
74 'euc-jp' => 'Encode/JP.pm',
75 'iso-2022-jp' => 'Encode/JP.pm',
76 'iso-2022-jp-1' => 'Encode/JP.pm',
77 '7bit-jis' => 'Encode/JP.pm',
78 shiftjis => 'Encode/JP.pm',
79 macjapan => 'Encode/JP.pm',
80 cp932 => 'Encode/JP.pm',
81 'euc-kr' => 'Encode/KR.pm',
82 ksc5601 => 'Encode/KR.pm',
83 cp949 => 'Encode/KR.pm',
84 big5 => 'Encode/TW.pm',
85 'big5-hkscs' => 'Encode/TW.pm',
86 cp950 => 'Encode/TW.pm',
87 gb18030 => 'Encode/HanExtra.pm',
88 big5plus => 'Encode/HanExtra.pm',
89 'euc-tw' => 'Encode/HanExtra.pm',
90 );
91}
92
5129552c 93for my $k (qw(centeuro croatian cyrillic dingbats greek
94 iceland roman rumanian sami
95 thai turkish ukraine))
96{
97 $ExtModule{"mac$k"} = 'Encode/Byte.pm';
98}
99
100
656753f8 101sub encodings
102{
5129552c 103 my $class = shift;
071db25d 104 my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
5129552c 105 for my $m (@modules)
106 {
107 $DEBUG and warn "about to require $m;";
108 eval { require $m; };
109 }
110 return
111 map({$_->[0]}
112 sort({$a->[1] cmp $b->[1]}
113 map({[$_, lc $_]}
114 grep({ $_ ne 'Internal' } keys %Encoding))));
51ef4e11 115}
116
51ef4e11 117sub define_encoding
118{
18586f54 119 my $obj = shift;
120 my $name = shift;
5129552c 121 $Encoding{$name} = $obj;
18586f54 122 my $lc = lc($name);
123 define_alias($lc => $obj) unless $lc eq $name;
124 while (@_)
125 {
126 my $alias = shift;
127 define_alias($alias,$obj);
128 }
129 return $obj;
656753f8 130}
131
656753f8 132sub getEncoding
133{
dd9703c9 134 my ($class,$name,$skip_external) = @_;
18586f54 135 my $enc;
136 if (ref($name) && $name->can('new_sequence'))
137 {
138 return $name;
139 }
140 my $lc = lc $name;
5129552c 141 if (exists $Encoding{$name})
18586f54 142 {
5129552c 143 return $Encoding{$name};
18586f54 144 }
5129552c 145 if (exists $Encoding{$lc})
18586f54 146 {
5129552c 147 return $Encoding{$lc};
18586f54 148 }
c50d192e 149
5129552c 150 my $oc = $class->find_alias($name);
c50d192e 151 return $oc if defined $oc;
152
5129552c 153 $oc = $class->find_alias($lc) if $lc ne $name;
c50d192e 154 return $oc if defined $oc;
155
5129552c 156 if (!$skip_external and exists $ExtModule{$lc})
d1ed7747 157 {
5129552c 158 eval{ require $ExtModule{$lc}; };
159 return $Encoding{$name} if exists $Encoding{$name};
d1ed7747 160 }
18586f54 161
18586f54 162 return;
656753f8 163}
164
4411f3b6 165sub find_encoding
166{
dd9703c9 167 my ($name,$skip_external) = @_;
168 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6 169}
170
171sub encode
172{
18586f54 173 my ($name,$string,$check) = @_;
174 my $enc = find_encoding($name);
175 croak("Unknown encoding '$name'") unless defined $enc;
176 my $octets = $enc->encode($string,$check);
177 return undef if ($check && length($string));
178 return $octets;
4411f3b6 179}
180
181sub decode
182{
18586f54 183 my ($name,$octets,$check) = @_;
184 my $enc = find_encoding($name);
185 croak("Unknown encoding '$name'") unless defined $enc;
186 my $string = $enc->decode($octets,$check);
187 $_[1] = $octets if $check;
188 return $string;
4411f3b6 189}
190
191sub from_to
192{
18586f54 193 my ($string,$from,$to,$check) = @_;
194 my $f = find_encoding($from);
195 croak("Unknown encoding '$from'") unless defined $f;
196 my $t = find_encoding($to);
197 croak("Unknown encoding '$to'") unless defined $t;
198 my $uni = $f->decode($string,$check);
199 return undef if ($check && length($string));
200 $string = $t->encode($uni,$check);
201 return undef if ($check && length($uni));
202 return length($_[0] = $string);
4411f3b6 203}
204
205sub encode_utf8
206{
18586f54 207 my ($str) = @_;
208 utf8::encode($str);
209 return $str;
4411f3b6 210}
211
212sub decode_utf8
213{
18586f54 214 my ($str) = @_;
215 return undef unless utf8::decode($str);
216 return $str;
5ad8ef52 217}
218
18586f54 219require Encode::Encoding;
220require Encode::XS;
221require Encode::Internal;
222require Encode::Unicode;
223require Encode::utf8;
64ffdd5e 224require Encode::10646_1;
18586f54 225require Encode::ucs2_le;
4411f3b6 226
656753f8 2271;
228
2a936312 229__END__
230
4411f3b6 231=head1 NAME
232
233Encode - character encodings
234
235=head1 SYNOPSIS
236
237 use Encode;
238
48e3bbdd 239
240=head2 Table of Contents
241
242Encode consists of a collection of modules which details are too big
243to fit in one document. This POD itself explains the top-level APIs
244and general topics at a glance. For other topics and more details,
245see the PODs below;
246
247 Name Description
248 --------------------------------------------------------
249 Encode::Alias Alias defintions to encodings
250 Encode::Encoding Encode Implementation Base Class
251 Encode::Supported List of Supported Encodings
252 Encode::CN Simplified Chinese Encodings
253 Encode::JP Japanese Encodings
254 Encode::KR Korean Encodings
255 Encode::TW Traditional Chinese Encodings
256 --------------------------------------------------------
257
4411f3b6 258=head1 DESCRIPTION
259
47bfe92f 260The C<Encode> module provides the interfaces between Perl's strings
48e3bbdd 261and the rest of the system. Perl strings are sequences of
262B<characters>.
263
264The repertoire of characters that Perl can represent is at least that
265defined by the Unicode Consortium. On most platforms the ordinal
266values of the characters (as returned by C<ord(ch)>) is the "Unicode
267codepoint" for the character (the exceptions are those platforms where
268the legacy encoding is some variant of EBCDIC rather than a super-set
269of ASCII - see L<perlebcdic>).
270
271Traditionally computer data has been moved around in 8-bit chunks
272often called "bytes". These chunks are also known as "octets" in
273networking standards. Perl is widely used to manipulate data of many
274types - not only strings of characters representing human or computer
275languages but also "binary" data being the machines representation of
276numbers, pixels in an image - or just about anything.
277
278When Perl is processing "binary data" the programmer wants Perl to
279process "sequences of bytes". This is not a problem for Perl - as a
280byte has 256 possible values it easily fits in Perl's much larger
281"logical character".
282
283=head2 TERMINOLOGY
4411f3b6 284
48e3bbdd 285=over 4
21938dfa 286
48e3bbdd 287=item *
288
289I<character>: a character in the range 0..(2**32-1) (or more).
290(What Perl's strings are made of.)
291
292=item *
293
294I<byte>: a character in the range 0..255
295(A special case of a Perl character.)
296
297=item *
298
299I<octet>: 8 bits of data, with ordinal values 0..255
300(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
301
302=back
4411f3b6 303
48e3bbdd 304The marker [INTERNAL] marks Internal Implementation Details, in
305general meant only for those who think they know what they are doing,
306and such details may change in future releases.
307
308=head1 PERL ENCODING API
4411f3b6 309
310=over 4
311
a63c962f 312=item $bytes = encode(ENCODING, $string[, CHECK])
4411f3b6 313
47bfe92f 314Encodes string from Perl's internal form into I<ENCODING> and returns
48e3bbdd 315a sequence of octets. ENCODING can be either a canonical name or
316alias. For encoding names and aliases, see L</"Defining Aliases">.
317For CHECK see L</"Handling Malformed Data">.
4411f3b6 318
48e3bbdd 319For example to convert (internally UTF-8 encoded) Unicode string to
320iso-8859-1 (also known as Latin1),
681a7c68 321
48e3bbdd 322 $octets = encode("iso-8859-1", $unicode);
681a7c68 323
a63c962f 324=item $string = decode(ENCODING, $bytes[, CHECK])
4411f3b6 325
47bfe92f 326Decode sequence of octets assumed to be in I<ENCODING> into Perl's
48e3bbdd 327internal form and returns the resulting string. as in encode(),
328ENCODING can be either a canonical name or alias. For encoding names
329and aliases, see L</"Defining Aliases">. For CHECK see
47bfe92f 330L</"Handling Malformed Data">.
331
1b2c56c8 332For example to convert ISO-8859-1 data to UTF-8:
681a7c68 333
48e3bbdd 334 $utf8 = decode("iso-8859-1", $latin1);
681a7c68 335
a63c962f 336=item from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
47bfe92f 337
2b106fbe 338Convert B<in-place> the data between two encodings. How did the data
339in $string originally get to be in FROM_ENCODING? Either using
48e3bbdd 340encode() or through PerlIO: See L</"Encoding and IO">.
341For encoding names and aliases, see L</"Defining Aliases">.
342For CHECK see L</"Handling Malformed Data">.
2b106fbe 343
1b2c56c8 344For example to convert ISO-8859-1 data to UTF-8:
2b106fbe 345
346 from_to($data, "iso-8859-1", "utf-8");
347
348and to convert it back:
349
350 from_to($data, "utf-8", "iso-8859-1");
4411f3b6 351
ab97ca19 352Note that because the conversion happens in place, the data to be
353converted cannot be a string constant, it must be a scalar variable.
354
4411f3b6 355=back
356
51ef4e11 357=head2 Listing available encodings
358
5129552c 359 use Encode;
360 @list = Encode->encodings();
361
362Returns a list of the canonical names of the available encodings that
363are loaded. To get a list of all available encodings including the
364ones that are not loaded yet, say
365
366 @all_encodings = Encode->encodings(":all");
367
368Or you can give the name of specific module.
369
370 @with_jp = Encode->encodings("Encode/JP.pm");
51ef4e11 371
a63c962f 372Note in this case you have to say C<"Encode/JP.pm"> instead of
373C<"Encode::JP">.
5d030b67 374
a63c962f 375To find which encodings are supported by this package in details,
5d030b67 376see L<Encode::Supported>.
51ef4e11 377
48e3bbdd 378
51ef4e11 379=head2 Defining Aliases
380
48e3bbdd 381To add new alias to a given encoding, Use;
382
5129552c 383 use Encode;
384 use Encode::Alias;
a63c962f 385 define_alias(newName => ENCODING);
51ef4e11 386
48e3bbdd 387After that, newName can be to be used as am alias for ENCODING.
388ENCODING may be either the name of an encoding or and I<encoding
389object>
51ef4e11 390
5d030b67 391See L<Encode::Alias> on details.
51ef4e11 392
4411f3b6 393=head1 Encoding and IO
394
395It is very common to want to do encoding transformations when
396reading or writing files, network connections, pipes etc.
47bfe92f 397If Perl is configured to use the new 'perlio' IO system then
4411f3b6 398C<Encode> provides a "layer" (See L<perliol>) which can transform
399data as it is read or written.
400
8e86646e 401Here is how the blind poet would modernise the encoding:
402
42234700 403 use Encode;
8e86646e 404 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
405 open(my $utf8,'>:utf8','iliad.utf8');
406 my @epic = <$iliad>;
407 print $utf8 @epic;
408 close($utf8);
409 close($illiad);
4411f3b6 410
411In addition the new IO system can also be configured to read/write
412UTF-8 encoded characters (as noted above this is efficient):
413
e9692b5b 414 open(my $fh,'>:utf8','anything');
415 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6 416
417Either of the above forms of "layer" specifications can be made the default
418for a lexical scope with the C<use open ...> pragma. See L<open>.
419
420Once a handle is open is layers can be altered using C<binmode>.
421
47bfe92f 422Without any such configuration, or if Perl itself is built using
4411f3b6 423system's own IO, then write operations assume that file handle accepts
424only I<bytes> and will C<die> if a character larger than 255 is
425written to the handle. When reading, each octet from the handle
426becomes a byte-in-a-character. Note that this default is the same
47bfe92f 427behaviour as bytes-only languages (including Perl before v5.6) would
428have, and is sufficient to handle native 8-bit encodings
429e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
430other encodings and binary data.
431
432In other cases it is the programs responsibility to transform
433characters into bytes using the API above before doing writes, and to
434transform the bytes read from a handle into characters before doing
435"character operations" (e.g. C<lc>, C</\W+/>, ...).
436
47bfe92f 437You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8 438want to bring into memory. For example to convert between ISO-8859-1
47bfe92f 439(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
440
e9692b5b 441 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
442 open(G, ">:utf8", "data.utf") or die $!;
443 while (<F>) { print G }
444
445 # Could also do "print G <F>" but that would pull
446 # the whole file into memory just to write it out again.
447
448More examples:
47bfe92f 449
e9692b5b 450 open(my $f, "<:encoding(cp1252)")
451 open(my $g, ">:encoding(iso-8859-2)")
452 open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f 453
454See L<PerlIO> for more information.
4411f3b6 455
1768d7eb 456See also L<encoding> for how to change the default encoding of the
d521382b 457data in your script.
1768d7eb 458
48e3bbdd 459=head1 Handling Malformed Data
460
461If CHECK is not set, C<undef> is returned. If the data is supposed to
462be UTF-8, an optional lexical warning (category utf8) is given. If
463CHECK is true but not a code reference, dies.
464
465It would desirable to have a way to indicate that transform should use
466the encodings "replacement character" - no such mechanism is defined yet.
467
468It is also planned to allow I<CHECK> to be a code reference.
469
470This is not yet implemented as there are design issues with what its
471arguments should be and how it returns its results.
472
473=over 4
474
475=item Scheme 1
476
477Passed remaining fragment of string being processed.
478Modifies it in place to remove bytes/characters it can understand
479and returns a string used to represent them.
480e.g.
481
482 sub fixup {
483 my $ch = substr($_[0],0,1,'');
484 return sprintf("\x{%02X}",ord($ch);
485 }
486
487This scheme is close to how underlying C code for Encode works, but gives
488the fixup routine very little context.
489
490=item Scheme 2
491
492Passed original string, and an index into it of the problem area, and
493output string so far. Appends what it will to output string and
494returns new index into original string. For example:
495
496 sub fixup {
497 # my ($s,$i,$d) = @_;
498 my $ch = substr($_[0],$_[1],1);
499 $_[2] .= sprintf("\x{%02X}",ord($ch);
500 return $_[1]+1;
501 }
502
503This scheme gives maximal control to the fixup routine but is more
504complicated to code, and may need internals of Encode to be tweaked to
505keep original string intact.
506
507=item Other Schemes
508
509Hybrids of above.
510
511Multiple return values rather than in-place modifications.
512
513Index into the string could be C<pos($str)> allowing C<s/\G...//>.
514
515=back
516
517=head2 UTF-8 / utf8
518
519The Unicode consortium defines the UTF-8 standard as a way of encoding
520the entire Unicode repertoire as sequences of octets. This encoding is
521expected to become very widespread. Perl can use this form internally
522to represent strings, so conversions to and from this form are
523particularly efficient (as octets in memory do not have to change,
524just the meta-data that tells Perl how to treat them).
525
526=over 4
527
528=item $bytes = encode_utf8($string);
529
530The characters that comprise string are encoded in Perl's superset of UTF-8
531and the resulting octets returned as a sequence of bytes. All possible
532characters have a UTF-8 representation so this function cannot fail.
533
534=item $string = decode_utf8($bytes [, CHECK]);
535
536The sequence of octets represented by $bytes is decoded from UTF-8
537into a sequence of logical characters. Not all sequences of octets
538form valid UTF-8 encodings, so it is possible for this call to fail.
539For CHECK see L</"Handling Malformed Data">.
540
541=back
542
543=head1 Defining Encodings
544
545To define a new encoding, use:
546
547 use Encode qw(define_alias);
548 define_encoding($object, 'canonicalName' [, alias...]);
549
550I<canonicalName> will be associated with I<$object>. The object
551should provide the interface described in L<Encode::Encoding>
552If more than two arguments are provided then additional
553arguments are taken as aliases for I<$object> as for C<define_alias>.
554
4411f3b6 555=head1 Messing with Perl's Internals
556
47bfe92f 557The following API uses parts of Perl's internals in the current
558implementation. As such they are efficient, but may change.
4411f3b6 559
560=over 4
561
a63c962f 562=item is_utf8(STRING [, CHECK])
4411f3b6 563
564[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f 565If CHECK is true, also checks the data in STRING for being well-formed
566UTF-8. Returns true if successful, false otherwise.
4411f3b6 567
a63c962f 568=item _utf8_on(STRING)
4411f3b6 569
570[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
571B<not> checked for being well-formed UTF-8. Do not use unless you
572B<know> that the STRING is well-formed UTF-8. Returns the previous
573state of the UTF-8 flag (so please don't test the return value as
574I<not> success or failure), or C<undef> if STRING is not a string.
575
a63c962f 576=item _utf8_off(STRING)
4411f3b6 577
578[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
579Returns the previous state of the UTF-8 flag (so please don't test the
580return value as I<not> success or failure), or C<undef> if STRING is
581not a string.
582
583=back
584
585=head1 SEE ALSO
586
5d030b67 587L<Encode::Encoding>,
588L<Encode::Supported>,
589L<PerlIO>,
590L<encoding>,
591L<perlebcdic>,
592L<perlfunc/open>,
593L<perlunicode>,
594L<utf8>,
595the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 596
597=cut