Update Changes.
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
fdd579e2 3our $VERSION = do { my @r = (q$Revision: 1.31 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c 4our $DEBUG = 0;
2c674647 5
6require DynaLoader;
7require Exporter;
8
51ef4e11 9our @ISA = qw(Exporter DynaLoader);
2c674647 10
4411f3b6 11# Public, encouraged API is exported by default
51ef4e11 12our @EXPORT = qw (
4411f3b6 13 encode
14 decode
15 encode_utf8
16 decode_utf8
17 find_encoding
51ef4e11 18 encodings
4411f3b6 19);
20
51ef4e11 21our @EXPORT_OK =
2c674647 22 qw(
51ef4e11 23 define_encoding
2c674647 24 from_to
25 is_utf8
4411f3b6 26 is_8bit
27 is_16bit
a12c0f56 28 utf8_upgrade
29 utf8_downgrade
4411f3b6 30 _utf8_on
31 _utf8_off
2c674647 32 );
33
34bootstrap Encode ();
35
4411f3b6 36# Documentation moved after __END__ for speed - NI-S
2c674647 37
bf230f3d 38use Carp;
39
a63c962f 40our $ON_EBCDIC = (ord("A") == 193);
f2a2953c 41
5d030b67 42use Encode::Alias;
43
5129552c 44# Make a %Encoding package variable to allow a certain amount of cheating
45our %Encoding;
fdd579e2 46use Encode::Config;
5129552c 47
656753f8 48sub encodings
49{
5129552c 50 my $class = shift;
071db25d 51 my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
c731e18e 52 for my $mod (@modules){
53 $mod =~ s,::,/,g or $mod = "Encode/$mod";
54 $mod .= '.pm';
55 $DEBUG and warn "about to require $mod;";
56 eval { require $mod; };
5129552c 57 }
c731e18e 58 my %modules = map {$_ => 1} @modules;
5129552c 59 return
c731e18e 60 sort grep {!/^(?:Internal|Unicode)$/o} keys %Encoding;
51ef4e11 61}
62
51ef4e11 63sub define_encoding
64{
18586f54 65 my $obj = shift;
66 my $name = shift;
5129552c 67 $Encoding{$name} = $obj;
18586f54 68 my $lc = lc($name);
69 define_alias($lc => $obj) unless $lc eq $name;
70 while (@_)
71 {
72 my $alias = shift;
73 define_alias($alias,$obj);
74 }
75 return $obj;
656753f8 76}
77
656753f8 78sub getEncoding
79{
dd9703c9 80 my ($class,$name,$skip_external) = @_;
18586f54 81 my $enc;
82 if (ref($name) && $name->can('new_sequence'))
83 {
84 return $name;
85 }
86 my $lc = lc $name;
5129552c 87 if (exists $Encoding{$name})
18586f54 88 {
5129552c 89 return $Encoding{$name};
18586f54 90 }
5129552c 91 if (exists $Encoding{$lc})
18586f54 92 {
5129552c 93 return $Encoding{$lc};
18586f54 94 }
c50d192e 95
5129552c 96 my $oc = $class->find_alias($name);
c50d192e 97 return $oc if defined $oc;
98
5129552c 99 $oc = $class->find_alias($lc) if $lc ne $name;
c50d192e 100 return $oc if defined $oc;
101
c731e18e 102 unless ($skip_external)
d1ed7747 103 {
c731e18e 104 if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
105 $mod =~ s,::,/,g ; $mod .= '.pm';
106 eval{ require $mod; };
107 return $Encoding{$name} if exists $Encoding{$name};
108 }
d1ed7747 109 }
18586f54 110 return;
656753f8 111}
112
4411f3b6 113sub find_encoding
114{
dd9703c9 115 my ($name,$skip_external) = @_;
116 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6 117}
118
119sub encode
120{
18586f54 121 my ($name,$string,$check) = @_;
122 my $enc = find_encoding($name);
123 croak("Unknown encoding '$name'") unless defined $enc;
124 my $octets = $enc->encode($string,$check);
125 return undef if ($check && length($string));
126 return $octets;
4411f3b6 127}
128
129sub decode
130{
18586f54 131 my ($name,$octets,$check) = @_;
132 my $enc = find_encoding($name);
133 croak("Unknown encoding '$name'") unless defined $enc;
134 my $string = $enc->decode($octets,$check);
135 $_[1] = $octets if $check;
136 return $string;
4411f3b6 137}
138
139sub from_to
140{
18586f54 141 my ($string,$from,$to,$check) = @_;
142 my $f = find_encoding($from);
143 croak("Unknown encoding '$from'") unless defined $f;
144 my $t = find_encoding($to);
145 croak("Unknown encoding '$to'") unless defined $t;
146 my $uni = $f->decode($string,$check);
147 return undef if ($check && length($string));
a999c27c 148 $string = $t->encode($uni,$check);
18586f54 149 return undef if ($check && length($uni));
3ef515df 150 return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6 151}
152
153sub encode_utf8
154{
18586f54 155 my ($str) = @_;
c731e18e 156 utf8::encode($str);
18586f54 157 return $str;
4411f3b6 158}
159
160sub decode_utf8
161{
18586f54 162 my ($str) = @_;
163 return undef unless utf8::decode($str);
164 return $str;
5ad8ef52 165}
166
f2a2953c 167predefine_encodings();
168
169#
170# This is to restore %Encoding if really needed;
171#
172sub predefine_encodings{
173 if ($ON_EBCDIC) {
174 # was in Encode::UTF_EBCDIC
175 package Encode::UTF_EBCDIC;
176 *name = sub{ shift->{'Name'} };
177 *new_sequence = sub{ return $_[0] };
178 *decode = sub{
179 my ($obj,$str,$chk) = @_;
180 my $res = '';
181 for (my $i = 0; $i < length($str); $i++) {
182 $res .=
183 chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
184 }
185 $_[1] = '' if $chk;
186 return $res;
187 };
188 *encode = sub{
189 my ($obj,$str,$chk) = @_;
190 my $res = '';
191 for (my $i = 0; $i < length($str); $i++) {
192 $res .=
193 chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
194 }
195 $_[1] = '' if $chk;
196 return $res;
197 };
c731e18e 198 $Encode::Encoding{Internal} =
199 bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
f2a2953c 200 } else {
201 # was in Encode::UTF_EBCDIC
202 package Encode::Internal;
203 *name = sub{ shift->{'Name'} };
204 *new_sequence = sub{ return $_[0] };
205 *decode = sub{
206 my ($obj,$str,$chk) = @_;
207 utf8::upgrade($str);
208 $_[1] = '' if $chk;
209 return $str;
210 };
211 *encode = \&decode;
212 $Encode::Encoding{Unicode} =
c731e18e 213 bless {Name => "Internal"} => "Encode::Internal";
f2a2953c 214 }
215
216 {
217 # was in Encode::utf8
218 package Encode::utf8;
219 *name = sub{ shift->{'Name'} };
220 *new_sequence = sub{ return $_[0] };
221 *decode = sub{
222 my ($obj,$octets,$chk) = @_;
223 my $str = Encode::decode_utf8($octets);
224 if (defined $str) {
225 $_[1] = '' if $chk;
226 return $str;
227 }
228 return undef;
229 };
230 *encode = sub {
231 my ($obj,$string,$chk) = @_;
232 my $octets = Encode::encode_utf8($string);
233 $_[1] = '' if $chk;
234 return $octets;
235 };
236 $Encode::Encoding{utf8} =
c731e18e 237 bless {Name => "utf8"} => "Encode::utf8";
f2a2953c 238 }
239 # do externals if necessary
240 require File::Basename;
241 require File::Spec;
c731e18e 242 for my $ext (qw()){
f2a2953c 243 my $pm =
244 File::Spec->catfile(File::Basename::dirname($INC{'Encode.pm'}),
245 "Encode", "$ext.pm");
246 do $pm;
247 }
248}
249
18586f54 250require Encode::Encoding;
251require Encode::XS;
4411f3b6 252
656753f8 2531;
254
2a936312 255__END__
256
4411f3b6 257=head1 NAME
258
259Encode - character encodings
260
261=head1 SYNOPSIS
262
263 use Encode;
264
67d7b5ef 265
266=head2 Table of Contents
267
268Encode consists of a collection of modules which details are too big
269to fit in one document. This POD itself explains the top-level APIs
270and general topics at a glance. For other topics and more details,
271see the PODs below;
272
273 Name Description
274 --------------------------------------------------------
275 Encode::Alias Alias defintions to encodings
276 Encode::Encoding Encode Implementation Base Class
277 Encode::Supported List of Supported Encodings
278 Encode::CN Simplified Chinese Encodings
279 Encode::JP Japanese Encodings
280 Encode::KR Korean Encodings
281 Encode::TW Traditional Chinese Encodings
282 --------------------------------------------------------
283
4411f3b6 284=head1 DESCRIPTION
285
47bfe92f 286The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef 287and the rest of the system. Perl strings are sequences of
288B<characters>.
289
290The repertoire of characters that Perl can represent is at least that
291defined by the Unicode Consortium. On most platforms the ordinal
292values of the characters (as returned by C<ord(ch)>) is the "Unicode
293codepoint" for the character (the exceptions are those platforms where
294the legacy encoding is some variant of EBCDIC rather than a super-set
295of ASCII - see L<perlebcdic>).
296
297Traditionally computer data has been moved around in 8-bit chunks
298often called "bytes". These chunks are also known as "octets" in
299networking standards. Perl is widely used to manipulate data of many
300types - not only strings of characters representing human or computer
301languages but also "binary" data being the machines representation of
302numbers, pixels in an image - or just about anything.
303
304When Perl is processing "binary data" the programmer wants Perl to
305process "sequences of bytes". This is not a problem for Perl - as a
306byte has 256 possible values it easily fits in Perl's much larger
307"logical character".
308
309=head2 TERMINOLOGY
4411f3b6 310
67d7b5ef 311=over 4
21938dfa 312
67d7b5ef 313=item *
314
315I<character>: a character in the range 0..(2**32-1) (or more).
316(What Perl's strings are made of.)
317
318=item *
319
320I<byte>: a character in the range 0..255
321(A special case of a Perl character.)
322
323=item *
324
325I<octet>: 8 bits of data, with ordinal values 0..255
326(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
327
328=back
4411f3b6 329
67d7b5ef 330The marker [INTERNAL] marks Internal Implementation Details, in
331general meant only for those who think they know what they are doing,
332and such details may change in future releases.
333
334=head1 PERL ENCODING API
4411f3b6 335
336=over 4
337
f2a2953c 338=item $octets = encode(ENCODING, $string[, CHECK])
4411f3b6 339
47bfe92f 340Encodes string from Perl's internal form into I<ENCODING> and returns
67d7b5ef 341a sequence of octets. ENCODING can be either a canonical name or
342alias. For encoding names and aliases, see L</"Defining Aliases">.
343For CHECK see L</"Handling Malformed Data">.
4411f3b6 344
67d7b5ef 345For example to convert (internally UTF-8 encoded) Unicode string to
346iso-8859-1 (also known as Latin1),
681a7c68 347
67d7b5ef 348 $octets = encode("iso-8859-1", $unicode);
681a7c68 349
f2a2953c 350=item $string = decode(ENCODING, $octets[, CHECK])
4411f3b6 351
47bfe92f 352Decode sequence of octets assumed to be in I<ENCODING> into Perl's
67d7b5ef 353internal form and returns the resulting string. as in encode(),
354ENCODING can be either a canonical name or alias. For encoding names
355and aliases, see L</"Defining Aliases">. For CHECK see
47bfe92f 356L</"Handling Malformed Data">.
357
1b2c56c8 358For example to convert ISO-8859-1 data to UTF-8:
681a7c68 359
67d7b5ef 360 $utf8 = decode("iso-8859-1", $latin1);
681a7c68 361
f2a2953c 362=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])
47bfe92f 363
2b106fbe 364Convert B<in-place> the data between two encodings. How did the data
365in $string originally get to be in FROM_ENCODING? Either using
67d7b5ef 366encode() or through PerlIO: See L</"Encoding and IO">.
367For encoding names and aliases, see L</"Defining Aliases">.
368For CHECK see L</"Handling Malformed Data">.
2b106fbe 369
1b2c56c8 370For example to convert ISO-8859-1 data to UTF-8:
2b106fbe 371
372 from_to($data, "iso-8859-1", "utf-8");
373
374and to convert it back:
375
376 from_to($data, "utf-8", "iso-8859-1");
4411f3b6 377
ab97ca19 378Note that because the conversion happens in place, the data to be
379converted cannot be a string constant, it must be a scalar variable.
380
3ef515df 381from_to() return the length of the converted string on success, undef
382otherwise.
383
4411f3b6 384=back
385
f2a2953c 386=head2 UTF-8 / utf8
387
388The Unicode consortium defines the UTF-8 standard as a way of encoding
389the entire Unicode repertoire as sequences of octets. This encoding is
390expected to become very widespread. Perl can use this form internally
391to represent strings, so conversions to and from this form are
392particularly efficient (as octets in memory do not have to change,
393just the meta-data that tells Perl how to treat them).
394
395=over 4
396
397=item $octets = encode_utf8($string);
398
399The characters that comprise string are encoded in Perl's superset of UTF-8
400and the resulting octets returned as a sequence of bytes. All possible
401characters have a UTF-8 representation so this function cannot fail.
402
403=item $string = decode_utf8($octets [, CHECK]);
404
405The sequence of octets represented by $octets is decoded from UTF-8
406into a sequence of logical characters. Not all sequences of octets
407form valid UTF-8 encodings, so it is possible for this call to fail.
408For CHECK see L</"Handling Malformed Data">.
409
410=back
411
51ef4e11 412=head2 Listing available encodings
413
5129552c 414 use Encode;
415 @list = Encode->encodings();
416
417Returns a list of the canonical names of the available encodings that
418are loaded. To get a list of all available encodings including the
419ones that are not loaded yet, say
420
421 @all_encodings = Encode->encodings(":all");
422
423Or you can give the name of specific module.
424
c731e18e 425 @with_jp = Encode->encodings("Encode::JP");
426
427When "::" is not in the name, "Encode::" is assumed.
51ef4e11 428
c731e18e 429 @ebcdic = Encode->encodings("EBCDIC");
5d030b67 430
a63c962f 431To find which encodings are supported by this package in details,
5d030b67 432see L<Encode::Supported>.
51ef4e11 433
434=head2 Defining Aliases
435
67d7b5ef 436To add new alias to a given encoding, Use;
437
5129552c 438 use Encode;
439 use Encode::Alias;
a63c962f 440 define_alias(newName => ENCODING);
51ef4e11 441
3ef515df 442After that, newName can be used as an alias for ENCODING.
f2a2953c 443ENCODING may be either the name of an encoding or an
444I<encoding object>
51ef4e11 445
5d030b67 446See L<Encode::Alias> on details.
51ef4e11 447
4411f3b6 448=head1 Encoding and IO
449
450It is very common to want to do encoding transformations when
451reading or writing files, network connections, pipes etc.
47bfe92f 452If Perl is configured to use the new 'perlio' IO system then
4411f3b6 453C<Encode> provides a "layer" (See L<perliol>) which can transform
454data as it is read or written.
455
8e86646e 456Here is how the blind poet would modernise the encoding:
457
42234700 458 use Encode;
8e86646e 459 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
460 open(my $utf8,'>:utf8','iliad.utf8');
461 my @epic = <$iliad>;
462 print $utf8 @epic;
463 close($utf8);
464 close($illiad);
4411f3b6 465
466In addition the new IO system can also be configured to read/write
467UTF-8 encoded characters (as noted above this is efficient):
468
e9692b5b 469 open(my $fh,'>:utf8','anything');
470 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6 471
472Either of the above forms of "layer" specifications can be made the default
473for a lexical scope with the C<use open ...> pragma. See L<open>.
474
475Once a handle is open is layers can be altered using C<binmode>.
476
47bfe92f 477Without any such configuration, or if Perl itself is built using
4411f3b6 478system's own IO, then write operations assume that file handle accepts
479only I<bytes> and will C<die> if a character larger than 255 is
480written to the handle. When reading, each octet from the handle
481becomes a byte-in-a-character. Note that this default is the same
47bfe92f 482behaviour as bytes-only languages (including Perl before v5.6) would
483have, and is sufficient to handle native 8-bit encodings
484e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
485other encodings and binary data.
486
487In other cases it is the programs responsibility to transform
488characters into bytes using the API above before doing writes, and to
489transform the bytes read from a handle into characters before doing
490"character operations" (e.g. C<lc>, C</\W+/>, ...).
491
47bfe92f 492You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8 493want to bring into memory. For example to convert between ISO-8859-1
47bfe92f 494(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
495
e9692b5b 496 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
497 open(G, ">:utf8", "data.utf") or die $!;
498 while (<F>) { print G }
499
500 # Could also do "print G <F>" but that would pull
501 # the whole file into memory just to write it out again.
502
503More examples:
47bfe92f 504
e9692b5b 505 open(my $f, "<:encoding(cp1252)")
506 open(my $g, ">:encoding(iso-8859-2)")
507 open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f 508
509See L<PerlIO> for more information.
4411f3b6 510
1768d7eb 511See also L<encoding> for how to change the default encoding of the
d521382b 512data in your script.
1768d7eb 513
67d7b5ef 514=head1 Handling Malformed Data
515
f2a2953c 516If I<CHECK> is not set, (en|de)code will put I<substitution character> in
517place of the malformed character. for UCM-based encodings,
518E<lt>subcharE<gt> will be used. For Unicode, \xFFFD is used. If the
519data is supposed to be UTF-8, an optional lexical warning (category
520utf8) is given.
67d7b5ef 521
f2a2953c 522If I<CHECK> is true but not a code reference, dies with an error message.
67d7b5ef 523
f2a2953c 524In future you will be able to use a code reference to a callback
525function for the value of I<CHECK> but its API is still undecided.
67d7b5ef 526
527=head1 Defining Encodings
528
529To define a new encoding, use:
530
531 use Encode qw(define_alias);
532 define_encoding($object, 'canonicalName' [, alias...]);
533
534I<canonicalName> will be associated with I<$object>. The object
535should provide the interface described in L<Encode::Encoding>
536If more than two arguments are provided then additional
537arguments are taken as aliases for I<$object> as for C<define_alias>.
538
f2a2953c 539See L<Encode::Encoding> for more details.
540
4411f3b6 541=head1 Messing with Perl's Internals
542
47bfe92f 543The following API uses parts of Perl's internals in the current
544implementation. As such they are efficient, but may change.
4411f3b6 545
546=over 4
547
a63c962f 548=item is_utf8(STRING [, CHECK])
4411f3b6 549
550[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f 551If CHECK is true, also checks the data in STRING for being well-formed
552UTF-8. Returns true if successful, false otherwise.
4411f3b6 553
a63c962f 554=item _utf8_on(STRING)
4411f3b6 555
556[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
557B<not> checked for being well-formed UTF-8. Do not use unless you
558B<know> that the STRING is well-formed UTF-8. Returns the previous
559state of the UTF-8 flag (so please don't test the return value as
560I<not> success or failure), or C<undef> if STRING is not a string.
561
a63c962f 562=item _utf8_off(STRING)
4411f3b6 563
564[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
565Returns the previous state of the UTF-8 flag (so please don't test the
566return value as I<not> success or failure), or C<undef> if STRING is
567not a string.
568
569=back
570
571=head1 SEE ALSO
572
5d030b67 573L<Encode::Encoding>,
574L<Encode::Supported>,
575L<PerlIO>,
576L<encoding>,
577L<perlebcdic>,
578L<perlfunc/open>,
579L<perlunicode>,
580L<utf8>,
581the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 582
583=cut