Upgrade to Encode 1.42, from Dan Kogai.
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
b2704119 3our $VERSION = do { my @r = (q$Revision: 1.42 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c 4our $DEBUG = 0;
2c674647 5
6require DynaLoader;
7require Exporter;
8
51ef4e11 9our @ISA = qw(Exporter DynaLoader);
2c674647 10
4411f3b6 11# Public, encouraged API is exported by default
51ef4e11 12our @EXPORT = qw (
4411f3b6 13 decode
4411f3b6 14 decode_utf8
fcb875d4 15 encode
16 encode_utf8
51ef4e11 17 encodings
fcb875d4 18 find_encoding
4411f3b6 19);
20
51ef4e11 21our @EXPORT_OK =
2c674647 22 qw(
fcb875d4 23 _utf8_off
24 _utf8_on
51ef4e11 25 define_encoding
2c674647 26 from_to
4411f3b6 27 is_16bit
fcb875d4 28 is_8bit
29 is_utf8
30 resolve_alias
a12c0f56 31 utf8_downgrade
fcb875d4 32 utf8_upgrade
2c674647 33 );
34
35bootstrap Encode ();
36
4411f3b6 37# Documentation moved after __END__ for speed - NI-S
2c674647 38
bf230f3d 39use Carp;
40
a63c962f 41our $ON_EBCDIC = (ord("A") == 193);
f2a2953c 42
5d030b67 43use Encode::Alias;
44
5129552c 45# Make a %Encoding package variable to allow a certain amount of cheating
46our %Encoding;
aae85ceb 47our %ExtModule;
48require Encode::Config;
49eval { require Encode::ConfigLocal };
5129552c 50
656753f8 51sub encodings
52{
5129552c 53 my $class = shift;
071db25d 54 my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
c731e18e 55 for my $mod (@modules){
56 $mod =~ s,::,/,g or $mod = "Encode/$mod";
57 $mod .= '.pm';
58 $DEBUG and warn "about to require $mod;";
59 eval { require $mod; };
5129552c 60 }
c731e18e 61 my %modules = map {$_ => 1} @modules;
5129552c 62 return
ce912cd4 63 sort { lc $a cmp lc $b }
64 grep {!/^(?:Internal|Unicode)$/o} keys %Encoding;
51ef4e11 65}
66
51ef4e11 67sub define_encoding
68{
18586f54 69 my $obj = shift;
70 my $name = shift;
5129552c 71 $Encoding{$name} = $obj;
18586f54 72 my $lc = lc($name);
73 define_alias($lc => $obj) unless $lc eq $name;
74 while (@_)
75 {
76 my $alias = shift;
77 define_alias($alias,$obj);
78 }
79 return $obj;
656753f8 80}
81
656753f8 82sub getEncoding
83{
dd9703c9 84 my ($class,$name,$skip_external) = @_;
18586f54 85 my $enc;
86 if (ref($name) && $name->can('new_sequence'))
87 {
88 return $name;
89 }
90 my $lc = lc $name;
5129552c 91 if (exists $Encoding{$name})
18586f54 92 {
5129552c 93 return $Encoding{$name};
18586f54 94 }
5129552c 95 if (exists $Encoding{$lc})
18586f54 96 {
5129552c 97 return $Encoding{$lc};
18586f54 98 }
c50d192e 99
5129552c 100 my $oc = $class->find_alias($name);
c50d192e 101 return $oc if defined $oc;
102
5129552c 103 $oc = $class->find_alias($lc) if $lc ne $name;
c50d192e 104 return $oc if defined $oc;
105
c731e18e 106 unless ($skip_external)
d1ed7747 107 {
c731e18e 108 if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
109 $mod =~ s,::,/,g ; $mod .= '.pm';
110 eval{ require $mod; };
111 return $Encoding{$name} if exists $Encoding{$name};
112 }
d1ed7747 113 }
18586f54 114 return;
656753f8 115}
116
4411f3b6 117sub find_encoding
118{
dd9703c9 119 my ($name,$skip_external) = @_;
120 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6 121}
122
fcb875d4 123sub resolve_alias {
124 my $obj = find_encoding(shift);
125 defined $obj and return $obj->name;
126 return;
127}
128
b2704119 129sub encode($$;$)
4411f3b6 130{
18586f54 131 my ($name,$string,$check) = @_;
b2704119 132 $check ||=0;
18586f54 133 my $enc = find_encoding($name);
134 croak("Unknown encoding '$name'") unless defined $enc;
135 my $octets = $enc->encode($string,$check);
136 return undef if ($check && length($string));
137 return $octets;
4411f3b6 138}
139
b2704119 140sub decode($$;$)
4411f3b6 141{
18586f54 142 my ($name,$octets,$check) = @_;
b2704119 143 $check ||=0;
18586f54 144 my $enc = find_encoding($name);
145 croak("Unknown encoding '$name'") unless defined $enc;
146 my $string = $enc->decode($octets,$check);
147 $_[1] = $octets if $check;
148 return $string;
4411f3b6 149}
150
b2704119 151sub from_to($$$;$)
4411f3b6 152{
18586f54 153 my ($string,$from,$to,$check) = @_;
b2704119 154 $check ||=0;
18586f54 155 my $f = find_encoding($from);
156 croak("Unknown encoding '$from'") unless defined $f;
157 my $t = find_encoding($to);
158 croak("Unknown encoding '$to'") unless defined $t;
159 my $uni = $f->decode($string,$check);
160 return undef if ($check && length($string));
a999c27c 161 $string = $t->encode($uni,$check);
18586f54 162 return undef if ($check && length($uni));
3ef515df 163 return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6 164}
165
b2704119 166sub encode_utf8($)
4411f3b6 167{
18586f54 168 my ($str) = @_;
c731e18e 169 utf8::encode($str);
18586f54 170 return $str;
4411f3b6 171}
172
b2704119 173sub decode_utf8($)
4411f3b6 174{
18586f54 175 my ($str) = @_;
176 return undef unless utf8::decode($str);
177 return $str;
5ad8ef52 178}
179
f2a2953c 180predefine_encodings();
181
182#
183# This is to restore %Encoding if really needed;
184#
185sub predefine_encodings{
186 if ($ON_EBCDIC) {
187 # was in Encode::UTF_EBCDIC
188 package Encode::UTF_EBCDIC;
189 *name = sub{ shift->{'Name'} };
190 *new_sequence = sub{ return $_[0] };
191 *decode = sub{
192 my ($obj,$str,$chk) = @_;
193 my $res = '';
194 for (my $i = 0; $i < length($str); $i++) {
195 $res .=
196 chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
197 }
198 $_[1] = '' if $chk;
199 return $res;
200 };
201 *encode = sub{
202 my ($obj,$str,$chk) = @_;
203 my $res = '';
204 for (my $i = 0; $i < length($str); $i++) {
205 $res .=
206 chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
207 }
208 $_[1] = '' if $chk;
209 return $res;
210 };
77ea6967 211 $Encode::Encoding{Unicode} =
c731e18e 212 bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
f2a2953c 213 } else {
214 # was in Encode::UTF_EBCDIC
215 package Encode::Internal;
216 *name = sub{ shift->{'Name'} };
217 *new_sequence = sub{ return $_[0] };
218 *decode = sub{
219 my ($obj,$str,$chk) = @_;
220 utf8::upgrade($str);
221 $_[1] = '' if $chk;
222 return $str;
223 };
224 *encode = \&decode;
225 $Encode::Encoding{Unicode} =
c731e18e 226 bless {Name => "Internal"} => "Encode::Internal";
f2a2953c 227 }
228
229 {
230 # was in Encode::utf8
231 package Encode::utf8;
232 *name = sub{ shift->{'Name'} };
233 *new_sequence = sub{ return $_[0] };
234 *decode = sub{
235 my ($obj,$octets,$chk) = @_;
236 my $str = Encode::decode_utf8($octets);
237 if (defined $str) {
238 $_[1] = '' if $chk;
239 return $str;
240 }
241 return undef;
242 };
243 *encode = sub {
244 my ($obj,$string,$chk) = @_;
245 my $octets = Encode::encode_utf8($string);
246 $_[1] = '' if $chk;
247 return $octets;
248 };
249 $Encode::Encoding{utf8} =
c731e18e 250 bless {Name => "utf8"} => "Encode::utf8";
f2a2953c 251 }
f2a2953c 252}
253
18586f54 254require Encode::Encoding;
b2704119 255
256eval { require PerlIO::encoding };
4411f3b6 257
656753f8 2581;
259
2a936312 260__END__
261
4411f3b6 262=head1 NAME
263
264Encode - character encodings
265
266=head1 SYNOPSIS
267
268 use Encode;
269
67d7b5ef 270
271=head2 Table of Contents
272
273Encode consists of a collection of modules which details are too big
274to fit in one document. This POD itself explains the top-level APIs
275and general topics at a glance. For other topics and more details,
276see the PODs below;
277
278 Name Description
279 --------------------------------------------------------
280 Encode::Alias Alias defintions to encodings
281 Encode::Encoding Encode Implementation Base Class
282 Encode::Supported List of Supported Encodings
283 Encode::CN Simplified Chinese Encodings
284 Encode::JP Japanese Encodings
285 Encode::KR Korean Encodings
286 Encode::TW Traditional Chinese Encodings
287 --------------------------------------------------------
288
4411f3b6 289=head1 DESCRIPTION
290
47bfe92f 291The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef 292and the rest of the system. Perl strings are sequences of
293B<characters>.
294
295The repertoire of characters that Perl can represent is at least that
296defined by the Unicode Consortium. On most platforms the ordinal
297values of the characters (as returned by C<ord(ch)>) is the "Unicode
298codepoint" for the character (the exceptions are those platforms where
299the legacy encoding is some variant of EBCDIC rather than a super-set
300of ASCII - see L<perlebcdic>).
301
302Traditionally computer data has been moved around in 8-bit chunks
303often called "bytes". These chunks are also known as "octets" in
304networking standards. Perl is widely used to manipulate data of many
305types - not only strings of characters representing human or computer
306languages but also "binary" data being the machines representation of
307numbers, pixels in an image - or just about anything.
308
309When Perl is processing "binary data" the programmer wants Perl to
310process "sequences of bytes". This is not a problem for Perl - as a
311byte has 256 possible values it easily fits in Perl's much larger
312"logical character".
313
314=head2 TERMINOLOGY
4411f3b6 315
67d7b5ef 316=over 4
21938dfa 317
67d7b5ef 318=item *
319
320I<character>: a character in the range 0..(2**32-1) (or more).
321(What Perl's strings are made of.)
322
323=item *
324
325I<byte>: a character in the range 0..255
326(A special case of a Perl character.)
327
328=item *
329
330I<octet>: 8 bits of data, with ordinal values 0..255
331(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
332
333=back
4411f3b6 334
67d7b5ef 335The marker [INTERNAL] marks Internal Implementation Details, in
336general meant only for those who think they know what they are doing,
337and such details may change in future releases.
338
339=head1 PERL ENCODING API
4411f3b6 340
341=over 4
342
f2a2953c 343=item $octets = encode(ENCODING, $string[, CHECK])
4411f3b6 344
47bfe92f 345Encodes string from Perl's internal form into I<ENCODING> and returns
67d7b5ef 346a sequence of octets. ENCODING can be either a canonical name or
347alias. For encoding names and aliases, see L</"Defining Aliases">.
348For CHECK see L</"Handling Malformed Data">.
4411f3b6 349
67d7b5ef 350For example to convert (internally UTF-8 encoded) Unicode string to
351iso-8859-1 (also known as Latin1),
681a7c68 352
67d7b5ef 353 $octets = encode("iso-8859-1", $unicode);
681a7c68 354
f2a2953c 355=item $string = decode(ENCODING, $octets[, CHECK])
4411f3b6 356
47bfe92f 357Decode sequence of octets assumed to be in I<ENCODING> into Perl's
67d7b5ef 358internal form and returns the resulting string. as in encode(),
359ENCODING can be either a canonical name or alias. For encoding names
360and aliases, see L</"Defining Aliases">. For CHECK see
47bfe92f 361L</"Handling Malformed Data">.
362
1b2c56c8 363For example to convert ISO-8859-1 data to UTF-8:
681a7c68 364
67d7b5ef 365 $utf8 = decode("iso-8859-1", $latin1);
681a7c68 366
f2a2953c 367=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])
47bfe92f 368
2b106fbe 369Convert B<in-place> the data between two encodings. How did the data
370in $string originally get to be in FROM_ENCODING? Either using
67d7b5ef 371encode() or through PerlIO: See L</"Encoding and IO">.
372For encoding names and aliases, see L</"Defining Aliases">.
373For CHECK see L</"Handling Malformed Data">.
2b106fbe 374
1b2c56c8 375For example to convert ISO-8859-1 data to UTF-8:
2b106fbe 376
377 from_to($data, "iso-8859-1", "utf-8");
378
379and to convert it back:
380
381 from_to($data, "utf-8", "iso-8859-1");
4411f3b6 382
ab97ca19 383Note that because the conversion happens in place, the data to be
384converted cannot be a string constant, it must be a scalar variable.
385
3ef515df 386from_to() return the length of the converted string on success, undef
387otherwise.
388
4411f3b6 389=back
390
f2a2953c 391=head2 UTF-8 / utf8
392
393The Unicode consortium defines the UTF-8 standard as a way of encoding
394the entire Unicode repertoire as sequences of octets. This encoding is
395expected to become very widespread. Perl can use this form internally
396to represent strings, so conversions to and from this form are
397particularly efficient (as octets in memory do not have to change,
398just the meta-data that tells Perl how to treat them).
399
400=over 4
401
402=item $octets = encode_utf8($string);
403
404The characters that comprise string are encoded in Perl's superset of UTF-8
405and the resulting octets returned as a sequence of bytes. All possible
406characters have a UTF-8 representation so this function cannot fail.
407
408=item $string = decode_utf8($octets [, CHECK]);
409
410The sequence of octets represented by $octets is decoded from UTF-8
411into a sequence of logical characters. Not all sequences of octets
412form valid UTF-8 encodings, so it is possible for this call to fail.
413For CHECK see L</"Handling Malformed Data">.
414
415=back
416
51ef4e11 417=head2 Listing available encodings
418
5129552c 419 use Encode;
420 @list = Encode->encodings();
421
422Returns a list of the canonical names of the available encodings that
423are loaded. To get a list of all available encodings including the
424ones that are not loaded yet, say
425
426 @all_encodings = Encode->encodings(":all");
427
428Or you can give the name of specific module.
429
c731e18e 430 @with_jp = Encode->encodings("Encode::JP");
431
432When "::" is not in the name, "Encode::" is assumed.
51ef4e11 433
c731e18e 434 @ebcdic = Encode->encodings("EBCDIC");
5d030b67 435
a63c962f 436To find which encodings are supported by this package in details,
5d030b67 437see L<Encode::Supported>.
51ef4e11 438
439=head2 Defining Aliases
440
67d7b5ef 441To add new alias to a given encoding, Use;
442
5129552c 443 use Encode;
444 use Encode::Alias;
a63c962f 445 define_alias(newName => ENCODING);
51ef4e11 446
3ef515df 447After that, newName can be used as an alias for ENCODING.
f2a2953c 448ENCODING may be either the name of an encoding or an
449I<encoding object>
51ef4e11 450
fcb875d4 451But before you do so, make sure the alias is nonexistent with
452C<resolve_alias()>, which returns the canonical name thereof.
453i.e.
454
455 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
456 Encode::resolve_alias("iso-8859-12") # false; nonexistent
457 Encode::resolve_alias($name) eq $name # true if $name is canonical
458
459This resolve_alias() does not need C<use Encode::Alias> and is
460exported via C<use encode qw(resolve_alias)>.
461
5d030b67 462See L<Encode::Alias> on details.
51ef4e11 463
4411f3b6 464=head1 Encoding and IO
465
466It is very common to want to do encoding transformations when
467reading or writing files, network connections, pipes etc.
47bfe92f 468If Perl is configured to use the new 'perlio' IO system then
4411f3b6 469C<Encode> provides a "layer" (See L<perliol>) which can transform
470data as it is read or written.
471
8e86646e 472Here is how the blind poet would modernise the encoding:
473
42234700 474 use Encode;
8e86646e 475 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
476 open(my $utf8,'>:utf8','iliad.utf8');
477 my @epic = <$iliad>;
478 print $utf8 @epic;
479 close($utf8);
480 close($illiad);
4411f3b6 481
482In addition the new IO system can also be configured to read/write
483UTF-8 encoded characters (as noted above this is efficient):
484
e9692b5b 485 open(my $fh,'>:utf8','anything');
486 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6 487
488Either of the above forms of "layer" specifications can be made the default
489for a lexical scope with the C<use open ...> pragma. See L<open>.
490
491Once a handle is open is layers can be altered using C<binmode>.
492
47bfe92f 493Without any such configuration, or if Perl itself is built using
4411f3b6 494system's own IO, then write operations assume that file handle accepts
495only I<bytes> and will C<die> if a character larger than 255 is
496written to the handle. When reading, each octet from the handle
497becomes a byte-in-a-character. Note that this default is the same
47bfe92f 498behaviour as bytes-only languages (including Perl before v5.6) would
499have, and is sufficient to handle native 8-bit encodings
500e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
501other encodings and binary data.
502
503In other cases it is the programs responsibility to transform
504characters into bytes using the API above before doing writes, and to
505transform the bytes read from a handle into characters before doing
506"character operations" (e.g. C<lc>, C</\W+/>, ...).
507
47bfe92f 508You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8 509want to bring into memory. For example to convert between ISO-8859-1
47bfe92f 510(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
511
e9692b5b 512 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
513 open(G, ">:utf8", "data.utf") or die $!;
514 while (<F>) { print G }
515
516 # Could also do "print G <F>" but that would pull
517 # the whole file into memory just to write it out again.
518
519More examples:
47bfe92f 520
e9692b5b 521 open(my $f, "<:encoding(cp1252)")
522 open(my $g, ">:encoding(iso-8859-2)")
523 open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f 524
525See L<PerlIO> for more information.
4411f3b6 526
1768d7eb 527See also L<encoding> for how to change the default encoding of the
d521382b 528data in your script.
1768d7eb 529
67d7b5ef 530=head1 Handling Malformed Data
531
f2a2953c 532If I<CHECK> is not set, (en|de)code will put I<substitution character> in
533place of the malformed character. for UCM-based encodings,
534E<lt>subcharE<gt> will be used. For Unicode, \xFFFD is used. If the
535data is supposed to be UTF-8, an optional lexical warning (category
536utf8) is given.
67d7b5ef 537
f2a2953c 538If I<CHECK> is true but not a code reference, dies with an error message.
67d7b5ef 539
f2a2953c 540In future you will be able to use a code reference to a callback
541function for the value of I<CHECK> but its API is still undecided.
67d7b5ef 542
543=head1 Defining Encodings
544
545To define a new encoding, use:
546
547 use Encode qw(define_alias);
548 define_encoding($object, 'canonicalName' [, alias...]);
549
550I<canonicalName> will be associated with I<$object>. The object
551should provide the interface described in L<Encode::Encoding>
552If more than two arguments are provided then additional
553arguments are taken as aliases for I<$object> as for C<define_alias>.
554
f2a2953c 555See L<Encode::Encoding> for more details.
556
4411f3b6 557=head1 Messing with Perl's Internals
558
47bfe92f 559The following API uses parts of Perl's internals in the current
560implementation. As such they are efficient, but may change.
4411f3b6 561
562=over 4
563
a63c962f 564=item is_utf8(STRING [, CHECK])
4411f3b6 565
566[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f 567If CHECK is true, also checks the data in STRING for being well-formed
568UTF-8. Returns true if successful, false otherwise.
4411f3b6 569
a63c962f 570=item _utf8_on(STRING)
4411f3b6 571
572[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
573B<not> checked for being well-formed UTF-8. Do not use unless you
574B<know> that the STRING is well-formed UTF-8. Returns the previous
575state of the UTF-8 flag (so please don't test the return value as
576I<not> success or failure), or C<undef> if STRING is not a string.
577
a63c962f 578=item _utf8_off(STRING)
4411f3b6 579
580[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
581Returns the previous state of the UTF-8 flag (so please don't test the
582return value as I<not> success or failure), or C<undef> if STRING is
583not a string.
584
585=back
586
587=head1 SEE ALSO
588
5d030b67 589L<Encode::Encoding>,
590L<Encode::Supported>,
591L<PerlIO>,
592L<encoding>,
593L<perlebcdic>,
594L<perlfunc/open>,
595L<perlunicode>,
596L<utf8>,
597the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 598
aae85ceb 599head2 MAINTAINER
600
601This project was originated by Nick Ing-Simmons and later maintained
602by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for full list
603of people involved. For any questions, use
604E<lt>perl-unicode@perl.orgE<gt> so others can share.
605
4411f3b6 606=cut