Re: [PATCH 5.7.3 docs] The question deals with a bug that was fixed
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
b0b300a3 3our $VERSION = do { my @r = (q$Revision: 1.51 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c 4our $DEBUG = 0;
2c674647 5
6require DynaLoader;
7require Exporter;
8
51ef4e11 9our @ISA = qw(Exporter DynaLoader);
2c674647 10
4411f3b6 11# Public, encouraged API is exported by default
85982a32 12
13our @EXPORT = qw(
14 decode decode_utf8 encode encode_utf8
15 encodings find_encoding
4411f3b6 16);
17
85982a32 18our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC PERLQQ);
19our @FB_CONSTS = qw(FB_DEFAULT FB_QUIET FB_WARN FB_PERLQQ FB_CROAK);
20
51ef4e11 21our @EXPORT_OK =
85982a32 22 (
23 qw(
24 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
25 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
26 ),
27 @FB_FLAGS, @FB_CONSTS,
28 );
29
30our %EXPORT_TAGS =
31 (
32 all => [ @EXPORT, @EXPORT_OK ],
33 fallbacks => [ @FB_CONSTS ],
34 fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
35 );
36
2c674647 37
38bootstrap Encode ();
39
4411f3b6 40# Documentation moved after __END__ for speed - NI-S
2c674647 41
bf230f3d 42use Carp;
43
a63c962f 44our $ON_EBCDIC = (ord("A") == 193);
f2a2953c 45
5d030b67 46use Encode::Alias;
47
5129552c 48# Make a %Encoding package variable to allow a certain amount of cheating
49our %Encoding;
aae85ceb 50our %ExtModule;
51require Encode::Config;
52eval { require Encode::ConfigLocal };
5129552c 53
656753f8 54sub encodings
55{
5129552c 56 my $class = shift;
071db25d 57 my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
c731e18e 58 for my $mod (@modules){
59 $mod =~ s,::,/,g or $mod = "Encode/$mod";
60 $mod .= '.pm';
61 $DEBUG and warn "about to require $mod;";
62 eval { require $mod; };
5129552c 63 }
c731e18e 64 my %modules = map {$_ => 1} @modules;
5129552c 65 return
ce912cd4 66 sort { lc $a cmp lc $b }
67 grep {!/^(?:Internal|Unicode)$/o} keys %Encoding;
51ef4e11 68}
69
85982a32 70sub perlio_ok{
71 exists $INC{"PerlIO/encoding.pm"} or return 0;
72 my $stash = ref($_[0]);
73 $stash ||= ref(find_encoding($_[0]));
74 return ($stash eq "Encode::XS" || $stash eq "Encode::Unicode");
75}
76
51ef4e11 77sub define_encoding
78{
18586f54 79 my $obj = shift;
80 my $name = shift;
5129552c 81 $Encoding{$name} = $obj;
18586f54 82 my $lc = lc($name);
83 define_alias($lc => $obj) unless $lc eq $name;
84 while (@_)
85 {
86 my $alias = shift;
87 define_alias($alias,$obj);
88 }
89 return $obj;
656753f8 90}
91
656753f8 92sub getEncoding
93{
dd9703c9 94 my ($class,$name,$skip_external) = @_;
18586f54 95 my $enc;
96 if (ref($name) && $name->can('new_sequence'))
97 {
98 return $name;
99 }
100 my $lc = lc $name;
5129552c 101 if (exists $Encoding{$name})
18586f54 102 {
5129552c 103 return $Encoding{$name};
18586f54 104 }
5129552c 105 if (exists $Encoding{$lc})
18586f54 106 {
5129552c 107 return $Encoding{$lc};
18586f54 108 }
c50d192e 109
5129552c 110 my $oc = $class->find_alias($name);
c50d192e 111 return $oc if defined $oc;
112
5129552c 113 $oc = $class->find_alias($lc) if $lc ne $name;
c50d192e 114 return $oc if defined $oc;
115
c731e18e 116 unless ($skip_external)
d1ed7747 117 {
c731e18e 118 if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
119 $mod =~ s,::,/,g ; $mod .= '.pm';
120 eval{ require $mod; };
121 return $Encoding{$name} if exists $Encoding{$name};
122 }
d1ed7747 123 }
18586f54 124 return;
656753f8 125}
126
4411f3b6 127sub find_encoding
128{
dd9703c9 129 my ($name,$skip_external) = @_;
130 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6 131}
132
fcb875d4 133sub resolve_alias {
134 my $obj = find_encoding(shift);
135 defined $obj and return $obj->name;
136 return;
137}
138
b2704119 139sub encode($$;$)
4411f3b6 140{
18586f54 141 my ($name,$string,$check) = @_;
b2704119 142 $check ||=0;
18586f54 143 my $enc = find_encoding($name);
144 croak("Unknown encoding '$name'") unless defined $enc;
145 my $octets = $enc->encode($string,$check);
146 return undef if ($check && length($string));
147 return $octets;
4411f3b6 148}
149
b2704119 150sub decode($$;$)
4411f3b6 151{
18586f54 152 my ($name,$octets,$check) = @_;
b2704119 153 $check ||=0;
18586f54 154 my $enc = find_encoding($name);
155 croak("Unknown encoding '$name'") unless defined $enc;
156 my $string = $enc->decode($octets,$check);
157 $_[1] = $octets if $check;
158 return $string;
4411f3b6 159}
160
b2704119 161sub from_to($$$;$)
4411f3b6 162{
18586f54 163 my ($string,$from,$to,$check) = @_;
b2704119 164 $check ||=0;
18586f54 165 my $f = find_encoding($from);
166 croak("Unknown encoding '$from'") unless defined $f;
167 my $t = find_encoding($to);
168 croak("Unknown encoding '$to'") unless defined $t;
169 my $uni = $f->decode($string,$check);
170 return undef if ($check && length($string));
a999c27c 171 $string = $t->encode($uni,$check);
18586f54 172 return undef if ($check && length($uni));
3ef515df 173 return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6 174}
175
b2704119 176sub encode_utf8($)
4411f3b6 177{
18586f54 178 my ($str) = @_;
c731e18e 179 utf8::encode($str);
18586f54 180 return $str;
4411f3b6 181}
182
b2704119 183sub decode_utf8($)
4411f3b6 184{
18586f54 185 my ($str) = @_;
186 return undef unless utf8::decode($str);
187 return $str;
5ad8ef52 188}
189
f2a2953c 190predefine_encodings();
191
192#
193# This is to restore %Encoding if really needed;
194#
195sub predefine_encodings{
196 if ($ON_EBCDIC) {
197 # was in Encode::UTF_EBCDIC
198 package Encode::UTF_EBCDIC;
199 *name = sub{ shift->{'Name'} };
200 *new_sequence = sub{ return $_[0] };
201 *decode = sub{
202 my ($obj,$str,$chk) = @_;
203 my $res = '';
204 for (my $i = 0; $i < length($str); $i++) {
205 $res .=
206 chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
207 }
208 $_[1] = '' if $chk;
209 return $res;
210 };
211 *encode = sub{
212 my ($obj,$str,$chk) = @_;
213 my $res = '';
214 for (my $i = 0; $i < length($str); $i++) {
215 $res .=
216 chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
217 }
218 $_[1] = '' if $chk;
219 return $res;
220 };
77ea6967 221 $Encode::Encoding{Unicode} =
c731e18e 222 bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
f2a2953c 223 } else {
224 # was in Encode::UTF_EBCDIC
225 package Encode::Internal;
226 *name = sub{ shift->{'Name'} };
227 *new_sequence = sub{ return $_[0] };
228 *decode = sub{
229 my ($obj,$str,$chk) = @_;
230 utf8::upgrade($str);
231 $_[1] = '' if $chk;
232 return $str;
233 };
234 *encode = \&decode;
235 $Encode::Encoding{Unicode} =
c731e18e 236 bless {Name => "Internal"} => "Encode::Internal";
f2a2953c 237 }
238
239 {
240 # was in Encode::utf8
241 package Encode::utf8;
242 *name = sub{ shift->{'Name'} };
243 *new_sequence = sub{ return $_[0] };
244 *decode = sub{
245 my ($obj,$octets,$chk) = @_;
246 my $str = Encode::decode_utf8($octets);
247 if (defined $str) {
248 $_[1] = '' if $chk;
249 return $str;
250 }
251 return undef;
252 };
253 *encode = sub {
254 my ($obj,$string,$chk) = @_;
255 my $octets = Encode::encode_utf8($string);
256 $_[1] = '' if $chk;
257 return $octets;
258 };
259 $Encode::Encoding{utf8} =
c731e18e 260 bless {Name => "utf8"} => "Encode::utf8";
f2a2953c 261 }
f2a2953c 262}
263
18586f54 264require Encode::Encoding;
b2704119 265
b0b300a3 266eval {
267 require PerlIO::encoding;
268 unless (PerlIO::encoding->VERSION >= 0.02){
269 delete $INC{"PerlIO/encoding.pm"};
270 }
271};
85982a32 272# warn $@ if $@;
4411f3b6 273
656753f8 2741;
275
2a936312 276__END__
277
4411f3b6 278=head1 NAME
279
280Encode - character encodings
281
282=head1 SYNOPSIS
283
284 use Encode;
285
67d7b5ef 286
287=head2 Table of Contents
288
289Encode consists of a collection of modules which details are too big
290to fit in one document. This POD itself explains the top-level APIs
291and general topics at a glance. For other topics and more details,
292see the PODs below;
293
294 Name Description
295 --------------------------------------------------------
296 Encode::Alias Alias defintions to encodings
297 Encode::Encoding Encode Implementation Base Class
298 Encode::Supported List of Supported Encodings
299 Encode::CN Simplified Chinese Encodings
300 Encode::JP Japanese Encodings
301 Encode::KR Korean Encodings
302 Encode::TW Traditional Chinese Encodings
303 --------------------------------------------------------
304
4411f3b6 305=head1 DESCRIPTION
306
47bfe92f 307The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef 308and the rest of the system. Perl strings are sequences of
309B<characters>.
310
311The repertoire of characters that Perl can represent is at least that
312defined by the Unicode Consortium. On most platforms the ordinal
313values of the characters (as returned by C<ord(ch)>) is the "Unicode
314codepoint" for the character (the exceptions are those platforms where
315the legacy encoding is some variant of EBCDIC rather than a super-set
316of ASCII - see L<perlebcdic>).
317
318Traditionally computer data has been moved around in 8-bit chunks
319often called "bytes". These chunks are also known as "octets" in
320networking standards. Perl is widely used to manipulate data of many
321types - not only strings of characters representing human or computer
322languages but also "binary" data being the machines representation of
323numbers, pixels in an image - or just about anything.
324
325When Perl is processing "binary data" the programmer wants Perl to
326process "sequences of bytes". This is not a problem for Perl - as a
327byte has 256 possible values it easily fits in Perl's much larger
328"logical character".
329
330=head2 TERMINOLOGY
4411f3b6 331
67d7b5ef 332=over 4
21938dfa 333
67d7b5ef 334=item *
335
336I<character>: a character in the range 0..(2**32-1) (or more).
337(What Perl's strings are made of.)
338
339=item *
340
341I<byte>: a character in the range 0..255
342(A special case of a Perl character.)
343
344=item *
345
346I<octet>: 8 bits of data, with ordinal values 0..255
347(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
348
349=back
4411f3b6 350
67d7b5ef 351The marker [INTERNAL] marks Internal Implementation Details, in
352general meant only for those who think they know what they are doing,
353and such details may change in future releases.
354
355=head1 PERL ENCODING API
4411f3b6 356
357=over 4
358
f2a2953c 359=item $octets = encode(ENCODING, $string[, CHECK])
4411f3b6 360
47bfe92f 361Encodes string from Perl's internal form into I<ENCODING> and returns
67d7b5ef 362a sequence of octets. ENCODING can be either a canonical name or
363alias. For encoding names and aliases, see L</"Defining Aliases">.
364For CHECK see L</"Handling Malformed Data">.
4411f3b6 365
67d7b5ef 366For example to convert (internally UTF-8 encoded) Unicode string to
367iso-8859-1 (also known as Latin1),
681a7c68 368
67d7b5ef 369 $octets = encode("iso-8859-1", $unicode);
681a7c68 370
f2a2953c 371=item $string = decode(ENCODING, $octets[, CHECK])
4411f3b6 372
47bfe92f 373Decode sequence of octets assumed to be in I<ENCODING> into Perl's
67d7b5ef 374internal form and returns the resulting string. as in encode(),
375ENCODING can be either a canonical name or alias. For encoding names
376and aliases, see L</"Defining Aliases">. For CHECK see
47bfe92f 377L</"Handling Malformed Data">.
378
1b2c56c8 379For example to convert ISO-8859-1 data to UTF-8:
681a7c68 380
67d7b5ef 381 $utf8 = decode("iso-8859-1", $latin1);
681a7c68 382
f2a2953c 383=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])
47bfe92f 384
85982a32 385Convert B<in-place> the data between two encodings.
1b2c56c8 386For example to convert ISO-8859-1 data to UTF-8:
2b106fbe 387
388 from_to($data, "iso-8859-1", "utf-8");
389
390and to convert it back:
391
392 from_to($data, "utf-8", "iso-8859-1");
4411f3b6 393
ab97ca19 394Note that because the conversion happens in place, the data to be
395converted cannot be a string constant, it must be a scalar variable.
396
3ef515df 397from_to() return the length of the converted string on success, undef
398otherwise.
399
4411f3b6 400=back
401
f2a2953c 402=head2 UTF-8 / utf8
403
404The Unicode consortium defines the UTF-8 standard as a way of encoding
405the entire Unicode repertoire as sequences of octets. This encoding is
406expected to become very widespread. Perl can use this form internally
407to represent strings, so conversions to and from this form are
408particularly efficient (as octets in memory do not have to change,
409just the meta-data that tells Perl how to treat them).
410
411=over 4
412
413=item $octets = encode_utf8($string);
414
415The characters that comprise string are encoded in Perl's superset of UTF-8
416and the resulting octets returned as a sequence of bytes. All possible
417characters have a UTF-8 representation so this function cannot fail.
418
419=item $string = decode_utf8($octets [, CHECK]);
420
421The sequence of octets represented by $octets is decoded from UTF-8
422into a sequence of logical characters. Not all sequences of octets
423form valid UTF-8 encodings, so it is possible for this call to fail.
424For CHECK see L</"Handling Malformed Data">.
425
426=back
427
51ef4e11 428=head2 Listing available encodings
429
5129552c 430 use Encode;
431 @list = Encode->encodings();
432
433Returns a list of the canonical names of the available encodings that
434are loaded. To get a list of all available encodings including the
435ones that are not loaded yet, say
436
437 @all_encodings = Encode->encodings(":all");
438
439Or you can give the name of specific module.
440
c731e18e 441 @with_jp = Encode->encodings("Encode::JP");
442
443When "::" is not in the name, "Encode::" is assumed.
51ef4e11 444
c731e18e 445 @ebcdic = Encode->encodings("EBCDIC");
5d030b67 446
a63c962f 447To find which encodings are supported by this package in details,
5d030b67 448see L<Encode::Supported>.
51ef4e11 449
450=head2 Defining Aliases
451
67d7b5ef 452To add new alias to a given encoding, Use;
453
5129552c 454 use Encode;
455 use Encode::Alias;
a63c962f 456 define_alias(newName => ENCODING);
51ef4e11 457
3ef515df 458After that, newName can be used as an alias for ENCODING.
f2a2953c 459ENCODING may be either the name of an encoding or an
460I<encoding object>
51ef4e11 461
fcb875d4 462But before you do so, make sure the alias is nonexistent with
463C<resolve_alias()>, which returns the canonical name thereof.
464i.e.
465
466 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
467 Encode::resolve_alias("iso-8859-12") # false; nonexistent
468 Encode::resolve_alias($name) eq $name # true if $name is canonical
469
470This resolve_alias() does not need C<use Encode::Alias> and is
471exported via C<use encode qw(resolve_alias)>.
472
5d030b67 473See L<Encode::Alias> on details.
51ef4e11 474
85982a32 475=head1 Encoding via PerlIO
4411f3b6 476
85982a32 477If your perl supports I<PerlIO>, you can use PerlIO layer to directly
478decode and encode via filehandle. The following two examples are
479totally identical by functionality.
4411f3b6 480
85982a32 481 # via PerlIO
482 open my $in, "<:encoding(shiftjis)", $infile or die;
483 open my $out, ">:encoding(euc-jp)", $outfile or die;
484 while(<>){ print; }
8e86646e 485
85982a32 486 # via from_to
487 open my $in, $infile or die;
488 open my $out, $outfile or die;
489 while(<>){
490 from_to($_, "shiftjis", "euc", 1);
491 }
4411f3b6 492
85982a32 493Unfortunately, not all encodings are PerlIO-savvy. You can check if
494your encoding is supported by PerlIO by C<perlio_ok> method.
4411f3b6 495
85982a32 496 Encode::perlio_ok("iso-20220jp"); # false
497 find_encoding("iso-2022-jp")->perlio_ok; # false
498 use Encode qw(perlio_ok); # exported upon request
499 perlio_ok("euc-jp") # true if PerlIO is enabled
4411f3b6 500
85982a32 501For gory details, see L<Encode::PerlIO>;
4411f3b6 502
85982a32 503=head1 Handling Malformed Data
4411f3b6 504
85982a32 505=over 4
47bfe92f 506
85982a32 507THE I<CHECK> argument is used as follows. When you omit it, it is
508identical to I<CHECK> = 0.
47bfe92f 509
85982a32 510=item I<CHECK> = Encode::FB_DEFAULT ( == 0)
47bfe92f 511
85982a32 512If I<CHECK> is 0, (en|de)code will put I<substitution character> in
513place of the malformed character. for UCM-based encodings,
514E<lt>subcharE<gt> will be used. For Unicode, \xFFFD is used. If the
515data is supposed to be UTF-8, an optional lexical warning (category
516utf8) is given.
e9692b5b 517
85982a32 518=item I<CHECK> = Encode::DIE_ON_ERROR (== 1)
e9692b5b 519
85982a32 520If I<CHECK> is 1, methods will die immediately with an error
521message. so when I<CHECK> is set, you should trap the fatal error
522with eval{} unless you really want to let it die on error.
47bfe92f 523
85982a32 524=item I<CHECK> = Encode::FB_QUIET
47bfe92f 525
85982a32 526If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately
527return proccessed part on error, with data passed via argument
528overwritten with unproccessed part. This is handy when have to
529repeatedly call because the source data is chopped in the middle for
530some reasons, such as fixed-width buffer. Here is a sample code that
531just does this.
4411f3b6 532
85982a32 533 my $data = '';
534 while(defined(read $fh, $buffer, 256)){
535 # buffer may end in partial character so we append
536 $data .= $buffer;
537 $utf8 .= decode($encoding, $data, ENCODE::FB_QUIET);
538 # $data now contains unprocessed partial character
539 }
1768d7eb 540
85982a32 541=item I<CHECK> = Encode::FB_WARN
67d7b5ef 542
85982a32 543This is the same as above, except it warns on error. Handy when you
544are debugging the mode above.
545
546=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
547
548For encodings that are implemented by Encode::XS, CHECK ==
549Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode.
550
551When you decode, '\xI<XX>' will be placed where I<XX> is the hex
552representation of the octet that could not be decoded to utf8. And
553when you encode, '\x{I<xxxx>}' will be placed where I<xxxx> is the
554Unicode ID of the character that cannot be found in the character
555repartoire of the encoding.
556
557=item The bitmask
558
559These modes are actually set via bitmask. here is how FB_XX are laid
560out. for FB_XX you can import via C<use Encode qw(:fallbacks)> for
561generic bitmask constants, you can import via
562 C<use Encode qw(:fallback_all)>.
563
b0b300a3 564 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
565 DIE_ON_ERR 0x0001 X
566 WARN_ON_ER 0x0002 X
567 RETURN_ON_ERR 0x0004 X X
568 LEAVE_SRC 0x0008
569 PERLQQ 0x0100 X
67d7b5ef 570
85982a32 571=head2 Unemplemented fallback schemes
67d7b5ef 572
f2a2953c 573In future you will be able to use a code reference to a callback
574function for the value of I<CHECK> but its API is still undecided.
67d7b5ef 575
576=head1 Defining Encodings
577
578To define a new encoding, use:
579
580 use Encode qw(define_alias);
581 define_encoding($object, 'canonicalName' [, alias...]);
582
583I<canonicalName> will be associated with I<$object>. The object
584should provide the interface described in L<Encode::Encoding>
585If more than two arguments are provided then additional
586arguments are taken as aliases for I<$object> as for C<define_alias>.
587
f2a2953c 588See L<Encode::Encoding> for more details.
589
4411f3b6 590=head1 Messing with Perl's Internals
591
47bfe92f 592The following API uses parts of Perl's internals in the current
593implementation. As such they are efficient, but may change.
4411f3b6 594
595=over 4
596
a63c962f 597=item is_utf8(STRING [, CHECK])
4411f3b6 598
599[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f 600If CHECK is true, also checks the data in STRING for being well-formed
601UTF-8. Returns true if successful, false otherwise.
4411f3b6 602
a63c962f 603=item _utf8_on(STRING)
4411f3b6 604
605[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
606B<not> checked for being well-formed UTF-8. Do not use unless you
607B<know> that the STRING is well-formed UTF-8. Returns the previous
608state of the UTF-8 flag (so please don't test the return value as
609I<not> success or failure), or C<undef> if STRING is not a string.
610
a63c962f 611=item _utf8_off(STRING)
4411f3b6 612
613[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
614Returns the previous state of the UTF-8 flag (so please don't test the
615return value as I<not> success or failure), or C<undef> if STRING is
616not a string.
617
618=back
619
620=head1 SEE ALSO
621
5d030b67 622L<Encode::Encoding>,
623L<Encode::Supported>,
85982a32 624L<Encode::PerlIO>,
5d030b67 625L<encoding>,
626L<perlebcdic>,
627L<perlfunc/open>,
628L<perlunicode>,
629L<utf8>,
630the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 631
85982a32 632=head1 MAINTAINER
aae85ceb 633
634This project was originated by Nick Ing-Simmons and later maintained
635by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for full list
636of people involved. For any questions, use
637E<lt>perl-unicode@perl.orgE<gt> so others can share.
638
4411f3b6 639=cut