Upgrade to Encode 1.50, from Dan Kogai.
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
85982a32 3our $VERSION = do { my @r = (q$Revision: 1.50 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c 4our $DEBUG = 0;
2c674647 5
6require DynaLoader;
7require Exporter;
8
51ef4e11 9our @ISA = qw(Exporter DynaLoader);
2c674647 10
4411f3b6 11# Public, encouraged API is exported by default
85982a32 12
13our @EXPORT = qw(
14 decode decode_utf8 encode encode_utf8
15 encodings find_encoding
4411f3b6 16);
17
85982a32 18our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC PERLQQ);
19our @FB_CONSTS = qw(FB_DEFAULT FB_QUIET FB_WARN FB_PERLQQ FB_CROAK);
20
51ef4e11 21our @EXPORT_OK =
85982a32 22 (
23 qw(
24 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
25 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
26 ),
27 @FB_FLAGS, @FB_CONSTS,
28 );
29
30our %EXPORT_TAGS =
31 (
32 all => [ @EXPORT, @EXPORT_OK ],
33 fallbacks => [ @FB_CONSTS ],
34 fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
35 );
36
2c674647 37
38bootstrap Encode ();
39
4411f3b6 40# Documentation moved after __END__ for speed - NI-S
2c674647 41
bf230f3d 42use Carp;
43
a63c962f 44our $ON_EBCDIC = (ord("A") == 193);
f2a2953c 45
5d030b67 46use Encode::Alias;
47
5129552c 48# Make a %Encoding package variable to allow a certain amount of cheating
49our %Encoding;
aae85ceb 50our %ExtModule;
51require Encode::Config;
52eval { require Encode::ConfigLocal };
5129552c 53
656753f8 54sub encodings
55{
5129552c 56 my $class = shift;
071db25d 57 my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
c731e18e 58 for my $mod (@modules){
59 $mod =~ s,::,/,g or $mod = "Encode/$mod";
60 $mod .= '.pm';
61 $DEBUG and warn "about to require $mod;";
62 eval { require $mod; };
5129552c 63 }
c731e18e 64 my %modules = map {$_ => 1} @modules;
5129552c 65 return
ce912cd4 66 sort { lc $a cmp lc $b }
67 grep {!/^(?:Internal|Unicode)$/o} keys %Encoding;
51ef4e11 68}
69
85982a32 70sub perlio_ok{
71 exists $INC{"PerlIO/encoding.pm"} or return 0;
72 my $stash = ref($_[0]);
73 $stash ||= ref(find_encoding($_[0]));
74 return ($stash eq "Encode::XS" || $stash eq "Encode::Unicode");
75}
76
51ef4e11 77sub define_encoding
78{
18586f54 79 my $obj = shift;
80 my $name = shift;
5129552c 81 $Encoding{$name} = $obj;
18586f54 82 my $lc = lc($name);
83 define_alias($lc => $obj) unless $lc eq $name;
84 while (@_)
85 {
86 my $alias = shift;
87 define_alias($alias,$obj);
88 }
89 return $obj;
656753f8 90}
91
656753f8 92sub getEncoding
93{
dd9703c9 94 my ($class,$name,$skip_external) = @_;
18586f54 95 my $enc;
96 if (ref($name) && $name->can('new_sequence'))
97 {
98 return $name;
99 }
100 my $lc = lc $name;
5129552c 101 if (exists $Encoding{$name})
18586f54 102 {
5129552c 103 return $Encoding{$name};
18586f54 104 }
5129552c 105 if (exists $Encoding{$lc})
18586f54 106 {
5129552c 107 return $Encoding{$lc};
18586f54 108 }
c50d192e 109
5129552c 110 my $oc = $class->find_alias($name);
c50d192e 111 return $oc if defined $oc;
112
5129552c 113 $oc = $class->find_alias($lc) if $lc ne $name;
c50d192e 114 return $oc if defined $oc;
115
c731e18e 116 unless ($skip_external)
d1ed7747 117 {
c731e18e 118 if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
119 $mod =~ s,::,/,g ; $mod .= '.pm';
120 eval{ require $mod; };
121 return $Encoding{$name} if exists $Encoding{$name};
122 }
d1ed7747 123 }
18586f54 124 return;
656753f8 125}
126
4411f3b6 127sub find_encoding
128{
dd9703c9 129 my ($name,$skip_external) = @_;
130 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6 131}
132
fcb875d4 133sub resolve_alias {
134 my $obj = find_encoding(shift);
135 defined $obj and return $obj->name;
136 return;
137}
138
b2704119 139sub encode($$;$)
4411f3b6 140{
18586f54 141 my ($name,$string,$check) = @_;
b2704119 142 $check ||=0;
18586f54 143 my $enc = find_encoding($name);
144 croak("Unknown encoding '$name'") unless defined $enc;
145 my $octets = $enc->encode($string,$check);
146 return undef if ($check && length($string));
147 return $octets;
4411f3b6 148}
149
b2704119 150sub decode($$;$)
4411f3b6 151{
18586f54 152 my ($name,$octets,$check) = @_;
b2704119 153 $check ||=0;
18586f54 154 my $enc = find_encoding($name);
155 croak("Unknown encoding '$name'") unless defined $enc;
156 my $string = $enc->decode($octets,$check);
157 $_[1] = $octets if $check;
158 return $string;
4411f3b6 159}
160
b2704119 161sub from_to($$$;$)
4411f3b6 162{
18586f54 163 my ($string,$from,$to,$check) = @_;
b2704119 164 $check ||=0;
18586f54 165 my $f = find_encoding($from);
166 croak("Unknown encoding '$from'") unless defined $f;
167 my $t = find_encoding($to);
168 croak("Unknown encoding '$to'") unless defined $t;
169 my $uni = $f->decode($string,$check);
170 return undef if ($check && length($string));
a999c27c 171 $string = $t->encode($uni,$check);
18586f54 172 return undef if ($check && length($uni));
3ef515df 173 return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6 174}
175
b2704119 176sub encode_utf8($)
4411f3b6 177{
18586f54 178 my ($str) = @_;
c731e18e 179 utf8::encode($str);
18586f54 180 return $str;
4411f3b6 181}
182
b2704119 183sub decode_utf8($)
4411f3b6 184{
18586f54 185 my ($str) = @_;
186 return undef unless utf8::decode($str);
187 return $str;
5ad8ef52 188}
189
f2a2953c 190predefine_encodings();
191
192#
193# This is to restore %Encoding if really needed;
194#
195sub predefine_encodings{
196 if ($ON_EBCDIC) {
197 # was in Encode::UTF_EBCDIC
198 package Encode::UTF_EBCDIC;
199 *name = sub{ shift->{'Name'} };
200 *new_sequence = sub{ return $_[0] };
201 *decode = sub{
202 my ($obj,$str,$chk) = @_;
203 my $res = '';
204 for (my $i = 0; $i < length($str); $i++) {
205 $res .=
206 chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
207 }
208 $_[1] = '' if $chk;
209 return $res;
210 };
211 *encode = sub{
212 my ($obj,$str,$chk) = @_;
213 my $res = '';
214 for (my $i = 0; $i < length($str); $i++) {
215 $res .=
216 chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
217 }
218 $_[1] = '' if $chk;
219 return $res;
220 };
77ea6967 221 $Encode::Encoding{Unicode} =
c731e18e 222 bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
f2a2953c 223 } else {
224 # was in Encode::UTF_EBCDIC
225 package Encode::Internal;
226 *name = sub{ shift->{'Name'} };
227 *new_sequence = sub{ return $_[0] };
228 *decode = sub{
229 my ($obj,$str,$chk) = @_;
230 utf8::upgrade($str);
231 $_[1] = '' if $chk;
232 return $str;
233 };
234 *encode = \&decode;
235 $Encode::Encoding{Unicode} =
c731e18e 236 bless {Name => "Internal"} => "Encode::Internal";
f2a2953c 237 }
238
239 {
240 # was in Encode::utf8
241 package Encode::utf8;
242 *name = sub{ shift->{'Name'} };
243 *new_sequence = sub{ return $_[0] };
244 *decode = sub{
245 my ($obj,$octets,$chk) = @_;
246 my $str = Encode::decode_utf8($octets);
247 if (defined $str) {
248 $_[1] = '' if $chk;
249 return $str;
250 }
251 return undef;
252 };
253 *encode = sub {
254 my ($obj,$string,$chk) = @_;
255 my $octets = Encode::encode_utf8($string);
256 $_[1] = '' if $chk;
257 return $octets;
258 };
259 $Encode::Encoding{utf8} =
c731e18e 260 bless {Name => "utf8"} => "Encode::utf8";
f2a2953c 261 }
f2a2953c 262}
263
18586f54 264require Encode::Encoding;
b2704119 265
85982a32 266eval qq{ use PerlIO::encoding 0.02 };
267# warn $@ if $@;
4411f3b6 268
656753f8 2691;
270
2a936312 271__END__
272
4411f3b6 273=head1 NAME
274
275Encode - character encodings
276
277=head1 SYNOPSIS
278
279 use Encode;
280
67d7b5ef 281
282=head2 Table of Contents
283
284Encode consists of a collection of modules which details are too big
285to fit in one document. This POD itself explains the top-level APIs
286and general topics at a glance. For other topics and more details,
287see the PODs below;
288
289 Name Description
290 --------------------------------------------------------
291 Encode::Alias Alias defintions to encodings
292 Encode::Encoding Encode Implementation Base Class
293 Encode::Supported List of Supported Encodings
294 Encode::CN Simplified Chinese Encodings
295 Encode::JP Japanese Encodings
296 Encode::KR Korean Encodings
297 Encode::TW Traditional Chinese Encodings
298 --------------------------------------------------------
299
4411f3b6 300=head1 DESCRIPTION
301
47bfe92f 302The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef 303and the rest of the system. Perl strings are sequences of
304B<characters>.
305
306The repertoire of characters that Perl can represent is at least that
307defined by the Unicode Consortium. On most platforms the ordinal
308values of the characters (as returned by C<ord(ch)>) is the "Unicode
309codepoint" for the character (the exceptions are those platforms where
310the legacy encoding is some variant of EBCDIC rather than a super-set
311of ASCII - see L<perlebcdic>).
312
313Traditionally computer data has been moved around in 8-bit chunks
314often called "bytes". These chunks are also known as "octets" in
315networking standards. Perl is widely used to manipulate data of many
316types - not only strings of characters representing human or computer
317languages but also "binary" data being the machines representation of
318numbers, pixels in an image - or just about anything.
319
320When Perl is processing "binary data" the programmer wants Perl to
321process "sequences of bytes". This is not a problem for Perl - as a
322byte has 256 possible values it easily fits in Perl's much larger
323"logical character".
324
325=head2 TERMINOLOGY
4411f3b6 326
67d7b5ef 327=over 4
21938dfa 328
67d7b5ef 329=item *
330
331I<character>: a character in the range 0..(2**32-1) (or more).
332(What Perl's strings are made of.)
333
334=item *
335
336I<byte>: a character in the range 0..255
337(A special case of a Perl character.)
338
339=item *
340
341I<octet>: 8 bits of data, with ordinal values 0..255
342(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
343
344=back
4411f3b6 345
67d7b5ef 346The marker [INTERNAL] marks Internal Implementation Details, in
347general meant only for those who think they know what they are doing,
348and such details may change in future releases.
349
350=head1 PERL ENCODING API
4411f3b6 351
352=over 4
353
f2a2953c 354=item $octets = encode(ENCODING, $string[, CHECK])
4411f3b6 355
47bfe92f 356Encodes string from Perl's internal form into I<ENCODING> and returns
67d7b5ef 357a sequence of octets. ENCODING can be either a canonical name or
358alias. For encoding names and aliases, see L</"Defining Aliases">.
359For CHECK see L</"Handling Malformed Data">.
4411f3b6 360
67d7b5ef 361For example to convert (internally UTF-8 encoded) Unicode string to
362iso-8859-1 (also known as Latin1),
681a7c68 363
67d7b5ef 364 $octets = encode("iso-8859-1", $unicode);
681a7c68 365
f2a2953c 366=item $string = decode(ENCODING, $octets[, CHECK])
4411f3b6 367
47bfe92f 368Decode sequence of octets assumed to be in I<ENCODING> into Perl's
67d7b5ef 369internal form and returns the resulting string. as in encode(),
370ENCODING can be either a canonical name or alias. For encoding names
371and aliases, see L</"Defining Aliases">. For CHECK see
47bfe92f 372L</"Handling Malformed Data">.
373
1b2c56c8 374For example to convert ISO-8859-1 data to UTF-8:
681a7c68 375
67d7b5ef 376 $utf8 = decode("iso-8859-1", $latin1);
681a7c68 377
f2a2953c 378=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])
47bfe92f 379
85982a32 380Convert B<in-place> the data between two encodings.
1b2c56c8 381For example to convert ISO-8859-1 data to UTF-8:
2b106fbe 382
383 from_to($data, "iso-8859-1", "utf-8");
384
385and to convert it back:
386
387 from_to($data, "utf-8", "iso-8859-1");
4411f3b6 388
ab97ca19 389Note that because the conversion happens in place, the data to be
390converted cannot be a string constant, it must be a scalar variable.
391
3ef515df 392from_to() return the length of the converted string on success, undef
393otherwise.
394
4411f3b6 395=back
396
f2a2953c 397=head2 UTF-8 / utf8
398
399The Unicode consortium defines the UTF-8 standard as a way of encoding
400the entire Unicode repertoire as sequences of octets. This encoding is
401expected to become very widespread. Perl can use this form internally
402to represent strings, so conversions to and from this form are
403particularly efficient (as octets in memory do not have to change,
404just the meta-data that tells Perl how to treat them).
405
406=over 4
407
408=item $octets = encode_utf8($string);
409
410The characters that comprise string are encoded in Perl's superset of UTF-8
411and the resulting octets returned as a sequence of bytes. All possible
412characters have a UTF-8 representation so this function cannot fail.
413
414=item $string = decode_utf8($octets [, CHECK]);
415
416The sequence of octets represented by $octets is decoded from UTF-8
417into a sequence of logical characters. Not all sequences of octets
418form valid UTF-8 encodings, so it is possible for this call to fail.
419For CHECK see L</"Handling Malformed Data">.
420
421=back
422
51ef4e11 423=head2 Listing available encodings
424
5129552c 425 use Encode;
426 @list = Encode->encodings();
427
428Returns a list of the canonical names of the available encodings that
429are loaded. To get a list of all available encodings including the
430ones that are not loaded yet, say
431
432 @all_encodings = Encode->encodings(":all");
433
434Or you can give the name of specific module.
435
c731e18e 436 @with_jp = Encode->encodings("Encode::JP");
437
438When "::" is not in the name, "Encode::" is assumed.
51ef4e11 439
c731e18e 440 @ebcdic = Encode->encodings("EBCDIC");
5d030b67 441
a63c962f 442To find which encodings are supported by this package in details,
5d030b67 443see L<Encode::Supported>.
51ef4e11 444
445=head2 Defining Aliases
446
67d7b5ef 447To add new alias to a given encoding, Use;
448
5129552c 449 use Encode;
450 use Encode::Alias;
a63c962f 451 define_alias(newName => ENCODING);
51ef4e11 452
3ef515df 453After that, newName can be used as an alias for ENCODING.
f2a2953c 454ENCODING may be either the name of an encoding or an
455I<encoding object>
51ef4e11 456
fcb875d4 457But before you do so, make sure the alias is nonexistent with
458C<resolve_alias()>, which returns the canonical name thereof.
459i.e.
460
461 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
462 Encode::resolve_alias("iso-8859-12") # false; nonexistent
463 Encode::resolve_alias($name) eq $name # true if $name is canonical
464
465This resolve_alias() does not need C<use Encode::Alias> and is
466exported via C<use encode qw(resolve_alias)>.
467
5d030b67 468See L<Encode::Alias> on details.
51ef4e11 469
85982a32 470=head1 Encoding via PerlIO
4411f3b6 471
85982a32 472If your perl supports I<PerlIO>, you can use PerlIO layer to directly
473decode and encode via filehandle. The following two examples are
474totally identical by functionality.
4411f3b6 475
85982a32 476 # via PerlIO
477 open my $in, "<:encoding(shiftjis)", $infile or die;
478 open my $out, ">:encoding(euc-jp)", $outfile or die;
479 while(<>){ print; }
8e86646e 480
85982a32 481 # via from_to
482 open my $in, $infile or die;
483 open my $out, $outfile or die;
484 while(<>){
485 from_to($_, "shiftjis", "euc", 1);
486 }
4411f3b6 487
85982a32 488Unfortunately, not all encodings are PerlIO-savvy. You can check if
489your encoding is supported by PerlIO by C<perlio_ok> method.
4411f3b6 490
85982a32 491 Encode::perlio_ok("iso-20220jp"); # false
492 find_encoding("iso-2022-jp")->perlio_ok; # false
493 use Encode qw(perlio_ok); # exported upon request
494 perlio_ok("euc-jp") # true if PerlIO is enabled
4411f3b6 495
85982a32 496For gory details, see L<Encode::PerlIO>;
4411f3b6 497
85982a32 498=head1 Handling Malformed Data
4411f3b6 499
85982a32 500=over 4
47bfe92f 501
85982a32 502THE I<CHECK> argument is used as follows. When you omit it, it is
503identical to I<CHECK> = 0.
47bfe92f 504
85982a32 505=item I<CHECK> = Encode::FB_DEFAULT ( == 0)
47bfe92f 506
85982a32 507If I<CHECK> is 0, (en|de)code will put I<substitution character> in
508place of the malformed character. for UCM-based encodings,
509E<lt>subcharE<gt> will be used. For Unicode, \xFFFD is used. If the
510data is supposed to be UTF-8, an optional lexical warning (category
511utf8) is given.
e9692b5b 512
85982a32 513=item I<CHECK> = Encode::DIE_ON_ERROR (== 1)
e9692b5b 514
85982a32 515If I<CHECK> is 1, methods will die immediately with an error
516message. so when I<CHECK> is set, you should trap the fatal error
517with eval{} unless you really want to let it die on error.
47bfe92f 518
85982a32 519=item I<CHECK> = Encode::FB_QUIET
47bfe92f 520
85982a32 521If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately
522return proccessed part on error, with data passed via argument
523overwritten with unproccessed part. This is handy when have to
524repeatedly call because the source data is chopped in the middle for
525some reasons, such as fixed-width buffer. Here is a sample code that
526just does this.
4411f3b6 527
85982a32 528 my $data = '';
529 while(defined(read $fh, $buffer, 256)){
530 # buffer may end in partial character so we append
531 $data .= $buffer;
532 $utf8 .= decode($encoding, $data, ENCODE::FB_QUIET);
533 # $data now contains unprocessed partial character
534 }
1768d7eb 535
85982a32 536=item I<CHECK> = Encode::FB_WARN
67d7b5ef 537
85982a32 538This is the same as above, except it warns on error. Handy when you
539are debugging the mode above.
540
541=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
542
543For encodings that are implemented by Encode::XS, CHECK ==
544Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode.
545
546When you decode, '\xI<XX>' will be placed where I<XX> is the hex
547representation of the octet that could not be decoded to utf8. And
548when you encode, '\x{I<xxxx>}' will be placed where I<xxxx> is the
549Unicode ID of the character that cannot be found in the character
550repartoire of the encoding.
551
552=item The bitmask
553
554These modes are actually set via bitmask. here is how FB_XX are laid
555out. for FB_XX you can import via C<use Encode qw(:fallbacks)> for
556generic bitmask constants, you can import via
557 C<use Encode qw(:fallback_all)>.
558
559 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
560 DIE_ON_ERR 0x0001 X
561 WARN_ON_ERR 0x0002 X
562 RETURN_ON_ERR 0x0004 X X
563 LEAVE_SRC 0x0008
564 PERLQQ 0x0100 X
67d7b5ef 565
85982a32 566=head2 Unemplemented fallback schemes
67d7b5ef 567
f2a2953c 568In future you will be able to use a code reference to a callback
569function for the value of I<CHECK> but its API is still undecided.
67d7b5ef 570
571=head1 Defining Encodings
572
573To define a new encoding, use:
574
575 use Encode qw(define_alias);
576 define_encoding($object, 'canonicalName' [, alias...]);
577
578I<canonicalName> will be associated with I<$object>. The object
579should provide the interface described in L<Encode::Encoding>
580If more than two arguments are provided then additional
581arguments are taken as aliases for I<$object> as for C<define_alias>.
582
f2a2953c 583See L<Encode::Encoding> for more details.
584
4411f3b6 585=head1 Messing with Perl's Internals
586
47bfe92f 587The following API uses parts of Perl's internals in the current
588implementation. As such they are efficient, but may change.
4411f3b6 589
590=over 4
591
a63c962f 592=item is_utf8(STRING [, CHECK])
4411f3b6 593
594[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f 595If CHECK is true, also checks the data in STRING for being well-formed
596UTF-8. Returns true if successful, false otherwise.
4411f3b6 597
a63c962f 598=item _utf8_on(STRING)
4411f3b6 599
600[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
601B<not> checked for being well-formed UTF-8. Do not use unless you
602B<know> that the STRING is well-formed UTF-8. Returns the previous
603state of the UTF-8 flag (so please don't test the return value as
604I<not> success or failure), or C<undef> if STRING is not a string.
605
a63c962f 606=item _utf8_off(STRING)
4411f3b6 607
608[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
609Returns the previous state of the UTF-8 flag (so please don't test the
610return value as I<not> success or failure), or C<undef> if STRING is
611not a string.
612
613=back
614
615=head1 SEE ALSO
616
5d030b67 617L<Encode::Encoding>,
618L<Encode::Supported>,
85982a32 619L<Encode::PerlIO>,
5d030b67 620L<encoding>,
621L<perlebcdic>,
622L<perlfunc/open>,
623L<perlunicode>,
624L<utf8>,
625the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 626
85982a32 627=head1 MAINTAINER
aae85ceb 628
629This project was originated by Nick Ing-Simmons and later maintained
630by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for full list
631of people involved. For any questions, use
632E<lt>perl-unicode@perl.orgE<gt> so others can share.
633
4411f3b6 634=cut