[Encode] 1.40 released!
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
aae85ceb 3our $VERSION = do { my @r = (q$Revision: 1.40 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c 4our $DEBUG = 0;
2c674647 5
6require DynaLoader;
7require Exporter;
8
51ef4e11 9our @ISA = qw(Exporter DynaLoader);
2c674647 10
4411f3b6 11# Public, encouraged API is exported by default
51ef4e11 12our @EXPORT = qw (
4411f3b6 13 decode
4411f3b6 14 decode_utf8
fcb875d4 15 encode
16 encode_utf8
51ef4e11 17 encodings
fcb875d4 18 find_encoding
4411f3b6 19);
20
51ef4e11 21our @EXPORT_OK =
2c674647 22 qw(
fcb875d4 23 _utf8_off
24 _utf8_on
51ef4e11 25 define_encoding
2c674647 26 from_to
4411f3b6 27 is_16bit
fcb875d4 28 is_8bit
29 is_utf8
30 resolve_alias
a12c0f56 31 utf8_downgrade
fcb875d4 32 utf8_upgrade
2c674647 33 );
34
35bootstrap Encode ();
36
4411f3b6 37# Documentation moved after __END__ for speed - NI-S
2c674647 38
bf230f3d 39use Carp;
40
a63c962f 41our $ON_EBCDIC = (ord("A") == 193);
f2a2953c 42
5d030b67 43use Encode::Alias;
44
5129552c 45# Make a %Encoding package variable to allow a certain amount of cheating
46our %Encoding;
aae85ceb 47our %ExtModule;
48require Encode::Config;
49eval { require Encode::ConfigLocal };
5129552c 50
656753f8 51sub encodings
52{
5129552c 53 my $class = shift;
071db25d 54 my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
c731e18e 55 for my $mod (@modules){
56 $mod =~ s,::,/,g or $mod = "Encode/$mod";
57 $mod .= '.pm';
58 $DEBUG and warn "about to require $mod;";
59 eval { require $mod; };
5129552c 60 }
c731e18e 61 my %modules = map {$_ => 1} @modules;
5129552c 62 return
ce912cd4 63 sort { lc $a cmp lc $b }
64 grep {!/^(?:Internal|Unicode)$/o} keys %Encoding;
51ef4e11 65}
66
51ef4e11 67sub define_encoding
68{
18586f54 69 my $obj = shift;
70 my $name = shift;
5129552c 71 $Encoding{$name} = $obj;
18586f54 72 my $lc = lc($name);
73 define_alias($lc => $obj) unless $lc eq $name;
74 while (@_)
75 {
76 my $alias = shift;
77 define_alias($alias,$obj);
78 }
79 return $obj;
656753f8 80}
81
656753f8 82sub getEncoding
83{
dd9703c9 84 my ($class,$name,$skip_external) = @_;
18586f54 85 my $enc;
86 if (ref($name) && $name->can('new_sequence'))
87 {
88 return $name;
89 }
90 my $lc = lc $name;
5129552c 91 if (exists $Encoding{$name})
18586f54 92 {
5129552c 93 return $Encoding{$name};
18586f54 94 }
5129552c 95 if (exists $Encoding{$lc})
18586f54 96 {
5129552c 97 return $Encoding{$lc};
18586f54 98 }
c50d192e 99
5129552c 100 my $oc = $class->find_alias($name);
c50d192e 101 return $oc if defined $oc;
102
5129552c 103 $oc = $class->find_alias($lc) if $lc ne $name;
c50d192e 104 return $oc if defined $oc;
105
c731e18e 106 unless ($skip_external)
d1ed7747 107 {
c731e18e 108 if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
109 $mod =~ s,::,/,g ; $mod .= '.pm';
110 eval{ require $mod; };
111 return $Encoding{$name} if exists $Encoding{$name};
112 }
d1ed7747 113 }
18586f54 114 return;
656753f8 115}
116
4411f3b6 117sub find_encoding
118{
dd9703c9 119 my ($name,$skip_external) = @_;
120 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6 121}
122
fcb875d4 123sub resolve_alias {
124 my $obj = find_encoding(shift);
125 defined $obj and return $obj->name;
126 return;
127}
128
4411f3b6 129sub encode
130{
18586f54 131 my ($name,$string,$check) = @_;
132 my $enc = find_encoding($name);
133 croak("Unknown encoding '$name'") unless defined $enc;
134 my $octets = $enc->encode($string,$check);
135 return undef if ($check && length($string));
136 return $octets;
4411f3b6 137}
138
139sub decode
140{
18586f54 141 my ($name,$octets,$check) = @_;
142 my $enc = find_encoding($name);
143 croak("Unknown encoding '$name'") unless defined $enc;
144 my $string = $enc->decode($octets,$check);
145 $_[1] = $octets if $check;
146 return $string;
4411f3b6 147}
148
149sub from_to
150{
18586f54 151 my ($string,$from,$to,$check) = @_;
152 my $f = find_encoding($from);
153 croak("Unknown encoding '$from'") unless defined $f;
154 my $t = find_encoding($to);
155 croak("Unknown encoding '$to'") unless defined $t;
156 my $uni = $f->decode($string,$check);
157 return undef if ($check && length($string));
a999c27c 158 $string = $t->encode($uni,$check);
18586f54 159 return undef if ($check && length($uni));
3ef515df 160 return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6 161}
162
163sub encode_utf8
164{
18586f54 165 my ($str) = @_;
c731e18e 166 utf8::encode($str);
18586f54 167 return $str;
4411f3b6 168}
169
170sub decode_utf8
171{
18586f54 172 my ($str) = @_;
173 return undef unless utf8::decode($str);
174 return $str;
5ad8ef52 175}
176
f2a2953c 177predefine_encodings();
178
179#
180# This is to restore %Encoding if really needed;
181#
182sub predefine_encodings{
183 if ($ON_EBCDIC) {
184 # was in Encode::UTF_EBCDIC
185 package Encode::UTF_EBCDIC;
186 *name = sub{ shift->{'Name'} };
187 *new_sequence = sub{ return $_[0] };
188 *decode = sub{
189 my ($obj,$str,$chk) = @_;
190 my $res = '';
191 for (my $i = 0; $i < length($str); $i++) {
192 $res .=
193 chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
194 }
195 $_[1] = '' if $chk;
196 return $res;
197 };
198 *encode = sub{
199 my ($obj,$str,$chk) = @_;
200 my $res = '';
201 for (my $i = 0; $i < length($str); $i++) {
202 $res .=
203 chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
204 }
205 $_[1] = '' if $chk;
206 return $res;
207 };
77ea6967 208 $Encode::Encoding{Unicode} =
c731e18e 209 bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
f2a2953c 210 } else {
211 # was in Encode::UTF_EBCDIC
212 package Encode::Internal;
213 *name = sub{ shift->{'Name'} };
214 *new_sequence = sub{ return $_[0] };
215 *decode = sub{
216 my ($obj,$str,$chk) = @_;
217 utf8::upgrade($str);
218 $_[1] = '' if $chk;
219 return $str;
220 };
221 *encode = \&decode;
222 $Encode::Encoding{Unicode} =
c731e18e 223 bless {Name => "Internal"} => "Encode::Internal";
f2a2953c 224 }
225
226 {
227 # was in Encode::utf8
228 package Encode::utf8;
229 *name = sub{ shift->{'Name'} };
230 *new_sequence = sub{ return $_[0] };
231 *decode = sub{
232 my ($obj,$octets,$chk) = @_;
233 my $str = Encode::decode_utf8($octets);
234 if (defined $str) {
235 $_[1] = '' if $chk;
236 return $str;
237 }
238 return undef;
239 };
240 *encode = sub {
241 my ($obj,$string,$chk) = @_;
242 my $octets = Encode::encode_utf8($string);
243 $_[1] = '' if $chk;
244 return $octets;
245 };
246 $Encode::Encoding{utf8} =
c731e18e 247 bless {Name => "utf8"} => "Encode::utf8";
f2a2953c 248 }
f2a2953c 249}
250
18586f54 251require Encode::Encoding;
252require Encode::XS;
4411f3b6 253
656753f8 2541;
255
2a936312 256__END__
257
4411f3b6 258=head1 NAME
259
260Encode - character encodings
261
262=head1 SYNOPSIS
263
264 use Encode;
265
67d7b5ef 266
267=head2 Table of Contents
268
269Encode consists of a collection of modules which details are too big
270to fit in one document. This POD itself explains the top-level APIs
271and general topics at a glance. For other topics and more details,
272see the PODs below;
273
274 Name Description
275 --------------------------------------------------------
276 Encode::Alias Alias defintions to encodings
277 Encode::Encoding Encode Implementation Base Class
278 Encode::Supported List of Supported Encodings
279 Encode::CN Simplified Chinese Encodings
280 Encode::JP Japanese Encodings
281 Encode::KR Korean Encodings
282 Encode::TW Traditional Chinese Encodings
283 --------------------------------------------------------
284
4411f3b6 285=head1 DESCRIPTION
286
47bfe92f 287The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef 288and the rest of the system. Perl strings are sequences of
289B<characters>.
290
291The repertoire of characters that Perl can represent is at least that
292defined by the Unicode Consortium. On most platforms the ordinal
293values of the characters (as returned by C<ord(ch)>) is the "Unicode
294codepoint" for the character (the exceptions are those platforms where
295the legacy encoding is some variant of EBCDIC rather than a super-set
296of ASCII - see L<perlebcdic>).
297
298Traditionally computer data has been moved around in 8-bit chunks
299often called "bytes". These chunks are also known as "octets" in
300networking standards. Perl is widely used to manipulate data of many
301types - not only strings of characters representing human or computer
302languages but also "binary" data being the machines representation of
303numbers, pixels in an image - or just about anything.
304
305When Perl is processing "binary data" the programmer wants Perl to
306process "sequences of bytes". This is not a problem for Perl - as a
307byte has 256 possible values it easily fits in Perl's much larger
308"logical character".
309
310=head2 TERMINOLOGY
4411f3b6 311
67d7b5ef 312=over 4
21938dfa 313
67d7b5ef 314=item *
315
316I<character>: a character in the range 0..(2**32-1) (or more).
317(What Perl's strings are made of.)
318
319=item *
320
321I<byte>: a character in the range 0..255
322(A special case of a Perl character.)
323
324=item *
325
326I<octet>: 8 bits of data, with ordinal values 0..255
327(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
328
329=back
4411f3b6 330
67d7b5ef 331The marker [INTERNAL] marks Internal Implementation Details, in
332general meant only for those who think they know what they are doing,
333and such details may change in future releases.
334
335=head1 PERL ENCODING API
4411f3b6 336
337=over 4
338
f2a2953c 339=item $octets = encode(ENCODING, $string[, CHECK])
4411f3b6 340
47bfe92f 341Encodes string from Perl's internal form into I<ENCODING> and returns
67d7b5ef 342a sequence of octets. ENCODING can be either a canonical name or
343alias. For encoding names and aliases, see L</"Defining Aliases">.
344For CHECK see L</"Handling Malformed Data">.
4411f3b6 345
67d7b5ef 346For example to convert (internally UTF-8 encoded) Unicode string to
347iso-8859-1 (also known as Latin1),
681a7c68 348
67d7b5ef 349 $octets = encode("iso-8859-1", $unicode);
681a7c68 350
f2a2953c 351=item $string = decode(ENCODING, $octets[, CHECK])
4411f3b6 352
47bfe92f 353Decode sequence of octets assumed to be in I<ENCODING> into Perl's
67d7b5ef 354internal form and returns the resulting string. as in encode(),
355ENCODING can be either a canonical name or alias. For encoding names
356and aliases, see L</"Defining Aliases">. For CHECK see
47bfe92f 357L</"Handling Malformed Data">.
358
1b2c56c8 359For example to convert ISO-8859-1 data to UTF-8:
681a7c68 360
67d7b5ef 361 $utf8 = decode("iso-8859-1", $latin1);
681a7c68 362
f2a2953c 363=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])
47bfe92f 364
2b106fbe 365Convert B<in-place> the data between two encodings. How did the data
366in $string originally get to be in FROM_ENCODING? Either using
67d7b5ef 367encode() or through PerlIO: See L</"Encoding and IO">.
368For encoding names and aliases, see L</"Defining Aliases">.
369For CHECK see L</"Handling Malformed Data">.
2b106fbe 370
1b2c56c8 371For example to convert ISO-8859-1 data to UTF-8:
2b106fbe 372
373 from_to($data, "iso-8859-1", "utf-8");
374
375and to convert it back:
376
377 from_to($data, "utf-8", "iso-8859-1");
4411f3b6 378
ab97ca19 379Note that because the conversion happens in place, the data to be
380converted cannot be a string constant, it must be a scalar variable.
381
3ef515df 382from_to() return the length of the converted string on success, undef
383otherwise.
384
4411f3b6 385=back
386
f2a2953c 387=head2 UTF-8 / utf8
388
389The Unicode consortium defines the UTF-8 standard as a way of encoding
390the entire Unicode repertoire as sequences of octets. This encoding is
391expected to become very widespread. Perl can use this form internally
392to represent strings, so conversions to and from this form are
393particularly efficient (as octets in memory do not have to change,
394just the meta-data that tells Perl how to treat them).
395
396=over 4
397
398=item $octets = encode_utf8($string);
399
400The characters that comprise string are encoded in Perl's superset of UTF-8
401and the resulting octets returned as a sequence of bytes. All possible
402characters have a UTF-8 representation so this function cannot fail.
403
404=item $string = decode_utf8($octets [, CHECK]);
405
406The sequence of octets represented by $octets is decoded from UTF-8
407into a sequence of logical characters. Not all sequences of octets
408form valid UTF-8 encodings, so it is possible for this call to fail.
409For CHECK see L</"Handling Malformed Data">.
410
411=back
412
51ef4e11 413=head2 Listing available encodings
414
5129552c 415 use Encode;
416 @list = Encode->encodings();
417
418Returns a list of the canonical names of the available encodings that
419are loaded. To get a list of all available encodings including the
420ones that are not loaded yet, say
421
422 @all_encodings = Encode->encodings(":all");
423
424Or you can give the name of specific module.
425
c731e18e 426 @with_jp = Encode->encodings("Encode::JP");
427
428When "::" is not in the name, "Encode::" is assumed.
51ef4e11 429
c731e18e 430 @ebcdic = Encode->encodings("EBCDIC");
5d030b67 431
a63c962f 432To find which encodings are supported by this package in details,
5d030b67 433see L<Encode::Supported>.
51ef4e11 434
435=head2 Defining Aliases
436
67d7b5ef 437To add new alias to a given encoding, Use;
438
5129552c 439 use Encode;
440 use Encode::Alias;
a63c962f 441 define_alias(newName => ENCODING);
51ef4e11 442
3ef515df 443After that, newName can be used as an alias for ENCODING.
f2a2953c 444ENCODING may be either the name of an encoding or an
445I<encoding object>
51ef4e11 446
fcb875d4 447But before you do so, make sure the alias is nonexistent with
448C<resolve_alias()>, which returns the canonical name thereof.
449i.e.
450
451 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
452 Encode::resolve_alias("iso-8859-12") # false; nonexistent
453 Encode::resolve_alias($name) eq $name # true if $name is canonical
454
455This resolve_alias() does not need C<use Encode::Alias> and is
456exported via C<use encode qw(resolve_alias)>.
457
5d030b67 458See L<Encode::Alias> on details.
51ef4e11 459
4411f3b6 460=head1 Encoding and IO
461
462It is very common to want to do encoding transformations when
463reading or writing files, network connections, pipes etc.
47bfe92f 464If Perl is configured to use the new 'perlio' IO system then
4411f3b6 465C<Encode> provides a "layer" (See L<perliol>) which can transform
466data as it is read or written.
467
8e86646e 468Here is how the blind poet would modernise the encoding:
469
42234700 470 use Encode;
8e86646e 471 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
472 open(my $utf8,'>:utf8','iliad.utf8');
473 my @epic = <$iliad>;
474 print $utf8 @epic;
475 close($utf8);
476 close($illiad);
4411f3b6 477
478In addition the new IO system can also be configured to read/write
479UTF-8 encoded characters (as noted above this is efficient):
480
e9692b5b 481 open(my $fh,'>:utf8','anything');
482 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6 483
484Either of the above forms of "layer" specifications can be made the default
485for a lexical scope with the C<use open ...> pragma. See L<open>.
486
487Once a handle is open is layers can be altered using C<binmode>.
488
47bfe92f 489Without any such configuration, or if Perl itself is built using
4411f3b6 490system's own IO, then write operations assume that file handle accepts
491only I<bytes> and will C<die> if a character larger than 255 is
492written to the handle. When reading, each octet from the handle
493becomes a byte-in-a-character. Note that this default is the same
47bfe92f 494behaviour as bytes-only languages (including Perl before v5.6) would
495have, and is sufficient to handle native 8-bit encodings
496e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
497other encodings and binary data.
498
499In other cases it is the programs responsibility to transform
500characters into bytes using the API above before doing writes, and to
501transform the bytes read from a handle into characters before doing
502"character operations" (e.g. C<lc>, C</\W+/>, ...).
503
47bfe92f 504You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8 505want to bring into memory. For example to convert between ISO-8859-1
47bfe92f 506(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
507
e9692b5b 508 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
509 open(G, ">:utf8", "data.utf") or die $!;
510 while (<F>) { print G }
511
512 # Could also do "print G <F>" but that would pull
513 # the whole file into memory just to write it out again.
514
515More examples:
47bfe92f 516
e9692b5b 517 open(my $f, "<:encoding(cp1252)")
518 open(my $g, ">:encoding(iso-8859-2)")
519 open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f 520
521See L<PerlIO> for more information.
4411f3b6 522
1768d7eb 523See also L<encoding> for how to change the default encoding of the
d521382b 524data in your script.
1768d7eb 525
67d7b5ef 526=head1 Handling Malformed Data
527
f2a2953c 528If I<CHECK> is not set, (en|de)code will put I<substitution character> in
529place of the malformed character. for UCM-based encodings,
530E<lt>subcharE<gt> will be used. For Unicode, \xFFFD is used. If the
531data is supposed to be UTF-8, an optional lexical warning (category
532utf8) is given.
67d7b5ef 533
f2a2953c 534If I<CHECK> is true but not a code reference, dies with an error message.
67d7b5ef 535
f2a2953c 536In future you will be able to use a code reference to a callback
537function for the value of I<CHECK> but its API is still undecided.
67d7b5ef 538
539=head1 Defining Encodings
540
541To define a new encoding, use:
542
543 use Encode qw(define_alias);
544 define_encoding($object, 'canonicalName' [, alias...]);
545
546I<canonicalName> will be associated with I<$object>. The object
547should provide the interface described in L<Encode::Encoding>
548If more than two arguments are provided then additional
549arguments are taken as aliases for I<$object> as for C<define_alias>.
550
f2a2953c 551See L<Encode::Encoding> for more details.
552
4411f3b6 553=head1 Messing with Perl's Internals
554
47bfe92f 555The following API uses parts of Perl's internals in the current
556implementation. As such they are efficient, but may change.
4411f3b6 557
558=over 4
559
a63c962f 560=item is_utf8(STRING [, CHECK])
4411f3b6 561
562[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f 563If CHECK is true, also checks the data in STRING for being well-formed
564UTF-8. Returns true if successful, false otherwise.
4411f3b6 565
a63c962f 566=item _utf8_on(STRING)
4411f3b6 567
568[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
569B<not> checked for being well-formed UTF-8. Do not use unless you
570B<know> that the STRING is well-formed UTF-8. Returns the previous
571state of the UTF-8 flag (so please don't test the return value as
572I<not> success or failure), or C<undef> if STRING is not a string.
573
a63c962f 574=item _utf8_off(STRING)
4411f3b6 575
576[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
577Returns the previous state of the UTF-8 flag (so please don't test the
578return value as I<not> success or failure), or C<undef> if STRING is
579not a string.
580
581=back
582
583=head1 SEE ALSO
584
5d030b67 585L<Encode::Encoding>,
586L<Encode::Supported>,
587L<PerlIO>,
588L<encoding>,
589L<perlebcdic>,
590L<perlfunc/open>,
591L<perlunicode>,
592L<utf8>,
593the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 594
aae85ceb 595head2 MAINTAINER
596
597This project was originated by Nick Ing-Simmons and later maintained
598by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for full list
599of people involved. For any questions, use
600E<lt>perl-unicode@perl.orgE<gt> so others can share.
601
4411f3b6 602=cut