Integrate mainline (Win2k/MinGW all ok except threads/t/end.t)
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
d6b7ef86 3our $VERSION = do { my @r = (q$Revision: 0.94 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
2c674647 4
5require DynaLoader;
6require Exporter;
7
51ef4e11 8our @ISA = qw(Exporter DynaLoader);
2c674647 9
4411f3b6 10# Public, encouraged API is exported by default
51ef4e11 11our @EXPORT = qw (
4411f3b6 12 encode
13 decode
14 encode_utf8
15 decode_utf8
16 find_encoding
51ef4e11 17 encodings
4411f3b6 18);
19
51ef4e11 20our @EXPORT_OK =
2c674647 21 qw(
51ef4e11 22 define_encoding
23 define_alias
2c674647 24 from_to
25 is_utf8
4411f3b6 26 is_8bit
27 is_16bit
a12c0f56 28 utf8_upgrade
29 utf8_downgrade
4411f3b6 30 _utf8_on
31 _utf8_off
2c674647 32 );
33
34bootstrap Encode ();
35
4411f3b6 36# Documentation moved after __END__ for speed - NI-S
2c674647 37
bf230f3d 38use Carp;
39
51ef4e11 40# Make a %encoding package variable to allow a certain amount of cheating
41our %encoding;
42my @alias; # ordered matching list
43my %alias; # cached known aliases
f7ac3676 44
6d6a7c8d 45 # 0 1 2 3 4 5 6 7 8 9 10
46our @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
47
f7ac3676 48our %winlatin2cp = (
49 'Latin1' => 1252,
50 'Latin2' => 1250,
51 'Cyrillic' => 1251,
f7ac3676 52 'Greek' => 1253,
53 'Turkish' => 1254,
54 'Hebrew' => 1255,
55 'Arabic' => 1256,
56 'Baltic' => 1257,
57 'Vietnamese' => 1258,
58 );
5345d506 59
5463e635 60our %external_tables =
2b217bf7 61 (
62 'euc-cn' => 'Encode/CN.pm',
63 gb2312 => 'Encode/CN.pm',
64 gb12345 => 'Encode/CN.pm',
65 gbk => 'Encode/CN.pm',
66 cp936 => 'Encode/CN.pm',
67 'iso-ir-165' => 'Encode/CN.pm',
68 'euc-jp' => 'Encode/JP.pm',
d6b7ef86 69 'iso-2022-jp' => 'Encode/JP.pm',
70 '7bit-jis' => 'Encode/JP.pm',
2b217bf7 71 shiftjis => 'Encode/JP.pm',
72 macjapan => 'Encode/JP.pm',
73 cp932 => 'Encode/JP.pm',
74 'euc-kr' => 'Encode/KR.pm',
75 ksc5601 => 'Encode/KR.pm',
76 cp949 => 'Encode/KR.pm',
77 big5 => 'Encode/TW.pm',
78 'big5-hkscs' => 'Encode/TW.pm',
79 cp950 => 'Encode/TW.pm',
80 gb18030 => 'Encode/HanExtra.pm',
81 big5plus => 'Encode/HanExtra.pm',
82 'euc-tw' => 'Encode/HanExtra.pm',
83 );
d1ed7747 84
656753f8 85sub encodings
86{
87 my ($class) = @_;
40a073c6 88 return
89 map { $_->[0] }
90 sort { $a->[1] cmp $b->[1] }
91 map { [$_, lc $_] }
92 grep { $_ ne 'Internal' }
93 keys %encoding;
51ef4e11 94}
95
96sub findAlias
97{
18586f54 98 my $class = shift;
99 local $_ = shift;
100 # print "# findAlias $_\n";
101 unless (exists $alias{$_})
656753f8 102 {
18586f54 103 for (my $i=0; $i < @alias; $i += 2)
104 {
105 my $alias = $alias[$i];
106 my $val = $alias[$i+1];
107 my $new;
108 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
109 {
110 $new = eval $val;
111 }
112 elsif (ref($alias) eq 'CODE')
113 {
114 $new = &{$alias}($val)
115 }
116 elsif (lc($_) eq lc($alias))
117 {
118 $new = $val;
119 }
120 if (defined($new))
121 {
122 next if $new eq $_; # avoid (direct) recursion on bugs
123 my $enc = (ref($new)) ? $new : find_encoding($new);
124 if ($enc)
125 {
126 $alias{$_} = $enc;
127 last;
128 }
129 }
130 }
656753f8 131 }
18586f54 132 return $alias{$_};
5345d506 133}
134
51ef4e11 135sub define_alias
5345d506 136{
18586f54 137 while (@_)
138 {
139 my ($alias,$name) = splice(@_,0,2);
140 push(@alias, $alias => $name);
141 }
51ef4e11 142}
143
016cb72c 144# Allow variants of iso-8859-1 etc.
d6089a2a 145define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
016cb72c 146
7faf300d 147# At least HP-UX has these.
148define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );
149
f7ac3676 150# More HP stuff.
151define_alias( qr/^(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
152
0b3236bb 153# The Official name of ASCII.
8a361256 154define_alias( qr/^ANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
155
58d53262 156# This is a font issue, not an encoding issue.
157# (The currency symbol of the Latin 1 upper half
158# has been redefined as the euro symbol.)
159define_alias( qr/^(.+)\@euro$/i => '"$1"' );
160
016cb72c 161# Allow latin-1 style names as well
7faf300d 162define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
016cb72c 163
f7ac3676 164# Allow winlatin1 style names as well
cf91068f 165define_alias( qr/^win(latin[12]|cyrillic|baltic|greek|turkish|hebrew|arabic|baltic|vietnamese)$/i => '"cp$winlatin2cp{\u$1}"' );
f7ac3676 166
016cb72c 167# Common names for non-latin prefered MIME names
168define_alias( 'ascii' => 'US-ascii',
169 'cyrillic' => 'iso-8859-5',
170 'arabic' => 'iso-8859-6',
171 'greek' => 'iso-8859-7',
f7ac3676 172 'hebrew' => 'iso-8859-8',
173 'thai' => 'iso-8859-11',
174 'tis620' => 'iso-8859-11',
175 );
016cb72c 176
7faf300d 177# At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
1853dd5f 178# And Microsoft has their own naming (again, surprisingly).
179define_alias( qr/^(?:ibm|ms)[-_]?(\d\d\d\d?)$/i => '"cp$1"');
180
181# Sometimes seen with a leading zero.
182define_alias( qr/^cp037$/i => '"cp37"');
183
184# Ououououou.
185define_alias( qr/^macRomanian$/i => '"macRumanian"');
7faf300d 186
58d53262 187# Standardize on the dashed versions.
188define_alias( qr/^utf8$/i => 'utf-8' );
7faf300d 189define_alias( qr/^koi8r$/i => 'koi8-r' );
f7ac3676 190define_alias( qr/^koi8u$/i => 'koi8-u' );
191
1853dd5f 192# Seen in some Linuxes.
193define_alias( qr/^ujis$/i => 'euc-jp' );
194
b2729934 195# CP936 doesn't have vendor-addon for GBK, so they're identical.
196define_alias( qr/^gbk$/i => '"cp936"');
197
f7ac3676 198# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
199# TODO: HP-UX '15' encodings japanese15 korean15 roi15
200# TODO: Cyrillic encoding ISO-IR-111 (useful?)
f7ac3676 201# TODO: Armenian encoding ARMSCII-8
202# TODO: Hebrew encoding ISO-8859-8-1
203# TODO: Thai encoding TCVN
204# TODO: Korean encoding Johab
56a543c5 205# TODO: Vietnamese encodings VPS
f7ac3676 206# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
207# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
208# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
209# Kannada Khmer Korean Laotian Malayalam Mongolian
210# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
18586f54 211
1853dd5f 212# Map white space and _ to '-'
016cb72c 213define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
214
51ef4e11 215sub define_encoding
216{
18586f54 217 my $obj = shift;
218 my $name = shift;
219 $encoding{$name} = $obj;
220 my $lc = lc($name);
221 define_alias($lc => $obj) unless $lc eq $name;
222 while (@_)
223 {
224 my $alias = shift;
225 define_alias($alias,$obj);
226 }
227 return $obj;
656753f8 228}
229
656753f8 230sub getEncoding
231{
dd9703c9 232 my ($class,$name,$skip_external) = @_;
18586f54 233 my $enc;
234 if (ref($name) && $name->can('new_sequence'))
235 {
236 return $name;
237 }
238 my $lc = lc $name;
239 if (exists $encoding{$name})
240 {
241 return $encoding{$name};
242 }
243 if (exists $encoding{$lc})
244 {
245 return $encoding{$lc};
246 }
c50d192e 247
248 my $oc = $class->findAlias($name);
249 return $oc if defined $oc;
250
251 $oc = $class->findAlias($lc) if $lc ne $name;
252 return $oc if defined $oc;
253
dd9703c9 254 if (!$skip_external and exists $external_tables{$lc})
d1ed7747 255 {
256 require $external_tables{$lc};
257 return $encoding{$name} if exists $encoding{$name};
258 }
18586f54 259
18586f54 260 return;
656753f8 261}
262
4411f3b6 263sub find_encoding
264{
dd9703c9 265 my ($name,$skip_external) = @_;
266 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6 267}
268
269sub encode
270{
18586f54 271 my ($name,$string,$check) = @_;
272 my $enc = find_encoding($name);
273 croak("Unknown encoding '$name'") unless defined $enc;
274 my $octets = $enc->encode($string,$check);
275 return undef if ($check && length($string));
276 return $octets;
4411f3b6 277}
278
279sub decode
280{
18586f54 281 my ($name,$octets,$check) = @_;
282 my $enc = find_encoding($name);
283 croak("Unknown encoding '$name'") unless defined $enc;
284 my $string = $enc->decode($octets,$check);
285 $_[1] = $octets if $check;
286 return $string;
4411f3b6 287}
288
289sub from_to
290{
18586f54 291 my ($string,$from,$to,$check) = @_;
292 my $f = find_encoding($from);
293 croak("Unknown encoding '$from'") unless defined $f;
294 my $t = find_encoding($to);
295 croak("Unknown encoding '$to'") unless defined $t;
296 my $uni = $f->decode($string,$check);
297 return undef if ($check && length($string));
298 $string = $t->encode($uni,$check);
299 return undef if ($check && length($uni));
300 return length($_[0] = $string);
4411f3b6 301}
302
303sub encode_utf8
304{
18586f54 305 my ($str) = @_;
306 utf8::encode($str);
307 return $str;
4411f3b6 308}
309
310sub decode_utf8
311{
18586f54 312 my ($str) = @_;
313 return undef unless utf8::decode($str);
314 return $str;
5ad8ef52 315}
316
18586f54 317require Encode::Encoding;
318require Encode::XS;
319require Encode::Internal;
320require Encode::Unicode;
321require Encode::utf8;
322require Encode::iso10646_1;
323require Encode::ucs2_le;
4411f3b6 324
656753f8 3251;
326
2a936312 327__END__
328
4411f3b6 329=head1 NAME
330
331Encode - character encodings
332
333=head1 SYNOPSIS
334
335 use Encode;
336
337=head1 DESCRIPTION
338
47bfe92f 339The C<Encode> module provides the interfaces between Perl's strings
340and the rest of the system. Perl strings are sequences of B<characters>.
4411f3b6 341
d6b7ef86 342To find more about character encodings, please consult
343L<Encode::Description> . This document focuses on programming references.
21938dfa 344
4411f3b6 345=head1 PERL ENCODING API
346
347=head2 Generic Encoding Interface
348
349=over 4
350
351=item *
352
353 $bytes = encode(ENCODING, $string[, CHECK])
354
47bfe92f 355Encodes string from Perl's internal form into I<ENCODING> and returns
356a sequence of octets. For CHECK see L</"Handling Malformed Data">.
4411f3b6 357
681a7c68 358For example to convert (internally UTF-8 encoded) Unicode data
359to octets:
360
361 $octets = encode("utf8", $unicode);
362
4411f3b6 363=item *
364
365 $string = decode(ENCODING, $bytes[, CHECK])
366
47bfe92f 367Decode sequence of octets assumed to be in I<ENCODING> into Perl's
368internal form and returns the resulting string. For CHECK see
369L</"Handling Malformed Data">.
370
d6b7ef86 371For example to convert ISO-8859-1 data to UTF-8:
681a7c68 372
373 $utf8 = decode("latin1", $latin1);
374
47bfe92f 375=item *
376
377 from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
378
2b106fbe 379Convert B<in-place> the data between two encodings. How did the data
380in $string originally get to be in FROM_ENCODING? Either using
e9692b5b 381encode() or through PerlIO: See L</"Encoding and IO">. For CHECK
2b106fbe 382see L</"Handling Malformed Data">.
383
d6b7ef86 384For example to convert ISO-8859-1 data to UTF-8:
2b106fbe 385
386 from_to($data, "iso-8859-1", "utf-8");
387
388and to convert it back:
389
390 from_to($data, "utf-8", "iso-8859-1");
4411f3b6 391
ab97ca19 392Note that because the conversion happens in place, the data to be
393converted cannot be a string constant, it must be a scalar variable.
394
4411f3b6 395=back
396
397=head2 Handling Malformed Data
398
399If CHECK is not set, C<undef> is returned. If the data is supposed to
47bfe92f 400be UTF-8, an optional lexical warning (category utf8) is given. If
401CHECK is true but not a code reference, dies.
4411f3b6 402
47bfe92f 403It would desirable to have a way to indicate that transform should use
404the encodings "replacement character" - no such mechanism is defined yet.
4411f3b6 405
406It is also planned to allow I<CHECK> to be a code reference.
407
47bfe92f 408This is not yet implemented as there are design issues with what its
409arguments should be and how it returns its results.
4411f3b6 410
411=over 4
412
413=item Scheme 1
414
415Passed remaining fragment of string being processed.
416Modifies it in place to remove bytes/characters it can understand
417and returns a string used to represent them.
418e.g.
419
420 sub fixup {
421 my $ch = substr($_[0],0,1,'');
422 return sprintf("\x{%02X}",ord($ch);
423 }
424
425This scheme is close to how underlying C code for Encode works, but gives
426the fixup routine very little context.
427
428=item Scheme 2
429
47bfe92f 430Passed original string, and an index into it of the problem area, and
431output string so far. Appends what it will to output string and
432returns new index into original string. For example:
4411f3b6 433
434 sub fixup {
435 # my ($s,$i,$d) = @_;
436 my $ch = substr($_[0],$_[1],1);
437 $_[2] .= sprintf("\x{%02X}",ord($ch);
438 return $_[1]+1;
439 }
440
47bfe92f 441This scheme gives maximal control to the fixup routine but is more
442complicated to code, and may need internals of Encode to be tweaked to
443keep original string intact.
4411f3b6 444
445=item Other Schemes
446
447Hybrids of above.
448
449Multiple return values rather than in-place modifications.
450
451Index into the string could be pos($str) allowing s/\G...//.
452
453=back
454
455=head2 UTF-8 / utf8
456
457The Unicode consortium defines the UTF-8 standard as a way of encoding
47bfe92f 458the entire Unicode repertiore as sequences of octets. This encoding is
459expected to become very widespread. Perl can use this form internaly
460to represent strings, so conversions to and from this form are
461particularly efficient (as octets in memory do not have to change,
462just the meta-data that tells Perl how to treat them).
4411f3b6 463
464=over 4
465
466=item *
467
468 $bytes = encode_utf8($string);
469
47bfe92f 470The characters that comprise string are encoded in Perl's superset of UTF-8
4411f3b6 471and the resulting octets returned as a sequence of bytes. All possible
472characters have a UTF-8 representation so this function cannot fail.
473
474=item *
475
476 $string = decode_utf8($bytes [,CHECK]);
477
47bfe92f 478The sequence of octets represented by $bytes is decoded from UTF-8
479into a sequence of logical characters. Not all sequences of octets
480form valid UTF-8 encodings, so it is possible for this call to fail.
481For CHECK see L</"Handling Malformed Data">.
4411f3b6 482
483=back
484
51ef4e11 485=head2 Listing available encodings
486
487 use Encode qw(encodings);
488 @list = encodings();
489
490Returns a list of the canonical names of the available encodings.
491
492=head2 Defining Aliases
493
494 use Encode qw(define_alias);
495 define_alias( newName => ENCODING);
496
47bfe92f 497Allows newName to be used as am alias for ENCODING. ENCODING may be
498either the name of an encoding or and encoding object (as above).
51ef4e11 499
500Currently I<newName> can be specified in the following ways:
501
502=over 4
503
504=item As a simple string.
505
506=item As a qr// compiled regular expression, e.g.:
507
508 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
509
47bfe92f 510In this case if I<ENCODING> is not a reference it is C<eval>-ed to
511allow C<$1> etc. to be subsituted. The example is one way to names as
512used in X11 font names to alias the MIME names for the iso-8859-*
d6b7ef86 513family. Note the double quote inside the single quote. If you are
514using regex here, you have to do so or it won't work in this case.
51ef4e11 515
516=item As a code reference, e.g.:
517
518 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
519
520In this case C<$_> will be set to the name that is being looked up and
47bfe92f 521I<ENCODING> is passed to the sub as its first argument. The example
522is another way to names as used in X11 font names to alias the MIME
523names for the iso-8859-* family.
51ef4e11 524
525=back
526
d6b7ef86 527=head1 Defining Encodings
51ef4e11 528
e9692b5b 529 use Encode qw(define_alias);
530 define_encoding( $object, 'canonicalName' [,alias...]);
51ef4e11 531
47bfe92f 532Causes I<canonicalName> to be associated with I<$object>. The object
d6b7ef86 533should provide the interface described in L<Encode::Encoding>
47bfe92f 534below. If more than two arguments are provided then additional
535arguments are taken as aliases for I<$object> as for C<define_alias>.
51ef4e11 536
4411f3b6 537=head1 Encoding and IO
538
539It is very common to want to do encoding transformations when
540reading or writing files, network connections, pipes etc.
47bfe92f 541If Perl is configured to use the new 'perlio' IO system then
4411f3b6 542C<Encode> provides a "layer" (See L<perliol>) which can transform
543data as it is read or written.
544
8e86646e 545Here is how the blind poet would modernise the encoding:
546
42234700 547 use Encode;
8e86646e 548 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
549 open(my $utf8,'>:utf8','iliad.utf8');
550 my @epic = <$iliad>;
551 print $utf8 @epic;
552 close($utf8);
553 close($illiad);
4411f3b6 554
555In addition the new IO system can also be configured to read/write
556UTF-8 encoded characters (as noted above this is efficient):
557
e9692b5b 558 open(my $fh,'>:utf8','anything');
559 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6 560
561Either of the above forms of "layer" specifications can be made the default
562for a lexical scope with the C<use open ...> pragma. See L<open>.
563
564Once a handle is open is layers can be altered using C<binmode>.
565
47bfe92f 566Without any such configuration, or if Perl itself is built using
4411f3b6 567system's own IO, then write operations assume that file handle accepts
568only I<bytes> and will C<die> if a character larger than 255 is
569written to the handle. When reading, each octet from the handle
570becomes a byte-in-a-character. Note that this default is the same
47bfe92f 571behaviour as bytes-only languages (including Perl before v5.6) would
572have, and is sufficient to handle native 8-bit encodings
573e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
574other encodings and binary data.
575
576In other cases it is the programs responsibility to transform
577characters into bytes using the API above before doing writes, and to
578transform the bytes read from a handle into characters before doing
579"character operations" (e.g. C<lc>, C</\W+/>, ...).
580
47bfe92f 581You can also use PerlIO to convert larger amounts of data you don't
d6b7ef86 582want to bring into memory. For example to convert between ISO-8859-1
47bfe92f 583(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
584
e9692b5b 585 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
586 open(G, ">:utf8", "data.utf") or die $!;
587 while (<F>) { print G }
588
589 # Could also do "print G <F>" but that would pull
590 # the whole file into memory just to write it out again.
591
592More examples:
47bfe92f 593
e9692b5b 594 open(my $f, "<:encoding(cp1252)")
595 open(my $g, ">:encoding(iso-8859-2)")
596 open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f 597
598See L<PerlIO> for more information.
4411f3b6 599
1768d7eb 600See also L<encoding> for how to change the default encoding of the
d521382b 601data in your script.
1768d7eb 602
4411f3b6 603=head1 Messing with Perl's Internals
604
47bfe92f 605The following API uses parts of Perl's internals in the current
606implementation. As such they are efficient, but may change.
4411f3b6 607
608=over 4
609
4411f3b6 610=item * is_utf8(STRING [, CHECK])
611
612[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f 613If CHECK is true, also checks the data in STRING for being well-formed
614UTF-8. Returns true if successful, false otherwise.
4411f3b6 615
4411f3b6 616=item *
617
618 _utf8_on(STRING)
619
620[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
621B<not> checked for being well-formed UTF-8. Do not use unless you
622B<know> that the STRING is well-formed UTF-8. Returns the previous
623state of the UTF-8 flag (so please don't test the return value as
624I<not> success or failure), or C<undef> if STRING is not a string.
625
626=item *
627
628 _utf8_off(STRING)
629
630[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
631Returns the previous state of the UTF-8 flag (so please don't test the
632return value as I<not> success or failure), or C<undef> if STRING is
633not a string.
634
635=back
636
637=head1 SEE ALSO
638
5463e635 639L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>, L<encoding>,
d6b7ef86 640L<utf8>, L<Encode::Description>, L<Encode::Encoding> the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
5463e635 641
4411f3b6 642
643=cut
644