left-over "use 5.7.2" in threads.pm
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
3ef515df 3our $VERSION = do { my @r = (q$Revision: 1.11 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c 4our $DEBUG = 0;
2c674647 5
6require DynaLoader;
7require Exporter;
8
51ef4e11 9our @ISA = qw(Exporter DynaLoader);
2c674647 10
4411f3b6 11# Public, encouraged API is exported by default
51ef4e11 12our @EXPORT = qw (
4411f3b6 13 encode
14 decode
15 encode_utf8
16 decode_utf8
17 find_encoding
51ef4e11 18 encodings
4411f3b6 19);
20
51ef4e11 21our @EXPORT_OK =
2c674647 22 qw(
51ef4e11 23 define_encoding
2c674647 24 from_to
25 is_utf8
4411f3b6 26 is_8bit
27 is_16bit
a12c0f56 28 utf8_upgrade
29 utf8_downgrade
4411f3b6 30 _utf8_on
31 _utf8_off
2c674647 32 );
33
34bootstrap Encode ();
35
4411f3b6 36# Documentation moved after __END__ for speed - NI-S
2c674647 37
bf230f3d 38use Carp;
39
a63c962f 40our $ON_EBCDIC = (ord("A") == 193);
5d030b67 41use Encode::Alias;
42
5129552c 43# Make a %Encoding package variable to allow a certain amount of cheating
44our %Encoding;
5345d506 45
5129552c 46our %ExtModule =
2b217bf7 47 (
5129552c 48 viscii => 'Encode/Byte.pm',
49 'koi8-r' => 'Encode/Byte.pm',
50 cp1047 => 'Encode/EBCDIC.pm',
51 cp37 => 'Encode/EBCDIC.pm',
52 'posix-bc' => 'Encode/EBCDIC.pm',
53 symbol => 'Encode/Symbol.pm',
54 dingbats => 'Encode/Symbol.pm',
2b217bf7 55 );
d1ed7747 56
5129552c 57for my $k (2..11,13..16){
58 $ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm';
59}
60
61for my $k (1250..1258){
62 $ExtModule{"cp$k"} = 'Encode/Byte.pm';
63}
64
a63c962f 65unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
66%ExtModule =(
67 %ExtModule,
68 'euc-cn' => 'Encode/CN.pm',
69 gb2312 => 'Encode/CN.pm',
70 gb12345 => 'Encode/CN.pm',
71 gbk => 'Encode/CN.pm',
72 cp936 => 'Encode/CN.pm',
73 'iso-ir-165' => 'Encode/CN.pm',
74 'euc-jp' => 'Encode/JP.pm',
75 'iso-2022-jp' => 'Encode/JP.pm',
76 'iso-2022-jp-1' => 'Encode/JP.pm',
77 '7bit-jis' => 'Encode/JP.pm',
78 shiftjis => 'Encode/JP.pm',
79 macjapan => 'Encode/JP.pm',
80 cp932 => 'Encode/JP.pm',
81 'euc-kr' => 'Encode/KR.pm',
82 ksc5601 => 'Encode/KR.pm',
83 cp949 => 'Encode/KR.pm',
84 big5 => 'Encode/TW.pm',
85 'big5-hkscs' => 'Encode/TW.pm',
86 cp950 => 'Encode/TW.pm',
87 gb18030 => 'Encode/HanExtra.pm',
88 big5plus => 'Encode/HanExtra.pm',
89 'euc-tw' => 'Encode/HanExtra.pm',
90 );
91}
92
3ef515df 93for my $k (qw{ CentralEurRoman Croatian Cyrillic Greek
94 Iceland Roman Rumanian Sami
95 Thai Turkish Ukrainian
96 })
5129552c 97{
98 $ExtModule{"mac$k"} = 'Encode/Byte.pm';
99}
100
656753f8 101sub encodings
102{
5129552c 103 my $class = shift;
071db25d 104 my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
5129552c 105 for my $m (@modules)
106 {
107 $DEBUG and warn "about to require $m;";
108 eval { require $m; };
109 }
110 return
111 map({$_->[0]}
112 sort({$a->[1] cmp $b->[1]}
113 map({[$_, lc $_]}
114 grep({ $_ ne 'Internal' } keys %Encoding))));
51ef4e11 115}
116
51ef4e11 117sub define_encoding
118{
18586f54 119 my $obj = shift;
120 my $name = shift;
5129552c 121 $Encoding{$name} = $obj;
18586f54 122 my $lc = lc($name);
123 define_alias($lc => $obj) unless $lc eq $name;
124 while (@_)
125 {
126 my $alias = shift;
127 define_alias($alias,$obj);
128 }
129 return $obj;
656753f8 130}
131
656753f8 132sub getEncoding
133{
dd9703c9 134 my ($class,$name,$skip_external) = @_;
18586f54 135 my $enc;
136 if (ref($name) && $name->can('new_sequence'))
137 {
138 return $name;
139 }
140 my $lc = lc $name;
5129552c 141 if (exists $Encoding{$name})
18586f54 142 {
5129552c 143 return $Encoding{$name};
18586f54 144 }
5129552c 145 if (exists $Encoding{$lc})
18586f54 146 {
5129552c 147 return $Encoding{$lc};
18586f54 148 }
c50d192e 149
5129552c 150 my $oc = $class->find_alias($name);
c50d192e 151 return $oc if defined $oc;
152
5129552c 153 $oc = $class->find_alias($lc) if $lc ne $name;
c50d192e 154 return $oc if defined $oc;
155
5129552c 156 if (!$skip_external and exists $ExtModule{$lc})
d1ed7747 157 {
5129552c 158 eval{ require $ExtModule{$lc}; };
159 return $Encoding{$name} if exists $Encoding{$name};
d1ed7747 160 }
18586f54 161
18586f54 162 return;
656753f8 163}
164
4411f3b6 165sub find_encoding
166{
dd9703c9 167 my ($name,$skip_external) = @_;
168 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6 169}
170
171sub encode
172{
18586f54 173 my ($name,$string,$check) = @_;
174 my $enc = find_encoding($name);
175 croak("Unknown encoding '$name'") unless defined $enc;
176 my $octets = $enc->encode($string,$check);
177 return undef if ($check && length($string));
178 return $octets;
4411f3b6 179}
180
181sub decode
182{
18586f54 183 my ($name,$octets,$check) = @_;
184 my $enc = find_encoding($name);
185 croak("Unknown encoding '$name'") unless defined $enc;
186 my $string = $enc->decode($octets,$check);
187 $_[1] = $octets if $check;
188 return $string;
4411f3b6 189}
190
191sub from_to
192{
18586f54 193 my ($string,$from,$to,$check) = @_;
194 my $f = find_encoding($from);
195 croak("Unknown encoding '$from'") unless defined $f;
196 my $t = find_encoding($to);
197 croak("Unknown encoding '$to'") unless defined $t;
198 my $uni = $f->decode($string,$check);
199 return undef if ($check && length($string));
200 $string = $t->encode($uni,$check);
201 return undef if ($check && length($uni));
3ef515df 202 return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6 203}
204
205sub encode_utf8
206{
18586f54 207 my ($str) = @_;
208 utf8::encode($str);
209 return $str;
4411f3b6 210}
211
212sub decode_utf8
213{
18586f54 214 my ($str) = @_;
215 return undef unless utf8::decode($str);
216 return $str;
5ad8ef52 217}
218
18586f54 219require Encode::Encoding;
220require Encode::XS;
221require Encode::Internal;
222require Encode::Unicode;
223require Encode::utf8;
64ffdd5e 224require Encode::10646_1;
18586f54 225require Encode::ucs2_le;
4411f3b6 226
656753f8 2271;
228
2a936312 229__END__
230
4411f3b6 231=head1 NAME
232
233Encode - character encodings
234
235=head1 SYNOPSIS
236
237 use Encode;
238
67d7b5ef 239
240=head2 Table of Contents
241
242Encode consists of a collection of modules which details are too big
243to fit in one document. This POD itself explains the top-level APIs
244and general topics at a glance. For other topics and more details,
245see the PODs below;
246
247 Name Description
248 --------------------------------------------------------
249 Encode::Alias Alias defintions to encodings
250 Encode::Encoding Encode Implementation Base Class
251 Encode::Supported List of Supported Encodings
252 Encode::CN Simplified Chinese Encodings
253 Encode::JP Japanese Encodings
254 Encode::KR Korean Encodings
255 Encode::TW Traditional Chinese Encodings
256 --------------------------------------------------------
257
4411f3b6 258=head1 DESCRIPTION
259
47bfe92f 260The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef 261and the rest of the system. Perl strings are sequences of
262B<characters>.
263
264The repertoire of characters that Perl can represent is at least that
265defined by the Unicode Consortium. On most platforms the ordinal
266values of the characters (as returned by C<ord(ch)>) is the "Unicode
267codepoint" for the character (the exceptions are those platforms where
268the legacy encoding is some variant of EBCDIC rather than a super-set
269of ASCII - see L<perlebcdic>).
270
271Traditionally computer data has been moved around in 8-bit chunks
272often called "bytes". These chunks are also known as "octets" in
273networking standards. Perl is widely used to manipulate data of many
274types - not only strings of characters representing human or computer
275languages but also "binary" data being the machines representation of
276numbers, pixels in an image - or just about anything.
277
278When Perl is processing "binary data" the programmer wants Perl to
279process "sequences of bytes". This is not a problem for Perl - as a
280byte has 256 possible values it easily fits in Perl's much larger
281"logical character".
282
283=head2 TERMINOLOGY
4411f3b6 284
67d7b5ef 285=over 4
21938dfa 286
67d7b5ef 287=item *
288
289I<character>: a character in the range 0..(2**32-1) (or more).
290(What Perl's strings are made of.)
291
292=item *
293
294I<byte>: a character in the range 0..255
295(A special case of a Perl character.)
296
297=item *
298
299I<octet>: 8 bits of data, with ordinal values 0..255
300(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
301
302=back
4411f3b6 303
67d7b5ef 304The marker [INTERNAL] marks Internal Implementation Details, in
305general meant only for those who think they know what they are doing,
306and such details may change in future releases.
307
308=head1 PERL ENCODING API
4411f3b6 309
310=over 4
311
a63c962f 312=item $bytes = encode(ENCODING, $string[, CHECK])
4411f3b6 313
47bfe92f 314Encodes string from Perl's internal form into I<ENCODING> and returns
67d7b5ef 315a sequence of octets. ENCODING can be either a canonical name or
316alias. For encoding names and aliases, see L</"Defining Aliases">.
317For CHECK see L</"Handling Malformed Data">.
4411f3b6 318
67d7b5ef 319For example to convert (internally UTF-8 encoded) Unicode string to
320iso-8859-1 (also known as Latin1),
681a7c68 321
67d7b5ef 322 $octets = encode("iso-8859-1", $unicode);
681a7c68 323
a63c962f 324=item $string = decode(ENCODING, $bytes[, CHECK])
4411f3b6 325
47bfe92f 326Decode sequence of octets assumed to be in I<ENCODING> into Perl's
67d7b5ef 327internal form and returns the resulting string. as in encode(),
328ENCODING can be either a canonical name or alias. For encoding names
329and aliases, see L</"Defining Aliases">. For CHECK see
47bfe92f 330L</"Handling Malformed Data">.
331
1b2c56c8 332For example to convert ISO-8859-1 data to UTF-8:
681a7c68 333
67d7b5ef 334 $utf8 = decode("iso-8859-1", $latin1);
681a7c68 335
3ef515df 336=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
47bfe92f 337
2b106fbe 338Convert B<in-place> the data between two encodings. How did the data
339in $string originally get to be in FROM_ENCODING? Either using
67d7b5ef 340encode() or through PerlIO: See L</"Encoding and IO">.
341For encoding names and aliases, see L</"Defining Aliases">.
342For CHECK see L</"Handling Malformed Data">.
2b106fbe 343
1b2c56c8 344For example to convert ISO-8859-1 data to UTF-8:
2b106fbe 345
346 from_to($data, "iso-8859-1", "utf-8");
347
348and to convert it back:
349
350 from_to($data, "utf-8", "iso-8859-1");
4411f3b6 351
ab97ca19 352Note that because the conversion happens in place, the data to be
353converted cannot be a string constant, it must be a scalar variable.
354
3ef515df 355from_to() return the length of the converted string on success, undef
356otherwise.
357
4411f3b6 358=back
359
51ef4e11 360=head2 Listing available encodings
361
5129552c 362 use Encode;
363 @list = Encode->encodings();
364
365Returns a list of the canonical names of the available encodings that
366are loaded. To get a list of all available encodings including the
367ones that are not loaded yet, say
368
369 @all_encodings = Encode->encodings(":all");
370
371Or you can give the name of specific module.
372
373 @with_jp = Encode->encodings("Encode/JP.pm");
51ef4e11 374
a63c962f 375Note in this case you have to say C<"Encode/JP.pm"> instead of
376C<"Encode::JP">.
5d030b67 377
a63c962f 378To find which encodings are supported by this package in details,
5d030b67 379see L<Encode::Supported>.
51ef4e11 380
67d7b5ef 381
51ef4e11 382=head2 Defining Aliases
383
67d7b5ef 384To add new alias to a given encoding, Use;
385
5129552c 386 use Encode;
387 use Encode::Alias;
a63c962f 388 define_alias(newName => ENCODING);
51ef4e11 389
3ef515df 390After that, newName can be used as an alias for ENCODING.
391ENCODING may be either the name of an encoding or an I<encoding
392 object>
51ef4e11 393
5d030b67 394See L<Encode::Alias> on details.
51ef4e11 395
4411f3b6 396=head1 Encoding and IO
397
398It is very common to want to do encoding transformations when
399reading or writing files, network connections, pipes etc.
47bfe92f 400If Perl is configured to use the new 'perlio' IO system then
4411f3b6 401C<Encode> provides a "layer" (See L<perliol>) which can transform
402data as it is read or written.
403
8e86646e 404Here is how the blind poet would modernise the encoding:
405
42234700 406 use Encode;
8e86646e 407 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
408 open(my $utf8,'>:utf8','iliad.utf8');
409 my @epic = <$iliad>;
410 print $utf8 @epic;
411 close($utf8);
412 close($illiad);
4411f3b6 413
414In addition the new IO system can also be configured to read/write
415UTF-8 encoded characters (as noted above this is efficient):
416
e9692b5b 417 open(my $fh,'>:utf8','anything');
418 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6 419
420Either of the above forms of "layer" specifications can be made the default
421for a lexical scope with the C<use open ...> pragma. See L<open>.
422
423Once a handle is open is layers can be altered using C<binmode>.
424
47bfe92f 425Without any such configuration, or if Perl itself is built using
4411f3b6 426system's own IO, then write operations assume that file handle accepts
427only I<bytes> and will C<die> if a character larger than 255 is
428written to the handle. When reading, each octet from the handle
429becomes a byte-in-a-character. Note that this default is the same
47bfe92f 430behaviour as bytes-only languages (including Perl before v5.6) would
431have, and is sufficient to handle native 8-bit encodings
432e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
433other encodings and binary data.
434
435In other cases it is the programs responsibility to transform
436characters into bytes using the API above before doing writes, and to
437transform the bytes read from a handle into characters before doing
438"character operations" (e.g. C<lc>, C</\W+/>, ...).
439
47bfe92f 440You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8 441want to bring into memory. For example to convert between ISO-8859-1
47bfe92f 442(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
443
e9692b5b 444 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
445 open(G, ">:utf8", "data.utf") or die $!;
446 while (<F>) { print G }
447
448 # Could also do "print G <F>" but that would pull
449 # the whole file into memory just to write it out again.
450
451More examples:
47bfe92f 452
e9692b5b 453 open(my $f, "<:encoding(cp1252)")
454 open(my $g, ">:encoding(iso-8859-2)")
455 open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f 456
457See L<PerlIO> for more information.
4411f3b6 458
1768d7eb 459See also L<encoding> for how to change the default encoding of the
d521382b 460data in your script.
1768d7eb 461
67d7b5ef 462=head1 Handling Malformed Data
463
464If CHECK is not set, C<undef> is returned. If the data is supposed to
465be UTF-8, an optional lexical warning (category utf8) is given. If
466CHECK is true but not a code reference, dies.
467
468It would desirable to have a way to indicate that transform should use
469the encodings "replacement character" - no such mechanism is defined yet.
470
471It is also planned to allow I<CHECK> to be a code reference.
472
473This is not yet implemented as there are design issues with what its
474arguments should be and how it returns its results.
475
476=over 4
477
478=item Scheme 1
479
480Passed remaining fragment of string being processed.
481Modifies it in place to remove bytes/characters it can understand
482and returns a string used to represent them.
483e.g.
484
485 sub fixup {
486 my $ch = substr($_[0],0,1,'');
487 return sprintf("\x{%02X}",ord($ch);
488 }
489
490This scheme is close to how underlying C code for Encode works, but gives
491the fixup routine very little context.
492
493=item Scheme 2
494
495Passed original string, and an index into it of the problem area, and
496output string so far. Appends what it will to output string and
497returns new index into original string. For example:
498
499 sub fixup {
500 # my ($s,$i,$d) = @_;
501 my $ch = substr($_[0],$_[1],1);
502 $_[2] .= sprintf("\x{%02X}",ord($ch);
503 return $_[1]+1;
504 }
505
506This scheme gives maximal control to the fixup routine but is more
507complicated to code, and may need internals of Encode to be tweaked to
508keep original string intact.
509
510=item Other Schemes
511
512Hybrids of above.
513
514Multiple return values rather than in-place modifications.
515
516Index into the string could be C<pos($str)> allowing C<s/\G...//>.
517
518=back
519
520=head2 UTF-8 / utf8
521
522The Unicode consortium defines the UTF-8 standard as a way of encoding
523the entire Unicode repertoire as sequences of octets. This encoding is
524expected to become very widespread. Perl can use this form internally
525to represent strings, so conversions to and from this form are
526particularly efficient (as octets in memory do not have to change,
527just the meta-data that tells Perl how to treat them).
528
529=over 4
530
531=item $bytes = encode_utf8($string);
532
533The characters that comprise string are encoded in Perl's superset of UTF-8
534and the resulting octets returned as a sequence of bytes. All possible
535characters have a UTF-8 representation so this function cannot fail.
536
537=item $string = decode_utf8($bytes [, CHECK]);
538
539The sequence of octets represented by $bytes is decoded from UTF-8
540into a sequence of logical characters. Not all sequences of octets
541form valid UTF-8 encodings, so it is possible for this call to fail.
542For CHECK see L</"Handling Malformed Data">.
543
544=back
545
546=head1 Defining Encodings
547
548To define a new encoding, use:
549
550 use Encode qw(define_alias);
551 define_encoding($object, 'canonicalName' [, alias...]);
552
553I<canonicalName> will be associated with I<$object>. The object
554should provide the interface described in L<Encode::Encoding>
555If more than two arguments are provided then additional
556arguments are taken as aliases for I<$object> as for C<define_alias>.
557
4411f3b6 558=head1 Messing with Perl's Internals
559
47bfe92f 560The following API uses parts of Perl's internals in the current
561implementation. As such they are efficient, but may change.
4411f3b6 562
563=over 4
564
a63c962f 565=item is_utf8(STRING [, CHECK])
4411f3b6 566
567[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f 568If CHECK is true, also checks the data in STRING for being well-formed
569UTF-8. Returns true if successful, false otherwise.
4411f3b6 570
a63c962f 571=item _utf8_on(STRING)
4411f3b6 572
573[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
574B<not> checked for being well-formed UTF-8. Do not use unless you
575B<know> that the STRING is well-formed UTF-8. Returns the previous
576state of the UTF-8 flag (so please don't test the return value as
577I<not> success or failure), or C<undef> if STRING is not a string.
578
a63c962f 579=item _utf8_off(STRING)
4411f3b6 580
581[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
582Returns the previous state of the UTF-8 flag (so please don't test the
583return value as I<not> success or failure), or C<undef> if STRING is
584not a string.
585
586=back
587
588=head1 SEE ALSO
589
5d030b67 590L<Encode::Encoding>,
591L<Encode::Supported>,
592L<PerlIO>,
593L<encoding>,
594L<perlebcdic>,
595L<perlfunc/open>,
596L<perlunicode>,
597L<utf8>,
598the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 599
600=cut