Integrate perlio:
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
071db25d 3our $VERSION = do { my @r = (q$Revision: 0.97 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c 4our $DEBUG = 0;
2c674647 5
6require DynaLoader;
7require Exporter;
8
51ef4e11 9our @ISA = qw(Exporter DynaLoader);
2c674647 10
4411f3b6 11# Public, encouraged API is exported by default
51ef4e11 12our @EXPORT = qw (
4411f3b6 13 encode
14 decode
15 encode_utf8
16 decode_utf8
17 find_encoding
51ef4e11 18 encodings
4411f3b6 19);
20
51ef4e11 21our @EXPORT_OK =
2c674647 22 qw(
51ef4e11 23 define_encoding
2c674647 24 from_to
25 is_utf8
4411f3b6 26 is_8bit
27 is_16bit
a12c0f56 28 utf8_upgrade
29 utf8_downgrade
4411f3b6 30 _utf8_on
31 _utf8_off
2c674647 32 );
33
34bootstrap Encode ();
35
4411f3b6 36# Documentation moved after __END__ for speed - NI-S
2c674647 37
bf230f3d 38use Carp;
39
5d030b67 40use Encode::Alias;
41
5129552c 42# Make a %Encoding package variable to allow a certain amount of cheating
43our %Encoding;
5345d506 44
5129552c 45our %ExtModule =
2b217bf7 46 (
5129552c 47 viscii => 'Encode/Byte.pm',
48 'koi8-r' => 'Encode/Byte.pm',
49 cp1047 => 'Encode/EBCDIC.pm',
50 cp37 => 'Encode/EBCDIC.pm',
51 'posix-bc' => 'Encode/EBCDIC.pm',
52 symbol => 'Encode/Symbol.pm',
53 dingbats => 'Encode/Symbol.pm',
54 'euc-cn' => 'Encode/CN.pm',
55 gb2312 => 'Encode/CN.pm',
56 gb12345 => 'Encode/CN.pm',
57 gbk => 'Encode/CN.pm',
58 cp936 => 'Encode/CN.pm',
59 'iso-ir-165' => 'Encode/CN.pm',
60 'euc-jp' => 'Encode/JP.pm',
61 'iso-2022-jp' => 'Encode/JP.pm',
62 '7bit-jis' => 'Encode/JP.pm',
63 shiftjis => 'Encode/JP.pm',
64 macjapan => 'Encode/JP.pm',
65 cp932 => 'Encode/JP.pm',
66 'euc-kr' => 'Encode/KR.pm',
67 ksc5601 => 'Encode/KR.pm',
68 cp949 => 'Encode/KR.pm',
69 big5 => 'Encode/TW.pm',
70 'big5-hkscs' => 'Encode/TW.pm',
71 cp950 => 'Encode/TW.pm',
72 gb18030 => 'Encode/HanExtra.pm',
73 big5plus => 'Encode/HanExtra.pm',
74 'euc-tw' => 'Encode/HanExtra.pm',
2b217bf7 75 );
d1ed7747 76
5129552c 77for my $k (2..11,13..16){
78 $ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm';
79}
80
81for my $k (1250..1258){
82 $ExtModule{"cp$k"} = 'Encode/Byte.pm';
83}
84
85for my $k (qw(centeuro croatian cyrillic dingbats greek
86 iceland roman rumanian sami
87 thai turkish ukraine))
88{
89 $ExtModule{"mac$k"} = 'Encode/Byte.pm';
90}
91
92
656753f8 93sub encodings
94{
5129552c 95 my $class = shift;
071db25d 96 my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
5129552c 97 for my $m (@modules)
98 {
99 $DEBUG and warn "about to require $m;";
100 eval { require $m; };
101 }
102 return
103 map({$_->[0]}
104 sort({$a->[1] cmp $b->[1]}
105 map({[$_, lc $_]}
106 grep({ $_ ne 'Internal' } keys %Encoding))));
51ef4e11 107}
108
51ef4e11 109sub define_encoding
110{
18586f54 111 my $obj = shift;
112 my $name = shift;
5129552c 113 $Encoding{$name} = $obj;
18586f54 114 my $lc = lc($name);
115 define_alias($lc => $obj) unless $lc eq $name;
116 while (@_)
117 {
118 my $alias = shift;
119 define_alias($alias,$obj);
120 }
121 return $obj;
656753f8 122}
123
656753f8 124sub getEncoding
125{
dd9703c9 126 my ($class,$name,$skip_external) = @_;
18586f54 127 my $enc;
128 if (ref($name) && $name->can('new_sequence'))
129 {
130 return $name;
131 }
132 my $lc = lc $name;
5129552c 133 if (exists $Encoding{$name})
18586f54 134 {
5129552c 135 return $Encoding{$name};
18586f54 136 }
5129552c 137 if (exists $Encoding{$lc})
18586f54 138 {
5129552c 139 return $Encoding{$lc};
18586f54 140 }
c50d192e 141
5129552c 142 my $oc = $class->find_alias($name);
c50d192e 143 return $oc if defined $oc;
144
5129552c 145 $oc = $class->find_alias($lc) if $lc ne $name;
c50d192e 146 return $oc if defined $oc;
147
5129552c 148 if (!$skip_external and exists $ExtModule{$lc})
d1ed7747 149 {
5129552c 150 eval{ require $ExtModule{$lc}; };
151 return $Encoding{$name} if exists $Encoding{$name};
d1ed7747 152 }
18586f54 153
18586f54 154 return;
656753f8 155}
156
4411f3b6 157sub find_encoding
158{
dd9703c9 159 my ($name,$skip_external) = @_;
160 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6 161}
162
163sub encode
164{
18586f54 165 my ($name,$string,$check) = @_;
166 my $enc = find_encoding($name);
167 croak("Unknown encoding '$name'") unless defined $enc;
168 my $octets = $enc->encode($string,$check);
169 return undef if ($check && length($string));
170 return $octets;
4411f3b6 171}
172
173sub decode
174{
18586f54 175 my ($name,$octets,$check) = @_;
176 my $enc = find_encoding($name);
177 croak("Unknown encoding '$name'") unless defined $enc;
178 my $string = $enc->decode($octets,$check);
179 $_[1] = $octets if $check;
180 return $string;
4411f3b6 181}
182
183sub from_to
184{
18586f54 185 my ($string,$from,$to,$check) = @_;
186 my $f = find_encoding($from);
187 croak("Unknown encoding '$from'") unless defined $f;
188 my $t = find_encoding($to);
189 croak("Unknown encoding '$to'") unless defined $t;
190 my $uni = $f->decode($string,$check);
191 return undef if ($check && length($string));
192 $string = $t->encode($uni,$check);
193 return undef if ($check && length($uni));
194 return length($_[0] = $string);
4411f3b6 195}
196
197sub encode_utf8
198{
18586f54 199 my ($str) = @_;
200 utf8::encode($str);
201 return $str;
4411f3b6 202}
203
204sub decode_utf8
205{
18586f54 206 my ($str) = @_;
207 return undef unless utf8::decode($str);
208 return $str;
5ad8ef52 209}
210
18586f54 211require Encode::Encoding;
212require Encode::XS;
213require Encode::Internal;
214require Encode::Unicode;
215require Encode::utf8;
216require Encode::iso10646_1;
217require Encode::ucs2_le;
4411f3b6 218
656753f8 2191;
220
2a936312 221__END__
222
4411f3b6 223=head1 NAME
224
225Encode - character encodings
226
227=head1 SYNOPSIS
228
229 use Encode;
230
231=head1 DESCRIPTION
232
47bfe92f 233The C<Encode> module provides the interfaces between Perl's strings
234and the rest of the system. Perl strings are sequences of B<characters>.
4411f3b6 235
1b2c56c8 236To find more about character encodings, please consult
5d030b67 237L<Encode::Details> . This document focuses on programming references.
21938dfa 238
4411f3b6 239=head1 PERL ENCODING API
240
241=head2 Generic Encoding Interface
242
243=over 4
244
245=item *
246
247 $bytes = encode(ENCODING, $string[, CHECK])
248
47bfe92f 249Encodes string from Perl's internal form into I<ENCODING> and returns
250a sequence of octets. For CHECK see L</"Handling Malformed Data">.
4411f3b6 251
681a7c68 252For example to convert (internally UTF-8 encoded) Unicode data
253to octets:
254
255 $octets = encode("utf8", $unicode);
256
4411f3b6 257=item *
258
259 $string = decode(ENCODING, $bytes[, CHECK])
260
47bfe92f 261Decode sequence of octets assumed to be in I<ENCODING> into Perl's
262internal form and returns the resulting string. For CHECK see
263L</"Handling Malformed Data">.
264
1b2c56c8 265For example to convert ISO-8859-1 data to UTF-8:
681a7c68 266
267 $utf8 = decode("latin1", $latin1);
268
47bfe92f 269=item *
270
271 from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
272
2b106fbe 273Convert B<in-place> the data between two encodings. How did the data
274in $string originally get to be in FROM_ENCODING? Either using
e9692b5b 275encode() or through PerlIO: See L</"Encoding and IO">. For CHECK
2b106fbe 276see L</"Handling Malformed Data">.
277
1b2c56c8 278For example to convert ISO-8859-1 data to UTF-8:
2b106fbe 279
280 from_to($data, "iso-8859-1", "utf-8");
281
282and to convert it back:
283
284 from_to($data, "utf-8", "iso-8859-1");
4411f3b6 285
ab97ca19 286Note that because the conversion happens in place, the data to be
287converted cannot be a string constant, it must be a scalar variable.
288
4411f3b6 289=back
290
291=head2 Handling Malformed Data
292
293If CHECK is not set, C<undef> is returned. If the data is supposed to
47bfe92f 294be UTF-8, an optional lexical warning (category utf8) is given. If
295CHECK is true but not a code reference, dies.
4411f3b6 296
47bfe92f 297It would desirable to have a way to indicate that transform should use
298the encodings "replacement character" - no such mechanism is defined yet.
4411f3b6 299
300It is also planned to allow I<CHECK> to be a code reference.
301
47bfe92f 302This is not yet implemented as there are design issues with what its
303arguments should be and how it returns its results.
4411f3b6 304
305=over 4
306
307=item Scheme 1
308
309Passed remaining fragment of string being processed.
310Modifies it in place to remove bytes/characters it can understand
311and returns a string used to represent them.
312e.g.
313
314 sub fixup {
315 my $ch = substr($_[0],0,1,'');
316 return sprintf("\x{%02X}",ord($ch);
317 }
318
319This scheme is close to how underlying C code for Encode works, but gives
320the fixup routine very little context.
321
322=item Scheme 2
323
47bfe92f 324Passed original string, and an index into it of the problem area, and
325output string so far. Appends what it will to output string and
326returns new index into original string. For example:
4411f3b6 327
328 sub fixup {
329 # my ($s,$i,$d) = @_;
330 my $ch = substr($_[0],$_[1],1);
331 $_[2] .= sprintf("\x{%02X}",ord($ch);
332 return $_[1]+1;
333 }
334
47bfe92f 335This scheme gives maximal control to the fixup routine but is more
336complicated to code, and may need internals of Encode to be tweaked to
337keep original string intact.
4411f3b6 338
339=item Other Schemes
340
341Hybrids of above.
342
343Multiple return values rather than in-place modifications.
344
345Index into the string could be pos($str) allowing s/\G...//.
346
347=back
348
349=head2 UTF-8 / utf8
350
351The Unicode consortium defines the UTF-8 standard as a way of encoding
47bfe92f 352the entire Unicode repertiore as sequences of octets. This encoding is
353expected to become very widespread. Perl can use this form internaly
354to represent strings, so conversions to and from this form are
355particularly efficient (as octets in memory do not have to change,
356just the meta-data that tells Perl how to treat them).
4411f3b6 357
358=over 4
359
360=item *
361
362 $bytes = encode_utf8($string);
363
47bfe92f 364The characters that comprise string are encoded in Perl's superset of UTF-8
4411f3b6 365and the resulting octets returned as a sequence of bytes. All possible
366characters have a UTF-8 representation so this function cannot fail.
367
368=item *
369
370 $string = decode_utf8($bytes [,CHECK]);
371
47bfe92f 372The sequence of octets represented by $bytes is decoded from UTF-8
373into a sequence of logical characters. Not all sequences of octets
374form valid UTF-8 encodings, so it is possible for this call to fail.
375For CHECK see L</"Handling Malformed Data">.
4411f3b6 376
377=back
378
51ef4e11 379=head2 Listing available encodings
380
5129552c 381 use Encode;
382 @list = Encode->encodings();
383
384Returns a list of the canonical names of the available encodings that
385are loaded. To get a list of all available encodings including the
386ones that are not loaded yet, say
387
388 @all_encodings = Encode->encodings(":all");
389
390Or you can give the name of specific module.
391
392 @with_jp = Encode->encodings("Encode/JP.pm");
51ef4e11 393
5129552c 394Note in this case you have to say "Encode/JP.pm instead of Encode::JP.
5d030b67 395
396To find which encodings are suppoted by this package in details,
397see L<Encode::Supported>.
51ef4e11 398
399=head2 Defining Aliases
400
5129552c 401 use Encode;
402 use Encode::Alias;
51ef4e11 403 define_alias( newName => ENCODING);
404
47bfe92f 405Allows newName to be used as am alias for ENCODING. ENCODING may be
406either the name of an encoding or and encoding object (as above).
51ef4e11 407
5d030b67 408See L<Encode::Alias> on details.
51ef4e11 409
1b2c56c8 410=head1 Defining Encodings
51ef4e11 411
e9692b5b 412 use Encode qw(define_alias);
413 define_encoding( $object, 'canonicalName' [,alias...]);
51ef4e11 414
47bfe92f 415Causes I<canonicalName> to be associated with I<$object>. The object
1b2c56c8 416should provide the interface described in L<Encode::Encoding>
47bfe92f 417below. If more than two arguments are provided then additional
418arguments are taken as aliases for I<$object> as for C<define_alias>.
51ef4e11 419
4411f3b6 420=head1 Encoding and IO
421
422It is very common to want to do encoding transformations when
423reading or writing files, network connections, pipes etc.
47bfe92f 424If Perl is configured to use the new 'perlio' IO system then
4411f3b6 425C<Encode> provides a "layer" (See L<perliol>) which can transform
426data as it is read or written.
427
8e86646e 428Here is how the blind poet would modernise the encoding:
429
42234700 430 use Encode;
8e86646e 431 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
432 open(my $utf8,'>:utf8','iliad.utf8');
433 my @epic = <$iliad>;
434 print $utf8 @epic;
435 close($utf8);
436 close($illiad);
4411f3b6 437
438In addition the new IO system can also be configured to read/write
439UTF-8 encoded characters (as noted above this is efficient):
440
e9692b5b 441 open(my $fh,'>:utf8','anything');
442 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6 443
444Either of the above forms of "layer" specifications can be made the default
445for a lexical scope with the C<use open ...> pragma. See L<open>.
446
447Once a handle is open is layers can be altered using C<binmode>.
448
47bfe92f 449Without any such configuration, or if Perl itself is built using
4411f3b6 450system's own IO, then write operations assume that file handle accepts
451only I<bytes> and will C<die> if a character larger than 255 is
452written to the handle. When reading, each octet from the handle
453becomes a byte-in-a-character. Note that this default is the same
47bfe92f 454behaviour as bytes-only languages (including Perl before v5.6) would
455have, and is sufficient to handle native 8-bit encodings
456e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
457other encodings and binary data.
458
459In other cases it is the programs responsibility to transform
460characters into bytes using the API above before doing writes, and to
461transform the bytes read from a handle into characters before doing
462"character operations" (e.g. C<lc>, C</\W+/>, ...).
463
47bfe92f 464You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8 465want to bring into memory. For example to convert between ISO-8859-1
47bfe92f 466(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
467
e9692b5b 468 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
469 open(G, ">:utf8", "data.utf") or die $!;
470 while (<F>) { print G }
471
472 # Could also do "print G <F>" but that would pull
473 # the whole file into memory just to write it out again.
474
475More examples:
47bfe92f 476
e9692b5b 477 open(my $f, "<:encoding(cp1252)")
478 open(my $g, ">:encoding(iso-8859-2)")
479 open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f 480
481See L<PerlIO> for more information.
4411f3b6 482
1768d7eb 483See also L<encoding> for how to change the default encoding of the
d521382b 484data in your script.
1768d7eb 485
4411f3b6 486=head1 Messing with Perl's Internals
487
47bfe92f 488The following API uses parts of Perl's internals in the current
489implementation. As such they are efficient, but may change.
4411f3b6 490
491=over 4
492
4411f3b6 493=item * is_utf8(STRING [, CHECK])
494
495[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f 496If CHECK is true, also checks the data in STRING for being well-formed
497UTF-8. Returns true if successful, false otherwise.
4411f3b6 498
4411f3b6 499=item *
500
501 _utf8_on(STRING)
502
503[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
504B<not> checked for being well-formed UTF-8. Do not use unless you
505B<know> that the STRING is well-formed UTF-8. Returns the previous
506state of the UTF-8 flag (so please don't test the return value as
507I<not> success or failure), or C<undef> if STRING is not a string.
508
509=item *
510
511 _utf8_off(STRING)
512
513[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
514Returns the previous state of the UTF-8 flag (so please don't test the
515return value as I<not> success or failure), or C<undef> if STRING is
516not a string.
517
518=back
519
520=head1 SEE ALSO
521
5d030b67 522L<Encode::Details>,
523L<Encode::Encoding>,
524L<Encode::Supported>,
525L<PerlIO>,
526L<encoding>,
527L<perlebcdic>,
528L<perlfunc/open>,
529L<perlunicode>,
530L<utf8>,
531the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 532
533=cut