Upgrade to Encode 0.99, from Dan Kogai.
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
64ffdd5e 3our $VERSION = do { my @r = (q$Revision: 0.99 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c 4our $DEBUG = 0;
2c674647 5
6require DynaLoader;
7require Exporter;
8
51ef4e11 9our @ISA = qw(Exporter DynaLoader);
2c674647 10
4411f3b6 11# Public, encouraged API is exported by default
51ef4e11 12our @EXPORT = qw (
4411f3b6 13 encode
14 decode
15 encode_utf8
16 decode_utf8
17 find_encoding
51ef4e11 18 encodings
4411f3b6 19);
20
51ef4e11 21our @EXPORT_OK =
2c674647 22 qw(
51ef4e11 23 define_encoding
2c674647 24 from_to
25 is_utf8
4411f3b6 26 is_8bit
27 is_16bit
a12c0f56 28 utf8_upgrade
29 utf8_downgrade
4411f3b6 30 _utf8_on
31 _utf8_off
2c674647 32 );
33
34bootstrap Encode ();
35
4411f3b6 36# Documentation moved after __END__ for speed - NI-S
2c674647 37
bf230f3d 38use Carp;
39
a63c962f 40our $ON_EBCDIC = (ord("A") == 193);
5d030b67 41use Encode::Alias;
42
5129552c 43# Make a %Encoding package variable to allow a certain amount of cheating
44our %Encoding;
5345d506 45
5129552c 46our %ExtModule =
2b217bf7 47 (
5129552c 48 viscii => 'Encode/Byte.pm',
49 'koi8-r' => 'Encode/Byte.pm',
50 cp1047 => 'Encode/EBCDIC.pm',
51 cp37 => 'Encode/EBCDIC.pm',
52 'posix-bc' => 'Encode/EBCDIC.pm',
53 symbol => 'Encode/Symbol.pm',
54 dingbats => 'Encode/Symbol.pm',
2b217bf7 55 );
d1ed7747 56
5129552c 57for my $k (2..11,13..16){
58 $ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm';
59}
60
61for my $k (1250..1258){
62 $ExtModule{"cp$k"} = 'Encode/Byte.pm';
63}
64
a63c962f 65unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
66%ExtModule =(
67 %ExtModule,
68 'euc-cn' => 'Encode/CN.pm',
69 gb2312 => 'Encode/CN.pm',
70 gb12345 => 'Encode/CN.pm',
71 gbk => 'Encode/CN.pm',
72 cp936 => 'Encode/CN.pm',
73 'iso-ir-165' => 'Encode/CN.pm',
74 'euc-jp' => 'Encode/JP.pm',
75 'iso-2022-jp' => 'Encode/JP.pm',
76 'iso-2022-jp-1' => 'Encode/JP.pm',
77 '7bit-jis' => 'Encode/JP.pm',
78 shiftjis => 'Encode/JP.pm',
79 macjapan => 'Encode/JP.pm',
80 cp932 => 'Encode/JP.pm',
81 'euc-kr' => 'Encode/KR.pm',
82 ksc5601 => 'Encode/KR.pm',
83 cp949 => 'Encode/KR.pm',
84 big5 => 'Encode/TW.pm',
85 'big5-hkscs' => 'Encode/TW.pm',
86 cp950 => 'Encode/TW.pm',
87 gb18030 => 'Encode/HanExtra.pm',
88 big5plus => 'Encode/HanExtra.pm',
89 'euc-tw' => 'Encode/HanExtra.pm',
90 );
91}
92
5129552c 93for my $k (qw(centeuro croatian cyrillic dingbats greek
94 iceland roman rumanian sami
95 thai turkish ukraine))
96{
97 $ExtModule{"mac$k"} = 'Encode/Byte.pm';
98}
99
100
656753f8 101sub encodings
102{
5129552c 103 my $class = shift;
071db25d 104 my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
5129552c 105 for my $m (@modules)
106 {
107 $DEBUG and warn "about to require $m;";
108 eval { require $m; };
109 }
110 return
111 map({$_->[0]}
112 sort({$a->[1] cmp $b->[1]}
113 map({[$_, lc $_]}
114 grep({ $_ ne 'Internal' } keys %Encoding))));
51ef4e11 115}
116
51ef4e11 117sub define_encoding
118{
18586f54 119 my $obj = shift;
120 my $name = shift;
5129552c 121 $Encoding{$name} = $obj;
18586f54 122 my $lc = lc($name);
123 define_alias($lc => $obj) unless $lc eq $name;
124 while (@_)
125 {
126 my $alias = shift;
127 define_alias($alias,$obj);
128 }
129 return $obj;
656753f8 130}
131
656753f8 132sub getEncoding
133{
dd9703c9 134 my ($class,$name,$skip_external) = @_;
18586f54 135 my $enc;
136 if (ref($name) && $name->can('new_sequence'))
137 {
138 return $name;
139 }
140 my $lc = lc $name;
5129552c 141 if (exists $Encoding{$name})
18586f54 142 {
5129552c 143 return $Encoding{$name};
18586f54 144 }
5129552c 145 if (exists $Encoding{$lc})
18586f54 146 {
5129552c 147 return $Encoding{$lc};
18586f54 148 }
c50d192e 149
5129552c 150 my $oc = $class->find_alias($name);
c50d192e 151 return $oc if defined $oc;
152
5129552c 153 $oc = $class->find_alias($lc) if $lc ne $name;
c50d192e 154 return $oc if defined $oc;
155
5129552c 156 if (!$skip_external and exists $ExtModule{$lc})
d1ed7747 157 {
5129552c 158 eval{ require $ExtModule{$lc}; };
159 return $Encoding{$name} if exists $Encoding{$name};
d1ed7747 160 }
18586f54 161
18586f54 162 return;
656753f8 163}
164
4411f3b6 165sub find_encoding
166{
dd9703c9 167 my ($name,$skip_external) = @_;
168 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6 169}
170
171sub encode
172{
18586f54 173 my ($name,$string,$check) = @_;
174 my $enc = find_encoding($name);
175 croak("Unknown encoding '$name'") unless defined $enc;
176 my $octets = $enc->encode($string,$check);
177 return undef if ($check && length($string));
178 return $octets;
4411f3b6 179}
180
181sub decode
182{
18586f54 183 my ($name,$octets,$check) = @_;
184 my $enc = find_encoding($name);
185 croak("Unknown encoding '$name'") unless defined $enc;
186 my $string = $enc->decode($octets,$check);
187 $_[1] = $octets if $check;
188 return $string;
4411f3b6 189}
190
191sub from_to
192{
18586f54 193 my ($string,$from,$to,$check) = @_;
194 my $f = find_encoding($from);
195 croak("Unknown encoding '$from'") unless defined $f;
196 my $t = find_encoding($to);
197 croak("Unknown encoding '$to'") unless defined $t;
198 my $uni = $f->decode($string,$check);
199 return undef if ($check && length($string));
200 $string = $t->encode($uni,$check);
201 return undef if ($check && length($uni));
202 return length($_[0] = $string);
4411f3b6 203}
204
205sub encode_utf8
206{
18586f54 207 my ($str) = @_;
208 utf8::encode($str);
209 return $str;
4411f3b6 210}
211
212sub decode_utf8
213{
18586f54 214 my ($str) = @_;
215 return undef unless utf8::decode($str);
216 return $str;
5ad8ef52 217}
218
18586f54 219require Encode::Encoding;
220require Encode::XS;
221require Encode::Internal;
222require Encode::Unicode;
223require Encode::utf8;
64ffdd5e 224require Encode::10646_1;
18586f54 225require Encode::ucs2_le;
4411f3b6 226
656753f8 2271;
228
2a936312 229__END__
230
4411f3b6 231=head1 NAME
232
233Encode - character encodings
234
235=head1 SYNOPSIS
236
237 use Encode;
238
239=head1 DESCRIPTION
240
47bfe92f 241The C<Encode> module provides the interfaces between Perl's strings
242and the rest of the system. Perl strings are sequences of B<characters>.
4411f3b6 243
1b2c56c8 244To find more about character encodings, please consult
a63c962f 245L<Encode::Details>. This document focuses on programming references.
21938dfa 246
4411f3b6 247=head1 PERL ENCODING API
248
249=head2 Generic Encoding Interface
250
251=over 4
252
a63c962f 253=item $bytes = encode(ENCODING, $string[, CHECK])
4411f3b6 254
47bfe92f 255Encodes string from Perl's internal form into I<ENCODING> and returns
256a sequence of octets. For CHECK see L</"Handling Malformed Data">.
4411f3b6 257
681a7c68 258For example to convert (internally UTF-8 encoded) Unicode data
259to octets:
260
261 $octets = encode("utf8", $unicode);
262
a63c962f 263=item $string = decode(ENCODING, $bytes[, CHECK])
4411f3b6 264
47bfe92f 265Decode sequence of octets assumed to be in I<ENCODING> into Perl's
266internal form and returns the resulting string. For CHECK see
267L</"Handling Malformed Data">.
268
1b2c56c8 269For example to convert ISO-8859-1 data to UTF-8:
681a7c68 270
271 $utf8 = decode("latin1", $latin1);
272
a63c962f 273=item from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
47bfe92f 274
2b106fbe 275Convert B<in-place> the data between two encodings. How did the data
276in $string originally get to be in FROM_ENCODING? Either using
e9692b5b 277encode() or through PerlIO: See L</"Encoding and IO">. For CHECK
2b106fbe 278see L</"Handling Malformed Data">.
279
1b2c56c8 280For example to convert ISO-8859-1 data to UTF-8:
2b106fbe 281
282 from_to($data, "iso-8859-1", "utf-8");
283
284and to convert it back:
285
286 from_to($data, "utf-8", "iso-8859-1");
4411f3b6 287
ab97ca19 288Note that because the conversion happens in place, the data to be
289converted cannot be a string constant, it must be a scalar variable.
290
4411f3b6 291=back
292
293=head2 Handling Malformed Data
294
295If CHECK is not set, C<undef> is returned. If the data is supposed to
47bfe92f 296be UTF-8, an optional lexical warning (category utf8) is given. If
297CHECK is true but not a code reference, dies.
4411f3b6 298
47bfe92f 299It would desirable to have a way to indicate that transform should use
300the encodings "replacement character" - no such mechanism is defined yet.
4411f3b6 301
302It is also planned to allow I<CHECK> to be a code reference.
303
47bfe92f 304This is not yet implemented as there are design issues with what its
305arguments should be and how it returns its results.
4411f3b6 306
307=over 4
308
309=item Scheme 1
310
311Passed remaining fragment of string being processed.
312Modifies it in place to remove bytes/characters it can understand
313and returns a string used to represent them.
314e.g.
315
316 sub fixup {
317 my $ch = substr($_[0],0,1,'');
318 return sprintf("\x{%02X}",ord($ch);
319 }
320
321This scheme is close to how underlying C code for Encode works, but gives
322the fixup routine very little context.
323
324=item Scheme 2
325
47bfe92f 326Passed original string, and an index into it of the problem area, and
327output string so far. Appends what it will to output string and
328returns new index into original string. For example:
4411f3b6 329
330 sub fixup {
331 # my ($s,$i,$d) = @_;
332 my $ch = substr($_[0],$_[1],1);
333 $_[2] .= sprintf("\x{%02X}",ord($ch);
334 return $_[1]+1;
335 }
336
47bfe92f 337This scheme gives maximal control to the fixup routine but is more
338complicated to code, and may need internals of Encode to be tweaked to
339keep original string intact.
4411f3b6 340
341=item Other Schemes
342
343Hybrids of above.
344
345Multiple return values rather than in-place modifications.
346
a63c962f 347Index into the string could be C<pos($str)> allowing C<s/\G...//>.
4411f3b6 348
349=back
350
351=head2 UTF-8 / utf8
352
353The Unicode consortium defines the UTF-8 standard as a way of encoding
a63c962f 354the entire Unicode repertoire as sequences of octets. This encoding is
355expected to become very widespread. Perl can use this form internally
47bfe92f 356to represent strings, so conversions to and from this form are
357particularly efficient (as octets in memory do not have to change,
358just the meta-data that tells Perl how to treat them).
4411f3b6 359
360=over 4
361
a63c962f 362=item $bytes = encode_utf8($string);
4411f3b6 363
47bfe92f 364The characters that comprise string are encoded in Perl's superset of UTF-8
4411f3b6 365and the resulting octets returned as a sequence of bytes. All possible
366characters have a UTF-8 representation so this function cannot fail.
367
a63c962f 368=item $string = decode_utf8($bytes [, CHECK]);
4411f3b6 369
47bfe92f 370The sequence of octets represented by $bytes is decoded from UTF-8
371into a sequence of logical characters. Not all sequences of octets
372form valid UTF-8 encodings, so it is possible for this call to fail.
373For CHECK see L</"Handling Malformed Data">.
4411f3b6 374
375=back
376
51ef4e11 377=head2 Listing available encodings
378
5129552c 379 use Encode;
380 @list = Encode->encodings();
381
382Returns a list of the canonical names of the available encodings that
383are loaded. To get a list of all available encodings including the
384ones that are not loaded yet, say
385
386 @all_encodings = Encode->encodings(":all");
387
388Or you can give the name of specific module.
389
390 @with_jp = Encode->encodings("Encode/JP.pm");
51ef4e11 391
a63c962f 392Note in this case you have to say C<"Encode/JP.pm"> instead of
393C<"Encode::JP">.
5d030b67 394
a63c962f 395To find which encodings are supported by this package in details,
5d030b67 396see L<Encode::Supported>.
51ef4e11 397
398=head2 Defining Aliases
399
5129552c 400 use Encode;
401 use Encode::Alias;
a63c962f 402 define_alias(newName => ENCODING);
51ef4e11 403
47bfe92f 404Allows newName to be used as am alias for ENCODING. ENCODING may be
405either the name of an encoding or and encoding object (as above).
51ef4e11 406
5d030b67 407See L<Encode::Alias> on details.
51ef4e11 408
1b2c56c8 409=head1 Defining Encodings
51ef4e11 410
e9692b5b 411 use Encode qw(define_alias);
a63c962f 412 define_encoding($object, 'canonicalName' [, alias...]);
51ef4e11 413
47bfe92f 414Causes I<canonicalName> to be associated with I<$object>. The object
1b2c56c8 415should provide the interface described in L<Encode::Encoding>
47bfe92f 416below. If more than two arguments are provided then additional
417arguments are taken as aliases for I<$object> as for C<define_alias>.
51ef4e11 418
4411f3b6 419=head1 Encoding and IO
420
421It is very common to want to do encoding transformations when
422reading or writing files, network connections, pipes etc.
47bfe92f 423If Perl is configured to use the new 'perlio' IO system then
4411f3b6 424C<Encode> provides a "layer" (See L<perliol>) which can transform
425data as it is read or written.
426
8e86646e 427Here is how the blind poet would modernise the encoding:
428
42234700 429 use Encode;
8e86646e 430 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
431 open(my $utf8,'>:utf8','iliad.utf8');
432 my @epic = <$iliad>;
433 print $utf8 @epic;
434 close($utf8);
435 close($illiad);
4411f3b6 436
437In addition the new IO system can also be configured to read/write
438UTF-8 encoded characters (as noted above this is efficient):
439
e9692b5b 440 open(my $fh,'>:utf8','anything');
441 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6 442
443Either of the above forms of "layer" specifications can be made the default
444for a lexical scope with the C<use open ...> pragma. See L<open>.
445
446Once a handle is open is layers can be altered using C<binmode>.
447
47bfe92f 448Without any such configuration, or if Perl itself is built using
4411f3b6 449system's own IO, then write operations assume that file handle accepts
450only I<bytes> and will C<die> if a character larger than 255 is
451written to the handle. When reading, each octet from the handle
452becomes a byte-in-a-character. Note that this default is the same
47bfe92f 453behaviour as bytes-only languages (including Perl before v5.6) would
454have, and is sufficient to handle native 8-bit encodings
455e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
456other encodings and binary data.
457
458In other cases it is the programs responsibility to transform
459characters into bytes using the API above before doing writes, and to
460transform the bytes read from a handle into characters before doing
461"character operations" (e.g. C<lc>, C</\W+/>, ...).
462
47bfe92f 463You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8 464want to bring into memory. For example to convert between ISO-8859-1
47bfe92f 465(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
466
e9692b5b 467 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
468 open(G, ">:utf8", "data.utf") or die $!;
469 while (<F>) { print G }
470
471 # Could also do "print G <F>" but that would pull
472 # the whole file into memory just to write it out again.
473
474More examples:
47bfe92f 475
e9692b5b 476 open(my $f, "<:encoding(cp1252)")
477 open(my $g, ">:encoding(iso-8859-2)")
478 open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f 479
480See L<PerlIO> for more information.
4411f3b6 481
1768d7eb 482See also L<encoding> for how to change the default encoding of the
d521382b 483data in your script.
1768d7eb 484
4411f3b6 485=head1 Messing with Perl's Internals
486
47bfe92f 487The following API uses parts of Perl's internals in the current
488implementation. As such they are efficient, but may change.
4411f3b6 489
490=over 4
491
a63c962f 492=item is_utf8(STRING [, CHECK])
4411f3b6 493
494[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f 495If CHECK is true, also checks the data in STRING for being well-formed
496UTF-8. Returns true if successful, false otherwise.
4411f3b6 497
a63c962f 498=item _utf8_on(STRING)
4411f3b6 499
500[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
501B<not> checked for being well-formed UTF-8. Do not use unless you
502B<know> that the STRING is well-formed UTF-8. Returns the previous
503state of the UTF-8 flag (so please don't test the return value as
504I<not> success or failure), or C<undef> if STRING is not a string.
505
a63c962f 506=item _utf8_off(STRING)
4411f3b6 507
508[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
509Returns the previous state of the UTF-8 flag (so please don't test the
510return value as I<not> success or failure), or C<undef> if STRING is
511not a string.
512
513=back
514
515=head1 SEE ALSO
516
5d030b67 517L<Encode::Details>,
518L<Encode::Encoding>,
519L<Encode::Supported>,
520L<PerlIO>,
521L<encoding>,
522L<perlebcdic>,
523L<perlfunc/open>,
524L<perlunicode>,
525L<utf8>,
526the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 527
528=cut