If Unicode keys are entered to a hash, a bit is turned on.
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
5d030b67 3our $VERSION = do { my @r = (q$Revision: 0.95 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
2c674647 4
5require DynaLoader;
6require Exporter;
7
51ef4e11 8our @ISA = qw(Exporter DynaLoader);
2c674647 9
4411f3b6 10# Public, encouraged API is exported by default
51ef4e11 11our @EXPORT = qw (
4411f3b6 12 encode
13 decode
14 encode_utf8
15 decode_utf8
16 find_encoding
51ef4e11 17 encodings
4411f3b6 18);
19
51ef4e11 20our @EXPORT_OK =
2c674647 21 qw(
51ef4e11 22 define_encoding
23 define_alias
2c674647 24 from_to
25 is_utf8
4411f3b6 26 is_8bit
27 is_16bit
a12c0f56 28 utf8_upgrade
29 utf8_downgrade
4411f3b6 30 _utf8_on
31 _utf8_off
2c674647 32 );
33
34bootstrap Encode ();
35
4411f3b6 36# Documentation moved after __END__ for speed - NI-S
2c674647 37
bf230f3d 38use Carp;
39
5d030b67 40use Encode::Alias;
41
51ef4e11 42# Make a %encoding package variable to allow a certain amount of cheating
43our %encoding;
5345d506 44
70122e76 45our %external_tables =
2b217bf7 46 (
47 'euc-cn' => 'Encode/CN.pm',
48 gb2312 => 'Encode/CN.pm',
49 gb12345 => 'Encode/CN.pm',
50 gbk => 'Encode/CN.pm',
51 cp936 => 'Encode/CN.pm',
52 'iso-ir-165' => 'Encode/CN.pm',
53 'euc-jp' => 'Encode/JP.pm',
ee981de6 54 'iso-2022-jp' => 'Encode/JP.pm',
55 '7bit-jis' => 'Encode/JP.pm',
2b217bf7 56 shiftjis => 'Encode/JP.pm',
57 macjapan => 'Encode/JP.pm',
58 cp932 => 'Encode/JP.pm',
59 'euc-kr' => 'Encode/KR.pm',
60 ksc5601 => 'Encode/KR.pm',
61 cp949 => 'Encode/KR.pm',
62 big5 => 'Encode/TW.pm',
63 'big5-hkscs' => 'Encode/TW.pm',
64 cp950 => 'Encode/TW.pm',
65 gb18030 => 'Encode/HanExtra.pm',
66 big5plus => 'Encode/HanExtra.pm',
67 'euc-tw' => 'Encode/HanExtra.pm',
68 );
d1ed7747 69
656753f8 70sub encodings
71{
72 my ($class) = @_;
40a073c6 73 return
74 map { $_->[0] }
75 sort { $a->[1] cmp $b->[1] }
76 map { [$_, lc $_] }
77 grep { $_ ne 'Internal' }
78 keys %encoding;
51ef4e11 79}
80
51ef4e11 81sub define_encoding
82{
18586f54 83 my $obj = shift;
84 my $name = shift;
85 $encoding{$name} = $obj;
86 my $lc = lc($name);
87 define_alias($lc => $obj) unless $lc eq $name;
88 while (@_)
89 {
90 my $alias = shift;
91 define_alias($alias,$obj);
92 }
93 return $obj;
656753f8 94}
95
656753f8 96sub getEncoding
97{
dd9703c9 98 my ($class,$name,$skip_external) = @_;
18586f54 99 my $enc;
100 if (ref($name) && $name->can('new_sequence'))
101 {
102 return $name;
103 }
104 my $lc = lc $name;
105 if (exists $encoding{$name})
106 {
107 return $encoding{$name};
108 }
109 if (exists $encoding{$lc})
110 {
111 return $encoding{$lc};
112 }
c50d192e 113
114 my $oc = $class->findAlias($name);
115 return $oc if defined $oc;
116
117 $oc = $class->findAlias($lc) if $lc ne $name;
118 return $oc if defined $oc;
119
dd9703c9 120 if (!$skip_external and exists $external_tables{$lc})
d1ed7747 121 {
122 require $external_tables{$lc};
123 return $encoding{$name} if exists $encoding{$name};
124 }
18586f54 125
18586f54 126 return;
656753f8 127}
128
4411f3b6 129sub find_encoding
130{
dd9703c9 131 my ($name,$skip_external) = @_;
132 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6 133}
134
135sub encode
136{
18586f54 137 my ($name,$string,$check) = @_;
138 my $enc = find_encoding($name);
139 croak("Unknown encoding '$name'") unless defined $enc;
140 my $octets = $enc->encode($string,$check);
141 return undef if ($check && length($string));
142 return $octets;
4411f3b6 143}
144
145sub decode
146{
18586f54 147 my ($name,$octets,$check) = @_;
148 my $enc = find_encoding($name);
149 croak("Unknown encoding '$name'") unless defined $enc;
150 my $string = $enc->decode($octets,$check);
151 $_[1] = $octets if $check;
152 return $string;
4411f3b6 153}
154
155sub from_to
156{
18586f54 157 my ($string,$from,$to,$check) = @_;
158 my $f = find_encoding($from);
159 croak("Unknown encoding '$from'") unless defined $f;
160 my $t = find_encoding($to);
161 croak("Unknown encoding '$to'") unless defined $t;
162 my $uni = $f->decode($string,$check);
163 return undef if ($check && length($string));
164 $string = $t->encode($uni,$check);
165 return undef if ($check && length($uni));
166 return length($_[0] = $string);
4411f3b6 167}
168
169sub encode_utf8
170{
18586f54 171 my ($str) = @_;
172 utf8::encode($str);
173 return $str;
4411f3b6 174}
175
176sub decode_utf8
177{
18586f54 178 my ($str) = @_;
179 return undef unless utf8::decode($str);
180 return $str;
5ad8ef52 181}
182
18586f54 183require Encode::Encoding;
184require Encode::XS;
185require Encode::Internal;
186require Encode::Unicode;
187require Encode::utf8;
188require Encode::iso10646_1;
189require Encode::ucs2_le;
4411f3b6 190
656753f8 1911;
192
2a936312 193__END__
194
4411f3b6 195=head1 NAME
196
197Encode - character encodings
198
199=head1 SYNOPSIS
200
201 use Encode;
202
203=head1 DESCRIPTION
204
47bfe92f 205The C<Encode> module provides the interfaces between Perl's strings
206and the rest of the system. Perl strings are sequences of B<characters>.
4411f3b6 207
1b2c56c8 208To find more about character encodings, please consult
5d030b67 209L<Encode::Details> . This document focuses on programming references.
21938dfa 210
4411f3b6 211=head1 PERL ENCODING API
212
213=head2 Generic Encoding Interface
214
215=over 4
216
217=item *
218
219 $bytes = encode(ENCODING, $string[, CHECK])
220
47bfe92f 221Encodes string from Perl's internal form into I<ENCODING> and returns
222a sequence of octets. For CHECK see L</"Handling Malformed Data">.
4411f3b6 223
681a7c68 224For example to convert (internally UTF-8 encoded) Unicode data
225to octets:
226
227 $octets = encode("utf8", $unicode);
228
4411f3b6 229=item *
230
231 $string = decode(ENCODING, $bytes[, CHECK])
232
47bfe92f 233Decode sequence of octets assumed to be in I<ENCODING> into Perl's
234internal form and returns the resulting string. For CHECK see
235L</"Handling Malformed Data">.
236
1b2c56c8 237For example to convert ISO-8859-1 data to UTF-8:
681a7c68 238
239 $utf8 = decode("latin1", $latin1);
240
47bfe92f 241=item *
242
243 from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
244
2b106fbe 245Convert B<in-place> the data between two encodings. How did the data
246in $string originally get to be in FROM_ENCODING? Either using
e9692b5b 247encode() or through PerlIO: See L</"Encoding and IO">. For CHECK
2b106fbe 248see L</"Handling Malformed Data">.
249
1b2c56c8 250For example to convert ISO-8859-1 data to UTF-8:
2b106fbe 251
252 from_to($data, "iso-8859-1", "utf-8");
253
254and to convert it back:
255
256 from_to($data, "utf-8", "iso-8859-1");
4411f3b6 257
ab97ca19 258Note that because the conversion happens in place, the data to be
259converted cannot be a string constant, it must be a scalar variable.
260
4411f3b6 261=back
262
263=head2 Handling Malformed Data
264
265If CHECK is not set, C<undef> is returned. If the data is supposed to
47bfe92f 266be UTF-8, an optional lexical warning (category utf8) is given. If
267CHECK is true but not a code reference, dies.
4411f3b6 268
47bfe92f 269It would desirable to have a way to indicate that transform should use
270the encodings "replacement character" - no such mechanism is defined yet.
4411f3b6 271
272It is also planned to allow I<CHECK> to be a code reference.
273
47bfe92f 274This is not yet implemented as there are design issues with what its
275arguments should be and how it returns its results.
4411f3b6 276
277=over 4
278
279=item Scheme 1
280
281Passed remaining fragment of string being processed.
282Modifies it in place to remove bytes/characters it can understand
283and returns a string used to represent them.
284e.g.
285
286 sub fixup {
287 my $ch = substr($_[0],0,1,'');
288 return sprintf("\x{%02X}",ord($ch);
289 }
290
291This scheme is close to how underlying C code for Encode works, but gives
292the fixup routine very little context.
293
294=item Scheme 2
295
47bfe92f 296Passed original string, and an index into it of the problem area, and
297output string so far. Appends what it will to output string and
298returns new index into original string. For example:
4411f3b6 299
300 sub fixup {
301 # my ($s,$i,$d) = @_;
302 my $ch = substr($_[0],$_[1],1);
303 $_[2] .= sprintf("\x{%02X}",ord($ch);
304 return $_[1]+1;
305 }
306
47bfe92f 307This scheme gives maximal control to the fixup routine but is more
308complicated to code, and may need internals of Encode to be tweaked to
309keep original string intact.
4411f3b6 310
311=item Other Schemes
312
313Hybrids of above.
314
315Multiple return values rather than in-place modifications.
316
317Index into the string could be pos($str) allowing s/\G...//.
318
319=back
320
321=head2 UTF-8 / utf8
322
323The Unicode consortium defines the UTF-8 standard as a way of encoding
47bfe92f 324the entire Unicode repertiore as sequences of octets. This encoding is
325expected to become very widespread. Perl can use this form internaly
326to represent strings, so conversions to and from this form are
327particularly efficient (as octets in memory do not have to change,
328just the meta-data that tells Perl how to treat them).
4411f3b6 329
330=over 4
331
332=item *
333
334 $bytes = encode_utf8($string);
335
47bfe92f 336The characters that comprise string are encoded in Perl's superset of UTF-8
4411f3b6 337and the resulting octets returned as a sequence of bytes. All possible
338characters have a UTF-8 representation so this function cannot fail.
339
340=item *
341
342 $string = decode_utf8($bytes [,CHECK]);
343
47bfe92f 344The sequence of octets represented by $bytes is decoded from UTF-8
345into a sequence of logical characters. Not all sequences of octets
346form valid UTF-8 encodings, so it is possible for this call to fail.
347For CHECK see L</"Handling Malformed Data">.
4411f3b6 348
349=back
350
51ef4e11 351=head2 Listing available encodings
352
353 use Encode qw(encodings);
354 @list = encodings();
355
5d030b67 356Returns a list of the canonical names of the available encodings.
357
358To find which encodings are suppoted by this package in details,
359see L<Encode::Supported>.
51ef4e11 360
361=head2 Defining Aliases
362
363 use Encode qw(define_alias);
364 define_alias( newName => ENCODING);
365
47bfe92f 366Allows newName to be used as am alias for ENCODING. ENCODING may be
367either the name of an encoding or and encoding object (as above).
51ef4e11 368
5d030b67 369See L<Encode::Alias> on details.
51ef4e11 370
1b2c56c8 371=head1 Defining Encodings
51ef4e11 372
e9692b5b 373 use Encode qw(define_alias);
374 define_encoding( $object, 'canonicalName' [,alias...]);
51ef4e11 375
47bfe92f 376Causes I<canonicalName> to be associated with I<$object>. The object
1b2c56c8 377should provide the interface described in L<Encode::Encoding>
47bfe92f 378below. If more than two arguments are provided then additional
379arguments are taken as aliases for I<$object> as for C<define_alias>.
51ef4e11 380
4411f3b6 381=head1 Encoding and IO
382
383It is very common to want to do encoding transformations when
384reading or writing files, network connections, pipes etc.
47bfe92f 385If Perl is configured to use the new 'perlio' IO system then
4411f3b6 386C<Encode> provides a "layer" (See L<perliol>) which can transform
387data as it is read or written.
388
8e86646e 389Here is how the blind poet would modernise the encoding:
390
42234700 391 use Encode;
8e86646e 392 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
393 open(my $utf8,'>:utf8','iliad.utf8');
394 my @epic = <$iliad>;
395 print $utf8 @epic;
396 close($utf8);
397 close($illiad);
4411f3b6 398
399In addition the new IO system can also be configured to read/write
400UTF-8 encoded characters (as noted above this is efficient):
401
e9692b5b 402 open(my $fh,'>:utf8','anything');
403 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6 404
405Either of the above forms of "layer" specifications can be made the default
406for a lexical scope with the C<use open ...> pragma. See L<open>.
407
408Once a handle is open is layers can be altered using C<binmode>.
409
47bfe92f 410Without any such configuration, or if Perl itself is built using
4411f3b6 411system's own IO, then write operations assume that file handle accepts
412only I<bytes> and will C<die> if a character larger than 255 is
413written to the handle. When reading, each octet from the handle
414becomes a byte-in-a-character. Note that this default is the same
47bfe92f 415behaviour as bytes-only languages (including Perl before v5.6) would
416have, and is sufficient to handle native 8-bit encodings
417e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
418other encodings and binary data.
419
420In other cases it is the programs responsibility to transform
421characters into bytes using the API above before doing writes, and to
422transform the bytes read from a handle into characters before doing
423"character operations" (e.g. C<lc>, C</\W+/>, ...).
424
47bfe92f 425You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8 426want to bring into memory. For example to convert between ISO-8859-1
47bfe92f 427(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
428
e9692b5b 429 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
430 open(G, ">:utf8", "data.utf") or die $!;
431 while (<F>) { print G }
432
433 # Could also do "print G <F>" but that would pull
434 # the whole file into memory just to write it out again.
435
436More examples:
47bfe92f 437
e9692b5b 438 open(my $f, "<:encoding(cp1252)")
439 open(my $g, ">:encoding(iso-8859-2)")
440 open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f 441
442See L<PerlIO> for more information.
4411f3b6 443
1768d7eb 444See also L<encoding> for how to change the default encoding of the
d521382b 445data in your script.
1768d7eb 446
4411f3b6 447=head1 Messing with Perl's Internals
448
47bfe92f 449The following API uses parts of Perl's internals in the current
450implementation. As such they are efficient, but may change.
4411f3b6 451
452=over 4
453
4411f3b6 454=item * is_utf8(STRING [, CHECK])
455
456[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f 457If CHECK is true, also checks the data in STRING for being well-formed
458UTF-8. Returns true if successful, false otherwise.
4411f3b6 459
4411f3b6 460=item *
461
462 _utf8_on(STRING)
463
464[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
465B<not> checked for being well-formed UTF-8. Do not use unless you
466B<know> that the STRING is well-formed UTF-8. Returns the previous
467state of the UTF-8 flag (so please don't test the return value as
468I<not> success or failure), or C<undef> if STRING is not a string.
469
470=item *
471
472 _utf8_off(STRING)
473
474[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
475Returns the previous state of the UTF-8 flag (so please don't test the
476return value as I<not> success or failure), or C<undef> if STRING is
477not a string.
478
479=back
480
481=head1 SEE ALSO
482
5d030b67 483L<Encode::Details>,
484L<Encode::Encoding>,
485L<Encode::Supported>,
486L<PerlIO>,
487L<encoding>,
488L<perlebcdic>,
489L<perlfunc/open>,
490L<perlunicode>,
491L<utf8>,
492the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 493
494=cut