Add the binmode()s to make JP.t pass on Win32
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
0e567a6c 3our $VERSION = '0.40';
2c674647 4
5require DynaLoader;
6require Exporter;
7
51ef4e11 8our @ISA = qw(Exporter DynaLoader);
2c674647 9
4411f3b6 10# Public, encouraged API is exported by default
51ef4e11 11our @EXPORT = qw (
4411f3b6 12 encode
13 decode
14 encode_utf8
15 decode_utf8
16 find_encoding
51ef4e11 17 encodings
4411f3b6 18);
19
51ef4e11 20our @EXPORT_OK =
2c674647 21 qw(
51ef4e11 22 define_encoding
23 define_alias
2c674647 24 from_to
25 is_utf8
4411f3b6 26 is_8bit
27 is_16bit
a12c0f56 28 utf8_upgrade
29 utf8_downgrade
4411f3b6 30 _utf8_on
31 _utf8_off
2c674647 32 );
33
34bootstrap Encode ();
35
4411f3b6 36# Documentation moved after __END__ for speed - NI-S
2c674647 37
bf230f3d 38use Carp;
39
51ef4e11 40# Make a %encoding package variable to allow a certain amount of cheating
41our %encoding;
42my @alias; # ordered matching list
43my %alias; # cached known aliases
f7ac3676 44
6d6a7c8d 45 # 0 1 2 3 4 5 6 7 8 9 10
46our @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
47
f7ac3676 48our %winlatin2cp = (
49 'Latin1' => 1252,
50 'Latin2' => 1250,
51 'Cyrillic' => 1251,
f7ac3676 52 'Greek' => 1253,
53 'Turkish' => 1254,
54 'Hebrew' => 1255,
55 'Arabic' => 1256,
56 'Baltic' => 1257,
57 'Vietnamese' => 1258,
58 );
5345d506 59
2b217bf7 60our %external_tables =
61 (
62 'euc-cn' => 'Encode/CN.pm',
63 gb2312 => 'Encode/CN.pm',
64 gb12345 => 'Encode/CN.pm',
65 gbk => 'Encode/CN.pm',
66 cp936 => 'Encode/CN.pm',
67 'iso-ir-165' => 'Encode/CN.pm',
68 'euc-jp' => 'Encode/JP.pm',
69 shiftjis => 'Encode/JP.pm',
70 macjapan => 'Encode/JP.pm',
71 cp932 => 'Encode/JP.pm',
72 'euc-kr' => 'Encode/KR.pm',
73 ksc5601 => 'Encode/KR.pm',
74 cp949 => 'Encode/KR.pm',
75 big5 => 'Encode/TW.pm',
76 'big5-hkscs' => 'Encode/TW.pm',
77 cp950 => 'Encode/TW.pm',
78 gb18030 => 'Encode/HanExtra.pm',
79 big5plus => 'Encode/HanExtra.pm',
80 'euc-tw' => 'Encode/HanExtra.pm',
81 );
d1ed7747 82
656753f8 83sub encodings
84{
85 my ($class) = @_;
40a073c6 86 return
87 map { $_->[0] }
88 sort { $a->[1] cmp $b->[1] }
89 map { [$_, lc $_] }
90 grep { $_ ne 'Internal' }
91 keys %encoding;
51ef4e11 92}
93
94sub findAlias
95{
18586f54 96 my $class = shift;
97 local $_ = shift;
98 # print "# findAlias $_\n";
99 unless (exists $alias{$_})
656753f8 100 {
18586f54 101 for (my $i=0; $i < @alias; $i += 2)
102 {
103 my $alias = $alias[$i];
104 my $val = $alias[$i+1];
105 my $new;
106 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
107 {
108 $new = eval $val;
109 }
110 elsif (ref($alias) eq 'CODE')
111 {
112 $new = &{$alias}($val)
113 }
114 elsif (lc($_) eq lc($alias))
115 {
116 $new = $val;
117 }
118 if (defined($new))
119 {
120 next if $new eq $_; # avoid (direct) recursion on bugs
121 my $enc = (ref($new)) ? $new : find_encoding($new);
122 if ($enc)
123 {
124 $alias{$_} = $enc;
125 last;
126 }
127 }
128 }
656753f8 129 }
18586f54 130 return $alias{$_};
5345d506 131}
132
51ef4e11 133sub define_alias
5345d506 134{
18586f54 135 while (@_)
136 {
137 my ($alias,$name) = splice(@_,0,2);
138 push(@alias, $alias => $name);
139 }
51ef4e11 140}
141
016cb72c 142# Allow variants of iso-8859-1 etc.
d6089a2a 143define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
016cb72c 144
7faf300d 145# At least HP-UX has these.
146define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );
147
f7ac3676 148# More HP stuff.
149define_alias( qr/^(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
150
0b3236bb 151# The Official name of ASCII.
8a361256 152define_alias( qr/^ANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
153
58d53262 154# This is a font issue, not an encoding issue.
155# (The currency symbol of the Latin 1 upper half
156# has been redefined as the euro symbol.)
157define_alias( qr/^(.+)\@euro$/i => '"$1"' );
158
016cb72c 159# Allow latin-1 style names as well
7faf300d 160define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
016cb72c 161
f7ac3676 162# Allow winlatin1 style names as well
cf91068f 163define_alias( qr/^win(latin[12]|cyrillic|baltic|greek|turkish|hebrew|arabic|baltic|vietnamese)$/i => '"cp$winlatin2cp{\u$1}"' );
f7ac3676 164
016cb72c 165# Common names for non-latin prefered MIME names
166define_alias( 'ascii' => 'US-ascii',
167 'cyrillic' => 'iso-8859-5',
168 'arabic' => 'iso-8859-6',
169 'greek' => 'iso-8859-7',
f7ac3676 170 'hebrew' => 'iso-8859-8',
171 'thai' => 'iso-8859-11',
172 'tis620' => 'iso-8859-11',
173 );
016cb72c 174
7faf300d 175# At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
1853dd5f 176# And Microsoft has their own naming (again, surprisingly).
177define_alias( qr/^(?:ibm|ms)[-_]?(\d\d\d\d?)$/i => '"cp$1"');
178
179# Sometimes seen with a leading zero.
180define_alias( qr/^cp037$/i => '"cp37"');
181
182# Ououououou.
183define_alias( qr/^macRomanian$/i => '"macRumanian"');
7faf300d 184
58d53262 185# Standardize on the dashed versions.
186define_alias( qr/^utf8$/i => 'utf-8' );
7faf300d 187define_alias( qr/^koi8r$/i => 'koi8-r' );
f7ac3676 188define_alias( qr/^koi8u$/i => 'koi8-u' );
189
1853dd5f 190# Seen in some Linuxes.
191define_alias( qr/^ujis$/i => 'euc-jp' );
192
b2729934 193# CP936 doesn't have vendor-addon for GBK, so they're identical.
194define_alias( qr/^gbk$/i => '"cp936"');
195
f7ac3676 196# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
197# TODO: HP-UX '15' encodings japanese15 korean15 roi15
198# TODO: Cyrillic encoding ISO-IR-111 (useful?)
f7ac3676 199# TODO: Armenian encoding ARMSCII-8
200# TODO: Hebrew encoding ISO-8859-8-1
201# TODO: Thai encoding TCVN
202# TODO: Korean encoding Johab
56a543c5 203# TODO: Vietnamese encodings VPS
f7ac3676 204# TODO: Japanese encoding JIS (not the same as SJIS)
205# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
206# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
207# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
208# Kannada Khmer Korean Laotian Malayalam Mongolian
209# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
18586f54 210
1853dd5f 211# Map white space and _ to '-'
016cb72c 212define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
213
51ef4e11 214sub define_encoding
215{
18586f54 216 my $obj = shift;
217 my $name = shift;
218 $encoding{$name} = $obj;
219 my $lc = lc($name);
220 define_alias($lc => $obj) unless $lc eq $name;
221 while (@_)
222 {
223 my $alias = shift;
224 define_alias($alias,$obj);
225 }
226 return $obj;
656753f8 227}
228
656753f8 229sub getEncoding
230{
dd9703c9 231 my ($class,$name,$skip_external) = @_;
18586f54 232 my $enc;
233 if (ref($name) && $name->can('new_sequence'))
234 {
235 return $name;
236 }
237 my $lc = lc $name;
238 if (exists $encoding{$name})
239 {
240 return $encoding{$name};
241 }
242 if (exists $encoding{$lc})
243 {
244 return $encoding{$lc};
245 }
c50d192e 246
247 my $oc = $class->findAlias($name);
248 return $oc if defined $oc;
249
250 $oc = $class->findAlias($lc) if $lc ne $name;
251 return $oc if defined $oc;
252
dd9703c9 253 if (!$skip_external and exists $external_tables{$lc})
d1ed7747 254 {
255 require $external_tables{$lc};
256 return $encoding{$name} if exists $encoding{$name};
257 }
18586f54 258
18586f54 259 return;
656753f8 260}
261
4411f3b6 262sub find_encoding
263{
dd9703c9 264 my ($name,$skip_external) = @_;
265 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6 266}
267
268sub encode
269{
18586f54 270 my ($name,$string,$check) = @_;
271 my $enc = find_encoding($name);
272 croak("Unknown encoding '$name'") unless defined $enc;
273 my $octets = $enc->encode($string,$check);
274 return undef if ($check && length($string));
275 return $octets;
4411f3b6 276}
277
278sub decode
279{
18586f54 280 my ($name,$octets,$check) = @_;
281 my $enc = find_encoding($name);
282 croak("Unknown encoding '$name'") unless defined $enc;
283 my $string = $enc->decode($octets,$check);
284 $_[1] = $octets if $check;
285 return $string;
4411f3b6 286}
287
288sub from_to
289{
18586f54 290 my ($string,$from,$to,$check) = @_;
291 my $f = find_encoding($from);
292 croak("Unknown encoding '$from'") unless defined $f;
293 my $t = find_encoding($to);
294 croak("Unknown encoding '$to'") unless defined $t;
295 my $uni = $f->decode($string,$check);
296 return undef if ($check && length($string));
297 $string = $t->encode($uni,$check);
298 return undef if ($check && length($uni));
299 return length($_[0] = $string);
4411f3b6 300}
301
302sub encode_utf8
303{
18586f54 304 my ($str) = @_;
305 utf8::encode($str);
306 return $str;
4411f3b6 307}
308
309sub decode_utf8
310{
18586f54 311 my ($str) = @_;
312 return undef unless utf8::decode($str);
313 return $str;
5ad8ef52 314}
315
18586f54 316require Encode::Encoding;
317require Encode::XS;
318require Encode::Internal;
319require Encode::Unicode;
320require Encode::utf8;
321require Encode::iso10646_1;
322require Encode::ucs2_le;
4411f3b6 323
656753f8 3241;
325
2a936312 326__END__
327
4411f3b6 328=head1 NAME
329
330Encode - character encodings
331
332=head1 SYNOPSIS
333
334 use Encode;
335
336=head1 DESCRIPTION
337
47bfe92f 338The C<Encode> module provides the interfaces between Perl's strings
339and the rest of the system. Perl strings are sequences of B<characters>.
4411f3b6 340
341The repertoire of characters that Perl can represent is at least that
47bfe92f 342defined by the Unicode Consortium. On most platforms the ordinal
343values of the characters (as returned by C<ord(ch)>) is the "Unicode
344codepoint" for the character (the exceptions are those platforms where
345the legacy encoding is some variant of EBCDIC rather than a super-set
346of ASCII - see L<perlebcdic>).
4411f3b6 347
348Traditionaly computer data has been moved around in 8-bit chunks
349often called "bytes". These chunks are also known as "octets" in
350networking standards. Perl is widely used to manipulate data of
351many types - not only strings of characters representing human or
352computer languages but also "binary" data being the machines representation
353of numbers, pixels in an image - or just about anything.
354
47bfe92f 355When Perl is processing "binary data" the programmer wants Perl to process
356"sequences of bytes". This is not a problem for Perl - as a byte has 256
357possible values it easily fits in Perl's much larger "logical character".
4411f3b6 358
d1ed7747 359Due to size concerns, each of B<CJK> (Chinese, Japanese & Korean) modules
360are not loaded in memory until the first time they're used. Although you
361don't have to C<use> the corresponding B<Encode::>(B<TW>|B<CN>|B<JP>|B<KR>)
362modules first, be aware that those encodings will not be in C<%encodings>
363until their module is loaded (either implicitly through using encodings
364contained in the same module, or via an explicit C<use>).
a67efb5b 365
4411f3b6 366=head2 TERMINOLOGY
367
4ac9195f 368=over 4
4411f3b6 369
370=item *
371
372I<character>: a character in the range 0..(2**32-1) (or more).
47bfe92f 373(What Perl's strings are made of.)
4411f3b6 374
375=item *
376
377I<byte>: a character in the range 0..255
47bfe92f 378(A special case of a Perl character.)
4411f3b6 379
380=item *
381
382I<octet>: 8 bits of data, with ordinal values 0..255
47bfe92f 383(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
4411f3b6 384
385=back
386
387The marker [INTERNAL] marks Internal Implementation Details, in
388general meant only for those who think they know what they are doing,
389and such details may change in future releases.
390
391=head1 ENCODINGS
392
393=head2 Characteristics of an Encoding
394
395An encoding has a "repertoire" of characters that it can represent,
396and for each representable character there is at least one sequence of
397octets that represents it.
398
399=head2 Types of Encodings
400
401Encodings can be divided into the following types:
402
403=over 4
404
405=item * Fixed length 8-bit (or less) encodings.
406
407Each character is a single octet so may have a repertoire of up to
408256 characters. ASCII and iso-8859-* are typical examples.
409
410=item * Fixed length 16-bit encodings
411
412Each character is two octets so may have a repertoire of up to
47bfe92f 41365 536 characters. Unicode's UCS-2 is an example. Also used for
4411f3b6 414encodings for East Asian languages.
415
416=item * Fixed length 32-bit encodings.
417
418Not really very "encoded" encodings. The Unicode code points
419are just represented as 4-octet integers. None the less because
420different architectures use different representations of integers
421(so called "endian") there at least two disctinct encodings.
422
423=item * Multi-byte encodings
424
425The number of octets needed to represent a character varies.
426UTF-8 is a particularly complex but regular case of a multi-byte
427encoding. Several East Asian countries use a multi-byte encoding
428where 1-octet is used to cover western roman characters and Asian
429characters get 2-octets.
430(UTF-16 is strictly a multi-byte encoding taking either 2 or 4 octets
431to represent a Unicode code point.)
432
433=item * "Escape" encodings.
434
435These encodings embed "escape sequences" into the octet sequence
436which describe how the following octets are to be interpreted.
437The iso-2022-* family is typical. Following the escape sequence
438octets are encoded by an "embedded" encoding (which will be one
439of the above types) until another escape sequence switches to
440a different "embedded" encoding.
441
442These schemes are very flexible and can handle mixed languages but are
47bfe92f 443very complex to process (and have state). No escape encodings are
444implemented for Perl yet.
4411f3b6 445
446=back
447
448=head2 Specifying Encodings
449
450Encodings can be specified to the API described below in two ways:
451
452=over 4
453
454=item 1. By name
455
47bfe92f 456Encoding names are strings with characters taken from a restricted
457repertoire. See L</"Encoding Names">.
4411f3b6 458
459=item 2. As an object
460
2659725b 461Encoding objects are returned by C<find_encoding($name, [$skip_external])>.
462If the second parameter is true, Encode will refrain from loading external
463modules for CJK encodings.
4411f3b6 464
465=back
466
467=head2 Encoding Names
468
469Encoding names are case insensitive. White space in names is ignored.
47bfe92f 470In addition an encoding may have aliases. Each encoding has one
471"canonical" name. The "canonical" name is chosen from the names of
472the encoding by picking the first in the following sequence:
4411f3b6 473
474=over 4
475
78255929 476=item * The MIME name as defined in IETF RFCs.
4411f3b6 477
478=item * The name in the IANA registry.
479
d1be9408 480=item * The name used by the organization that defined it.
4411f3b6 481
482=back
483
484Because of all the alias issues, and because in the general case
485encodings have state C<Encode> uses the encoding object internally
486once an operation is in progress.
487
21938dfa 488As of Perl 5.8.0, at least the following encodings are recognized
489(the => marks aliases):
490
491 ASCII
492
493 US-ASCII => ASCII
494
495The Unicode:
496
0b3236bb 497 UTF-8
21938dfa 498 UTF-16
499 UCS-2
500
501 ISO 10646-1 => UCS-2
502
503The ISO 8859 and KOI:
504
505 ISO 8859-1 ISO 8859-6 ISO 8859-11 KOI8-F
506 ISO 8859-2 ISO 8859-7 (12 doesn't exist) KOI8-R
56a543c5 507 ISO 8859-3 ISO 8859-8 ISO 8859-13 KOI8-U
21938dfa 508 ISO 8859-4 ISO 8859-9 ISO 8859-14
509 ISO 8859-5 ISO 8859-10 ISO 8859-15
510 ISO 8859-16
511
512 Latin1 => 8859-1 Latin6 => 8859-10
513 Latin2 => 8859-2 Latin7 => 8859-13
0b3236bb 514 Latin3 => 8859-3 Latin8 => 8859-14
21938dfa 515 Latin4 => 8859-4 Latin9 => 8859-15
516 Latin5 => 8859-9 Latin10 => 8859-16
517
518 Cyrillic => 8859-5
519 Arabic => 8859-6
520 Greek => 8859-7
521 Hebrew => 8859-8
522 Thai => 8859-11
0b3236bb 523 TIS620 => 8859-11
21938dfa 524
525The CJKV: Chinese, Japanese, Korean, Vietnamese:
526
0b3236bb 527 ISO 2022 ISO 2022 JP-1 JIS 0201 GB 1988 Big5 EUC-CN
528 ISO 2022 CN ISO 2022 JP-2 JIS 0208 GB 2312 HZ EUC-JP
21938dfa 529 ISO 2022 JP ISO 2022 KR JIS 0210 GB 12345 CNS 11643 EUC-JP-0212
cb448690 530 Shift-JIS GBK Big5-HKSCS EUC-KR
531 VISCII ISO-IR-165
532
a67efb5b 533(Due to size concerns, additional Chinese encodings including C<GB 18030>,
534C<EUC-TW> and C<BIG5PLUS> are distributed separately on CPAN, under the name
535L<Encode::HanExtra>.)
21938dfa 536
537The PC codepages:
538
539 CP37 CP852 CP861 CP866 CP949 CP1251 CP1256
540 CP424 CP855 CP862 CP869 CP950 CP1252 CP1257
541 CP737 CP856 CP863 CP874 CP1006 CP1253 CP1258
542 CP775 CP857 CP864 CP932 CP1047 CP1254
543 CP850 CP860 CP865 CP936 CP1250 CP1255
544
545 WinLatin1 => CP1252
546 WinLatin2 => CP1250
547 WinCyrillic => CP1251
548 WinGreek => CP1253
549 WinTurkiskh => CP1254
550 WinHebrew => CP1255
551 WinArabic => CP1256
552 WinBaltic => CP1257
553 WinVietnamese => CP1258
554
4a42e14c 555(All the CPI<NNN...> are available also as IBMI<NNN...>.)
21938dfa 556
557The Mac codepages:
558
0b3236bb 559 MacCentralEuropean MacJapanese
560 MacCroatian MacRoman
1853dd5f 561 MacCyrillic MacRomanian
0b3236bb 562 MacDingbats MacSami
563 MacGreek MacThai
564 MacIcelandic MacTurkish
565 MacUkraine
21938dfa 566
567Miscellaneous:
568
569 7bit-greek IR-197
570 7bit-kana NeXTstep
571 7bit-latin1 POSIX-BC
572 DingBats Roman8
573 GSM 0338 Symbol
574
4411f3b6 575=head1 PERL ENCODING API
576
577=head2 Generic Encoding Interface
578
579=over 4
580
581=item *
582
583 $bytes = encode(ENCODING, $string[, CHECK])
584
47bfe92f 585Encodes string from Perl's internal form into I<ENCODING> and returns
586a sequence of octets. For CHECK see L</"Handling Malformed Data">.
4411f3b6 587
681a7c68 588For example to convert (internally UTF-8 encoded) Unicode data
589to octets:
590
591 $octets = encode("utf8", $unicode);
592
4411f3b6 593=item *
594
595 $string = decode(ENCODING, $bytes[, CHECK])
596
47bfe92f 597Decode sequence of octets assumed to be in I<ENCODING> into Perl's
598internal form and returns the resulting string. For CHECK see
599L</"Handling Malformed Data">.
600
681a7c68 601For example to convert ISO 8859-1 data to UTF-8:
602
603 $utf8 = decode("latin1", $latin1);
604
47bfe92f 605=item *
606
607 from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
608
2b106fbe 609Convert B<in-place> the data between two encodings. How did the data
610in $string originally get to be in FROM_ENCODING? Either using
e9692b5b 611encode() or through PerlIO: See L</"Encoding and IO">. For CHECK
2b106fbe 612see L</"Handling Malformed Data">.
613
614For example to convert ISO 8859-1 data to UTF-8:
615
616 from_to($data, "iso-8859-1", "utf-8");
617
618and to convert it back:
619
620 from_to($data, "utf-8", "iso-8859-1");
4411f3b6 621
ab97ca19 622Note that because the conversion happens in place, the data to be
623converted cannot be a string constant, it must be a scalar variable.
624
4411f3b6 625=back
626
627=head2 Handling Malformed Data
628
629If CHECK is not set, C<undef> is returned. If the data is supposed to
47bfe92f 630be UTF-8, an optional lexical warning (category utf8) is given. If
631CHECK is true but not a code reference, dies.
4411f3b6 632
47bfe92f 633It would desirable to have a way to indicate that transform should use
634the encodings "replacement character" - no such mechanism is defined yet.
4411f3b6 635
636It is also planned to allow I<CHECK> to be a code reference.
637
47bfe92f 638This is not yet implemented as there are design issues with what its
639arguments should be and how it returns its results.
4411f3b6 640
641=over 4
642
643=item Scheme 1
644
645Passed remaining fragment of string being processed.
646Modifies it in place to remove bytes/characters it can understand
647and returns a string used to represent them.
648e.g.
649
650 sub fixup {
651 my $ch = substr($_[0],0,1,'');
652 return sprintf("\x{%02X}",ord($ch);
653 }
654
655This scheme is close to how underlying C code for Encode works, but gives
656the fixup routine very little context.
657
658=item Scheme 2
659
47bfe92f 660Passed original string, and an index into it of the problem area, and
661output string so far. Appends what it will to output string and
662returns new index into original string. For example:
4411f3b6 663
664 sub fixup {
665 # my ($s,$i,$d) = @_;
666 my $ch = substr($_[0],$_[1],1);
667 $_[2] .= sprintf("\x{%02X}",ord($ch);
668 return $_[1]+1;
669 }
670
47bfe92f 671This scheme gives maximal control to the fixup routine but is more
672complicated to code, and may need internals of Encode to be tweaked to
673keep original string intact.
4411f3b6 674
675=item Other Schemes
676
677Hybrids of above.
678
679Multiple return values rather than in-place modifications.
680
681Index into the string could be pos($str) allowing s/\G...//.
682
683=back
684
685=head2 UTF-8 / utf8
686
687The Unicode consortium defines the UTF-8 standard as a way of encoding
47bfe92f 688the entire Unicode repertiore as sequences of octets. This encoding is
689expected to become very widespread. Perl can use this form internaly
690to represent strings, so conversions to and from this form are
691particularly efficient (as octets in memory do not have to change,
692just the meta-data that tells Perl how to treat them).
4411f3b6 693
694=over 4
695
696=item *
697
698 $bytes = encode_utf8($string);
699
47bfe92f 700The characters that comprise string are encoded in Perl's superset of UTF-8
4411f3b6 701and the resulting octets returned as a sequence of bytes. All possible
702characters have a UTF-8 representation so this function cannot fail.
703
704=item *
705
706 $string = decode_utf8($bytes [,CHECK]);
707
47bfe92f 708The sequence of octets represented by $bytes is decoded from UTF-8
709into a sequence of logical characters. Not all sequences of octets
710form valid UTF-8 encodings, so it is possible for this call to fail.
711For CHECK see L</"Handling Malformed Data">.
4411f3b6 712
713=back
714
715=head2 Other Encodings of Unicode
716
47bfe92f 717UTF-16 is similar to UCS-2, 16 bit or 2-byte chunks. UCS-2 can only
7a4efbb2 718represent 0..0xFFFF, while UTF-16 has a I<surrogate pair> scheme which
47bfe92f 719allows it to cover the whole Unicode range.
4411f3b6 720
7a4efbb2 721Surrogates are code points set aside to encode the 0x01000..0x10FFFF
722range of Unicode code points in pairs of 16-bit units. The I<high
723surrogates> are the range 0xD800..0xDBFF, and the I<low surrogates>
724are the range 0xDC00..0xDFFFF. The surrogate encoding is
725
726 $hi = ($uni - 0x10000) / 0x400 + 0xD800;
727 $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
728
729and the decoding is
730
731 $uni = 0x10000 + ($hi - 0xD8000) * 0x400 + ($lo - 0xDC00);
732
8040349a 733Encode implements big-endian UCS-2 aliased to "iso-10646-1" as that
47bfe92f 734happens to be the name used by that representation when used with X11
735fonts.
4411f3b6 736
737UTF-32 or UCS-4 is 32-bit or 4-byte chunks. Perl's logical characters
738can be considered as being in this form without encoding. An encoding
47bfe92f 739to transfer strings in this form (e.g. to write them to a file) would
740need to
4411f3b6 741
c079d275 742 pack('L*', unpack('U*', $string)); # native
4411f3b6 743 or
c079d275 744 pack('V*', unpack('U*', $string)); # little-endian
4411f3b6 745 or
c079d275 746 pack('N*', unpack('U*', $string)); # big-endian
4411f3b6 747
c079d275 748depending on the endianness required.
4411f3b6 749
51ef4e11 750No UTF-32 encodings are implemented yet.
4411f3b6 751
47bfe92f 752Both UCS-2 and UCS-4 style encodings can have "byte order marks" by
753representing the code point 0xFFFE as the very first thing in a file.
4411f3b6 754
51ef4e11 755=head2 Listing available encodings
756
757 use Encode qw(encodings);
758 @list = encodings();
759
760Returns a list of the canonical names of the available encodings.
761
762=head2 Defining Aliases
763
764 use Encode qw(define_alias);
765 define_alias( newName => ENCODING);
766
47bfe92f 767Allows newName to be used as am alias for ENCODING. ENCODING may be
768either the name of an encoding or and encoding object (as above).
51ef4e11 769
770Currently I<newName> can be specified in the following ways:
771
772=over 4
773
774=item As a simple string.
775
776=item As a qr// compiled regular expression, e.g.:
777
778 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
779
47bfe92f 780In this case if I<ENCODING> is not a reference it is C<eval>-ed to
781allow C<$1> etc. to be subsituted. The example is one way to names as
782used in X11 font names to alias the MIME names for the iso-8859-*
783family.
51ef4e11 784
785=item As a code reference, e.g.:
786
787 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
788
789In this case C<$_> will be set to the name that is being looked up and
47bfe92f 790I<ENCODING> is passed to the sub as its first argument. The example
791is another way to names as used in X11 font names to alias the MIME
792names for the iso-8859-* family.
51ef4e11 793
794=back
795
796=head2 Defining Encodings
797
e9692b5b 798 use Encode qw(define_alias);
799 define_encoding( $object, 'canonicalName' [,alias...]);
51ef4e11 800
47bfe92f 801Causes I<canonicalName> to be associated with I<$object>. The object
802should provide the interface described in L</"IMPLEMENTATION CLASSES">
803below. If more than two arguments are provided then additional
804arguments are taken as aliases for I<$object> as for C<define_alias>.
51ef4e11 805
4411f3b6 806=head1 Encoding and IO
807
808It is very common to want to do encoding transformations when
809reading or writing files, network connections, pipes etc.
47bfe92f 810If Perl is configured to use the new 'perlio' IO system then
4411f3b6 811C<Encode> provides a "layer" (See L<perliol>) which can transform
812data as it is read or written.
813
8e86646e 814Here is how the blind poet would modernise the encoding:
815
42234700 816 use Encode;
8e86646e 817 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
818 open(my $utf8,'>:utf8','iliad.utf8');
819 my @epic = <$iliad>;
820 print $utf8 @epic;
821 close($utf8);
822 close($illiad);
4411f3b6 823
824In addition the new IO system can also be configured to read/write
825UTF-8 encoded characters (as noted above this is efficient):
826
e9692b5b 827 open(my $fh,'>:utf8','anything');
828 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6 829
830Either of the above forms of "layer" specifications can be made the default
831for a lexical scope with the C<use open ...> pragma. See L<open>.
832
833Once a handle is open is layers can be altered using C<binmode>.
834
47bfe92f 835Without any such configuration, or if Perl itself is built using
4411f3b6 836system's own IO, then write operations assume that file handle accepts
837only I<bytes> and will C<die> if a character larger than 255 is
838written to the handle. When reading, each octet from the handle
839becomes a byte-in-a-character. Note that this default is the same
47bfe92f 840behaviour as bytes-only languages (including Perl before v5.6) would
841have, and is sufficient to handle native 8-bit encodings
842e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
843other encodings and binary data.
844
845In other cases it is the programs responsibility to transform
846characters into bytes using the API above before doing writes, and to
847transform the bytes read from a handle into characters before doing
848"character operations" (e.g. C<lc>, C</\W+/>, ...).
849
47bfe92f 850You can also use PerlIO to convert larger amounts of data you don't
851want to bring into memory. For example to convert between ISO 8859-1
852(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
853
e9692b5b 854 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
855 open(G, ">:utf8", "data.utf") or die $!;
856 while (<F>) { print G }
857
858 # Could also do "print G <F>" but that would pull
859 # the whole file into memory just to write it out again.
860
861More examples:
47bfe92f 862
e9692b5b 863 open(my $f, "<:encoding(cp1252)")
864 open(my $g, ">:encoding(iso-8859-2)")
865 open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f 866
867See L<PerlIO> for more information.
4411f3b6 868
1768d7eb 869See also L<encoding> for how to change the default encoding of the
d521382b 870data in your script.
1768d7eb 871
4411f3b6 872=head1 Encoding How to ...
873
874To do:
875
876=over 4
877
878=item * IO with mixed content (faking iso-2020-*)
879
880=item * MIME's Content-Length:
881
882=item * UTF-8 strings in binary data.
883
47bfe92f 884=item * Perl/Encode wrappers on non-Unicode XS modules.
4411f3b6 885
886=back
887
888=head1 Messing with Perl's Internals
889
47bfe92f 890The following API uses parts of Perl's internals in the current
891implementation. As such they are efficient, but may change.
4411f3b6 892
893=over 4
894
4411f3b6 895=item * is_utf8(STRING [, CHECK])
896
897[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f 898If CHECK is true, also checks the data in STRING for being well-formed
899UTF-8. Returns true if successful, false otherwise.
4411f3b6 900
901=item * valid_utf8(STRING)
902
47bfe92f 903[INTERNAL] Test whether STRING is in a consistent state. Will return
904true if string is held as bytes, or is well-formed UTF-8 and has the
905UTF-8 flag on. Main reason for this routine is to allow Perl's
906testsuite to check that operations have left strings in a consistent
907state.
4411f3b6 908
909=item *
910
911 _utf8_on(STRING)
912
913[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
914B<not> checked for being well-formed UTF-8. Do not use unless you
915B<know> that the STRING is well-formed UTF-8. Returns the previous
916state of the UTF-8 flag (so please don't test the return value as
917I<not> success or failure), or C<undef> if STRING is not a string.
918
919=item *
920
921 _utf8_off(STRING)
922
923[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
924Returns the previous state of the UTF-8 flag (so please don't test the
925return value as I<not> success or failure), or C<undef> if STRING is
926not a string.
927
928=back
929
4edaa979 930=head1 IMPLEMENTATION CLASSES
931
932As mentioned above encodings are (in the current implementation at least)
933defined by objects. The mapping of encoding name to object is via the
51ef4e11 934C<%encodings> hash.
4edaa979 935
936The values of the hash can currently be either strings or objects.
937The string form may go away in the future. The string form occurs
938when C<encodings()> has scanned C<@INC> for loadable encodings but has
939not actually loaded the encoding in question. This is because the
47bfe92f 940current "loading" process is all Perl and a bit slow.
4edaa979 941
47bfe92f 942Once an encoding is loaded then value of the hash is object which
943implements the encoding. The object should provide the following
944interface:
4edaa979 945
946=over 4
947
948=item -E<gt>name
949
950Should return the string representing the canonical name of the encoding.
951
952=item -E<gt>new_sequence
953
47bfe92f 954This is a placeholder for encodings with state. It should return an
955object which implements this interface, all current implementations
956return the original object.
4edaa979 957
958=item -E<gt>encode($string,$check)
959
47bfe92f 960Should return the octet sequence representing I<$string>. If I<$check>
961is true it should modify I<$string> in place to remove the converted
962part (i.e. the whole string unless there is an error). If an error
963occurs it should return the octet sequence for the fragment of string
964that has been converted, and modify $string in-place to remove the
965converted part leaving it starting with the problem fragment.
4edaa979 966
47bfe92f 967If check is is false then C<encode> should make a "best effort" to
968convert the string - for example by using a replacement character.
4edaa979 969
970=item -E<gt>decode($octets,$check)
971
47bfe92f 972Should return the string that I<$octets> represents. If I<$check> is
973true it should modify I<$octets> in place to remove the converted part
974(i.e. the whole sequence unless there is an error). If an error
975occurs it should return the fragment of string that has been
976converted, and modify $octets in-place to remove the converted part
4edaa979 977leaving it starting with the problem fragment.
978
47bfe92f 979If check is is false then C<decode> should make a "best effort" to
980convert the string - for example by using Unicode's "\x{FFFD}" as a
981replacement character.
4edaa979 982
983=back
984
47bfe92f 985It should be noted that the check behaviour is different from the
986outer public API. The logic is that the "unchecked" case is useful
987when encoding is part of a stream which may be reporting errors
988(e.g. STDERR). In such cases it is desirable to get everything
989through somehow without causing additional errors which obscure the
990original one. Also the encoding is best placed to know what the
991correct replacement character is, so if that is the desired behaviour
992then letting low level code do it is the most efficient.
993
994In contrast if check is true, the scheme above allows the encoding to
995do as much as it can and tell layer above how much that was. What is
996lacking at present is a mechanism to report what went wrong. The most
997likely interface will be an additional method call to the object, or
998perhaps (to avoid forcing per-stream objects on otherwise stateless
999encodings) and additional parameter.
1000
1001It is also highly desirable that encoding classes inherit from
1002C<Encode::Encoding> as a base class. This allows that class to define
1003additional behaviour for all encoding objects. For example built in
1004Unicode, UCS-2 and UTF-8 classes use :
51ef4e11 1005
1006 package Encode::MyEncoding;
1007 use base qw(Encode::Encoding);
1008
1009 __PACKAGE__->Define(qw(myCanonical myAlias));
1010
47bfe92f 1011To create an object with bless {Name => ...},$class, and call
1012define_encoding. They inherit their C<name> method from
1013C<Encode::Encoding>.
4edaa979 1014
1015=head2 Compiled Encodings
1016
47bfe92f 1017F<Encode.xs> provides a class C<Encode::XS> which provides the
1018interface described above. It calls a generic octet-sequence to
1019octet-sequence "engine" that is driven by tables (defined in
1020F<encengine.c>). The same engine is used for both encode and
1021decode. C<Encode:XS>'s C<encode> forces Perl's characters to their
1022UTF-8 form and then treats them as just another multibyte
1023encoding. C<Encode:XS>'s C<decode> transforms the sequence and then
1024turns the UTF-8-ness flag as that is the form that the tables are
1025defined to produce. For details of the engine see the comments in
1026F<encengine.c>.
1027
1028The tables are produced by the Perl script F<compile> (the name needs
1029to change so we can eventually install it somewhere). F<compile> can
1030currently read two formats:
4edaa979 1031
1032=over 4
1033
1034=item *.enc
1035
47bfe92f 1036This is a coined format used by Tcl. It is documented in
1037Encode/EncodeFormat.pod.
4edaa979 1038
1039=item *.ucm
1040
1041This is the semi-standard format used by IBM's ICU package.
1042
1043=back
1044
1045F<compile> can write the following forms:
1046
1047=over 4
1048
1049=item *.ucm
1050
1051See above - the F<Encode/*.ucm> files provided with the distribution have
1052been created from the original Tcl .enc files using this approach.
1053
1054=item *.c
1055
1056Produces tables as C data structures - this is used to build in encodings
1057into F<Encode.so>/F<Encode.dll>.
1058
1059=item *.xs
1060
47bfe92f 1061In theory this allows encodings to be stand-alone loadable Perl
1062extensions. The process has not yet been tested. The plan is to use
1063this approach for large East Asian encodings.
4edaa979 1064
1065=back
1066
47bfe92f 1067The set of encodings built-in to F<Encode.so>/F<Encode.dll> is
1068determined by F<Makefile.PL>. The current set is as follows:
4edaa979 1069
1070=over 4
1071
1072=item ascii and iso-8859-*
1073
1074That is all the common 8-bit "western" encodings.
1075
1076=item IBM-1047 and two other variants of EBCDIC.
1077
47bfe92f 1078These are the same variants that are supported by EBCDIC Perl as
1079"native" encodings. They are included to prove "reversibility" of
1080some constructs in EBCDIC Perl.
4edaa979 1081
1082=item symbol and dingbats as used by Tk on X11.
1083
47bfe92f 1084(The reason Encode got started was to support Perl/Tk.)
4edaa979 1085
1086=back
1087
47bfe92f 1088That set is rather ad hoc and has been driven by the needs of the
1089tests rather than the needs of typical applications. It is likely
1090to be rationalized.
4edaa979 1091
4411f3b6 1092=head1 SEE ALSO
1093
1768d7eb 1094L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>, L<encoding>
4411f3b6 1095
1096=cut
1097