Re: [ID 20020227.024] unexpected warning getc() on unopened filehandle STDIN
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
0e567a6c 3our $VERSION = '0.40';
2c674647 4
5require DynaLoader;
6require Exporter;
7
51ef4e11 8our @ISA = qw(Exporter DynaLoader);
2c674647 9
4411f3b6 10# Public, encouraged API is exported by default
51ef4e11 11our @EXPORT = qw (
4411f3b6 12 encode
13 decode
14 encode_utf8
15 decode_utf8
16 find_encoding
51ef4e11 17 encodings
4411f3b6 18);
19
51ef4e11 20our @EXPORT_OK =
2c674647 21 qw(
51ef4e11 22 define_encoding
23 define_alias
2c674647 24 from_to
25 is_utf8
4411f3b6 26 is_8bit
27 is_16bit
a12c0f56 28 utf8_upgrade
29 utf8_downgrade
4411f3b6 30 _utf8_on
31 _utf8_off
2c674647 32 );
33
34bootstrap Encode ();
35
4411f3b6 36# Documentation moved after __END__ for speed - NI-S
2c674647 37
bf230f3d 38use Carp;
39
51ef4e11 40# Make a %encoding package variable to allow a certain amount of cheating
41our %encoding;
42my @alias; # ordered matching list
43my %alias; # cached known aliases
f7ac3676 44
6d6a7c8d 45 # 0 1 2 3 4 5 6 7 8 9 10
46our @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
47
f7ac3676 48our %winlatin2cp = (
49 'Latin1' => 1252,
50 'Latin2' => 1250,
51 'Cyrillic' => 1251,
f7ac3676 52 'Greek' => 1253,
53 'Turkish' => 1254,
54 'Hebrew' => 1255,
55 'Arabic' => 1256,
56 'Baltic' => 1257,
57 'Vietnamese' => 1258,
58 );
5345d506 59
656753f8 60sub encodings
61{
62 my ($class) = @_;
40a073c6 63 return
64 map { $_->[0] }
65 sort { $a->[1] cmp $b->[1] }
66 map { [$_, lc $_] }
67 grep { $_ ne 'Internal' }
68 keys %encoding;
51ef4e11 69}
70
71sub findAlias
72{
18586f54 73 my $class = shift;
74 local $_ = shift;
75 # print "# findAlias $_\n";
76 unless (exists $alias{$_})
656753f8 77 {
18586f54 78 for (my $i=0; $i < @alias; $i += 2)
79 {
80 my $alias = $alias[$i];
81 my $val = $alias[$i+1];
82 my $new;
83 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
84 {
85 $new = eval $val;
86 }
87 elsif (ref($alias) eq 'CODE')
88 {
89 $new = &{$alias}($val)
90 }
91 elsif (lc($_) eq lc($alias))
92 {
93 $new = $val;
94 }
95 if (defined($new))
96 {
97 next if $new eq $_; # avoid (direct) recursion on bugs
98 my $enc = (ref($new)) ? $new : find_encoding($new);
99 if ($enc)
100 {
101 $alias{$_} = $enc;
102 last;
103 }
104 }
105 }
656753f8 106 }
18586f54 107 return $alias{$_};
5345d506 108}
109
51ef4e11 110sub define_alias
5345d506 111{
18586f54 112 while (@_)
113 {
114 my ($alias,$name) = splice(@_,0,2);
115 push(@alias, $alias => $name);
116 }
51ef4e11 117}
118
016cb72c 119# Allow variants of iso-8859-1 etc.
d6089a2a 120define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
016cb72c 121
7faf300d 122# At least HP-UX has these.
123define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );
124
f7ac3676 125# More HP stuff.
126define_alias( qr/^(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
127
0b3236bb 128# The Official name of ASCII.
8a361256 129define_alias( qr/^ANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
130
58d53262 131# This is a font issue, not an encoding issue.
132# (The currency symbol of the Latin 1 upper half
133# has been redefined as the euro symbol.)
134define_alias( qr/^(.+)\@euro$/i => '"$1"' );
135
016cb72c 136# Allow latin-1 style names as well
7faf300d 137define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
016cb72c 138
f7ac3676 139# Allow winlatin1 style names as well
cf91068f 140define_alias( qr/^win(latin[12]|cyrillic|baltic|greek|turkish|hebrew|arabic|baltic|vietnamese)$/i => '"cp$winlatin2cp{\u$1}"' );
f7ac3676 141
016cb72c 142# Common names for non-latin prefered MIME names
143define_alias( 'ascii' => 'US-ascii',
144 'cyrillic' => 'iso-8859-5',
145 'arabic' => 'iso-8859-6',
146 'greek' => 'iso-8859-7',
f7ac3676 147 'hebrew' => 'iso-8859-8',
148 'thai' => 'iso-8859-11',
149 'tis620' => 'iso-8859-11',
150 );
016cb72c 151
7faf300d 152# At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
1853dd5f 153# And Microsoft has their own naming (again, surprisingly).
154define_alias( qr/^(?:ibm|ms)[-_]?(\d\d\d\d?)$/i => '"cp$1"');
155
156# Sometimes seen with a leading zero.
157define_alias( qr/^cp037$/i => '"cp37"');
158
159# Ououououou.
160define_alias( qr/^macRomanian$/i => '"macRumanian"');
7faf300d 161
58d53262 162# Standardize on the dashed versions.
163define_alias( qr/^utf8$/i => 'utf-8' );
7faf300d 164define_alias( qr/^koi8r$/i => 'koi8-r' );
f7ac3676 165define_alias( qr/^koi8u$/i => 'koi8-u' );
166
1853dd5f 167# Seen in some Linuxes.
168define_alias( qr/^ujis$/i => 'euc-jp' );
169
b2729934 170# CP936 doesn't have vendor-addon for GBK, so they're identical.
171define_alias( qr/^gbk$/i => '"cp936"');
172
f7ac3676 173# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
174# TODO: HP-UX '15' encodings japanese15 korean15 roi15
175# TODO: Cyrillic encoding ISO-IR-111 (useful?)
f7ac3676 176# TODO: Armenian encoding ARMSCII-8
177# TODO: Hebrew encoding ISO-8859-8-1
178# TODO: Thai encoding TCVN
179# TODO: Korean encoding Johab
56a543c5 180# TODO: Vietnamese encodings VPS
f7ac3676 181# TODO: Japanese encoding JIS (not the same as SJIS)
182# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
183# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
184# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
185# Kannada Khmer Korean Laotian Malayalam Mongolian
186# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
18586f54 187
1853dd5f 188# Map white space and _ to '-'
016cb72c 189define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
190
51ef4e11 191sub define_encoding
192{
18586f54 193 my $obj = shift;
194 my $name = shift;
195 $encoding{$name} = $obj;
196 my $lc = lc($name);
197 define_alias($lc => $obj) unless $lc eq $name;
198 while (@_)
199 {
200 my $alias = shift;
201 define_alias($alias,$obj);
202 }
203 return $obj;
656753f8 204}
205
656753f8 206sub getEncoding
207{
18586f54 208 my ($class,$name) = @_;
209 my $enc;
210 if (ref($name) && $name->can('new_sequence'))
211 {
212 return $name;
213 }
214 my $lc = lc $name;
215 if (exists $encoding{$name})
216 {
217 return $encoding{$name};
218 }
219 if (exists $encoding{$lc})
220 {
221 return $encoding{$lc};
222 }
223
224 my $oc = $class->findAlias($name);
225 return $oc if defined $oc;
226 return $class->findAlias($lc) if $lc ne $name;
227
228 return;
656753f8 229}
230
4411f3b6 231sub find_encoding
232{
18586f54 233 my ($name) = @_;
234 return __PACKAGE__->getEncoding($name);
4411f3b6 235}
236
237sub encode
238{
18586f54 239 my ($name,$string,$check) = @_;
240 my $enc = find_encoding($name);
241 croak("Unknown encoding '$name'") unless defined $enc;
242 my $octets = $enc->encode($string,$check);
243 return undef if ($check && length($string));
244 return $octets;
4411f3b6 245}
246
247sub decode
248{
18586f54 249 my ($name,$octets,$check) = @_;
250 my $enc = find_encoding($name);
251 croak("Unknown encoding '$name'") unless defined $enc;
252 my $string = $enc->decode($octets,$check);
253 $_[1] = $octets if $check;
254 return $string;
4411f3b6 255}
256
257sub from_to
258{
18586f54 259 my ($string,$from,$to,$check) = @_;
260 my $f = find_encoding($from);
261 croak("Unknown encoding '$from'") unless defined $f;
262 my $t = find_encoding($to);
263 croak("Unknown encoding '$to'") unless defined $t;
264 my $uni = $f->decode($string,$check);
265 return undef if ($check && length($string));
266 $string = $t->encode($uni,$check);
267 return undef if ($check && length($uni));
268 return length($_[0] = $string);
4411f3b6 269}
270
271sub encode_utf8
272{
18586f54 273 my ($str) = @_;
274 utf8::encode($str);
275 return $str;
4411f3b6 276}
277
278sub decode_utf8
279{
18586f54 280 my ($str) = @_;
281 return undef unless utf8::decode($str);
282 return $str;
5ad8ef52 283}
284
18586f54 285require Encode::Encoding;
286require Encode::XS;
287require Encode::Internal;
288require Encode::Unicode;
289require Encode::utf8;
290require Encode::iso10646_1;
291require Encode::ucs2_le;
4411f3b6 292
656753f8 2931;
294
2a936312 295__END__
296
4411f3b6 297=head1 NAME
298
299Encode - character encodings
300
301=head1 SYNOPSIS
302
303 use Encode;
304
a67efb5b 305 use Encode::TW; # for Taiwan-based Chinese encodings
306 use Encode::CN; # for China-based Chinese encodings
307 use Encode::JP; # for Japanese encodings
308 use Encode::KR; # for Korean encodings
309
4411f3b6 310=head1 DESCRIPTION
311
47bfe92f 312The C<Encode> module provides the interfaces between Perl's strings
313and the rest of the system. Perl strings are sequences of B<characters>.
4411f3b6 314
315The repertoire of characters that Perl can represent is at least that
47bfe92f 316defined by the Unicode Consortium. On most platforms the ordinal
317values of the characters (as returned by C<ord(ch)>) is the "Unicode
318codepoint" for the character (the exceptions are those platforms where
319the legacy encoding is some variant of EBCDIC rather than a super-set
320of ASCII - see L<perlebcdic>).
4411f3b6 321
322Traditionaly computer data has been moved around in 8-bit chunks
323often called "bytes". These chunks are also known as "octets" in
324networking standards. Perl is widely used to manipulate data of
325many types - not only strings of characters representing human or
326computer languages but also "binary" data being the machines representation
327of numbers, pixels in an image - or just about anything.
328
47bfe92f 329When Perl is processing "binary data" the programmer wants Perl to process
330"sequences of bytes". This is not a problem for Perl - as a byte has 256
331possible values it easily fits in Perl's much larger "logical character".
4411f3b6 332
a67efb5b 333Due to size concerns, before using B<CJK> (Chinese, Japanese & Korean)
334encodings, you have to C<use> the corresponding
335B<Encode::>(B<TW>|B<CN>|B<JP>|B<KR>) modules first.
336
4411f3b6 337=head2 TERMINOLOGY
338
4ac9195f 339=over 4
4411f3b6 340
341=item *
342
343I<character>: a character in the range 0..(2**32-1) (or more).
47bfe92f 344(What Perl's strings are made of.)
4411f3b6 345
346=item *
347
348I<byte>: a character in the range 0..255
47bfe92f 349(A special case of a Perl character.)
4411f3b6 350
351=item *
352
353I<octet>: 8 bits of data, with ordinal values 0..255
47bfe92f 354(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
4411f3b6 355
356=back
357
358The marker [INTERNAL] marks Internal Implementation Details, in
359general meant only for those who think they know what they are doing,
360and such details may change in future releases.
361
362=head1 ENCODINGS
363
364=head2 Characteristics of an Encoding
365
366An encoding has a "repertoire" of characters that it can represent,
367and for each representable character there is at least one sequence of
368octets that represents it.
369
370=head2 Types of Encodings
371
372Encodings can be divided into the following types:
373
374=over 4
375
376=item * Fixed length 8-bit (or less) encodings.
377
378Each character is a single octet so may have a repertoire of up to
379256 characters. ASCII and iso-8859-* are typical examples.
380
381=item * Fixed length 16-bit encodings
382
383Each character is two octets so may have a repertoire of up to
47bfe92f 38465 536 characters. Unicode's UCS-2 is an example. Also used for
4411f3b6 385encodings for East Asian languages.
386
387=item * Fixed length 32-bit encodings.
388
389Not really very "encoded" encodings. The Unicode code points
390are just represented as 4-octet integers. None the less because
391different architectures use different representations of integers
392(so called "endian") there at least two disctinct encodings.
393
394=item * Multi-byte encodings
395
396The number of octets needed to represent a character varies.
397UTF-8 is a particularly complex but regular case of a multi-byte
398encoding. Several East Asian countries use a multi-byte encoding
399where 1-octet is used to cover western roman characters and Asian
400characters get 2-octets.
401(UTF-16 is strictly a multi-byte encoding taking either 2 or 4 octets
402to represent a Unicode code point.)
403
404=item * "Escape" encodings.
405
406These encodings embed "escape sequences" into the octet sequence
407which describe how the following octets are to be interpreted.
408The iso-2022-* family is typical. Following the escape sequence
409octets are encoded by an "embedded" encoding (which will be one
410of the above types) until another escape sequence switches to
411a different "embedded" encoding.
412
413These schemes are very flexible and can handle mixed languages but are
47bfe92f 414very complex to process (and have state). No escape encodings are
415implemented for Perl yet.
4411f3b6 416
417=back
418
419=head2 Specifying Encodings
420
421Encodings can be specified to the API described below in two ways:
422
423=over 4
424
425=item 1. By name
426
47bfe92f 427Encoding names are strings with characters taken from a restricted
428repertoire. See L</"Encoding Names">.
4411f3b6 429
430=item 2. As an object
431
432Encoding objects are returned by C<find_encoding($name)>.
433
434=back
435
436=head2 Encoding Names
437
438Encoding names are case insensitive. White space in names is ignored.
47bfe92f 439In addition an encoding may have aliases. Each encoding has one
440"canonical" name. The "canonical" name is chosen from the names of
441the encoding by picking the first in the following sequence:
4411f3b6 442
443=over 4
444
78255929 445=item * The MIME name as defined in IETF RFCs.
4411f3b6 446
447=item * The name in the IANA registry.
448
d1be9408 449=item * The name used by the organization that defined it.
4411f3b6 450
451=back
452
453Because of all the alias issues, and because in the general case
454encodings have state C<Encode> uses the encoding object internally
455once an operation is in progress.
456
21938dfa 457As of Perl 5.8.0, at least the following encodings are recognized
458(the => marks aliases):
459
460 ASCII
461
462 US-ASCII => ASCII
463
464The Unicode:
465
0b3236bb 466 UTF-8
21938dfa 467 UTF-16
468 UCS-2
469
470 ISO 10646-1 => UCS-2
471
472The ISO 8859 and KOI:
473
474 ISO 8859-1 ISO 8859-6 ISO 8859-11 KOI8-F
475 ISO 8859-2 ISO 8859-7 (12 doesn't exist) KOI8-R
56a543c5 476 ISO 8859-3 ISO 8859-8 ISO 8859-13 KOI8-U
21938dfa 477 ISO 8859-4 ISO 8859-9 ISO 8859-14
478 ISO 8859-5 ISO 8859-10 ISO 8859-15
479 ISO 8859-16
480
481 Latin1 => 8859-1 Latin6 => 8859-10
482 Latin2 => 8859-2 Latin7 => 8859-13
0b3236bb 483 Latin3 => 8859-3 Latin8 => 8859-14
21938dfa 484 Latin4 => 8859-4 Latin9 => 8859-15
485 Latin5 => 8859-9 Latin10 => 8859-16
486
487 Cyrillic => 8859-5
488 Arabic => 8859-6
489 Greek => 8859-7
490 Hebrew => 8859-8
491 Thai => 8859-11
0b3236bb 492 TIS620 => 8859-11
21938dfa 493
494The CJKV: Chinese, Japanese, Korean, Vietnamese:
495
0b3236bb 496 ISO 2022 ISO 2022 JP-1 JIS 0201 GB 1988 Big5 EUC-CN
497 ISO 2022 CN ISO 2022 JP-2 JIS 0208 GB 2312 HZ EUC-JP
21938dfa 498 ISO 2022 JP ISO 2022 KR JIS 0210 GB 12345 CNS 11643 EUC-JP-0212
cb448690 499 Shift-JIS GBK Big5-HKSCS EUC-KR
500 VISCII ISO-IR-165
501
a67efb5b 502(Due to size concerns, additional Chinese encodings including C<GB 18030>,
503C<EUC-TW> and C<BIG5PLUS> are distributed separately on CPAN, under the name
504L<Encode::HanExtra>.)
21938dfa 505
506The PC codepages:
507
508 CP37 CP852 CP861 CP866 CP949 CP1251 CP1256
509 CP424 CP855 CP862 CP869 CP950 CP1252 CP1257
510 CP737 CP856 CP863 CP874 CP1006 CP1253 CP1258
511 CP775 CP857 CP864 CP932 CP1047 CP1254
512 CP850 CP860 CP865 CP936 CP1250 CP1255
513
514 WinLatin1 => CP1252
515 WinLatin2 => CP1250
516 WinCyrillic => CP1251
517 WinGreek => CP1253
518 WinTurkiskh => CP1254
519 WinHebrew => CP1255
520 WinArabic => CP1256
521 WinBaltic => CP1257
522 WinVietnamese => CP1258
523
4a42e14c 524(All the CPI<NNN...> are available also as IBMI<NNN...>.)
21938dfa 525
526The Mac codepages:
527
0b3236bb 528 MacCentralEuropean MacJapanese
529 MacCroatian MacRoman
1853dd5f 530 MacCyrillic MacRomanian
0b3236bb 531 MacDingbats MacSami
532 MacGreek MacThai
533 MacIcelandic MacTurkish
534 MacUkraine
21938dfa 535
536Miscellaneous:
537
538 7bit-greek IR-197
539 7bit-kana NeXTstep
540 7bit-latin1 POSIX-BC
541 DingBats Roman8
542 GSM 0338 Symbol
543
4411f3b6 544=head1 PERL ENCODING API
545
546=head2 Generic Encoding Interface
547
548=over 4
549
550=item *
551
552 $bytes = encode(ENCODING, $string[, CHECK])
553
47bfe92f 554Encodes string from Perl's internal form into I<ENCODING> and returns
555a sequence of octets. For CHECK see L</"Handling Malformed Data">.
4411f3b6 556
681a7c68 557For example to convert (internally UTF-8 encoded) Unicode data
558to octets:
559
560 $octets = encode("utf8", $unicode);
561
4411f3b6 562=item *
563
564 $string = decode(ENCODING, $bytes[, CHECK])
565
47bfe92f 566Decode sequence of octets assumed to be in I<ENCODING> into Perl's
567internal form and returns the resulting string. For CHECK see
568L</"Handling Malformed Data">.
569
681a7c68 570For example to convert ISO 8859-1 data to UTF-8:
571
572 $utf8 = decode("latin1", $latin1);
573
47bfe92f 574=item *
575
576 from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
577
2b106fbe 578Convert B<in-place> the data between two encodings. How did the data
579in $string originally get to be in FROM_ENCODING? Either using
e9692b5b 580encode() or through PerlIO: See L</"Encoding and IO">. For CHECK
2b106fbe 581see L</"Handling Malformed Data">.
582
583For example to convert ISO 8859-1 data to UTF-8:
584
585 from_to($data, "iso-8859-1", "utf-8");
586
587and to convert it back:
588
589 from_to($data, "utf-8", "iso-8859-1");
4411f3b6 590
ab97ca19 591Note that because the conversion happens in place, the data to be
592converted cannot be a string constant, it must be a scalar variable.
593
4411f3b6 594=back
595
596=head2 Handling Malformed Data
597
598If CHECK is not set, C<undef> is returned. If the data is supposed to
47bfe92f 599be UTF-8, an optional lexical warning (category utf8) is given. If
600CHECK is true but not a code reference, dies.
4411f3b6 601
47bfe92f 602It would desirable to have a way to indicate that transform should use
603the encodings "replacement character" - no such mechanism is defined yet.
4411f3b6 604
605It is also planned to allow I<CHECK> to be a code reference.
606
47bfe92f 607This is not yet implemented as there are design issues with what its
608arguments should be and how it returns its results.
4411f3b6 609
610=over 4
611
612=item Scheme 1
613
614Passed remaining fragment of string being processed.
615Modifies it in place to remove bytes/characters it can understand
616and returns a string used to represent them.
617e.g.
618
619 sub fixup {
620 my $ch = substr($_[0],0,1,'');
621 return sprintf("\x{%02X}",ord($ch);
622 }
623
624This scheme is close to how underlying C code for Encode works, but gives
625the fixup routine very little context.
626
627=item Scheme 2
628
47bfe92f 629Passed original string, and an index into it of the problem area, and
630output string so far. Appends what it will to output string and
631returns new index into original string. For example:
4411f3b6 632
633 sub fixup {
634 # my ($s,$i,$d) = @_;
635 my $ch = substr($_[0],$_[1],1);
636 $_[2] .= sprintf("\x{%02X}",ord($ch);
637 return $_[1]+1;
638 }
639
47bfe92f 640This scheme gives maximal control to the fixup routine but is more
641complicated to code, and may need internals of Encode to be tweaked to
642keep original string intact.
4411f3b6 643
644=item Other Schemes
645
646Hybrids of above.
647
648Multiple return values rather than in-place modifications.
649
650Index into the string could be pos($str) allowing s/\G...//.
651
652=back
653
654=head2 UTF-8 / utf8
655
656The Unicode consortium defines the UTF-8 standard as a way of encoding
47bfe92f 657the entire Unicode repertiore as sequences of octets. This encoding is
658expected to become very widespread. Perl can use this form internaly
659to represent strings, so conversions to and from this form are
660particularly efficient (as octets in memory do not have to change,
661just the meta-data that tells Perl how to treat them).
4411f3b6 662
663=over 4
664
665=item *
666
667 $bytes = encode_utf8($string);
668
47bfe92f 669The characters that comprise string are encoded in Perl's superset of UTF-8
4411f3b6 670and the resulting octets returned as a sequence of bytes. All possible
671characters have a UTF-8 representation so this function cannot fail.
672
673=item *
674
675 $string = decode_utf8($bytes [,CHECK]);
676
47bfe92f 677The sequence of octets represented by $bytes is decoded from UTF-8
678into a sequence of logical characters. Not all sequences of octets
679form valid UTF-8 encodings, so it is possible for this call to fail.
680For CHECK see L</"Handling Malformed Data">.
4411f3b6 681
682=back
683
684=head2 Other Encodings of Unicode
685
47bfe92f 686UTF-16 is similar to UCS-2, 16 bit or 2-byte chunks. UCS-2 can only
7a4efbb2 687represent 0..0xFFFF, while UTF-16 has a I<surrogate pair> scheme which
47bfe92f 688allows it to cover the whole Unicode range.
4411f3b6 689
7a4efbb2 690Surrogates are code points set aside to encode the 0x01000..0x10FFFF
691range of Unicode code points in pairs of 16-bit units. The I<high
692surrogates> are the range 0xD800..0xDBFF, and the I<low surrogates>
693are the range 0xDC00..0xDFFFF. The surrogate encoding is
694
695 $hi = ($uni - 0x10000) / 0x400 + 0xD800;
696 $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
697
698and the decoding is
699
700 $uni = 0x10000 + ($hi - 0xD8000) * 0x400 + ($lo - 0xDC00);
701
8040349a 702Encode implements big-endian UCS-2 aliased to "iso-10646-1" as that
47bfe92f 703happens to be the name used by that representation when used with X11
704fonts.
4411f3b6 705
706UTF-32 or UCS-4 is 32-bit or 4-byte chunks. Perl's logical characters
707can be considered as being in this form without encoding. An encoding
47bfe92f 708to transfer strings in this form (e.g. to write them to a file) would
709need to
4411f3b6 710
c079d275 711 pack('L*', unpack('U*', $string)); # native
4411f3b6 712 or
c079d275 713 pack('V*', unpack('U*', $string)); # little-endian
4411f3b6 714 or
c079d275 715 pack('N*', unpack('U*', $string)); # big-endian
4411f3b6 716
c079d275 717depending on the endianness required.
4411f3b6 718
51ef4e11 719No UTF-32 encodings are implemented yet.
4411f3b6 720
47bfe92f 721Both UCS-2 and UCS-4 style encodings can have "byte order marks" by
722representing the code point 0xFFFE as the very first thing in a file.
4411f3b6 723
51ef4e11 724=head2 Listing available encodings
725
726 use Encode qw(encodings);
727 @list = encodings();
728
729Returns a list of the canonical names of the available encodings.
730
731=head2 Defining Aliases
732
733 use Encode qw(define_alias);
734 define_alias( newName => ENCODING);
735
47bfe92f 736Allows newName to be used as am alias for ENCODING. ENCODING may be
737either the name of an encoding or and encoding object (as above).
51ef4e11 738
739Currently I<newName> can be specified in the following ways:
740
741=over 4
742
743=item As a simple string.
744
745=item As a qr// compiled regular expression, e.g.:
746
747 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
748
47bfe92f 749In this case if I<ENCODING> is not a reference it is C<eval>-ed to
750allow C<$1> etc. to be subsituted. The example is one way to names as
751used in X11 font names to alias the MIME names for the iso-8859-*
752family.
51ef4e11 753
754=item As a code reference, e.g.:
755
756 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
757
758In this case C<$_> will be set to the name that is being looked up and
47bfe92f 759I<ENCODING> is passed to the sub as its first argument. The example
760is another way to names as used in X11 font names to alias the MIME
761names for the iso-8859-* family.
51ef4e11 762
763=back
764
765=head2 Defining Encodings
766
e9692b5b 767 use Encode qw(define_alias);
768 define_encoding( $object, 'canonicalName' [,alias...]);
51ef4e11 769
47bfe92f 770Causes I<canonicalName> to be associated with I<$object>. The object
771should provide the interface described in L</"IMPLEMENTATION CLASSES">
772below. If more than two arguments are provided then additional
773arguments are taken as aliases for I<$object> as for C<define_alias>.
51ef4e11 774
4411f3b6 775=head1 Encoding and IO
776
777It is very common to want to do encoding transformations when
778reading or writing files, network connections, pipes etc.
47bfe92f 779If Perl is configured to use the new 'perlio' IO system then
4411f3b6 780C<Encode> provides a "layer" (See L<perliol>) which can transform
781data as it is read or written.
782
8e86646e 783Here is how the blind poet would modernise the encoding:
784
42234700 785 use Encode;
8e86646e 786 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
787 open(my $utf8,'>:utf8','iliad.utf8');
788 my @epic = <$iliad>;
789 print $utf8 @epic;
790 close($utf8);
791 close($illiad);
4411f3b6 792
793In addition the new IO system can also be configured to read/write
794UTF-8 encoded characters (as noted above this is efficient):
795
e9692b5b 796 open(my $fh,'>:utf8','anything');
797 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6 798
799Either of the above forms of "layer" specifications can be made the default
800for a lexical scope with the C<use open ...> pragma. See L<open>.
801
802Once a handle is open is layers can be altered using C<binmode>.
803
47bfe92f 804Without any such configuration, or if Perl itself is built using
4411f3b6 805system's own IO, then write operations assume that file handle accepts
806only I<bytes> and will C<die> if a character larger than 255 is
807written to the handle. When reading, each octet from the handle
808becomes a byte-in-a-character. Note that this default is the same
47bfe92f 809behaviour as bytes-only languages (including Perl before v5.6) would
810have, and is sufficient to handle native 8-bit encodings
811e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
812other encodings and binary data.
813
814In other cases it is the programs responsibility to transform
815characters into bytes using the API above before doing writes, and to
816transform the bytes read from a handle into characters before doing
817"character operations" (e.g. C<lc>, C</\W+/>, ...).
818
47bfe92f 819You can also use PerlIO to convert larger amounts of data you don't
820want to bring into memory. For example to convert between ISO 8859-1
821(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
822
e9692b5b 823 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
824 open(G, ">:utf8", "data.utf") or die $!;
825 while (<F>) { print G }
826
827 # Could also do "print G <F>" but that would pull
828 # the whole file into memory just to write it out again.
829
830More examples:
47bfe92f 831
e9692b5b 832 open(my $f, "<:encoding(cp1252)")
833 open(my $g, ">:encoding(iso-8859-2)")
834 open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f 835
836See L<PerlIO> for more information.
4411f3b6 837
1768d7eb 838See also L<encoding> for how to change the default encoding of the
d521382b 839data in your script.
1768d7eb 840
4411f3b6 841=head1 Encoding How to ...
842
843To do:
844
845=over 4
846
847=item * IO with mixed content (faking iso-2020-*)
848
849=item * MIME's Content-Length:
850
851=item * UTF-8 strings in binary data.
852
47bfe92f 853=item * Perl/Encode wrappers on non-Unicode XS modules.
4411f3b6 854
855=back
856
857=head1 Messing with Perl's Internals
858
47bfe92f 859The following API uses parts of Perl's internals in the current
860implementation. As such they are efficient, but may change.
4411f3b6 861
862=over 4
863
4411f3b6 864=item * is_utf8(STRING [, CHECK])
865
866[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f 867If CHECK is true, also checks the data in STRING for being well-formed
868UTF-8. Returns true if successful, false otherwise.
4411f3b6 869
870=item * valid_utf8(STRING)
871
47bfe92f 872[INTERNAL] Test whether STRING is in a consistent state. Will return
873true if string is held as bytes, or is well-formed UTF-8 and has the
874UTF-8 flag on. Main reason for this routine is to allow Perl's
875testsuite to check that operations have left strings in a consistent
876state.
4411f3b6 877
878=item *
879
880 _utf8_on(STRING)
881
882[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
883B<not> checked for being well-formed UTF-8. Do not use unless you
884B<know> that the STRING is well-formed UTF-8. Returns the previous
885state of the UTF-8 flag (so please don't test the return value as
886I<not> success or failure), or C<undef> if STRING is not a string.
887
888=item *
889
890 _utf8_off(STRING)
891
892[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
893Returns the previous state of the UTF-8 flag (so please don't test the
894return value as I<not> success or failure), or C<undef> if STRING is
895not a string.
896
897=back
898
4edaa979 899=head1 IMPLEMENTATION CLASSES
900
901As mentioned above encodings are (in the current implementation at least)
902defined by objects. The mapping of encoding name to object is via the
51ef4e11 903C<%encodings> hash.
4edaa979 904
905The values of the hash can currently be either strings or objects.
906The string form may go away in the future. The string form occurs
907when C<encodings()> has scanned C<@INC> for loadable encodings but has
908not actually loaded the encoding in question. This is because the
47bfe92f 909current "loading" process is all Perl and a bit slow.
4edaa979 910
47bfe92f 911Once an encoding is loaded then value of the hash is object which
912implements the encoding. The object should provide the following
913interface:
4edaa979 914
915=over 4
916
917=item -E<gt>name
918
919Should return the string representing the canonical name of the encoding.
920
921=item -E<gt>new_sequence
922
47bfe92f 923This is a placeholder for encodings with state. It should return an
924object which implements this interface, all current implementations
925return the original object.
4edaa979 926
927=item -E<gt>encode($string,$check)
928
47bfe92f 929Should return the octet sequence representing I<$string>. If I<$check>
930is true it should modify I<$string> in place to remove the converted
931part (i.e. the whole string unless there is an error). If an error
932occurs it should return the octet sequence for the fragment of string
933that has been converted, and modify $string in-place to remove the
934converted part leaving it starting with the problem fragment.
4edaa979 935
47bfe92f 936If check is is false then C<encode> should make a "best effort" to
937convert the string - for example by using a replacement character.
4edaa979 938
939=item -E<gt>decode($octets,$check)
940
47bfe92f 941Should return the string that I<$octets> represents. If I<$check> is
942true it should modify I<$octets> in place to remove the converted part
943(i.e. the whole sequence unless there is an error). If an error
944occurs it should return the fragment of string that has been
945converted, and modify $octets in-place to remove the converted part
4edaa979 946leaving it starting with the problem fragment.
947
47bfe92f 948If check is is false then C<decode> should make a "best effort" to
949convert the string - for example by using Unicode's "\x{FFFD}" as a
950replacement character.
4edaa979 951
952=back
953
47bfe92f 954It should be noted that the check behaviour is different from the
955outer public API. The logic is that the "unchecked" case is useful
956when encoding is part of a stream which may be reporting errors
957(e.g. STDERR). In such cases it is desirable to get everything
958through somehow without causing additional errors which obscure the
959original one. Also the encoding is best placed to know what the
960correct replacement character is, so if that is the desired behaviour
961then letting low level code do it is the most efficient.
962
963In contrast if check is true, the scheme above allows the encoding to
964do as much as it can and tell layer above how much that was. What is
965lacking at present is a mechanism to report what went wrong. The most
966likely interface will be an additional method call to the object, or
967perhaps (to avoid forcing per-stream objects on otherwise stateless
968encodings) and additional parameter.
969
970It is also highly desirable that encoding classes inherit from
971C<Encode::Encoding> as a base class. This allows that class to define
972additional behaviour for all encoding objects. For example built in
973Unicode, UCS-2 and UTF-8 classes use :
51ef4e11 974
975 package Encode::MyEncoding;
976 use base qw(Encode::Encoding);
977
978 __PACKAGE__->Define(qw(myCanonical myAlias));
979
47bfe92f 980To create an object with bless {Name => ...},$class, and call
981define_encoding. They inherit their C<name> method from
982C<Encode::Encoding>.
4edaa979 983
984=head2 Compiled Encodings
985
47bfe92f 986F<Encode.xs> provides a class C<Encode::XS> which provides the
987interface described above. It calls a generic octet-sequence to
988octet-sequence "engine" that is driven by tables (defined in
989F<encengine.c>). The same engine is used for both encode and
990decode. C<Encode:XS>'s C<encode> forces Perl's characters to their
991UTF-8 form and then treats them as just another multibyte
992encoding. C<Encode:XS>'s C<decode> transforms the sequence and then
993turns the UTF-8-ness flag as that is the form that the tables are
994defined to produce. For details of the engine see the comments in
995F<encengine.c>.
996
997The tables are produced by the Perl script F<compile> (the name needs
998to change so we can eventually install it somewhere). F<compile> can
999currently read two formats:
4edaa979 1000
1001=over 4
1002
1003=item *.enc
1004
47bfe92f 1005This is a coined format used by Tcl. It is documented in
1006Encode/EncodeFormat.pod.
4edaa979 1007
1008=item *.ucm
1009
1010This is the semi-standard format used by IBM's ICU package.
1011
1012=back
1013
1014F<compile> can write the following forms:
1015
1016=over 4
1017
1018=item *.ucm
1019
1020See above - the F<Encode/*.ucm> files provided with the distribution have
1021been created from the original Tcl .enc files using this approach.
1022
1023=item *.c
1024
1025Produces tables as C data structures - this is used to build in encodings
1026into F<Encode.so>/F<Encode.dll>.
1027
1028=item *.xs
1029
47bfe92f 1030In theory this allows encodings to be stand-alone loadable Perl
1031extensions. The process has not yet been tested. The plan is to use
1032this approach for large East Asian encodings.
4edaa979 1033
1034=back
1035
47bfe92f 1036The set of encodings built-in to F<Encode.so>/F<Encode.dll> is
1037determined by F<Makefile.PL>. The current set is as follows:
4edaa979 1038
1039=over 4
1040
1041=item ascii and iso-8859-*
1042
1043That is all the common 8-bit "western" encodings.
1044
1045=item IBM-1047 and two other variants of EBCDIC.
1046
47bfe92f 1047These are the same variants that are supported by EBCDIC Perl as
1048"native" encodings. They are included to prove "reversibility" of
1049some constructs in EBCDIC Perl.
4edaa979 1050
1051=item symbol and dingbats as used by Tk on X11.
1052
47bfe92f 1053(The reason Encode got started was to support Perl/Tk.)
4edaa979 1054
1055=back
1056
47bfe92f 1057That set is rather ad hoc and has been driven by the needs of the
1058tests rather than the needs of typical applications. It is likely
1059to be rationalized.
4edaa979 1060
4411f3b6 1061=head1 SEE ALSO
1062
1768d7eb 1063L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>, L<encoding>
4411f3b6 1064
1065=cut
1066