Re: For 5.7.3, known failure for rel2abs2rel.t on FreeBSD 4.5
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
0e567a6c 3our $VERSION = '0.40';
2c674647 4
5require DynaLoader;
6require Exporter;
7
51ef4e11 8our @ISA = qw(Exporter DynaLoader);
2c674647 9
4411f3b6 10# Public, encouraged API is exported by default
51ef4e11 11our @EXPORT = qw (
4411f3b6 12 encode
13 decode
14 encode_utf8
15 decode_utf8
16 find_encoding
51ef4e11 17 encodings
4411f3b6 18);
19
51ef4e11 20our @EXPORT_OK =
2c674647 21 qw(
51ef4e11 22 define_encoding
23 define_alias
2c674647 24 from_to
25 is_utf8
4411f3b6 26 is_8bit
27 is_16bit
a12c0f56 28 utf8_upgrade
29 utf8_downgrade
4411f3b6 30 _utf8_on
31 _utf8_off
2c674647 32 );
33
34bootstrap Encode ();
35
4411f3b6 36# Documentation moved after __END__ for speed - NI-S
2c674647 37
bf230f3d 38use Carp;
39
51ef4e11 40# Make a %encoding package variable to allow a certain amount of cheating
41our %encoding;
42my @alias; # ordered matching list
43my %alias; # cached known aliases
f7ac3676 44
6d6a7c8d 45 # 0 1 2 3 4 5 6 7 8 9 10
46our @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
47
f7ac3676 48our %winlatin2cp = (
49 'Latin1' => 1252,
50 'Latin2' => 1250,
51 'Cyrillic' => 1251,
f7ac3676 52 'Greek' => 1253,
53 'Turkish' => 1254,
54 'Hebrew' => 1255,
55 'Arabic' => 1256,
56 'Baltic' => 1257,
57 'Vietnamese' => 1258,
58 );
5345d506 59
656753f8 60sub encodings
61{
62 my ($class) = @_;
40a073c6 63 return
64 map { $_->[0] }
65 sort { $a->[1] cmp $b->[1] }
66 map { [$_, lc $_] }
67 grep { $_ ne 'Internal' }
68 keys %encoding;
51ef4e11 69}
70
71sub findAlias
72{
18586f54 73 my $class = shift;
74 local $_ = shift;
75 # print "# findAlias $_\n";
76 unless (exists $alias{$_})
656753f8 77 {
18586f54 78 for (my $i=0; $i < @alias; $i += 2)
79 {
80 my $alias = $alias[$i];
81 my $val = $alias[$i+1];
82 my $new;
83 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
84 {
85 $new = eval $val;
86 }
87 elsif (ref($alias) eq 'CODE')
88 {
89 $new = &{$alias}($val)
90 }
91 elsif (lc($_) eq lc($alias))
92 {
93 $new = $val;
94 }
95 if (defined($new))
96 {
97 next if $new eq $_; # avoid (direct) recursion on bugs
98 my $enc = (ref($new)) ? $new : find_encoding($new);
99 if ($enc)
100 {
101 $alias{$_} = $enc;
102 last;
103 }
104 }
105 }
656753f8 106 }
18586f54 107 return $alias{$_};
5345d506 108}
109
51ef4e11 110sub define_alias
5345d506 111{
18586f54 112 while (@_)
113 {
114 my ($alias,$name) = splice(@_,0,2);
115 push(@alias, $alias => $name);
116 }
51ef4e11 117}
118
016cb72c 119# Allow variants of iso-8859-1 etc.
d6089a2a 120define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
016cb72c 121
7faf300d 122# At least HP-UX has these.
123define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );
124
f7ac3676 125# More HP stuff.
126define_alias( qr/^(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
127
0b3236bb 128# The Official name of ASCII.
8a361256 129define_alias( qr/^ANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
130
58d53262 131# This is a font issue, not an encoding issue.
132# (The currency symbol of the Latin 1 upper half
133# has been redefined as the euro symbol.)
134define_alias( qr/^(.+)\@euro$/i => '"$1"' );
135
016cb72c 136# Allow latin-1 style names as well
7faf300d 137define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
016cb72c 138
f7ac3676 139# Allow winlatin1 style names as well
cf91068f 140define_alias( qr/^win(latin[12]|cyrillic|baltic|greek|turkish|hebrew|arabic|baltic|vietnamese)$/i => '"cp$winlatin2cp{\u$1}"' );
f7ac3676 141
016cb72c 142# Common names for non-latin prefered MIME names
143define_alias( 'ascii' => 'US-ascii',
144 'cyrillic' => 'iso-8859-5',
145 'arabic' => 'iso-8859-6',
146 'greek' => 'iso-8859-7',
f7ac3676 147 'hebrew' => 'iso-8859-8',
148 'thai' => 'iso-8859-11',
149 'tis620' => 'iso-8859-11',
150 );
016cb72c 151
7faf300d 152# At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
1853dd5f 153# And Microsoft has their own naming (again, surprisingly).
154define_alias( qr/^(?:ibm|ms)[-_]?(\d\d\d\d?)$/i => '"cp$1"');
155
156# Sometimes seen with a leading zero.
157define_alias( qr/^cp037$/i => '"cp37"');
158
159# Ououououou.
160define_alias( qr/^macRomanian$/i => '"macRumanian"');
7faf300d 161
58d53262 162# Standardize on the dashed versions.
163define_alias( qr/^utf8$/i => 'utf-8' );
7faf300d 164define_alias( qr/^koi8r$/i => 'koi8-r' );
f7ac3676 165define_alias( qr/^koi8u$/i => 'koi8-u' );
166
1853dd5f 167# Seen in some Linuxes.
168define_alias( qr/^ujis$/i => 'euc-jp' );
169
f7ac3676 170# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
171# TODO: HP-UX '15' encodings japanese15 korean15 roi15
172# TODO: Cyrillic encoding ISO-IR-111 (useful?)
f500e210 173# TODO: Chinese encodings GB18030 EUC-TW HZ
f7ac3676 174# TODO: Armenian encoding ARMSCII-8
175# TODO: Hebrew encoding ISO-8859-8-1
176# TODO: Thai encoding TCVN
177# TODO: Korean encoding Johab
56a543c5 178# TODO: Vietnamese encodings VPS
f7ac3676 179# TODO: Japanese encoding JIS (not the same as SJIS)
180# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
181# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
182# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
183# Kannada Khmer Korean Laotian Malayalam Mongolian
184# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
18586f54 185
1853dd5f 186# Map white space and _ to '-'
016cb72c 187define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
188
51ef4e11 189sub define_encoding
190{
18586f54 191 my $obj = shift;
192 my $name = shift;
193 $encoding{$name} = $obj;
194 my $lc = lc($name);
195 define_alias($lc => $obj) unless $lc eq $name;
196 while (@_)
197 {
198 my $alias = shift;
199 define_alias($alias,$obj);
200 }
201 return $obj;
656753f8 202}
203
656753f8 204sub getEncoding
205{
18586f54 206 my ($class,$name) = @_;
207 my $enc;
208 if (ref($name) && $name->can('new_sequence'))
209 {
210 return $name;
211 }
212 my $lc = lc $name;
213 if (exists $encoding{$name})
214 {
215 return $encoding{$name};
216 }
217 if (exists $encoding{$lc})
218 {
219 return $encoding{$lc};
220 }
221
222 my $oc = $class->findAlias($name);
223 return $oc if defined $oc;
224 return $class->findAlias($lc) if $lc ne $name;
225
226 return;
656753f8 227}
228
4411f3b6 229sub find_encoding
230{
18586f54 231 my ($name) = @_;
232 return __PACKAGE__->getEncoding($name);
4411f3b6 233}
234
235sub encode
236{
18586f54 237 my ($name,$string,$check) = @_;
238 my $enc = find_encoding($name);
239 croak("Unknown encoding '$name'") unless defined $enc;
240 my $octets = $enc->encode($string,$check);
241 return undef if ($check && length($string));
242 return $octets;
4411f3b6 243}
244
245sub decode
246{
18586f54 247 my ($name,$octets,$check) = @_;
248 my $enc = find_encoding($name);
249 croak("Unknown encoding '$name'") unless defined $enc;
250 my $string = $enc->decode($octets,$check);
251 $_[1] = $octets if $check;
252 return $string;
4411f3b6 253}
254
255sub from_to
256{
18586f54 257 my ($string,$from,$to,$check) = @_;
258 my $f = find_encoding($from);
259 croak("Unknown encoding '$from'") unless defined $f;
260 my $t = find_encoding($to);
261 croak("Unknown encoding '$to'") unless defined $t;
262 my $uni = $f->decode($string,$check);
263 return undef if ($check && length($string));
264 $string = $t->encode($uni,$check);
265 return undef if ($check && length($uni));
266 return length($_[0] = $string);
4411f3b6 267}
268
269sub encode_utf8
270{
18586f54 271 my ($str) = @_;
272 utf8::encode($str);
273 return $str;
4411f3b6 274}
275
276sub decode_utf8
277{
18586f54 278 my ($str) = @_;
279 return undef unless utf8::decode($str);
280 return $str;
5ad8ef52 281}
282
18586f54 283require Encode::Encoding;
284require Encode::XS;
285require Encode::Internal;
286require Encode::Unicode;
287require Encode::utf8;
288require Encode::iso10646_1;
289require Encode::ucs2_le;
4411f3b6 290
656753f8 2911;
292
2a936312 293__END__
294
4411f3b6 295=head1 NAME
296
297Encode - character encodings
298
299=head1 SYNOPSIS
300
301 use Encode;
302
a67efb5b 303 use Encode::TW; # for Taiwan-based Chinese encodings
304 use Encode::CN; # for China-based Chinese encodings
305 use Encode::JP; # for Japanese encodings
306 use Encode::KR; # for Korean encodings
307
4411f3b6 308=head1 DESCRIPTION
309
47bfe92f 310The C<Encode> module provides the interfaces between Perl's strings
311and the rest of the system. Perl strings are sequences of B<characters>.
4411f3b6 312
313The repertoire of characters that Perl can represent is at least that
47bfe92f 314defined by the Unicode Consortium. On most platforms the ordinal
315values of the characters (as returned by C<ord(ch)>) is the "Unicode
316codepoint" for the character (the exceptions are those platforms where
317the legacy encoding is some variant of EBCDIC rather than a super-set
318of ASCII - see L<perlebcdic>).
4411f3b6 319
320Traditionaly computer data has been moved around in 8-bit chunks
321often called "bytes". These chunks are also known as "octets" in
322networking standards. Perl is widely used to manipulate data of
323many types - not only strings of characters representing human or
324computer languages but also "binary" data being the machines representation
325of numbers, pixels in an image - or just about anything.
326
47bfe92f 327When Perl is processing "binary data" the programmer wants Perl to process
328"sequences of bytes". This is not a problem for Perl - as a byte has 256
329possible values it easily fits in Perl's much larger "logical character".
4411f3b6 330
a67efb5b 331Due to size concerns, before using B<CJK> (Chinese, Japanese & Korean)
332encodings, you have to C<use> the corresponding
333B<Encode::>(B<TW>|B<CN>|B<JP>|B<KR>) modules first.
334
4411f3b6 335=head2 TERMINOLOGY
336
4ac9195f 337=over 4
4411f3b6 338
339=item *
340
341I<character>: a character in the range 0..(2**32-1) (or more).
47bfe92f 342(What Perl's strings are made of.)
4411f3b6 343
344=item *
345
346I<byte>: a character in the range 0..255
47bfe92f 347(A special case of a Perl character.)
4411f3b6 348
349=item *
350
351I<octet>: 8 bits of data, with ordinal values 0..255
47bfe92f 352(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
4411f3b6 353
354=back
355
356The marker [INTERNAL] marks Internal Implementation Details, in
357general meant only for those who think they know what they are doing,
358and such details may change in future releases.
359
360=head1 ENCODINGS
361
362=head2 Characteristics of an Encoding
363
364An encoding has a "repertoire" of characters that it can represent,
365and for each representable character there is at least one sequence of
366octets that represents it.
367
368=head2 Types of Encodings
369
370Encodings can be divided into the following types:
371
372=over 4
373
374=item * Fixed length 8-bit (or less) encodings.
375
376Each character is a single octet so may have a repertoire of up to
377256 characters. ASCII and iso-8859-* are typical examples.
378
379=item * Fixed length 16-bit encodings
380
381Each character is two octets so may have a repertoire of up to
47bfe92f 38265 536 characters. Unicode's UCS-2 is an example. Also used for
4411f3b6 383encodings for East Asian languages.
384
385=item * Fixed length 32-bit encodings.
386
387Not really very "encoded" encodings. The Unicode code points
388are just represented as 4-octet integers. None the less because
389different architectures use different representations of integers
390(so called "endian") there at least two disctinct encodings.
391
392=item * Multi-byte encodings
393
394The number of octets needed to represent a character varies.
395UTF-8 is a particularly complex but regular case of a multi-byte
396encoding. Several East Asian countries use a multi-byte encoding
397where 1-octet is used to cover western roman characters and Asian
398characters get 2-octets.
399(UTF-16 is strictly a multi-byte encoding taking either 2 or 4 octets
400to represent a Unicode code point.)
401
402=item * "Escape" encodings.
403
404These encodings embed "escape sequences" into the octet sequence
405which describe how the following octets are to be interpreted.
406The iso-2022-* family is typical. Following the escape sequence
407octets are encoded by an "embedded" encoding (which will be one
408of the above types) until another escape sequence switches to
409a different "embedded" encoding.
410
411These schemes are very flexible and can handle mixed languages but are
47bfe92f 412very complex to process (and have state). No escape encodings are
413implemented for Perl yet.
4411f3b6 414
415=back
416
417=head2 Specifying Encodings
418
419Encodings can be specified to the API described below in two ways:
420
421=over 4
422
423=item 1. By name
424
47bfe92f 425Encoding names are strings with characters taken from a restricted
426repertoire. See L</"Encoding Names">.
4411f3b6 427
428=item 2. As an object
429
430Encoding objects are returned by C<find_encoding($name)>.
431
432=back
433
434=head2 Encoding Names
435
436Encoding names are case insensitive. White space in names is ignored.
47bfe92f 437In addition an encoding may have aliases. Each encoding has one
438"canonical" name. The "canonical" name is chosen from the names of
439the encoding by picking the first in the following sequence:
4411f3b6 440
441=over 4
442
78255929 443=item * The MIME name as defined in IETF RFCs.
4411f3b6 444
445=item * The name in the IANA registry.
446
d1be9408 447=item * The name used by the organization that defined it.
4411f3b6 448
449=back
450
451Because of all the alias issues, and because in the general case
452encodings have state C<Encode> uses the encoding object internally
453once an operation is in progress.
454
21938dfa 455As of Perl 5.8.0, at least the following encodings are recognized
456(the => marks aliases):
457
458 ASCII
459
460 US-ASCII => ASCII
461
462The Unicode:
463
0b3236bb 464 UTF-8
21938dfa 465 UTF-16
466 UCS-2
467
468 ISO 10646-1 => UCS-2
469
470The ISO 8859 and KOI:
471
472 ISO 8859-1 ISO 8859-6 ISO 8859-11 KOI8-F
473 ISO 8859-2 ISO 8859-7 (12 doesn't exist) KOI8-R
56a543c5 474 ISO 8859-3 ISO 8859-8 ISO 8859-13 KOI8-U
21938dfa 475 ISO 8859-4 ISO 8859-9 ISO 8859-14
476 ISO 8859-5 ISO 8859-10 ISO 8859-15
477 ISO 8859-16
478
479 Latin1 => 8859-1 Latin6 => 8859-10
480 Latin2 => 8859-2 Latin7 => 8859-13
0b3236bb 481 Latin3 => 8859-3 Latin8 => 8859-14
21938dfa 482 Latin4 => 8859-4 Latin9 => 8859-15
483 Latin5 => 8859-9 Latin10 => 8859-16
484
485 Cyrillic => 8859-5
486 Arabic => 8859-6
487 Greek => 8859-7
488 Hebrew => 8859-8
489 Thai => 8859-11
0b3236bb 490 TIS620 => 8859-11
21938dfa 491
492The CJKV: Chinese, Japanese, Korean, Vietnamese:
493
0b3236bb 494 ISO 2022 ISO 2022 JP-1 JIS 0201 GB 1988 Big5 EUC-CN
495 ISO 2022 CN ISO 2022 JP-2 JIS 0208 GB 2312 HZ EUC-JP
21938dfa 496 ISO 2022 JP ISO 2022 KR JIS 0210 GB 12345 CNS 11643 EUC-JP-0212
cb448690 497 Shift-JIS GBK Big5-HKSCS EUC-KR
498 VISCII ISO-IR-165
499
a67efb5b 500(Due to size concerns, additional Chinese encodings including C<GB 18030>,
501C<EUC-TW> and C<BIG5PLUS> are distributed separately on CPAN, under the name
502L<Encode::HanExtra>.)
21938dfa 503
504The PC codepages:
505
506 CP37 CP852 CP861 CP866 CP949 CP1251 CP1256
507 CP424 CP855 CP862 CP869 CP950 CP1252 CP1257
508 CP737 CP856 CP863 CP874 CP1006 CP1253 CP1258
509 CP775 CP857 CP864 CP932 CP1047 CP1254
510 CP850 CP860 CP865 CP936 CP1250 CP1255
511
512 WinLatin1 => CP1252
513 WinLatin2 => CP1250
514 WinCyrillic => CP1251
515 WinGreek => CP1253
516 WinTurkiskh => CP1254
517 WinHebrew => CP1255
518 WinArabic => CP1256
519 WinBaltic => CP1257
520 WinVietnamese => CP1258
521
4a42e14c 522(All the CPI<NNN...> are available also as IBMI<NNN...>.)
21938dfa 523
524The Mac codepages:
525
0b3236bb 526 MacCentralEuropean MacJapanese
527 MacCroatian MacRoman
1853dd5f 528 MacCyrillic MacRomanian
0b3236bb 529 MacDingbats MacSami
530 MacGreek MacThai
531 MacIcelandic MacTurkish
532 MacUkraine
21938dfa 533
534Miscellaneous:
535
536 7bit-greek IR-197
537 7bit-kana NeXTstep
538 7bit-latin1 POSIX-BC
539 DingBats Roman8
540 GSM 0338 Symbol
541
4411f3b6 542=head1 PERL ENCODING API
543
544=head2 Generic Encoding Interface
545
546=over 4
547
548=item *
549
550 $bytes = encode(ENCODING, $string[, CHECK])
551
47bfe92f 552Encodes string from Perl's internal form into I<ENCODING> and returns
553a sequence of octets. For CHECK see L</"Handling Malformed Data">.
4411f3b6 554
681a7c68 555For example to convert (internally UTF-8 encoded) Unicode data
556to octets:
557
558 $octets = encode("utf8", $unicode);
559
4411f3b6 560=item *
561
562 $string = decode(ENCODING, $bytes[, CHECK])
563
47bfe92f 564Decode sequence of octets assumed to be in I<ENCODING> into Perl's
565internal form and returns the resulting string. For CHECK see
566L</"Handling Malformed Data">.
567
681a7c68 568For example to convert ISO 8859-1 data to UTF-8:
569
570 $utf8 = decode("latin1", $latin1);
571
47bfe92f 572=item *
573
574 from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
575
2b106fbe 576Convert B<in-place> the data between two encodings. How did the data
577in $string originally get to be in FROM_ENCODING? Either using
e9692b5b 578encode() or through PerlIO: See L</"Encoding and IO">. For CHECK
2b106fbe 579see L</"Handling Malformed Data">.
580
581For example to convert ISO 8859-1 data to UTF-8:
582
583 from_to($data, "iso-8859-1", "utf-8");
584
585and to convert it back:
586
587 from_to($data, "utf-8", "iso-8859-1");
4411f3b6 588
ab97ca19 589Note that because the conversion happens in place, the data to be
590converted cannot be a string constant, it must be a scalar variable.
591
4411f3b6 592=back
593
594=head2 Handling Malformed Data
595
596If CHECK is not set, C<undef> is returned. If the data is supposed to
47bfe92f 597be UTF-8, an optional lexical warning (category utf8) is given. If
598CHECK is true but not a code reference, dies.
4411f3b6 599
47bfe92f 600It would desirable to have a way to indicate that transform should use
601the encodings "replacement character" - no such mechanism is defined yet.
4411f3b6 602
603It is also planned to allow I<CHECK> to be a code reference.
604
47bfe92f 605This is not yet implemented as there are design issues with what its
606arguments should be and how it returns its results.
4411f3b6 607
608=over 4
609
610=item Scheme 1
611
612Passed remaining fragment of string being processed.
613Modifies it in place to remove bytes/characters it can understand
614and returns a string used to represent them.
615e.g.
616
617 sub fixup {
618 my $ch = substr($_[0],0,1,'');
619 return sprintf("\x{%02X}",ord($ch);
620 }
621
622This scheme is close to how underlying C code for Encode works, but gives
623the fixup routine very little context.
624
625=item Scheme 2
626
47bfe92f 627Passed original string, and an index into it of the problem area, and
628output string so far. Appends what it will to output string and
629returns new index into original string. For example:
4411f3b6 630
631 sub fixup {
632 # my ($s,$i,$d) = @_;
633 my $ch = substr($_[0],$_[1],1);
634 $_[2] .= sprintf("\x{%02X}",ord($ch);
635 return $_[1]+1;
636 }
637
47bfe92f 638This scheme gives maximal control to the fixup routine but is more
639complicated to code, and may need internals of Encode to be tweaked to
640keep original string intact.
4411f3b6 641
642=item Other Schemes
643
644Hybrids of above.
645
646Multiple return values rather than in-place modifications.
647
648Index into the string could be pos($str) allowing s/\G...//.
649
650=back
651
652=head2 UTF-8 / utf8
653
654The Unicode consortium defines the UTF-8 standard as a way of encoding
47bfe92f 655the entire Unicode repertiore as sequences of octets. This encoding is
656expected to become very widespread. Perl can use this form internaly
657to represent strings, so conversions to and from this form are
658particularly efficient (as octets in memory do not have to change,
659just the meta-data that tells Perl how to treat them).
4411f3b6 660
661=over 4
662
663=item *
664
665 $bytes = encode_utf8($string);
666
47bfe92f 667The characters that comprise string are encoded in Perl's superset of UTF-8
4411f3b6 668and the resulting octets returned as a sequence of bytes. All possible
669characters have a UTF-8 representation so this function cannot fail.
670
671=item *
672
673 $string = decode_utf8($bytes [,CHECK]);
674
47bfe92f 675The sequence of octets represented by $bytes is decoded from UTF-8
676into a sequence of logical characters. Not all sequences of octets
677form valid UTF-8 encodings, so it is possible for this call to fail.
678For CHECK see L</"Handling Malformed Data">.
4411f3b6 679
680=back
681
682=head2 Other Encodings of Unicode
683
47bfe92f 684UTF-16 is similar to UCS-2, 16 bit or 2-byte chunks. UCS-2 can only
7a4efbb2 685represent 0..0xFFFF, while UTF-16 has a I<surrogate pair> scheme which
47bfe92f 686allows it to cover the whole Unicode range.
4411f3b6 687
7a4efbb2 688Surrogates are code points set aside to encode the 0x01000..0x10FFFF
689range of Unicode code points in pairs of 16-bit units. The I<high
690surrogates> are the range 0xD800..0xDBFF, and the I<low surrogates>
691are the range 0xDC00..0xDFFFF. The surrogate encoding is
692
693 $hi = ($uni - 0x10000) / 0x400 + 0xD800;
694 $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
695
696and the decoding is
697
698 $uni = 0x10000 + ($hi - 0xD8000) * 0x400 + ($lo - 0xDC00);
699
8040349a 700Encode implements big-endian UCS-2 aliased to "iso-10646-1" as that
47bfe92f 701happens to be the name used by that representation when used with X11
702fonts.
4411f3b6 703
704UTF-32 or UCS-4 is 32-bit or 4-byte chunks. Perl's logical characters
705can be considered as being in this form without encoding. An encoding
47bfe92f 706to transfer strings in this form (e.g. to write them to a file) would
707need to
4411f3b6 708
c079d275 709 pack('L*', unpack('U*', $string)); # native
4411f3b6 710 or
c079d275 711 pack('V*', unpack('U*', $string)); # little-endian
4411f3b6 712 or
c079d275 713 pack('N*', unpack('U*', $string)); # big-endian
4411f3b6 714
c079d275 715depending on the endianness required.
4411f3b6 716
51ef4e11 717No UTF-32 encodings are implemented yet.
4411f3b6 718
47bfe92f 719Both UCS-2 and UCS-4 style encodings can have "byte order marks" by
720representing the code point 0xFFFE as the very first thing in a file.
4411f3b6 721
51ef4e11 722=head2 Listing available encodings
723
724 use Encode qw(encodings);
725 @list = encodings();
726
727Returns a list of the canonical names of the available encodings.
728
729=head2 Defining Aliases
730
731 use Encode qw(define_alias);
732 define_alias( newName => ENCODING);
733
47bfe92f 734Allows newName to be used as am alias for ENCODING. ENCODING may be
735either the name of an encoding or and encoding object (as above).
51ef4e11 736
737Currently I<newName> can be specified in the following ways:
738
739=over 4
740
741=item As a simple string.
742
743=item As a qr// compiled regular expression, e.g.:
744
745 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
746
47bfe92f 747In this case if I<ENCODING> is not a reference it is C<eval>-ed to
748allow C<$1> etc. to be subsituted. The example is one way to names as
749used in X11 font names to alias the MIME names for the iso-8859-*
750family.
51ef4e11 751
752=item As a code reference, e.g.:
753
754 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
755
756In this case C<$_> will be set to the name that is being looked up and
47bfe92f 757I<ENCODING> is passed to the sub as its first argument. The example
758is another way to names as used in X11 font names to alias the MIME
759names for the iso-8859-* family.
51ef4e11 760
761=back
762
763=head2 Defining Encodings
764
e9692b5b 765 use Encode qw(define_alias);
766 define_encoding( $object, 'canonicalName' [,alias...]);
51ef4e11 767
47bfe92f 768Causes I<canonicalName> to be associated with I<$object>. The object
769should provide the interface described in L</"IMPLEMENTATION CLASSES">
770below. If more than two arguments are provided then additional
771arguments are taken as aliases for I<$object> as for C<define_alias>.
51ef4e11 772
4411f3b6 773=head1 Encoding and IO
774
775It is very common to want to do encoding transformations when
776reading or writing files, network connections, pipes etc.
47bfe92f 777If Perl is configured to use the new 'perlio' IO system then
4411f3b6 778C<Encode> provides a "layer" (See L<perliol>) which can transform
779data as it is read or written.
780
8e86646e 781Here is how the blind poet would modernise the encoding:
782
42234700 783 use Encode;
8e86646e 784 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
785 open(my $utf8,'>:utf8','iliad.utf8');
786 my @epic = <$iliad>;
787 print $utf8 @epic;
788 close($utf8);
789 close($illiad);
4411f3b6 790
791In addition the new IO system can also be configured to read/write
792UTF-8 encoded characters (as noted above this is efficient):
793
e9692b5b 794 open(my $fh,'>:utf8','anything');
795 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6 796
797Either of the above forms of "layer" specifications can be made the default
798for a lexical scope with the C<use open ...> pragma. See L<open>.
799
800Once a handle is open is layers can be altered using C<binmode>.
801
47bfe92f 802Without any such configuration, or if Perl itself is built using
4411f3b6 803system's own IO, then write operations assume that file handle accepts
804only I<bytes> and will C<die> if a character larger than 255 is
805written to the handle. When reading, each octet from the handle
806becomes a byte-in-a-character. Note that this default is the same
47bfe92f 807behaviour as bytes-only languages (including Perl before v5.6) would
808have, and is sufficient to handle native 8-bit encodings
809e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
810other encodings and binary data.
811
812In other cases it is the programs responsibility to transform
813characters into bytes using the API above before doing writes, and to
814transform the bytes read from a handle into characters before doing
815"character operations" (e.g. C<lc>, C</\W+/>, ...).
816
47bfe92f 817You can also use PerlIO to convert larger amounts of data you don't
818want to bring into memory. For example to convert between ISO 8859-1
819(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
820
e9692b5b 821 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
822 open(G, ">:utf8", "data.utf") or die $!;
823 while (<F>) { print G }
824
825 # Could also do "print G <F>" but that would pull
826 # the whole file into memory just to write it out again.
827
828More examples:
47bfe92f 829
e9692b5b 830 open(my $f, "<:encoding(cp1252)")
831 open(my $g, ">:encoding(iso-8859-2)")
832 open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f 833
834See L<PerlIO> for more information.
4411f3b6 835
1768d7eb 836See also L<encoding> for how to change the default encoding of the
d521382b 837data in your script.
1768d7eb 838
4411f3b6 839=head1 Encoding How to ...
840
841To do:
842
843=over 4
844
845=item * IO with mixed content (faking iso-2020-*)
846
847=item * MIME's Content-Length:
848
849=item * UTF-8 strings in binary data.
850
47bfe92f 851=item * Perl/Encode wrappers on non-Unicode XS modules.
4411f3b6 852
853=back
854
855=head1 Messing with Perl's Internals
856
47bfe92f 857The following API uses parts of Perl's internals in the current
858implementation. As such they are efficient, but may change.
4411f3b6 859
860=over 4
861
4411f3b6 862=item * is_utf8(STRING [, CHECK])
863
864[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f 865If CHECK is true, also checks the data in STRING for being well-formed
866UTF-8. Returns true if successful, false otherwise.
4411f3b6 867
868=item * valid_utf8(STRING)
869
47bfe92f 870[INTERNAL] Test whether STRING is in a consistent state. Will return
871true if string is held as bytes, or is well-formed UTF-8 and has the
872UTF-8 flag on. Main reason for this routine is to allow Perl's
873testsuite to check that operations have left strings in a consistent
874state.
4411f3b6 875
876=item *
877
878 _utf8_on(STRING)
879
880[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
881B<not> checked for being well-formed UTF-8. Do not use unless you
882B<know> that the STRING is well-formed UTF-8. Returns the previous
883state of the UTF-8 flag (so please don't test the return value as
884I<not> success or failure), or C<undef> if STRING is not a string.
885
886=item *
887
888 _utf8_off(STRING)
889
890[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
891Returns the previous state of the UTF-8 flag (so please don't test the
892return value as I<not> success or failure), or C<undef> if STRING is
893not a string.
894
895=back
896
4edaa979 897=head1 IMPLEMENTATION CLASSES
898
899As mentioned above encodings are (in the current implementation at least)
900defined by objects. The mapping of encoding name to object is via the
51ef4e11 901C<%encodings> hash.
4edaa979 902
903The values of the hash can currently be either strings or objects.
904The string form may go away in the future. The string form occurs
905when C<encodings()> has scanned C<@INC> for loadable encodings but has
906not actually loaded the encoding in question. This is because the
47bfe92f 907current "loading" process is all Perl and a bit slow.
4edaa979 908
47bfe92f 909Once an encoding is loaded then value of the hash is object which
910implements the encoding. The object should provide the following
911interface:
4edaa979 912
913=over 4
914
915=item -E<gt>name
916
917Should return the string representing the canonical name of the encoding.
918
919=item -E<gt>new_sequence
920
47bfe92f 921This is a placeholder for encodings with state. It should return an
922object which implements this interface, all current implementations
923return the original object.
4edaa979 924
925=item -E<gt>encode($string,$check)
926
47bfe92f 927Should return the octet sequence representing I<$string>. If I<$check>
928is true it should modify I<$string> in place to remove the converted
929part (i.e. the whole string unless there is an error). If an error
930occurs it should return the octet sequence for the fragment of string
931that has been converted, and modify $string in-place to remove the
932converted part leaving it starting with the problem fragment.
4edaa979 933
47bfe92f 934If check is is false then C<encode> should make a "best effort" to
935convert the string - for example by using a replacement character.
4edaa979 936
937=item -E<gt>decode($octets,$check)
938
47bfe92f 939Should return the string that I<$octets> represents. If I<$check> is
940true it should modify I<$octets> in place to remove the converted part
941(i.e. the whole sequence unless there is an error). If an error
942occurs it should return the fragment of string that has been
943converted, and modify $octets in-place to remove the converted part
4edaa979 944leaving it starting with the problem fragment.
945
47bfe92f 946If check is is false then C<decode> should make a "best effort" to
947convert the string - for example by using Unicode's "\x{FFFD}" as a
948replacement character.
4edaa979 949
950=back
951
47bfe92f 952It should be noted that the check behaviour is different from the
953outer public API. The logic is that the "unchecked" case is useful
954when encoding is part of a stream which may be reporting errors
955(e.g. STDERR). In such cases it is desirable to get everything
956through somehow without causing additional errors which obscure the
957original one. Also the encoding is best placed to know what the
958correct replacement character is, so if that is the desired behaviour
959then letting low level code do it is the most efficient.
960
961In contrast if check is true, the scheme above allows the encoding to
962do as much as it can and tell layer above how much that was. What is
963lacking at present is a mechanism to report what went wrong. The most
964likely interface will be an additional method call to the object, or
965perhaps (to avoid forcing per-stream objects on otherwise stateless
966encodings) and additional parameter.
967
968It is also highly desirable that encoding classes inherit from
969C<Encode::Encoding> as a base class. This allows that class to define
970additional behaviour for all encoding objects. For example built in
971Unicode, UCS-2 and UTF-8 classes use :
51ef4e11 972
973 package Encode::MyEncoding;
974 use base qw(Encode::Encoding);
975
976 __PACKAGE__->Define(qw(myCanonical myAlias));
977
47bfe92f 978To create an object with bless {Name => ...},$class, and call
979define_encoding. They inherit their C<name> method from
980C<Encode::Encoding>.
4edaa979 981
982=head2 Compiled Encodings
983
47bfe92f 984F<Encode.xs> provides a class C<Encode::XS> which provides the
985interface described above. It calls a generic octet-sequence to
986octet-sequence "engine" that is driven by tables (defined in
987F<encengine.c>). The same engine is used for both encode and
988decode. C<Encode:XS>'s C<encode> forces Perl's characters to their
989UTF-8 form and then treats them as just another multibyte
990encoding. C<Encode:XS>'s C<decode> transforms the sequence and then
991turns the UTF-8-ness flag as that is the form that the tables are
992defined to produce. For details of the engine see the comments in
993F<encengine.c>.
994
995The tables are produced by the Perl script F<compile> (the name needs
996to change so we can eventually install it somewhere). F<compile> can
997currently read two formats:
4edaa979 998
999=over 4
1000
1001=item *.enc
1002
47bfe92f 1003This is a coined format used by Tcl. It is documented in
1004Encode/EncodeFormat.pod.
4edaa979 1005
1006=item *.ucm
1007
1008This is the semi-standard format used by IBM's ICU package.
1009
1010=back
1011
1012F<compile> can write the following forms:
1013
1014=over 4
1015
1016=item *.ucm
1017
1018See above - the F<Encode/*.ucm> files provided with the distribution have
1019been created from the original Tcl .enc files using this approach.
1020
1021=item *.c
1022
1023Produces tables as C data structures - this is used to build in encodings
1024into F<Encode.so>/F<Encode.dll>.
1025
1026=item *.xs
1027
47bfe92f 1028In theory this allows encodings to be stand-alone loadable Perl
1029extensions. The process has not yet been tested. The plan is to use
1030this approach for large East Asian encodings.
4edaa979 1031
1032=back
1033
47bfe92f 1034The set of encodings built-in to F<Encode.so>/F<Encode.dll> is
1035determined by F<Makefile.PL>. The current set is as follows:
4edaa979 1036
1037=over 4
1038
1039=item ascii and iso-8859-*
1040
1041That is all the common 8-bit "western" encodings.
1042
1043=item IBM-1047 and two other variants of EBCDIC.
1044
47bfe92f 1045These are the same variants that are supported by EBCDIC Perl as
1046"native" encodings. They are included to prove "reversibility" of
1047some constructs in EBCDIC Perl.
4edaa979 1048
1049=item symbol and dingbats as used by Tk on X11.
1050
47bfe92f 1051(The reason Encode got started was to support Perl/Tk.)
4edaa979 1052
1053=back
1054
47bfe92f 1055That set is rather ad hoc and has been driven by the needs of the
1056tests rather than the needs of typical applications. It is likely
1057to be rationalized.
4edaa979 1058
4411f3b6 1059=head1 SEE ALSO
1060
1768d7eb 1061L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>, L<encoding>
4411f3b6 1062
1063=cut
1064