Integrate mainline.
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
2c674647 3
b8a524e9 4our $VERSION = '0.02';
2c674647 5
6require DynaLoader;
7require Exporter;
8
51ef4e11 9our @ISA = qw(Exporter DynaLoader);
2c674647 10
4411f3b6 11# Public, encouraged API is exported by default
51ef4e11 12our @EXPORT = qw (
4411f3b6 13 encode
14 decode
15 encode_utf8
16 decode_utf8
17 find_encoding
51ef4e11 18 encodings
4411f3b6 19);
20
51ef4e11 21our @EXPORT_OK =
2c674647 22 qw(
51ef4e11 23 define_encoding
24 define_alias
2c674647 25 from_to
26 is_utf8
4411f3b6 27 is_8bit
28 is_16bit
a12c0f56 29 utf8_upgrade
30 utf8_downgrade
4411f3b6 31 _utf8_on
32 _utf8_off
2c674647 33 );
34
35bootstrap Encode ();
36
4411f3b6 37# Documentation moved after __END__ for speed - NI-S
2c674647 38
bf230f3d 39use Carp;
40
51ef4e11 41# Make a %encoding package variable to allow a certain amount of cheating
42our %encoding;
43my @alias; # ordered matching list
44my %alias; # cached known aliases
5345d506 45
656753f8 46sub encodings
47{
48 my ($class) = @_;
51ef4e11 49 return keys %encoding;
50}
51
52sub findAlias
53{
54 my $class = shift;
55 local $_ = shift;
56 unless (exists $alias{$_})
656753f8 57 {
51ef4e11 58 for (my $i=0; $i < @alias; $i += 2)
656753f8 59 {
51ef4e11 60 my $alias = $alias[$i];
61 my $val = $alias[$i+1];
62 my $new;
5ad8ef52 63
51ef4e11 64 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
5345d506 65 {
51ef4e11 66 $new = eval $val;
67 }
68 elsif (ref($alias) eq 'CODE')
69 {
70 $new = &{$alias}($val)
71 }
5ad8ef52 72 elsif (lc($_) eq lc($alias))
51ef4e11 73 {
74 $new = $val;
75 }
76 if (defined($new))
77 {
78 next if $new eq $_; # avoid (direct) recursion on bugs
79 my $enc = (ref($new)) ? $new : find_encoding($new);
80 if ($enc)
5345d506 81 {
51ef4e11 82 $alias{$_} = $enc;
83 last;
5345d506 84 }
85 }
656753f8 86 }
5345d506 87 }
51ef4e11 88 return $alias{$_};
5345d506 89}
90
51ef4e11 91sub define_alias
5345d506 92{
51ef4e11 93 while (@_)
5345d506 94 {
51ef4e11 95 my ($alias,$name) = splice(@_,0,2);
96 push(@alias, $alias => $name);
656753f8 97 }
51ef4e11 98}
99
016cb72c 100# Allow variants of iso-8859-1 etc.
d6089a2a 101define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
016cb72c 102
103# Allow latin-1 style names as well
104 # 0 1 2 3 4 5 6 7 8 9 10
105my @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
106define_alias( qr/^latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
107
108# Common names for non-latin prefered MIME names
109define_alias( 'ascii' => 'US-ascii',
110 'cyrillic' => 'iso-8859-5',
111 'arabic' => 'iso-8859-6',
112 'greek' => 'iso-8859-7',
113 'hebrew' => 'iso-8859-8');
114
51ef4e11 115define_alias( 'ibm-1047' => 'cp1047');
116
016cb72c 117# Map white space and _ to '-'
118define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
119
51ef4e11 120sub define_encoding
121{
122 my $obj = shift;
123 my $name = shift;
124 $encoding{$name} = $obj;
125 my $lc = lc($name);
126 define_alias($lc => $obj) unless $lc eq $name;
127 while (@_)
656753f8 128 {
51ef4e11 129 my $alias = shift;
130 define_alias($alias,$obj);
656753f8 131 }
51ef4e11 132 return $obj;
656753f8 133}
134
656753f8 135sub getEncoding
136{
137 my ($class,$name) = @_;
5345d506 138 my $enc;
51ef4e11 139 if (exists $encoding{$name})
656753f8 140 {
51ef4e11 141 return $encoding{$name};
142 }
143 else
144 {
145 return $class->findAlias($name);
656753f8 146 }
656753f8 147}
148
4411f3b6 149sub find_encoding
150{
151 my ($name) = @_;
152 return __PACKAGE__->getEncoding($name);
153}
154
155sub encode
156{
157 my ($name,$string,$check) = @_;
158 my $enc = find_encoding($name);
159 croak("Unknown encoding '$name'") unless defined $enc;
50d26985 160 my $octets = $enc->encode($string,$check);
4411f3b6 161 return undef if ($check && length($string));
162 return $octets;
163}
164
165sub decode
166{
167 my ($name,$octets,$check) = @_;
168 my $enc = find_encoding($name);
169 croak("Unknown encoding '$name'") unless defined $enc;
50d26985 170 my $string = $enc->decode($octets,$check);
4411f3b6 171 return undef if ($check && length($octets));
172 return $string;
173}
174
175sub from_to
176{
177 my ($string,$from,$to,$check) = @_;
178 my $f = find_encoding($from);
179 croak("Unknown encoding '$from'") unless defined $f;
180 my $t = find_encoding($to);
181 croak("Unknown encoding '$to'") unless defined $t;
50d26985 182 my $uni = $f->decode($string,$check);
4411f3b6 183 return undef if ($check && length($string));
50d26985 184 $string = $t->encode($uni,$check);
4411f3b6 185 return undef if ($check && length($uni));
186 return length($_[0] = $string);
187}
188
189sub encode_utf8
190{
191 my ($str) = @_;
1b026014 192 utf8::encode($str);
4411f3b6 193 return $str;
194}
195
196sub decode_utf8
197{
198 my ($str) = @_;
1b026014 199 return undef unless utf8::decode($str);
4411f3b6 200 return $str;
201}
202
50d26985 203package Encode::Encoding;
204# Base class for classes which implement encodings
4edaa979 205
51ef4e11 206sub Define
207{
208 my $obj = shift;
209 my $canonical = shift;
210 $obj = bless { Name => $canonical },$obj unless ref $obj;
211 # warn "$canonical => $obj\n";
212 Encode::define_encoding($obj, $canonical, @_);
213}
214
215sub name { shift->{'Name'} }
216
50d26985 217# Temporary legacy methods
4edaa979 218sub toUnicode { shift->decode(@_) }
219sub fromUnicode { shift->encode(@_) }
220
221sub new_sequence { return $_[0] }
50d26985 222
223package Encode::XS;
224use base 'Encode::Encoding';
225
5ad8ef52 226package Encode::Internal;
50d26985 227use base 'Encode::Encoding';
656753f8 228
9b37254d 229# Dummy package that provides the encode interface but leaves data
1b026014 230# as UTF-X encoded. It is here so that from_to() works.
656753f8 231
5ad8ef52 232__PACKAGE__->Define('Internal');
233
234Encode::define_alias( 'Unicode' => 'Internal' ) if ord('A') == 65;
656753f8 235
50d26985 236sub decode
a12c0f56 237{
238 my ($obj,$str,$chk) = @_;
1b026014 239 utf8::upgrade($str);
a12c0f56 240 $_[1] = '' if $chk;
241 return $str;
242}
656753f8 243
50d26985 244*encode = \&decode;
656753f8 245
5ad8ef52 246package Encoding::Unicode;
247use base 'Encode::Encoding';
248
249__PACKAGE__->Define('Unicode') unless ord('A') == 65;
250
251sub decode
252{
253 my ($obj,$str,$chk) = @_;
254 my $res = '';
255 for (my $i = 0; $i < length($str); $i++)
256 {
257 $res .= chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
258 }
259 $_[1] = '' if $chk;
260 return $res;
261}
262
263sub encode
264{
265 my ($obj,$str,$chk) = @_;
266 my $res = '';
267 for (my $i = 0; $i < length($str); $i++)
268 {
269 $res .= chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
270 }
271 $_[1] = '' if $chk;
272 return $res;
273}
274
275
4411f3b6 276package Encode::utf8;
50d26985 277use base 'Encode::Encoding';
4411f3b6 278# package to allow long-hand
279# $octets = encode( utf8 => $string );
280#
281
51ef4e11 282__PACKAGE__->Define(qw(UTF-8 utf8));
4411f3b6 283
50d26985 284sub decode
4411f3b6 285{
286 my ($obj,$octets,$chk) = @_;
2a936312 287 my $str = Encode::decode_utf8($octets);
4411f3b6 288 if (defined $str)
289 {
290 $_[1] = '' if $chk;
291 return $str;
292 }
293 return undef;
294}
295
50d26985 296sub encode
4411f3b6 297{
298 my ($obj,$string,$chk) = @_;
2a936312 299 my $octets = Encode::encode_utf8($string);
4411f3b6 300 $_[1] = '' if $chk;
301 return $octets;
4411f3b6 302}
303
9b37254d 304package Encode::iso10646_1;
50d26985 305use base 'Encode::Encoding';
51ef4e11 306# Encoding is 16-bit network order Unicode (no surogates)
9b37254d 307# Used for X font encodings
87714904 308
8040349a 309__PACKAGE__->Define(qw(UCS-2 iso-10646-1));
87714904 310
50d26985 311sub decode
87714904 312{
313 my ($obj,$str,$chk) = @_;
314 my $uni = '';
315 while (length($str))
316 {
5dcbab34 317 my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
87714904 318 $uni .= chr($code);
319 }
320 $_[1] = $str if $chk;
8040349a 321 utf8::upgrade($uni);
87714904 322 return $uni;
323}
324
50d26985 325sub encode
87714904 326{
327 my ($obj,$uni,$chk) = @_;
328 my $str = '';
329 while (length($uni))
330 {
331 my $ch = substr($uni,0,1,'');
332 my $x = ord($ch);
333 unless ($x < 32768)
334 {
335 last if ($chk);
336 $x = 0;
337 }
5dcbab34 338 $str .= pack('n',$x);
656753f8 339 }
bf230f3d 340 $_[1] = $uni if $chk;
656753f8 341 return $str;
342}
343
4411f3b6 344# switch back to Encode package in case we ever add AutoLoader
345package Encode;
346
656753f8 3471;
348
2a936312 349__END__
350
4411f3b6 351=head1 NAME
352
353Encode - character encodings
354
355=head1 SYNOPSIS
356
357 use Encode;
358
359=head1 DESCRIPTION
360
47bfe92f 361The C<Encode> module provides the interfaces between Perl's strings
362and the rest of the system. Perl strings are sequences of B<characters>.
4411f3b6 363
364The repertoire of characters that Perl can represent is at least that
47bfe92f 365defined by the Unicode Consortium. On most platforms the ordinal
366values of the characters (as returned by C<ord(ch)>) is the "Unicode
367codepoint" for the character (the exceptions are those platforms where
368the legacy encoding is some variant of EBCDIC rather than a super-set
369of ASCII - see L<perlebcdic>).
4411f3b6 370
371Traditionaly computer data has been moved around in 8-bit chunks
372often called "bytes". These chunks are also known as "octets" in
373networking standards. Perl is widely used to manipulate data of
374many types - not only strings of characters representing human or
375computer languages but also "binary" data being the machines representation
376of numbers, pixels in an image - or just about anything.
377
47bfe92f 378When Perl is processing "binary data" the programmer wants Perl to process
379"sequences of bytes". This is not a problem for Perl - as a byte has 256
380possible values it easily fits in Perl's much larger "logical character".
4411f3b6 381
382=head2 TERMINOLOGY
383
4ac9195f 384=over 4
4411f3b6 385
386=item *
387
388I<character>: a character in the range 0..(2**32-1) (or more).
47bfe92f 389(What Perl's strings are made of.)
4411f3b6 390
391=item *
392
393I<byte>: a character in the range 0..255
47bfe92f 394(A special case of a Perl character.)
4411f3b6 395
396=item *
397
398I<octet>: 8 bits of data, with ordinal values 0..255
47bfe92f 399(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
4411f3b6 400
401=back
402
403The marker [INTERNAL] marks Internal Implementation Details, in
404general meant only for those who think they know what they are doing,
405and such details may change in future releases.
406
407=head1 ENCODINGS
408
409=head2 Characteristics of an Encoding
410
411An encoding has a "repertoire" of characters that it can represent,
412and for each representable character there is at least one sequence of
413octets that represents it.
414
415=head2 Types of Encodings
416
417Encodings can be divided into the following types:
418
419=over 4
420
421=item * Fixed length 8-bit (or less) encodings.
422
423Each character is a single octet so may have a repertoire of up to
424256 characters. ASCII and iso-8859-* are typical examples.
425
426=item * Fixed length 16-bit encodings
427
428Each character is two octets so may have a repertoire of up to
47bfe92f 42965 536 characters. Unicode's UCS-2 is an example. Also used for
4411f3b6 430encodings for East Asian languages.
431
432=item * Fixed length 32-bit encodings.
433
434Not really very "encoded" encodings. The Unicode code points
435are just represented as 4-octet integers. None the less because
436different architectures use different representations of integers
437(so called "endian") there at least two disctinct encodings.
438
439=item * Multi-byte encodings
440
441The number of octets needed to represent a character varies.
442UTF-8 is a particularly complex but regular case of a multi-byte
443encoding. Several East Asian countries use a multi-byte encoding
444where 1-octet is used to cover western roman characters and Asian
445characters get 2-octets.
446(UTF-16 is strictly a multi-byte encoding taking either 2 or 4 octets
447to represent a Unicode code point.)
448
449=item * "Escape" encodings.
450
451These encodings embed "escape sequences" into the octet sequence
452which describe how the following octets are to be interpreted.
453The iso-2022-* family is typical. Following the escape sequence
454octets are encoded by an "embedded" encoding (which will be one
455of the above types) until another escape sequence switches to
456a different "embedded" encoding.
457
458These schemes are very flexible and can handle mixed languages but are
47bfe92f 459very complex to process (and have state). No escape encodings are
460implemented for Perl yet.
4411f3b6 461
462=back
463
464=head2 Specifying Encodings
465
466Encodings can be specified to the API described below in two ways:
467
468=over 4
469
470=item 1. By name
471
47bfe92f 472Encoding names are strings with characters taken from a restricted
473repertoire. See L</"Encoding Names">.
4411f3b6 474
475=item 2. As an object
476
477Encoding objects are returned by C<find_encoding($name)>.
478
479=back
480
481=head2 Encoding Names
482
483Encoding names are case insensitive. White space in names is ignored.
47bfe92f 484In addition an encoding may have aliases. Each encoding has one
485"canonical" name. The "canonical" name is chosen from the names of
486the encoding by picking the first in the following sequence:
4411f3b6 487
488=over 4
489
490=item * The MIME name as defined in IETF RFC-XXXX.
491
492=item * The name in the IANA registry.
493
494=item * The name used by the the organization that defined it.
495
496=back
497
498Because of all the alias issues, and because in the general case
499encodings have state C<Encode> uses the encoding object internally
500once an operation is in progress.
501
4411f3b6 502=head1 PERL ENCODING API
503
504=head2 Generic Encoding Interface
505
506=over 4
507
508=item *
509
510 $bytes = encode(ENCODING, $string[, CHECK])
511
47bfe92f 512Encodes string from Perl's internal form into I<ENCODING> and returns
513a sequence of octets. For CHECK see L</"Handling Malformed Data">.
4411f3b6 514
515=item *
516
517 $string = decode(ENCODING, $bytes[, CHECK])
518
47bfe92f 519Decode sequence of octets assumed to be in I<ENCODING> into Perl's
520internal form and returns the resulting string. For CHECK see
521L</"Handling Malformed Data">.
522
523=item *
524
525 from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
526
2b106fbe 527Convert B<in-place> the data between two encodings. How did the data
528in $string originally get to be in FROM_ENCODING? Either using
529encode() or through PerlIO: See L</"Encode and PerlIO">. For CHECK
530see L</"Handling Malformed Data">.
531
532For example to convert ISO 8859-1 data to UTF-8:
533
534 from_to($data, "iso-8859-1", "utf-8");
535
536and to convert it back:
537
538 from_to($data, "utf-8", "iso-8859-1");
4411f3b6 539
ab97ca19 540Note that because the conversion happens in place, the data to be
541converted cannot be a string constant, it must be a scalar variable.
542
4411f3b6 543=back
544
545=head2 Handling Malformed Data
546
547If CHECK is not set, C<undef> is returned. If the data is supposed to
47bfe92f 548be UTF-8, an optional lexical warning (category utf8) is given. If
549CHECK is true but not a code reference, dies.
4411f3b6 550
47bfe92f 551It would desirable to have a way to indicate that transform should use
552the encodings "replacement character" - no such mechanism is defined yet.
4411f3b6 553
554It is also planned to allow I<CHECK> to be a code reference.
555
47bfe92f 556This is not yet implemented as there are design issues with what its
557arguments should be and how it returns its results.
4411f3b6 558
559=over 4
560
561=item Scheme 1
562
563Passed remaining fragment of string being processed.
564Modifies it in place to remove bytes/characters it can understand
565and returns a string used to represent them.
566e.g.
567
568 sub fixup {
569 my $ch = substr($_[0],0,1,'');
570 return sprintf("\x{%02X}",ord($ch);
571 }
572
573This scheme is close to how underlying C code for Encode works, but gives
574the fixup routine very little context.
575
576=item Scheme 2
577
47bfe92f 578Passed original string, and an index into it of the problem area, and
579output string so far. Appends what it will to output string and
580returns new index into original string. For example:
4411f3b6 581
582 sub fixup {
583 # my ($s,$i,$d) = @_;
584 my $ch = substr($_[0],$_[1],1);
585 $_[2] .= sprintf("\x{%02X}",ord($ch);
586 return $_[1]+1;
587 }
588
47bfe92f 589This scheme gives maximal control to the fixup routine but is more
590complicated to code, and may need internals of Encode to be tweaked to
591keep original string intact.
4411f3b6 592
593=item Other Schemes
594
595Hybrids of above.
596
597Multiple return values rather than in-place modifications.
598
599Index into the string could be pos($str) allowing s/\G...//.
600
601=back
602
603=head2 UTF-8 / utf8
604
605The Unicode consortium defines the UTF-8 standard as a way of encoding
47bfe92f 606the entire Unicode repertiore as sequences of octets. This encoding is
607expected to become very widespread. Perl can use this form internaly
608to represent strings, so conversions to and from this form are
609particularly efficient (as octets in memory do not have to change,
610just the meta-data that tells Perl how to treat them).
4411f3b6 611
612=over 4
613
614=item *
615
616 $bytes = encode_utf8($string);
617
47bfe92f 618The characters that comprise string are encoded in Perl's superset of UTF-8
4411f3b6 619and the resulting octets returned as a sequence of bytes. All possible
620characters have a UTF-8 representation so this function cannot fail.
621
622=item *
623
624 $string = decode_utf8($bytes [,CHECK]);
625
47bfe92f 626The sequence of octets represented by $bytes is decoded from UTF-8
627into a sequence of logical characters. Not all sequences of octets
628form valid UTF-8 encodings, so it is possible for this call to fail.
629For CHECK see L</"Handling Malformed Data">.
4411f3b6 630
631=back
632
633=head2 Other Encodings of Unicode
634
47bfe92f 635UTF-16 is similar to UCS-2, 16 bit or 2-byte chunks. UCS-2 can only
636represent 0..0xFFFF, while UTF-16 has a "surrogate pair" scheme which
637allows it to cover the whole Unicode range.
4411f3b6 638
8040349a 639Encode implements big-endian UCS-2 aliased to "iso-10646-1" as that
47bfe92f 640happens to be the name used by that representation when used with X11
641fonts.
4411f3b6 642
643UTF-32 or UCS-4 is 32-bit or 4-byte chunks. Perl's logical characters
644can be considered as being in this form without encoding. An encoding
47bfe92f 645to transfer strings in this form (e.g. to write them to a file) would
646need to
4411f3b6 647
648 pack('L',map(chr($_),split(//,$string))); # native
649 or
650 pack('V',map(chr($_),split(//,$string))); # little-endian
651 or
652 pack('N',map(chr($_),split(//,$string))); # big-endian
653
654depending on the endian required.
655
51ef4e11 656No UTF-32 encodings are implemented yet.
4411f3b6 657
47bfe92f 658Both UCS-2 and UCS-4 style encodings can have "byte order marks" by
659representing the code point 0xFFFE as the very first thing in a file.
4411f3b6 660
51ef4e11 661=head2 Listing available encodings
662
663 use Encode qw(encodings);
664 @list = encodings();
665
666Returns a list of the canonical names of the available encodings.
667
668=head2 Defining Aliases
669
670 use Encode qw(define_alias);
671 define_alias( newName => ENCODING);
672
47bfe92f 673Allows newName to be used as am alias for ENCODING. ENCODING may be
674either the name of an encoding or and encoding object (as above).
51ef4e11 675
676Currently I<newName> can be specified in the following ways:
677
678=over 4
679
680=item As a simple string.
681
682=item As a qr// compiled regular expression, e.g.:
683
684 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
685
47bfe92f 686In this case if I<ENCODING> is not a reference it is C<eval>-ed to
687allow C<$1> etc. to be subsituted. The example is one way to names as
688used in X11 font names to alias the MIME names for the iso-8859-*
689family.
51ef4e11 690
691=item As a code reference, e.g.:
692
693 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
694
695In this case C<$_> will be set to the name that is being looked up and
47bfe92f 696I<ENCODING> is passed to the sub as its first argument. The example
697is another way to names as used in X11 font names to alias the MIME
698names for the iso-8859-* family.
51ef4e11 699
700=back
701
702=head2 Defining Encodings
703
704 use Encode qw(define_alias);
705 define_encoding( $object, 'canonicalName' [,alias...]);
706
47bfe92f 707Causes I<canonicalName> to be associated with I<$object>. The object
708should provide the interface described in L</"IMPLEMENTATION CLASSES">
709below. If more than two arguments are provided then additional
710arguments are taken as aliases for I<$object> as for C<define_alias>.
51ef4e11 711
4411f3b6 712=head1 Encoding and IO
713
714It is very common to want to do encoding transformations when
715reading or writing files, network connections, pipes etc.
47bfe92f 716If Perl is configured to use the new 'perlio' IO system then
4411f3b6 717C<Encode> provides a "layer" (See L<perliol>) which can transform
718data as it is read or written.
719
51ef4e11 720 open(my $ilyad,'>:encoding(iso-8859-7)','ilyad.greek');
4411f3b6 721 print $ilyad @epic;
722
723In addition the new IO system can also be configured to read/write
724UTF-8 encoded characters (as noted above this is efficient):
725
726 open(my $fh,'>:utf8','anything');
727 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
728
729Either of the above forms of "layer" specifications can be made the default
730for a lexical scope with the C<use open ...> pragma. See L<open>.
731
732Once a handle is open is layers can be altered using C<binmode>.
733
47bfe92f 734Without any such configuration, or if Perl itself is built using
4411f3b6 735system's own IO, then write operations assume that file handle accepts
736only I<bytes> and will C<die> if a character larger than 255 is
737written to the handle. When reading, each octet from the handle
738becomes a byte-in-a-character. Note that this default is the same
47bfe92f 739behaviour as bytes-only languages (including Perl before v5.6) would
740have, and is sufficient to handle native 8-bit encodings
741e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
742other encodings and binary data.
743
744In other cases it is the programs responsibility to transform
745characters into bytes using the API above before doing writes, and to
746transform the bytes read from a handle into characters before doing
747"character operations" (e.g. C<lc>, C</\W+/>, ...).
748
749=head1 Encode and PerlIO
750
751The PerlIO layer (new since Perl 5.7) can be used to automatically
752convert the data being read in or written out to be converted from
753some encoding into Perl's internal encoding or from Perl's internal
754encoding into some other encoding.
755
756Examples:
4411f3b6 757
47bfe92f 758 open(my $f, "<:encoding(cp1252)")
759
760 open(my $g, ">:encoding(iso-8859-1)")
761
762You can also use PerlIO to convert larger amounts of data you don't
763want to bring into memory. For example to convert between ISO 8859-1
764(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
765
766 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
767 open(G, ">:utf8", "data.utf") or die $!;
768 while (<F>) { print G }
769
2b106fbe 770 # Could also do "print G <F>" but that would pull
47bfe92f 771 # the whole file into memory just to write it out again.
772
773See L<PerlIO> for more information.
4411f3b6 774
775=head1 Encoding How to ...
776
777To do:
778
779=over 4
780
781=item * IO with mixed content (faking iso-2020-*)
782
783=item * MIME's Content-Length:
784
785=item * UTF-8 strings in binary data.
786
47bfe92f 787=item * Perl/Encode wrappers on non-Unicode XS modules.
4411f3b6 788
789=back
790
791=head1 Messing with Perl's Internals
792
47bfe92f 793The following API uses parts of Perl's internals in the current
794implementation. As such they are efficient, but may change.
4411f3b6 795
796=over 4
797
4411f3b6 798=item * is_utf8(STRING [, CHECK])
799
800[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f 801If CHECK is true, also checks the data in STRING for being well-formed
802UTF-8. Returns true if successful, false otherwise.
4411f3b6 803
804=item * valid_utf8(STRING)
805
47bfe92f 806[INTERNAL] Test whether STRING is in a consistent state. Will return
807true if string is held as bytes, or is well-formed UTF-8 and has the
808UTF-8 flag on. Main reason for this routine is to allow Perl's
809testsuite to check that operations have left strings in a consistent
810state.
4411f3b6 811
812=item *
813
814 _utf8_on(STRING)
815
816[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
817B<not> checked for being well-formed UTF-8. Do not use unless you
818B<know> that the STRING is well-formed UTF-8. Returns the previous
819state of the UTF-8 flag (so please don't test the return value as
820I<not> success or failure), or C<undef> if STRING is not a string.
821
822=item *
823
824 _utf8_off(STRING)
825
826[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
827Returns the previous state of the UTF-8 flag (so please don't test the
828return value as I<not> success or failure), or C<undef> if STRING is
829not a string.
830
831=back
832
4edaa979 833=head1 IMPLEMENTATION CLASSES
834
835As mentioned above encodings are (in the current implementation at least)
836defined by objects. The mapping of encoding name to object is via the
51ef4e11 837C<%encodings> hash.
4edaa979 838
839The values of the hash can currently be either strings or objects.
840The string form may go away in the future. The string form occurs
841when C<encodings()> has scanned C<@INC> for loadable encodings but has
842not actually loaded the encoding in question. This is because the
47bfe92f 843current "loading" process is all Perl and a bit slow.
4edaa979 844
47bfe92f 845Once an encoding is loaded then value of the hash is object which
846implements the encoding. The object should provide the following
847interface:
4edaa979 848
849=over 4
850
851=item -E<gt>name
852
853Should return the string representing the canonical name of the encoding.
854
855=item -E<gt>new_sequence
856
47bfe92f 857This is a placeholder for encodings with state. It should return an
858object which implements this interface, all current implementations
859return the original object.
4edaa979 860
861=item -E<gt>encode($string,$check)
862
47bfe92f 863Should return the octet sequence representing I<$string>. If I<$check>
864is true it should modify I<$string> in place to remove the converted
865part (i.e. the whole string unless there is an error). If an error
866occurs it should return the octet sequence for the fragment of string
867that has been converted, and modify $string in-place to remove the
868converted part leaving it starting with the problem fragment.
4edaa979 869
47bfe92f 870If check is is false then C<encode> should make a "best effort" to
871convert the string - for example by using a replacement character.
4edaa979 872
873=item -E<gt>decode($octets,$check)
874
47bfe92f 875Should return the string that I<$octets> represents. If I<$check> is
876true it should modify I<$octets> in place to remove the converted part
877(i.e. the whole sequence unless there is an error). If an error
878occurs it should return the fragment of string that has been
879converted, and modify $octets in-place to remove the converted part
4edaa979 880leaving it starting with the problem fragment.
881
47bfe92f 882If check is is false then C<decode> should make a "best effort" to
883convert the string - for example by using Unicode's "\x{FFFD}" as a
884replacement character.
4edaa979 885
886=back
887
47bfe92f 888It should be noted that the check behaviour is different from the
889outer public API. The logic is that the "unchecked" case is useful
890when encoding is part of a stream which may be reporting errors
891(e.g. STDERR). In such cases it is desirable to get everything
892through somehow without causing additional errors which obscure the
893original one. Also the encoding is best placed to know what the
894correct replacement character is, so if that is the desired behaviour
895then letting low level code do it is the most efficient.
896
897In contrast if check is true, the scheme above allows the encoding to
898do as much as it can and tell layer above how much that was. What is
899lacking at present is a mechanism to report what went wrong. The most
900likely interface will be an additional method call to the object, or
901perhaps (to avoid forcing per-stream objects on otherwise stateless
902encodings) and additional parameter.
903
904It is also highly desirable that encoding classes inherit from
905C<Encode::Encoding> as a base class. This allows that class to define
906additional behaviour for all encoding objects. For example built in
907Unicode, UCS-2 and UTF-8 classes use :
51ef4e11 908
909 package Encode::MyEncoding;
910 use base qw(Encode::Encoding);
911
912 __PACKAGE__->Define(qw(myCanonical myAlias));
913
47bfe92f 914To create an object with bless {Name => ...},$class, and call
915define_encoding. They inherit their C<name> method from
916C<Encode::Encoding>.
4edaa979 917
918=head2 Compiled Encodings
919
47bfe92f 920F<Encode.xs> provides a class C<Encode::XS> which provides the
921interface described above. It calls a generic octet-sequence to
922octet-sequence "engine" that is driven by tables (defined in
923F<encengine.c>). The same engine is used for both encode and
924decode. C<Encode:XS>'s C<encode> forces Perl's characters to their
925UTF-8 form and then treats them as just another multibyte
926encoding. C<Encode:XS>'s C<decode> transforms the sequence and then
927turns the UTF-8-ness flag as that is the form that the tables are
928defined to produce. For details of the engine see the comments in
929F<encengine.c>.
930
931The tables are produced by the Perl script F<compile> (the name needs
932to change so we can eventually install it somewhere). F<compile> can
933currently read two formats:
4edaa979 934
935=over 4
936
937=item *.enc
938
47bfe92f 939This is a coined format used by Tcl. It is documented in
940Encode/EncodeFormat.pod.
4edaa979 941
942=item *.ucm
943
944This is the semi-standard format used by IBM's ICU package.
945
946=back
947
948F<compile> can write the following forms:
949
950=over 4
951
952=item *.ucm
953
954See above - the F<Encode/*.ucm> files provided with the distribution have
955been created from the original Tcl .enc files using this approach.
956
957=item *.c
958
959Produces tables as C data structures - this is used to build in encodings
960into F<Encode.so>/F<Encode.dll>.
961
962=item *.xs
963
47bfe92f 964In theory this allows encodings to be stand-alone loadable Perl
965extensions. The process has not yet been tested. The plan is to use
966this approach for large East Asian encodings.
4edaa979 967
968=back
969
47bfe92f 970The set of encodings built-in to F<Encode.so>/F<Encode.dll> is
971determined by F<Makefile.PL>. The current set is as follows:
4edaa979 972
973=over 4
974
975=item ascii and iso-8859-*
976
977That is all the common 8-bit "western" encodings.
978
979=item IBM-1047 and two other variants of EBCDIC.
980
47bfe92f 981These are the same variants that are supported by EBCDIC Perl as
982"native" encodings. They are included to prove "reversibility" of
983some constructs in EBCDIC Perl.
4edaa979 984
985=item symbol and dingbats as used by Tk on X11.
986
47bfe92f 987(The reason Encode got started was to support Perl/Tk.)
4edaa979 988
989=back
990
47bfe92f 991That set is rather ad hoc and has been driven by the needs of the
992tests rather than the needs of typical applications. It is likely
993to be rationalized.
4edaa979 994
4411f3b6 995=head1 SEE ALSO
996
47bfe92f 997L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>
4411f3b6 998
999=cut
1000