8 @ISA = qw(Exporter DynaLoader);
10 # Public, encouraged API is exported by default
34 # Documentation moved after __END__ for speed - NI-S
38 # The global hash is declared in XS code
39 $encoding{Unicode} = bless({},'Encode::Unicode');
40 $encoding{utf8} = bless({},'Encode::utf8');
41 $encoding{'iso10646-1'} = bless({},'Encode::iso10646_1');
46 foreach my $dir (@INC)
48 if (opendir(my $dh,"$dir/Encode"))
50 while (defined(my $name = readdir($dh)))
52 if ($name =~ /^(.*)\.enc$/)
54 next if exists $encoding{$1};
55 $encoding{$1} = "$dir/$name";
61 return keys %encoding;
66 my ($class,$name,$file) = @_;
67 if (open(my $fh,$file))
73 $type = substr($line,0,1);
74 last unless $type eq '#';
76 $class .= ('::'.(($type eq 'E') ? 'Escape' : 'Table'));
77 #warn "Loading $file";
78 return $class->read($fh,$name,$type);
88 my ($class,$name) = @_;
90 unless (ref($enc = $encoding{$name}))
92 $enc = $class->loadEncoding($name,$enc) if defined $enc;
95 foreach my $dir (@INC)
97 last if ($enc = $class->loadEncoding($name,"$dir/Encode/$name.enc"));
100 $encoding{$name} = $enc;
108 return __PACKAGE__->getEncoding($name);
113 my ($name,$string,$check) = @_;
114 my $enc = find_encoding($name);
115 croak("Unknown encoding '$name'") unless defined $enc;
116 my $octets = $enc->encode($string,$check);
117 return undef if ($check && length($string));
123 my ($name,$octets,$check) = @_;
124 my $enc = find_encoding($name);
125 croak("Unknown encoding '$name'") unless defined $enc;
126 my $string = $enc->decode($octets,$check);
127 return undef if ($check && length($octets));
133 my ($string,$from,$to,$check) = @_;
134 my $f = find_encoding($from);
135 croak("Unknown encoding '$from'") unless defined $f;
136 my $t = find_encoding($to);
137 croak("Unknown encoding '$to'") unless defined $t;
138 my $uni = $f->decode($string,$check);
139 return undef if ($check && length($string));
140 $string = $t->encode($uni,$check);
141 return undef if ($check && length($uni));
142 return length($_[0] = $string);
155 return undef unless utf8_decode($str);
159 package Encode::Encoding;
160 # Base class for classes which implement encodings
161 # Temporary legacy methods
162 sub toUnicode { shift->decode(@_) }
163 sub fromUnicode { shift->encode(@_) }
166 use base 'Encode::Encoding';
168 package Encode::Unicode;
169 use base 'Encode::Encoding';
171 # Dummy package that provides the encode interface but leaves data
172 # as UTF-8 encoded. It is here so that from_to() works.
174 sub name { 'Unicode' }
178 my ($obj,$str,$chk) = @_;
179 Encode::utf8_upgrade($str);
186 package Encode::utf8;
187 use base 'Encode::Encoding';
189 # package to allow long-hand
190 # $octets = encode( utf8 => $string );
197 my ($obj,$octets,$chk) = @_;
198 my $str = Encode::decode_utf8($octets);
209 my ($obj,$string,$chk) = @_;
210 my $octets = Encode::encode_utf8($string);
215 package Encode::Table;
216 use base 'Encode::Encoding';
220 my ($class,$fh,$name,$type) = @_;
221 my $rep = $class->can("rep_$type");
222 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
231 my $page = hex($line);
233 my $ch = $page * 256;
234 for (my $i = 0; $i < 16; $i++)
237 for (my $j = 0; $j < 16; $j++)
239 my $val = hex(substr($line,0,4,''));
254 $touni[$page] = \@page;
257 return bless {Name => $name,
266 sub name { shift->{'Name'} }
272 sub rep_M { ($_[0] > 255) ? 'n' : 'C' }
277 $ch = 0 unless @_ > 1;
283 my ($obj,$str,$chk) = @_;
284 my $rep = $obj->{'Rep'};
285 my $touni = $obj->{'ToUni'};
289 my $ch = ord(substr($str,0,1,''));
291 if (&$rep($ch) eq 'C')
293 $x = $touni->[0][$ch];
297 $x = $touni->[$ch][ord(substr($str,0,1,''))];
302 # What do we do here ?
307 $_[1] = $str if $chk;
313 my ($obj,$uni,$chk) = @_;
314 my $fmuni = $obj->{'FmUni'};
316 my $def = $obj->{'Def'};
317 my $rep = $obj->{'Rep'};
320 my $ch = substr($uni,0,1,'');
321 my $x = $fmuni->{chr(ord($ch))};
327 $str .= pack(&$rep($x),$x);
329 $_[1] = $uni if $chk;
333 package Encode::iso10646_1;
334 use base 'Encode::Encoding';
336 # Encoding is 16-bit network order Unicode
337 # Used for X font encodings
339 sub name { 'iso10646-1' }
343 my ($obj,$str,$chk) = @_;
347 my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
350 $_[1] = $str if $chk;
351 Encode::utf8_upgrade($uni);
357 my ($obj,$uni,$chk) = @_;
361 my $ch = substr($uni,0,1,'');
368 $str .= pack('n',$x);
370 $_[1] = $uni if $chk;
375 package Encode::Escape;
376 use base 'Encode::Encoding';
382 my ($class,$fh,$name) = @_;
383 my %self = (Name => $name, Num => 0);
386 my ($key,$val) = /^(\S+)\s+(.*)$/;
387 $val =~ s/^\{(.*?)\}/$1/g;
388 $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge;
391 return bless \%self,$class;
394 sub name { shift->{'Name'} }
398 croak("Not implemented yet");
403 croak("Not implemented yet");
406 # switch back to Encode package in case we ever add AutoLoader
415 Encode - character encodings
423 The C<Encode> module provides the interfaces between perl's strings
424 and the rest of the system. Perl strings are sequences of B<characters>.
426 The repertoire of characters that Perl can represent is at least that
427 defined by the Unicode Consortium. On most platforms the ordinal values
428 of the characters (as returned by C<ord(ch)>) is the "Unicode codepoint" for
429 the character (the exceptions are those platforms where the legacy
430 encoding is some variant of EBCDIC rather than a super-set of ASCII
431 - see L<perlebcdic>).
433 Traditionaly computer data has been moved around in 8-bit chunks
434 often called "bytes". These chunks are also known as "octets" in
435 networking standards. Perl is widely used to manipulate data of
436 many types - not only strings of characters representing human or
437 computer languages but also "binary" data being the machines representation
438 of numbers, pixels in an image - or just about anything.
440 When perl is processing "binary data" the programmer wants perl to process
441 "sequences of bytes". This is not a problem for perl - as a byte has 256
442 possible values it easily fits in perl's much larger "logical character".
450 I<character>: a character in the range 0..(2**32-1) (or more).
451 (What perl's strings are made of.)
455 I<byte>: a character in the range 0..255
456 (A special case of a perl character.)
460 I<octet>: 8 bits of data, with ordinal values 0..255
461 (Term for bytes passed to or from a non-perl context, e.g. disk file.)
465 The marker [INTERNAL] marks Internal Implementation Details, in
466 general meant only for those who think they know what they are doing,
467 and such details may change in future releases.
471 =head2 Characteristics of an Encoding
473 An encoding has a "repertoire" of characters that it can represent,
474 and for each representable character there is at least one sequence of
475 octets that represents it.
477 =head2 Types of Encodings
479 Encodings can be divided into the following types:
483 =item * Fixed length 8-bit (or less) encodings.
485 Each character is a single octet so may have a repertoire of up to
486 256 characters. ASCII and iso-8859-* are typical examples.
488 =item * Fixed length 16-bit encodings
490 Each character is two octets so may have a repertoire of up to
491 65,536 characters. Unicode's UCS-2 is an example. Also used for
492 encodings for East Asian languages.
494 =item * Fixed length 32-bit encodings.
496 Not really very "encoded" encodings. The Unicode code points
497 are just represented as 4-octet integers. None the less because
498 different architectures use different representations of integers
499 (so called "endian") there at least two disctinct encodings.
501 =item * Multi-byte encodings
503 The number of octets needed to represent a character varies.
504 UTF-8 is a particularly complex but regular case of a multi-byte
505 encoding. Several East Asian countries use a multi-byte encoding
506 where 1-octet is used to cover western roman characters and Asian
507 characters get 2-octets.
508 (UTF-16 is strictly a multi-byte encoding taking either 2 or 4 octets
509 to represent a Unicode code point.)
511 =item * "Escape" encodings.
513 These encodings embed "escape sequences" into the octet sequence
514 which describe how the following octets are to be interpreted.
515 The iso-2022-* family is typical. Following the escape sequence
516 octets are encoded by an "embedded" encoding (which will be one
517 of the above types) until another escape sequence switches to
518 a different "embedded" encoding.
520 These schemes are very flexible and can handle mixed languages but are
521 very complex to process (and have state).
522 No escape encodings are implemented for perl yet.
526 =head2 Specifying Encodings
528 Encodings can be specified to the API described below in two ways:
534 Encoding names are strings with characters taken from a restricted repertoire.
535 See L</"Encoding Names">.
537 =item 2. As an object
539 Encoding objects are returned by C<find_encoding($name)>.
543 =head2 Encoding Names
545 Encoding names are case insensitive. White space in names is ignored.
546 In addition an encoding may have aliases. Each encoding has one "canonical" name.
547 The "canonical" name is chosen from the names of the encoding by picking
548 the first in the following sequence:
552 =item * The MIME name as defined in IETF RFC-XXXX.
554 =item * The name in the IANA registry.
556 =item * The name used by the the organization that defined it.
560 Because of all the alias issues, and because in the general case
561 encodings have state C<Encode> uses the encoding object internally
562 once an operation is in progress.
564 I<Aliasing is not yet implemented.>
566 =head1 PERL ENCODING API
568 =head2 Generic Encoding Interface
574 $bytes = encode(ENCODING, $string[, CHECK])
576 Encodes string from perl's internal form into I<ENCODING> and returns a
578 See L</"Handling Malformed Data">.
582 $string = decode(ENCODING, $bytes[, CHECK])
584 Decode sequence of octets assumed to be in I<ENCODING> into perls internal
585 form and returns the resuting string.
586 See L</"Handling Malformed Data">.
590 =head2 Handling Malformed Data
592 If CHECK is not set, C<undef> is returned. If the data is supposed to
593 be UTF-8, an optional lexical warning (category utf8) is given.
594 If CHECK is true but not a code reference, dies.
596 It would desirable to have a way to indicate that transform should use the
597 encodings "replacement character" - no such mechanism is defined yet.
599 It is also planned to allow I<CHECK> to be a code reference.
601 This is not yet implemented as there are design issues with what its arguments
602 should be and how it returns its results.
608 Passed remaining fragment of string being processed.
609 Modifies it in place to remove bytes/characters it can understand
610 and returns a string used to represent them.
614 my $ch = substr($_[0],0,1,'');
615 return sprintf("\x{%02X}",ord($ch);
618 This scheme is close to how underlying C code for Encode works, but gives
619 the fixup routine very little context.
623 Passed original string, and an index into it of the problem area,
624 and output string so far.
625 Appends what it will to output string and returns new index into
630 # my ($s,$i,$d) = @_;
631 my $ch = substr($_[0],$_[1],1);
632 $_[2] .= sprintf("\x{%02X}",ord($ch);
636 This scheme gives maximal control to the fixup routine but is more complicated
637 to code, and may need internals of Encode to be tweaked to keep original
644 Multiple return values rather than in-place modifications.
646 Index into the string could be pos($str) allowing s/\G...//.
652 The Unicode consortium defines the UTF-8 standard as a way of encoding
653 the entire Unicode repertiore as sequences of octets. This encoding
654 is expected to become very widespread. Perl can use this form internaly
655 to represent strings, so conversions to and from this form are particularly
656 efficient (as octets in memory do not have to change, just the meta-data
657 that tells perl how to treat them).
663 $bytes = encode_utf8($string);
665 The characters that comprise string are encoded in perl's superset of UTF-8
666 and the resulting octets returned as a sequence of bytes. All possible
667 characters have a UTF-8 representation so this function cannot fail.
671 $string = decode_utf8($bytes [,CHECK]);
673 The sequence of octets represented by $bytes is decoded from UTF-8 into
674 a sequence of logical characters. Not all sequences of octets form valid
675 UTF-8 encodings, so it is possible for this call to fail.
676 See L</"Handling Malformed Data">.
680 =head2 Other Encodings of Unicode
682 UTF-16 is similar to UCS-2, 16 bit or 2-byte chunks.
683 UCS-2 can only represent 0..0xFFFF, while UTF-16 has a "surogate pair"
684 scheme which allows it to cover the whole Unicode range.
686 Encode implements big-endian UCS-2 as the encoding "iso10646-1" as that
687 happens to be the name used by that representation when used with X11 fonts.
689 UTF-32 or UCS-4 is 32-bit or 4-byte chunks. Perl's logical characters
690 can be considered as being in this form without encoding. An encoding
691 to transfer strings in this form (e.g. to write them to a file) would need to
693 pack('L',map(chr($_),split(//,$string))); # native
695 pack('V',map(chr($_),split(//,$string))); # little-endian
697 pack('N',map(chr($_),split(//,$string))); # big-endian
699 depending on the endian required.
701 No UTF-32 encodings are not yet implemented.
703 Both UCS-2 and UCS-4 style encodings can have "byte order marks" by representing
704 the code point 0xFFFE as the very first thing in a file.
706 =head1 Encoding and IO
708 It is very common to want to do encoding transformations when
709 reading or writing files, network connections, pipes etc.
710 If perl is configured to use the new 'perlio' IO system then
711 C<Encode> provides a "layer" (See L<perliol>) which can transform
712 data as it is read or written.
714 open(my $ilyad,'>:encoding(iso8859-7)','ilyad.greek');
717 In addition the new IO system can also be configured to read/write
718 UTF-8 encoded characters (as noted above this is efficient):
720 open(my $fh,'>:utf8','anything');
721 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
723 Either of the above forms of "layer" specifications can be made the default
724 for a lexical scope with the C<use open ...> pragma. See L<open>.
726 Once a handle is open is layers can be altered using C<binmode>.
728 Without any such configuration, or if perl itself is built using
729 system's own IO, then write operations assume that file handle accepts
730 only I<bytes> and will C<die> if a character larger than 255 is
731 written to the handle. When reading, each octet from the handle
732 becomes a byte-in-a-character. Note that this default is the same
733 behaviour as bytes-only languages (including perl before v5.6) would have,
734 and is sufficient to handle native 8-bit encodings e.g. iso-8859-1,
735 EBCDIC etc. and any legacy mechanisms for handling other encodings
738 In other cases it is the programs responsibility
739 to transform characters into bytes using the API above before
740 doing writes, and to transform the bytes read from a handle into characters
741 before doing "character operations" (e.g. C<lc>, C</\W+/>, ...).
743 =head1 Encoding How to ...
749 =item * IO with mixed content (faking iso-2020-*)
751 =item * MIME's Content-Length:
753 =item * UTF-8 strings in binary data.
755 =item * perl/Encode wrappers on non-Unicode XS modules.
759 =head1 Messing with Perl's Internals
761 The following API uses parts of perl's internals in the current implementation.
762 As such they are efficient, but may change.
768 $num_octets = utf8_upgrade($string);
770 Converts internal representation of string to the UTF-8 form.
771 Returns the number of octets necessary to represent the string as UTF-8.
773 =item * utf8_downgrade($string[, CHECK])
775 Converts internal representation of string to be un-encoded bytes.
777 =item * is_utf8(STRING [, CHECK])
779 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
780 If CHECK is true, also checks the data in STRING for being
781 well-formed UTF-8. Returns true if successful, false otherwise.
783 =item * valid_utf8(STRING)
785 [INTERNAL] Test whether STRING is in a consistent state.
786 Will return true if string is held as bytes, or is well-formed UTF-8
787 and has the UTF-8 flag on.
788 Main reason for this routine is to allow perl's testsuite to check
789 that operations have left strings in a consistent state.
795 [INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
796 B<not> checked for being well-formed UTF-8. Do not use unless you
797 B<know> that the STRING is well-formed UTF-8. Returns the previous
798 state of the UTF-8 flag (so please don't test the return value as
799 I<not> success or failure), or C<undef> if STRING is not a string.
805 [INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
806 Returns the previous state of the UTF-8 flag (so please don't test the
807 return value as I<not> success or failure), or C<undef> if STRING is
814 L<perlunicode>, L<perlebcdic>, L<perlfunc/open>