More EBCDIC tweaks:
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
2c674647 3
51ef4e11 4our $VERSION = 0.02;
2c674647 5
6require DynaLoader;
7require Exporter;
8
51ef4e11 9our @ISA = qw(Exporter DynaLoader);
2c674647 10
4411f3b6 11# Public, encouraged API is exported by default
51ef4e11 12our @EXPORT = qw (
4411f3b6 13 encode
14 decode
15 encode_utf8
16 decode_utf8
17 find_encoding
51ef4e11 18 encodings
4411f3b6 19);
20
51ef4e11 21our @EXPORT_OK =
2c674647 22 qw(
51ef4e11 23 define_encoding
24 define_alias
2c674647 25 from_to
26 is_utf8
4411f3b6 27 is_8bit
28 is_16bit
a12c0f56 29 utf8_upgrade
30 utf8_downgrade
4411f3b6 31 _utf8_on
32 _utf8_off
2c674647 33 );
34
35bootstrap Encode ();
36
4411f3b6 37# Documentation moved after __END__ for speed - NI-S
2c674647 38
bf230f3d 39use Carp;
40
51ef4e11 41# Make a %encoding package variable to allow a certain amount of cheating
42our %encoding;
43my @alias; # ordered matching list
44my %alias; # cached known aliases
5345d506 45
656753f8 46sub encodings
47{
48 my ($class) = @_;
51ef4e11 49 return keys %encoding;
50}
51
52sub findAlias
53{
54 my $class = shift;
55 local $_ = shift;
56 unless (exists $alias{$_})
656753f8 57 {
51ef4e11 58 for (my $i=0; $i < @alias; $i += 2)
656753f8 59 {
51ef4e11 60 my $alias = $alias[$i];
61 my $val = $alias[$i+1];
62 my $new;
5ad8ef52 63
51ef4e11 64 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
5345d506 65 {
51ef4e11 66 $new = eval $val;
67 }
68 elsif (ref($alias) eq 'CODE')
69 {
70 $new = &{$alias}($val)
71 }
5ad8ef52 72 elsif (lc($_) eq lc($alias))
51ef4e11 73 {
74 $new = $val;
75 }
76 if (defined($new))
77 {
78 next if $new eq $_; # avoid (direct) recursion on bugs
79 my $enc = (ref($new)) ? $new : find_encoding($new);
80 if ($enc)
5345d506 81 {
51ef4e11 82 $alias{$_} = $enc;
83 last;
5345d506 84 }
85 }
656753f8 86 }
5345d506 87 }
51ef4e11 88 return $alias{$_};
5345d506 89}
90
51ef4e11 91sub define_alias
5345d506 92{
51ef4e11 93 while (@_)
5345d506 94 {
51ef4e11 95 my ($alias,$name) = splice(@_,0,2);
96 push(@alias, $alias => $name);
656753f8 97 }
51ef4e11 98}
99
016cb72c 100# Allow variants of iso-8859-1 etc.
d6089a2a 101define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
016cb72c 102
103# Allow latin-1 style names as well
104 # 0 1 2 3 4 5 6 7 8 9 10
105my @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
106define_alias( qr/^latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
107
108# Common names for non-latin prefered MIME names
109define_alias( 'ascii' => 'US-ascii',
110 'cyrillic' => 'iso-8859-5',
111 'arabic' => 'iso-8859-6',
112 'greek' => 'iso-8859-7',
113 'hebrew' => 'iso-8859-8');
114
51ef4e11 115define_alias( 'ibm-1047' => 'cp1047');
116
016cb72c 117# Map white space and _ to '-'
118define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
119
51ef4e11 120sub define_encoding
121{
122 my $obj = shift;
123 my $name = shift;
124 $encoding{$name} = $obj;
125 my $lc = lc($name);
126 define_alias($lc => $obj) unless $lc eq $name;
127 while (@_)
656753f8 128 {
51ef4e11 129 my $alias = shift;
130 define_alias($alias,$obj);
656753f8 131 }
51ef4e11 132 return $obj;
656753f8 133}
134
656753f8 135sub getEncoding
136{
137 my ($class,$name) = @_;
5345d506 138 my $enc;
51ef4e11 139 if (exists $encoding{$name})
656753f8 140 {
51ef4e11 141 return $encoding{$name};
142 }
143 else
144 {
145 return $class->findAlias($name);
656753f8 146 }
656753f8 147}
148
4411f3b6 149sub find_encoding
150{
151 my ($name) = @_;
152 return __PACKAGE__->getEncoding($name);
153}
154
155sub encode
156{
157 my ($name,$string,$check) = @_;
158 my $enc = find_encoding($name);
159 croak("Unknown encoding '$name'") unless defined $enc;
50d26985 160 my $octets = $enc->encode($string,$check);
4411f3b6 161 return undef if ($check && length($string));
162 return $octets;
163}
164
165sub decode
166{
167 my ($name,$octets,$check) = @_;
168 my $enc = find_encoding($name);
169 croak("Unknown encoding '$name'") unless defined $enc;
50d26985 170 my $string = $enc->decode($octets,$check);
4411f3b6 171 return undef if ($check && length($octets));
172 return $string;
173}
174
175sub from_to
176{
177 my ($string,$from,$to,$check) = @_;
178 my $f = find_encoding($from);
179 croak("Unknown encoding '$from'") unless defined $f;
180 my $t = find_encoding($to);
181 croak("Unknown encoding '$to'") unless defined $t;
50d26985 182 my $uni = $f->decode($string,$check);
4411f3b6 183 return undef if ($check && length($string));
50d26985 184 $string = $t->encode($uni,$check);
4411f3b6 185 return undef if ($check && length($uni));
186 return length($_[0] = $string);
187}
188
189sub encode_utf8
190{
191 my ($str) = @_;
1b026014 192 utf8::encode($str);
4411f3b6 193 return $str;
194}
195
196sub decode_utf8
197{
198 my ($str) = @_;
1b026014 199 return undef unless utf8::decode($str);
4411f3b6 200 return $str;
201}
202
50d26985 203package Encode::Encoding;
204# Base class for classes which implement encodings
4edaa979 205
51ef4e11 206sub Define
207{
208 my $obj = shift;
209 my $canonical = shift;
210 $obj = bless { Name => $canonical },$obj unless ref $obj;
211 # warn "$canonical => $obj\n";
212 Encode::define_encoding($obj, $canonical, @_);
213}
214
215sub name { shift->{'Name'} }
216
50d26985 217# Temporary legacy methods
4edaa979 218sub toUnicode { shift->decode(@_) }
219sub fromUnicode { shift->encode(@_) }
220
221sub new_sequence { return $_[0] }
50d26985 222
223package Encode::XS;
224use base 'Encode::Encoding';
225
5ad8ef52 226package Encode::Internal;
50d26985 227use base 'Encode::Encoding';
656753f8 228
9b37254d 229# Dummy package that provides the encode interface but leaves data
1b026014 230# as UTF-X encoded. It is here so that from_to() works.
656753f8 231
5ad8ef52 232__PACKAGE__->Define('Internal');
233
234Encode::define_alias( 'Unicode' => 'Internal' ) if ord('A') == 65;
656753f8 235
50d26985 236sub decode
a12c0f56 237{
238 my ($obj,$str,$chk) = @_;
1b026014 239 utf8::upgrade($str);
a12c0f56 240 $_[1] = '' if $chk;
241 return $str;
242}
656753f8 243
50d26985 244*encode = \&decode;
656753f8 245
5ad8ef52 246package Encoding::Unicode;
247use base 'Encode::Encoding';
248
249__PACKAGE__->Define('Unicode') unless ord('A') == 65;
250
251sub decode
252{
253 my ($obj,$str,$chk) = @_;
254 my $res = '';
255 for (my $i = 0; $i < length($str); $i++)
256 {
257 $res .= chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
258 }
259 $_[1] = '' if $chk;
260 return $res;
261}
262
263sub encode
264{
265 my ($obj,$str,$chk) = @_;
266 my $res = '';
267 for (my $i = 0; $i < length($str); $i++)
268 {
269 $res .= chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
270 }
271 $_[1] = '' if $chk;
272 return $res;
273}
274
275
4411f3b6 276package Encode::utf8;
50d26985 277use base 'Encode::Encoding';
4411f3b6 278# package to allow long-hand
279# $octets = encode( utf8 => $string );
280#
281
51ef4e11 282__PACKAGE__->Define(qw(UTF-8 utf8));
4411f3b6 283
50d26985 284sub decode
4411f3b6 285{
286 my ($obj,$octets,$chk) = @_;
2a936312 287 my $str = Encode::decode_utf8($octets);
4411f3b6 288 if (defined $str)
289 {
290 $_[1] = '' if $chk;
291 return $str;
292 }
293 return undef;
294}
295
50d26985 296sub encode
4411f3b6 297{
298 my ($obj,$string,$chk) = @_;
2a936312 299 my $octets = Encode::encode_utf8($string);
4411f3b6 300 $_[1] = '' if $chk;
301 return $octets;
4411f3b6 302}
303
9b37254d 304package Encode::iso10646_1;
50d26985 305use base 'Encode::Encoding';
51ef4e11 306# Encoding is 16-bit network order Unicode (no surogates)
9b37254d 307# Used for X font encodings
87714904 308
8040349a 309__PACKAGE__->Define(qw(UCS-2 iso-10646-1));
87714904 310
50d26985 311sub decode
87714904 312{
313 my ($obj,$str,$chk) = @_;
314 my $uni = '';
315 while (length($str))
316 {
5dcbab34 317 my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
87714904 318 $uni .= chr($code);
319 }
320 $_[1] = $str if $chk;
8040349a 321 utf8::upgrade($uni);
87714904 322 return $uni;
323}
324
50d26985 325sub encode
87714904 326{
327 my ($obj,$uni,$chk) = @_;
328 my $str = '';
329 while (length($uni))
330 {
331 my $ch = substr($uni,0,1,'');
332 my $x = ord($ch);
333 unless ($x < 32768)
334 {
335 last if ($chk);
336 $x = 0;
337 }
5dcbab34 338 $str .= pack('n',$x);
656753f8 339 }
bf230f3d 340 $_[1] = $uni if $chk;
656753f8 341 return $str;
342}
343
4411f3b6 344# switch back to Encode package in case we ever add AutoLoader
345package Encode;
346
656753f8 3471;
348
2a936312 349__END__
350
4411f3b6 351=head1 NAME
352
353Encode - character encodings
354
355=head1 SYNOPSIS
356
357 use Encode;
358
359=head1 DESCRIPTION
360
361The C<Encode> module provides the interfaces between perl's strings
362and the rest of the system. Perl strings are sequences of B<characters>.
363
364The repertoire of characters that Perl can represent is at least that
365defined by the Unicode Consortium. On most platforms the ordinal values
366of the characters (as returned by C<ord(ch)>) is the "Unicode codepoint" for
367the character (the exceptions are those platforms where the legacy
368encoding is some variant of EBCDIC rather than a super-set of ASCII
369- see L<perlebcdic>).
370
371Traditionaly computer data has been moved around in 8-bit chunks
372often called "bytes". These chunks are also known as "octets" in
373networking standards. Perl is widely used to manipulate data of
374many types - not only strings of characters representing human or
375computer languages but also "binary" data being the machines representation
376of numbers, pixels in an image - or just about anything.
377
378When perl is processing "binary data" the programmer wants perl to process
379"sequences of bytes". This is not a problem for perl - as a byte has 256
380possible values it easily fits in perl's much larger "logical character".
381
382=head2 TERMINOLOGY
383
4ac9195f 384=over 4
4411f3b6 385
386=item *
387
388I<character>: a character in the range 0..(2**32-1) (or more).
389(What perl's strings are made of.)
390
391=item *
392
393I<byte>: a character in the range 0..255
394(A special case of a perl character.)
395
396=item *
397
398I<octet>: 8 bits of data, with ordinal values 0..255
399(Term for bytes passed to or from a non-perl context, e.g. disk file.)
400
401=back
402
403The marker [INTERNAL] marks Internal Implementation Details, in
404general meant only for those who think they know what they are doing,
405and such details may change in future releases.
406
407=head1 ENCODINGS
408
409=head2 Characteristics of an Encoding
410
411An encoding has a "repertoire" of characters that it can represent,
412and for each representable character there is at least one sequence of
413octets that represents it.
414
415=head2 Types of Encodings
416
417Encodings can be divided into the following types:
418
419=over 4
420
421=item * Fixed length 8-bit (or less) encodings.
422
423Each character is a single octet so may have a repertoire of up to
424256 characters. ASCII and iso-8859-* are typical examples.
425
426=item * Fixed length 16-bit encodings
427
428Each character is two octets so may have a repertoire of up to
42965,536 characters. Unicode's UCS-2 is an example. Also used for
430encodings for East Asian languages.
431
432=item * Fixed length 32-bit encodings.
433
434Not really very "encoded" encodings. The Unicode code points
435are just represented as 4-octet integers. None the less because
436different architectures use different representations of integers
437(so called "endian") there at least two disctinct encodings.
438
439=item * Multi-byte encodings
440
441The number of octets needed to represent a character varies.
442UTF-8 is a particularly complex but regular case of a multi-byte
443encoding. Several East Asian countries use a multi-byte encoding
444where 1-octet is used to cover western roman characters and Asian
445characters get 2-octets.
446(UTF-16 is strictly a multi-byte encoding taking either 2 or 4 octets
447to represent a Unicode code point.)
448
449=item * "Escape" encodings.
450
451These encodings embed "escape sequences" into the octet sequence
452which describe how the following octets are to be interpreted.
453The iso-2022-* family is typical. Following the escape sequence
454octets are encoded by an "embedded" encoding (which will be one
455of the above types) until another escape sequence switches to
456a different "embedded" encoding.
457
458These schemes are very flexible and can handle mixed languages but are
459very complex to process (and have state).
460No escape encodings are implemented for perl yet.
461
462=back
463
464=head2 Specifying Encodings
465
466Encodings can be specified to the API described below in two ways:
467
468=over 4
469
470=item 1. By name
471
472Encoding names are strings with characters taken from a restricted repertoire.
473See L</"Encoding Names">.
474
475=item 2. As an object
476
477Encoding objects are returned by C<find_encoding($name)>.
478
479=back
480
481=head2 Encoding Names
482
483Encoding names are case insensitive. White space in names is ignored.
484In addition an encoding may have aliases. Each encoding has one "canonical" name.
485The "canonical" name is chosen from the names of the encoding by picking
486the first in the following sequence:
487
488=over 4
489
490=item * The MIME name as defined in IETF RFC-XXXX.
491
492=item * The name in the IANA registry.
493
494=item * The name used by the the organization that defined it.
495
496=back
497
498Because of all the alias issues, and because in the general case
499encodings have state C<Encode> uses the encoding object internally
500once an operation is in progress.
501
4411f3b6 502=head1 PERL ENCODING API
503
504=head2 Generic Encoding Interface
505
506=over 4
507
508=item *
509
510 $bytes = encode(ENCODING, $string[, CHECK])
511
512Encodes string from perl's internal form into I<ENCODING> and returns a
513sequence of octets.
514See L</"Handling Malformed Data">.
515
516=item *
517
518 $string = decode(ENCODING, $bytes[, CHECK])
519
520Decode sequence of octets assumed to be in I<ENCODING> into perls internal
521form and returns the resuting string.
522See L</"Handling Malformed Data">.
523
524=back
525
526=head2 Handling Malformed Data
527
528If CHECK is not set, C<undef> is returned. If the data is supposed to
529be UTF-8, an optional lexical warning (category utf8) is given.
530If CHECK is true but not a code reference, dies.
531
532It would desirable to have a way to indicate that transform should use the
533encodings "replacement character" - no such mechanism is defined yet.
534
535It is also planned to allow I<CHECK> to be a code reference.
536
537This is not yet implemented as there are design issues with what its arguments
538should be and how it returns its results.
539
540=over 4
541
542=item Scheme 1
543
544Passed remaining fragment of string being processed.
545Modifies it in place to remove bytes/characters it can understand
546and returns a string used to represent them.
547e.g.
548
549 sub fixup {
550 my $ch = substr($_[0],0,1,'');
551 return sprintf("\x{%02X}",ord($ch);
552 }
553
554This scheme is close to how underlying C code for Encode works, but gives
555the fixup routine very little context.
556
557=item Scheme 2
558
559Passed original string, and an index into it of the problem area,
560and output string so far.
561Appends what it will to output string and returns new index into
562original string.
563e.g.
564
565 sub fixup {
566 # my ($s,$i,$d) = @_;
567 my $ch = substr($_[0],$_[1],1);
568 $_[2] .= sprintf("\x{%02X}",ord($ch);
569 return $_[1]+1;
570 }
571
572This scheme gives maximal control to the fixup routine but is more complicated
573to code, and may need internals of Encode to be tweaked to keep original
574string intact.
575
576=item Other Schemes
577
578Hybrids of above.
579
580Multiple return values rather than in-place modifications.
581
582Index into the string could be pos($str) allowing s/\G...//.
583
584=back
585
586=head2 UTF-8 / utf8
587
588The Unicode consortium defines the UTF-8 standard as a way of encoding
589the entire Unicode repertiore as sequences of octets. This encoding
590is expected to become very widespread. Perl can use this form internaly
591to represent strings, so conversions to and from this form are particularly
592efficient (as octets in memory do not have to change, just the meta-data
593that tells perl how to treat them).
594
595=over 4
596
597=item *
598
599 $bytes = encode_utf8($string);
600
601The characters that comprise string are encoded in perl's superset of UTF-8
602and the resulting octets returned as a sequence of bytes. All possible
603characters have a UTF-8 representation so this function cannot fail.
604
605=item *
606
607 $string = decode_utf8($bytes [,CHECK]);
608
609The sequence of octets represented by $bytes is decoded from UTF-8 into
610a sequence of logical characters. Not all sequences of octets form valid
611UTF-8 encodings, so it is possible for this call to fail.
612See L</"Handling Malformed Data">.
613
614=back
615
616=head2 Other Encodings of Unicode
617
618UTF-16 is similar to UCS-2, 16 bit or 2-byte chunks.
619UCS-2 can only represent 0..0xFFFF, while UTF-16 has a "surogate pair"
620scheme which allows it to cover the whole Unicode range.
621
8040349a 622Encode implements big-endian UCS-2 aliased to "iso-10646-1" as that
4411f3b6 623happens to be the name used by that representation when used with X11 fonts.
624
625UTF-32 or UCS-4 is 32-bit or 4-byte chunks. Perl's logical characters
626can be considered as being in this form without encoding. An encoding
627to transfer strings in this form (e.g. to write them to a file) would need to
628
629 pack('L',map(chr($_),split(//,$string))); # native
630 or
631 pack('V',map(chr($_),split(//,$string))); # little-endian
632 or
633 pack('N',map(chr($_),split(//,$string))); # big-endian
634
635depending on the endian required.
636
51ef4e11 637No UTF-32 encodings are implemented yet.
4411f3b6 638
639Both UCS-2 and UCS-4 style encodings can have "byte order marks" by representing
640the code point 0xFFFE as the very first thing in a file.
641
51ef4e11 642=head2 Listing available encodings
643
644 use Encode qw(encodings);
645 @list = encodings();
646
647Returns a list of the canonical names of the available encodings.
648
649=head2 Defining Aliases
650
651 use Encode qw(define_alias);
652 define_alias( newName => ENCODING);
653
654Allows newName to be used as am alias for ENCODING. ENCODING may be either the
655name of an encoding or and encoding object (as above).
656
657Currently I<newName> can be specified in the following ways:
658
659=over 4
660
661=item As a simple string.
662
663=item As a qr// compiled regular expression, e.g.:
664
665 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
666
667In this case if I<ENCODING> is not a reference it is C<eval>-ed to allow
668C<$1> etc. to be subsituted.
669The example is one way to names as used in X11 font names to alias the MIME names for the
670iso-8859-* family.
671
672=item As a code reference, e.g.:
673
674 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
675
676In this case C<$_> will be set to the name that is being looked up and
677I<ENCODING> is passed to the sub as its first argument.
678The example is another way to names as used in X11 font names to alias the MIME names for
679the iso-8859-* family.
680
681=back
682
683=head2 Defining Encodings
684
685 use Encode qw(define_alias);
686 define_encoding( $object, 'canonicalName' [,alias...]);
687
688Causes I<canonicalName> to be associated with I<$object>.
689The object should provide the interface described in L</"IMPLEMENTATION CLASSES"> below.
690If more than two arguments are provided then additional arguments are taken
691as aliases for I<$object> as for C<define_alias>.
692
4411f3b6 693=head1 Encoding and IO
694
695It is very common to want to do encoding transformations when
696reading or writing files, network connections, pipes etc.
697If perl is configured to use the new 'perlio' IO system then
698C<Encode> provides a "layer" (See L<perliol>) which can transform
699data as it is read or written.
700
51ef4e11 701 open(my $ilyad,'>:encoding(iso-8859-7)','ilyad.greek');
4411f3b6 702 print $ilyad @epic;
703
704In addition the new IO system can also be configured to read/write
705UTF-8 encoded characters (as noted above this is efficient):
706
707 open(my $fh,'>:utf8','anything');
708 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
709
710Either of the above forms of "layer" specifications can be made the default
711for a lexical scope with the C<use open ...> pragma. See L<open>.
712
713Once a handle is open is layers can be altered using C<binmode>.
714
715Without any such configuration, or if perl itself is built using
716system's own IO, then write operations assume that file handle accepts
717only I<bytes> and will C<die> if a character larger than 255 is
718written to the handle. When reading, each octet from the handle
719becomes a byte-in-a-character. Note that this default is the same
720behaviour as bytes-only languages (including perl before v5.6) would have,
721and is sufficient to handle native 8-bit encodings e.g. iso-8859-1,
722EBCDIC etc. and any legacy mechanisms for handling other encodings
723and binary data.
724
725In other cases it is the programs responsibility
726to transform characters into bytes using the API above before
727doing writes, and to transform the bytes read from a handle into characters
728before doing "character operations" (e.g. C<lc>, C</\W+/>, ...).
729
730=head1 Encoding How to ...
731
732To do:
733
734=over 4
735
736=item * IO with mixed content (faking iso-2020-*)
737
738=item * MIME's Content-Length:
739
740=item * UTF-8 strings in binary data.
741
742=item * perl/Encode wrappers on non-Unicode XS modules.
743
744=back
745
746=head1 Messing with Perl's Internals
747
748The following API uses parts of perl's internals in the current implementation.
749As such they are efficient, but may change.
750
751=over 4
752
4411f3b6 753=item * is_utf8(STRING [, CHECK])
754
755[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
756If CHECK is true, also checks the data in STRING for being
757well-formed UTF-8. Returns true if successful, false otherwise.
758
759=item * valid_utf8(STRING)
760
761[INTERNAL] Test whether STRING is in a consistent state.
762Will return true if string is held as bytes, or is well-formed UTF-8
763and has the UTF-8 flag on.
764Main reason for this routine is to allow perl's testsuite to check
765that operations have left strings in a consistent state.
766
767=item *
768
769 _utf8_on(STRING)
770
771[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
772B<not> checked for being well-formed UTF-8. Do not use unless you
773B<know> that the STRING is well-formed UTF-8. Returns the previous
774state of the UTF-8 flag (so please don't test the return value as
775I<not> success or failure), or C<undef> if STRING is not a string.
776
777=item *
778
779 _utf8_off(STRING)
780
781[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
782Returns the previous state of the UTF-8 flag (so please don't test the
783return value as I<not> success or failure), or C<undef> if STRING is
784not a string.
785
786=back
787
4edaa979 788=head1 IMPLEMENTATION CLASSES
789
790As mentioned above encodings are (in the current implementation at least)
791defined by objects. The mapping of encoding name to object is via the
51ef4e11 792C<%encodings> hash.
4edaa979 793
794The values of the hash can currently be either strings or objects.
795The string form may go away in the future. The string form occurs
796when C<encodings()> has scanned C<@INC> for loadable encodings but has
797not actually loaded the encoding in question. This is because the
798current "loading" process is all perl and a bit slow.
799
800Once an encoding is loaded then value of the hash is object which implements
801the encoding. The object should provide the following interface:
802
803=over 4
804
805=item -E<gt>name
806
807Should return the string representing the canonical name of the encoding.
808
809=item -E<gt>new_sequence
810
811This is a placeholder for encodings with state. It should return an object
812which implements this interface, all current implementations return the
813original object.
814
815=item -E<gt>encode($string,$check)
816
817Should return the octet sequence representing I<$string>. If I<$check> is true
818it should modify I<$string> in place to remove the converted part (i.e.
819the whole string unless there is an error).
820If an error occurs it should return the octet sequence for the
821fragment of string that has been converted, and modify $string in-place
822to remove the converted part leaving it starting with the problem fragment.
823
824If check is is false then C<encode> should make a "best effort" to convert
825the string - for example by using a replacement character.
826
827=item -E<gt>decode($octets,$check)
828
829Should return the string that I<$octets> represents. If I<$check> is true
830it should modify I<$octets> in place to remove the converted part (i.e.
831the whole sequence unless there is an error).
832If an error occurs it should return the fragment of string
833that has been converted, and modify $octets in-place to remove the converted part
834leaving it starting with the problem fragment.
835
836If check is is false then C<decode> should make a "best effort" to convert
837the string - for example by using Unicode's "\x{FFFD}" as a replacement character.
838
839=back
840
841It should be noted that the check behaviour is different from the outer
842public API. The logic is that the "unchecked" case is useful when
843encoding is part of a stream which may be reporting errors (e.g. STDERR).
844In such cases it is desirable to get everything through somehow without
845causing additional errors which obscure the original one. Also the encoding
846is best placed to know what the correct replacement character is, so if that
847is the desired behaviour then letting low level code do it is the most efficient.
848
849In contrast if check is true, the scheme above allows the encoding to do as
850much as it can and tell layer above how much that was. What is lacking
851at present is a mechanism to report what went wrong. The most likely interface
852will be an additional method call to the object, or perhaps
853(to avoid forcing per-stream objects on otherwise stateless encodings)
854and additional parameter.
855
856It is also highly desirable that encoding classes inherit from C<Encode::Encoding>
857as a base class. This allows that class to define additional behaviour for
51ef4e11 858all encoding objects. For example built in Unicode, UCS-2 and UTF-8 classes
859use :
860
861 package Encode::MyEncoding;
862 use base qw(Encode::Encoding);
863
864 __PACKAGE__->Define(qw(myCanonical myAlias));
865
866To create an object with bless {Name => ...},$class, and call define_encoding.
867They inherit their C<name> method from C<Encode::Encoding>.
4edaa979 868
869=head2 Compiled Encodings
870
871F<Encode.xs> provides a class C<Encode::XS> which provides the interface described
872above. It calls a generic octet-sequence to octet-sequence "engine" that is
873driven by tables (defined in F<encengine.c>). The same engine is used for both
874encode and decode. C<Encode:XS>'s C<encode> forces perl's characters to their UTF-8 form
875and then treats them as just another multibyte encoding. C<Encode:XS>'s C<decode> transforms
876the sequence and then turns the UTF-8-ness flag as that is the form that the tables
877are defined to produce. For details of the engine see the comments in F<encengine.c>.
878
879The tables are produced by the perl script F<compile> (the name needs to change so
880we can eventually install it somewhere). F<compile> can currently read two formats:
881
882=over 4
883
884=item *.enc
885
886This is a coined format used by Tcl. It is documented in Encode/EncodeFormat.pod.
887
888=item *.ucm
889
890This is the semi-standard format used by IBM's ICU package.
891
892=back
893
894F<compile> can write the following forms:
895
896=over 4
897
898=item *.ucm
899
900See above - the F<Encode/*.ucm> files provided with the distribution have
901been created from the original Tcl .enc files using this approach.
902
903=item *.c
904
905Produces tables as C data structures - this is used to build in encodings
906into F<Encode.so>/F<Encode.dll>.
907
908=item *.xs
909
910In theory this allows encodings to be stand-alone loadable perl extensions.
911The process has not yet been tested. The plan is to use this approach
912for large East Asian encodings.
913
914=back
915
916The set of encodings built-in to F<Encode.so>/F<Encode.dll> is determined by
917F<Makefile.PL>. The current set is as follows:
918
919=over 4
920
921=item ascii and iso-8859-*
922
923That is all the common 8-bit "western" encodings.
924
925=item IBM-1047 and two other variants of EBCDIC.
926
927These are the same variants that are supported by EBCDIC perl as "native" encodings.
928They are included to prove "reversibility" of some constructs in EBCDIC perl.
929
930=item symbol and dingbats as used by Tk on X11.
931
932(The reason Encode got started was to support perl/Tk.)
933
934=back
935
936That set is rather ad. hoc. and has been driven by the needs of the tests rather
937than the needs of typical applications. It is likely to be rationalized.
938
4411f3b6 939=head1 SEE ALSO
940
941L<perlunicode>, L<perlebcdic>, L<perlfunc/open>
942
943=cut
944
945
2a936312 946