9 our @ISA = qw(Exporter DynaLoader);
11 # Public, encouraged API is exported by default
37 # Documentation moved after __END__ for speed - NI-S
41 # Make a %encoding package variable to allow a certain amount of cheating
43 my @alias; # ordered matching list
44 my %alias; # cached known aliases
45 # 0 1 2 3 4 5 6 7 8 9 10
46 our @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
52 return keys %encoding;
59 unless (exists $alias{$_})
61 for (my $i=0; $i < @alias; $i += 2)
63 my $alias = $alias[$i];
64 my $val = $alias[$i+1];
66 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
70 elsif (ref($alias) eq 'CODE')
72 $new = &{$alias}($val)
74 elsif (lc($_) eq lc($alias))
80 next if $new eq $_; # avoid (direct) recursion on bugs
81 my $enc = (ref($new)) ? $new : find_encoding($new);
97 my ($alias,$name) = splice(@_,0,2);
98 push(@alias, $alias => $name);
102 # Allow variants of iso-8859-1 etc.
103 define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
105 # This is a font issue, not an encoding issue.
106 # (The currency symbol of the Latin 1 upper half is redefined
107 # as the euro symbol.)
108 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
110 # Solaris has this as a generic Latin-1 encoding.
111 define_alias( qr/^iso_8859_1$/ => 'iso-8859-1' );
113 # At least HP-UX has these.
114 define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );
116 # Allow latin-1 style names as well
117 define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
119 # Common names for non-latin prefered MIME names
120 define_alias( 'ascii' => 'US-ascii',
121 'cyrillic' => 'iso-8859-5',
122 'arabic' => 'iso-8859-6',
123 'greek' => 'iso-8859-7',
124 'hebrew' => 'iso-8859-8');
126 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
127 define_alias( qr/^ibm[-_]?(\d\d\d\d?)$/i => '"cp$1"');
129 # Standardize on the dashed version.
130 define_alias( qr/^koi8r$/i => 'koi8-r' );
132 # Map white space and _ to '-'
133 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
139 $encoding{$name} = $obj;
141 define_alias($lc => $obj) unless $lc eq $name;
145 define_alias($alias,$obj);
152 my ($class,$name) = @_;
154 if (ref($name) && $name->can('new_sequence'))
158 if (exists $encoding{$name})
160 return $encoding{$name};
164 return $class->findAlias($name);
171 return __PACKAGE__->getEncoding($name);
176 my ($name,$string,$check) = @_;
177 my $enc = find_encoding($name);
178 croak("Unknown encoding '$name'") unless defined $enc;
179 my $octets = $enc->encode($string,$check);
180 return undef if ($check && length($string));
186 my ($name,$octets,$check) = @_;
187 my $enc = find_encoding($name);
188 croak("Unknown encoding '$name'") unless defined $enc;
189 my $string = $enc->decode($octets,$check);
190 return undef if ($check && length($octets));
196 my ($string,$from,$to,$check) = @_;
197 my $f = find_encoding($from);
198 croak("Unknown encoding '$from'") unless defined $f;
199 my $t = find_encoding($to);
200 croak("Unknown encoding '$to'") unless defined $t;
201 my $uni = $f->decode($string,$check);
202 return undef if ($check && length($string));
203 $string = $t->encode($uni,$check);
204 return undef if ($check && length($uni));
205 return length($_[0] = $string);
218 return undef unless utf8::decode($str);
222 package Encode::Encoding;
223 # Base class for classes which implement encodings
228 my $canonical = shift;
229 $obj = bless { Name => $canonical },$obj unless ref $obj;
230 # warn "$canonical => $obj\n";
231 Encode::define_encoding($obj, $canonical, @_);
234 sub name { shift->{'Name'} }
236 # Temporary legacy methods
237 sub toUnicode { shift->decode(@_) }
238 sub fromUnicode { shift->encode(@_) }
240 sub new_sequence { return $_[0] }
243 use base 'Encode::Encoding';
245 package Encode::Internal;
246 use base 'Encode::Encoding';
248 # Dummy package that provides the encode interface but leaves data
249 # as UTF-X encoded. It is here so that from_to() works.
251 __PACKAGE__->Define('Internal');
253 Encode::define_alias( 'Unicode' => 'Internal' ) if ord('A') == 65;
257 my ($obj,$str,$chk) = @_;
265 package Encoding::Unicode;
266 use base 'Encode::Encoding';
268 __PACKAGE__->Define('Unicode') unless ord('A') == 65;
272 my ($obj,$str,$chk) = @_;
274 for (my $i = 0; $i < length($str); $i++)
276 $res .= chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
284 my ($obj,$str,$chk) = @_;
286 for (my $i = 0; $i < length($str); $i++)
288 $res .= chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
295 package Encode::utf8;
296 use base 'Encode::Encoding';
297 # package to allow long-hand
298 # $octets = encode( utf8 => $string );
301 __PACKAGE__->Define(qw(UTF-8 utf8));
305 my ($obj,$octets,$chk) = @_;
306 my $str = Encode::decode_utf8($octets);
317 my ($obj,$string,$chk) = @_;
318 my $octets = Encode::encode_utf8($string);
323 package Encode::iso10646_1;
324 use base 'Encode::Encoding';
325 # Encoding is 16-bit network order Unicode (no surogates)
326 # Used for X font encodings
328 __PACKAGE__->Define(qw(UCS-2 iso-10646-1));
332 my ($obj,$str,$chk) = @_;
336 my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
339 $_[1] = $str if $chk;
346 my ($obj,$uni,$chk) = @_;
350 my $ch = substr($uni,0,1,'');
357 $str .= pack('n',$x);
359 $_[1] = $uni if $chk;
363 # switch back to Encode package in case we ever add AutoLoader
372 Encode - character encodings
380 The C<Encode> module provides the interfaces between Perl's strings
381 and the rest of the system. Perl strings are sequences of B<characters>.
383 The repertoire of characters that Perl can represent is at least that
384 defined by the Unicode Consortium. On most platforms the ordinal
385 values of the characters (as returned by C<ord(ch)>) is the "Unicode
386 codepoint" for the character (the exceptions are those platforms where
387 the legacy encoding is some variant of EBCDIC rather than a super-set
388 of ASCII - see L<perlebcdic>).
390 Traditionaly computer data has been moved around in 8-bit chunks
391 often called "bytes". These chunks are also known as "octets" in
392 networking standards. Perl is widely used to manipulate data of
393 many types - not only strings of characters representing human or
394 computer languages but also "binary" data being the machines representation
395 of numbers, pixels in an image - or just about anything.
397 When Perl is processing "binary data" the programmer wants Perl to process
398 "sequences of bytes". This is not a problem for Perl - as a byte has 256
399 possible values it easily fits in Perl's much larger "logical character".
407 I<character>: a character in the range 0..(2**32-1) (or more).
408 (What Perl's strings are made of.)
412 I<byte>: a character in the range 0..255
413 (A special case of a Perl character.)
417 I<octet>: 8 bits of data, with ordinal values 0..255
418 (Term for bytes passed to or from a non-Perl context, e.g. disk file.)
422 The marker [INTERNAL] marks Internal Implementation Details, in
423 general meant only for those who think they know what they are doing,
424 and such details may change in future releases.
428 =head2 Characteristics of an Encoding
430 An encoding has a "repertoire" of characters that it can represent,
431 and for each representable character there is at least one sequence of
432 octets that represents it.
434 =head2 Types of Encodings
436 Encodings can be divided into the following types:
440 =item * Fixed length 8-bit (or less) encodings.
442 Each character is a single octet so may have a repertoire of up to
443 256 characters. ASCII and iso-8859-* are typical examples.
445 =item * Fixed length 16-bit encodings
447 Each character is two octets so may have a repertoire of up to
448 65 536 characters. Unicode's UCS-2 is an example. Also used for
449 encodings for East Asian languages.
451 =item * Fixed length 32-bit encodings.
453 Not really very "encoded" encodings. The Unicode code points
454 are just represented as 4-octet integers. None the less because
455 different architectures use different representations of integers
456 (so called "endian") there at least two disctinct encodings.
458 =item * Multi-byte encodings
460 The number of octets needed to represent a character varies.
461 UTF-8 is a particularly complex but regular case of a multi-byte
462 encoding. Several East Asian countries use a multi-byte encoding
463 where 1-octet is used to cover western roman characters and Asian
464 characters get 2-octets.
465 (UTF-16 is strictly a multi-byte encoding taking either 2 or 4 octets
466 to represent a Unicode code point.)
468 =item * "Escape" encodings.
470 These encodings embed "escape sequences" into the octet sequence
471 which describe how the following octets are to be interpreted.
472 The iso-2022-* family is typical. Following the escape sequence
473 octets are encoded by an "embedded" encoding (which will be one
474 of the above types) until another escape sequence switches to
475 a different "embedded" encoding.
477 These schemes are very flexible and can handle mixed languages but are
478 very complex to process (and have state). No escape encodings are
479 implemented for Perl yet.
483 =head2 Specifying Encodings
485 Encodings can be specified to the API described below in two ways:
491 Encoding names are strings with characters taken from a restricted
492 repertoire. See L</"Encoding Names">.
494 =item 2. As an object
496 Encoding objects are returned by C<find_encoding($name)>.
500 =head2 Encoding Names
502 Encoding names are case insensitive. White space in names is ignored.
503 In addition an encoding may have aliases. Each encoding has one
504 "canonical" name. The "canonical" name is chosen from the names of
505 the encoding by picking the first in the following sequence:
509 =item * The MIME name as defined in IETF RFC-XXXX.
511 =item * The name in the IANA registry.
513 =item * The name used by the the organization that defined it.
517 Because of all the alias issues, and because in the general case
518 encodings have state C<Encode> uses the encoding object internally
519 once an operation is in progress.
521 =head1 PERL ENCODING API
523 =head2 Generic Encoding Interface
529 $bytes = encode(ENCODING, $string[, CHECK])
531 Encodes string from Perl's internal form into I<ENCODING> and returns
532 a sequence of octets. For CHECK see L</"Handling Malformed Data">.
536 $string = decode(ENCODING, $bytes[, CHECK])
538 Decode sequence of octets assumed to be in I<ENCODING> into Perl's
539 internal form and returns the resulting string. For CHECK see
540 L</"Handling Malformed Data">.
544 from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
546 Convert B<in-place> the data between two encodings. How did the data
547 in $string originally get to be in FROM_ENCODING? Either using
548 encode() or through PerlIO: See L</"Encoding and IO">. For CHECK
549 see L</"Handling Malformed Data">.
551 For example to convert ISO 8859-1 data to UTF-8:
553 from_to($data, "iso-8859-1", "utf-8");
555 and to convert it back:
557 from_to($data, "utf-8", "iso-8859-1");
559 Note that because the conversion happens in place, the data to be
560 converted cannot be a string constant, it must be a scalar variable.
564 =head2 Handling Malformed Data
566 If CHECK is not set, C<undef> is returned. If the data is supposed to
567 be UTF-8, an optional lexical warning (category utf8) is given. If
568 CHECK is true but not a code reference, dies.
570 It would desirable to have a way to indicate that transform should use
571 the encodings "replacement character" - no such mechanism is defined yet.
573 It is also planned to allow I<CHECK> to be a code reference.
575 This is not yet implemented as there are design issues with what its
576 arguments should be and how it returns its results.
582 Passed remaining fragment of string being processed.
583 Modifies it in place to remove bytes/characters it can understand
584 and returns a string used to represent them.
588 my $ch = substr($_[0],0,1,'');
589 return sprintf("\x{%02X}",ord($ch);
592 This scheme is close to how underlying C code for Encode works, but gives
593 the fixup routine very little context.
597 Passed original string, and an index into it of the problem area, and
598 output string so far. Appends what it will to output string and
599 returns new index into original string. For example:
602 # my ($s,$i,$d) = @_;
603 my $ch = substr($_[0],$_[1],1);
604 $_[2] .= sprintf("\x{%02X}",ord($ch);
608 This scheme gives maximal control to the fixup routine but is more
609 complicated to code, and may need internals of Encode to be tweaked to
610 keep original string intact.
616 Multiple return values rather than in-place modifications.
618 Index into the string could be pos($str) allowing s/\G...//.
624 The Unicode consortium defines the UTF-8 standard as a way of encoding
625 the entire Unicode repertiore as sequences of octets. This encoding is
626 expected to become very widespread. Perl can use this form internaly
627 to represent strings, so conversions to and from this form are
628 particularly efficient (as octets in memory do not have to change,
629 just the meta-data that tells Perl how to treat them).
635 $bytes = encode_utf8($string);
637 The characters that comprise string are encoded in Perl's superset of UTF-8
638 and the resulting octets returned as a sequence of bytes. All possible
639 characters have a UTF-8 representation so this function cannot fail.
643 $string = decode_utf8($bytes [,CHECK]);
645 The sequence of octets represented by $bytes is decoded from UTF-8
646 into a sequence of logical characters. Not all sequences of octets
647 form valid UTF-8 encodings, so it is possible for this call to fail.
648 For CHECK see L</"Handling Malformed Data">.
652 =head2 Other Encodings of Unicode
654 UTF-16 is similar to UCS-2, 16 bit or 2-byte chunks. UCS-2 can only
655 represent 0..0xFFFF, while UTF-16 has a "surrogate pair" scheme which
656 allows it to cover the whole Unicode range.
658 Encode implements big-endian UCS-2 aliased to "iso-10646-1" as that
659 happens to be the name used by that representation when used with X11
662 UTF-32 or UCS-4 is 32-bit or 4-byte chunks. Perl's logical characters
663 can be considered as being in this form without encoding. An encoding
664 to transfer strings in this form (e.g. to write them to a file) would
667 pack('L',map(chr($_),split(//,$string))); # native
669 pack('V',map(chr($_),split(//,$string))); # little-endian
671 pack('N',map(chr($_),split(//,$string))); # big-endian
673 depending on the endian required.
675 No UTF-32 encodings are implemented yet.
677 Both UCS-2 and UCS-4 style encodings can have "byte order marks" by
678 representing the code point 0xFFFE as the very first thing in a file.
680 =head2 Listing available encodings
682 use Encode qw(encodings);
685 Returns a list of the canonical names of the available encodings.
687 =head2 Defining Aliases
689 use Encode qw(define_alias);
690 define_alias( newName => ENCODING);
692 Allows newName to be used as am alias for ENCODING. ENCODING may be
693 either the name of an encoding or and encoding object (as above).
695 Currently I<newName> can be specified in the following ways:
699 =item As a simple string.
701 =item As a qr// compiled regular expression, e.g.:
703 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
705 In this case if I<ENCODING> is not a reference it is C<eval>-ed to
706 allow C<$1> etc. to be subsituted. The example is one way to names as
707 used in X11 font names to alias the MIME names for the iso-8859-*
710 =item As a code reference, e.g.:
712 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
714 In this case C<$_> will be set to the name that is being looked up and
715 I<ENCODING> is passed to the sub as its first argument. The example
716 is another way to names as used in X11 font names to alias the MIME
717 names for the iso-8859-* family.
721 =head2 Defining Encodings
723 use Encode qw(define_alias);
724 define_encoding( $object, 'canonicalName' [,alias...]);
726 Causes I<canonicalName> to be associated with I<$object>. The object
727 should provide the interface described in L</"IMPLEMENTATION CLASSES">
728 below. If more than two arguments are provided then additional
729 arguments are taken as aliases for I<$object> as for C<define_alias>.
731 =head1 Encoding and IO
733 It is very common to want to do encoding transformations when
734 reading or writing files, network connections, pipes etc.
735 If Perl is configured to use the new 'perlio' IO system then
736 C<Encode> provides a "layer" (See L<perliol>) which can transform
737 data as it is read or written.
740 open(my $ilyad,'>:encoding(iso-8859-7)','ilyad.greek');
743 In addition the new IO system can also be configured to read/write
744 UTF-8 encoded characters (as noted above this is efficient):
746 open(my $fh,'>:utf8','anything');
747 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
749 Either of the above forms of "layer" specifications can be made the default
750 for a lexical scope with the C<use open ...> pragma. See L<open>.
752 Once a handle is open is layers can be altered using C<binmode>.
754 Without any such configuration, or if Perl itself is built using
755 system's own IO, then write operations assume that file handle accepts
756 only I<bytes> and will C<die> if a character larger than 255 is
757 written to the handle. When reading, each octet from the handle
758 becomes a byte-in-a-character. Note that this default is the same
759 behaviour as bytes-only languages (including Perl before v5.6) would
760 have, and is sufficient to handle native 8-bit encodings
761 e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
762 other encodings and binary data.
764 In other cases it is the programs responsibility to transform
765 characters into bytes using the API above before doing writes, and to
766 transform the bytes read from a handle into characters before doing
767 "character operations" (e.g. C<lc>, C</\W+/>, ...).
769 You can also use PerlIO to convert larger amounts of data you don't
770 want to bring into memory. For example to convert between ISO 8859-1
771 (Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
773 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
774 open(G, ">:utf8", "data.utf") or die $!;
775 while (<F>) { print G }
777 # Could also do "print G <F>" but that would pull
778 # the whole file into memory just to write it out again.
782 open(my $f, "<:encoding(cp1252)")
783 open(my $g, ">:encoding(iso-8859-2)")
784 open(my $h, ">:encoding(latin9)") # iso-8859-15
786 See L<PerlIO> for more information.
788 =head1 Encoding How to ...
794 =item * IO with mixed content (faking iso-2020-*)
796 =item * MIME's Content-Length:
798 =item * UTF-8 strings in binary data.
800 =item * Perl/Encode wrappers on non-Unicode XS modules.
804 =head1 Messing with Perl's Internals
806 The following API uses parts of Perl's internals in the current
807 implementation. As such they are efficient, but may change.
811 =item * is_utf8(STRING [, CHECK])
813 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
814 If CHECK is true, also checks the data in STRING for being well-formed
815 UTF-8. Returns true if successful, false otherwise.
817 =item * valid_utf8(STRING)
819 [INTERNAL] Test whether STRING is in a consistent state. Will return
820 true if string is held as bytes, or is well-formed UTF-8 and has the
821 UTF-8 flag on. Main reason for this routine is to allow Perl's
822 testsuite to check that operations have left strings in a consistent
829 [INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
830 B<not> checked for being well-formed UTF-8. Do not use unless you
831 B<know> that the STRING is well-formed UTF-8. Returns the previous
832 state of the UTF-8 flag (so please don't test the return value as
833 I<not> success or failure), or C<undef> if STRING is not a string.
839 [INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
840 Returns the previous state of the UTF-8 flag (so please don't test the
841 return value as I<not> success or failure), or C<undef> if STRING is
846 =head1 IMPLEMENTATION CLASSES
848 As mentioned above encodings are (in the current implementation at least)
849 defined by objects. The mapping of encoding name to object is via the
852 The values of the hash can currently be either strings or objects.
853 The string form may go away in the future. The string form occurs
854 when C<encodings()> has scanned C<@INC> for loadable encodings but has
855 not actually loaded the encoding in question. This is because the
856 current "loading" process is all Perl and a bit slow.
858 Once an encoding is loaded then value of the hash is object which
859 implements the encoding. The object should provide the following
866 Should return the string representing the canonical name of the encoding.
868 =item -E<gt>new_sequence
870 This is a placeholder for encodings with state. It should return an
871 object which implements this interface, all current implementations
872 return the original object.
874 =item -E<gt>encode($string,$check)
876 Should return the octet sequence representing I<$string>. If I<$check>
877 is true it should modify I<$string> in place to remove the converted
878 part (i.e. the whole string unless there is an error). If an error
879 occurs it should return the octet sequence for the fragment of string
880 that has been converted, and modify $string in-place to remove the
881 converted part leaving it starting with the problem fragment.
883 If check is is false then C<encode> should make a "best effort" to
884 convert the string - for example by using a replacement character.
886 =item -E<gt>decode($octets,$check)
888 Should return the string that I<$octets> represents. If I<$check> is
889 true it should modify I<$octets> in place to remove the converted part
890 (i.e. the whole sequence unless there is an error). If an error
891 occurs it should return the fragment of string that has been
892 converted, and modify $octets in-place to remove the converted part
893 leaving it starting with the problem fragment.
895 If check is is false then C<decode> should make a "best effort" to
896 convert the string - for example by using Unicode's "\x{FFFD}" as a
897 replacement character.
901 It should be noted that the check behaviour is different from the
902 outer public API. The logic is that the "unchecked" case is useful
903 when encoding is part of a stream which may be reporting errors
904 (e.g. STDERR). In such cases it is desirable to get everything
905 through somehow without causing additional errors which obscure the
906 original one. Also the encoding is best placed to know what the
907 correct replacement character is, so if that is the desired behaviour
908 then letting low level code do it is the most efficient.
910 In contrast if check is true, the scheme above allows the encoding to
911 do as much as it can and tell layer above how much that was. What is
912 lacking at present is a mechanism to report what went wrong. The most
913 likely interface will be an additional method call to the object, or
914 perhaps (to avoid forcing per-stream objects on otherwise stateless
915 encodings) and additional parameter.
917 It is also highly desirable that encoding classes inherit from
918 C<Encode::Encoding> as a base class. This allows that class to define
919 additional behaviour for all encoding objects. For example built in
920 Unicode, UCS-2 and UTF-8 classes use :
922 package Encode::MyEncoding;
923 use base qw(Encode::Encoding);
925 __PACKAGE__->Define(qw(myCanonical myAlias));
927 To create an object with bless {Name => ...},$class, and call
928 define_encoding. They inherit their C<name> method from
931 =head2 Compiled Encodings
933 F<Encode.xs> provides a class C<Encode::XS> which provides the
934 interface described above. It calls a generic octet-sequence to
935 octet-sequence "engine" that is driven by tables (defined in
936 F<encengine.c>). The same engine is used for both encode and
937 decode. C<Encode:XS>'s C<encode> forces Perl's characters to their
938 UTF-8 form and then treats them as just another multibyte
939 encoding. C<Encode:XS>'s C<decode> transforms the sequence and then
940 turns the UTF-8-ness flag as that is the form that the tables are
941 defined to produce. For details of the engine see the comments in
944 The tables are produced by the Perl script F<compile> (the name needs
945 to change so we can eventually install it somewhere). F<compile> can
946 currently read two formats:
952 This is a coined format used by Tcl. It is documented in
953 Encode/EncodeFormat.pod.
957 This is the semi-standard format used by IBM's ICU package.
961 F<compile> can write the following forms:
967 See above - the F<Encode/*.ucm> files provided with the distribution have
968 been created from the original Tcl .enc files using this approach.
972 Produces tables as C data structures - this is used to build in encodings
973 into F<Encode.so>/F<Encode.dll>.
977 In theory this allows encodings to be stand-alone loadable Perl
978 extensions. The process has not yet been tested. The plan is to use
979 this approach for large East Asian encodings.
983 The set of encodings built-in to F<Encode.so>/F<Encode.dll> is
984 determined by F<Makefile.PL>. The current set is as follows:
988 =item ascii and iso-8859-*
990 That is all the common 8-bit "western" encodings.
992 =item IBM-1047 and two other variants of EBCDIC.
994 These are the same variants that are supported by EBCDIC Perl as
995 "native" encodings. They are included to prove "reversibility" of
996 some constructs in EBCDIC Perl.
998 =item symbol and dingbats as used by Tk on X11.
1000 (The reason Encode got started was to support Perl/Tk.)
1004 That set is rather ad hoc and has been driven by the needs of the
1005 tests rather than the needs of typical applications. It is likely
1010 L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>