my @alias; # ordered matching list
my %alias; # cached known aliases
+ # 0 1 2 3 4 5 6 7 8 9 10
+our @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
+
+our %winlatin2cp = (
+ 'Latin1' => 1252,
+ 'Latin2' => 1250,
+ 'Cyrillic' => 1251,
+ 'Greek' => 1253,
+ 'Turkish' => 1254,
+ 'Hebrew' => 1255,
+ 'Arabic' => 1256,
+ 'Baltic' => 1257,
+ 'Vietnamese' => 1258,
+ );
+
sub encodings
{
my ($class) = @_;
{
my $class = shift;
local $_ = shift;
+ # print "# findAlias $_\n";
unless (exists $alias{$_})
{
for (my $i=0; $i < @alias; $i += 2)
my $alias = $alias[$i];
my $val = $alias[$i+1];
my $new;
-
if (ref($alias) eq 'Regexp' && $_ =~ $alias)
{
$new = eval $val;
# Allow variants of iso-8859-1 etc.
define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
+# At least HP-UX has these.
+define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );
+
+# More HP stuff.
+define_alias( qr/^(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
+
+# The Official name of ASCII.
+define_alias( qr/^ANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
+
+# This is a font issue, not an encoding issue.
+# (The currency symbol of the Latin 1 upper half
+# has been redefined as the euro symbol.)
+define_alias( qr/^(.+)\@euro$/i => '"$1"' );
+
# Allow latin-1 style names as well
- # 0 1 2 3 4 5 6 7 8 9 10
-my @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
-define_alias( qr/^latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
+define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
+
+# Allow winlatin1 style names as well
+define_alias( qr/^win(latin[12]|cyrillic|baltic|greek|turkish|hebrew|arabic|baltic|vietnamese)$/i => '"cp$winlatin2cp{\u$1}"' );
# Common names for non-latin prefered MIME names
define_alias( 'ascii' => 'US-ascii',
'cyrillic' => 'iso-8859-5',
'arabic' => 'iso-8859-6',
'greek' => 'iso-8859-7',
- 'hebrew' => 'iso-8859-8');
-
-define_alias( 'ibm-1047' => 'cp1047');
+ 'hebrew' => 'iso-8859-8',
+ 'thai' => 'iso-8859-11',
+ 'tis620' => 'iso-8859-11',
+ );
+
+# At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
+define_alias( qr/^ibm[-_]?(\d\d\d\d?)$/i => '"cp$1"');
+
+# Standardize on the dashed versions.
+define_alias( qr/^utf8$/i => 'utf-8' );
+define_alias( qr/^koi8r$/i => 'koi8-r' );
+define_alias( qr/^koi8u$/i => 'koi8-u' );
+
+# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
+# TODO: HP-UX '15' encodings japanese15 korean15 roi15
+# TODO: Cyrillic encoding ISO-IR-111 (useful?)
+# TODO: Chinese encodings GB18030 GBK Big5-HSKCS EUC-TW
+# TODO: Armenian encoding ARMSCII-8
+# TODO: Hebrew encoding ISO-8859-8-1
+# TODO: Thai encoding TCVN
+# TODO: Korean encoding Johab
+# TODO: Vietnamese encodings VPS
+# TODO: Japanese encoding JIS (not the same as SJIS)
+# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
+# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
+# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
+# Kannada Khmer Korean Laotian Malayalam Mongolian
+# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
+# TODO: what is the Japanese 'UJIS' encoding seen in some Linuxes?
# Map white space and _ to '-'
define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
{
my ($class,$name) = @_;
my $enc;
+ if (ref($name) && $name->can('new_sequence'))
+ {
+ return $name;
+ }
+ my $lc = lc $name;
if (exists $encoding{$name})
{
return $encoding{$name};
}
- else
+ if (exists $encoding{$lc})
{
- return $class->findAlias($name);
+ return $encoding{$lc};
}
+
+ my $oc = $class->findAlias($name);
+ return $oc if defined $oc;
+ return $class->findAlias($lc) if $lc ne $name;
+
+ return;
}
sub find_encoding
my $enc = find_encoding($name);
croak("Unknown encoding '$name'") unless defined $enc;
my $string = $enc->decode($octets,$check);
- return undef if ($check && length($octets));
+ $_[1] = $octets if $check;
return $string;
}
return $str;
}
+package Encode::ucs_2le;
+use base 'Encode::Encoding';
+
+__PACKAGE__->Define(qw(UCS-2le UCS-2LE ucs-2le));
+
+sub decode
+{
+ my ($obj,$str,$chk) = @_;
+ my $uni = '';
+ while (length($str))
+ {
+ my $code = unpack('v',substr($str,0,2,'')) & 0xffff;
+ $uni .= chr($code);
+ }
+ $_[1] = $str if $chk;
+ utf8::upgrade($uni);
+ return $uni;
+}
+
+sub encode
+{
+ my ($obj,$uni,$chk) = @_;
+ my $str = '';
+ while (length($uni))
+ {
+ my $ch = substr($uni,0,1,'');
+ my $x = ord($ch);
+ unless ($x < 32768)
+ {
+ last if ($chk);
+ $x = 0;
+ }
+ $str .= pack('v',$x);
+ }
+ $_[1] = $uni if $chk;
+ return $str;
+}
+
# switch back to Encode package in case we ever add AutoLoader
package Encode;
=over 4
-=item * The MIME name as defined in IETF RFC-XXXX.
+=item * The MIME name as defined in IETF RFCs.
=item * The name in the IANA registry.
-=item * The name used by the the organization that defined it.
+=item * The name used by the organization that defined it.
=back
encodings have state C<Encode> uses the encoding object internally
once an operation is in progress.
+As of Perl 5.8.0, at least the following encodings are recognized
+(the => marks aliases):
+
+ ASCII
+
+ US-ASCII => ASCII
+
+The Unicode:
+
+ UTF-8
+ UTF-16
+ UCS-2
+
+ ISO 10646-1 => UCS-2
+
+The ISO 8859 and KOI:
+
+ ISO 8859-1 ISO 8859-6 ISO 8859-11 KOI8-F
+ ISO 8859-2 ISO 8859-7 (12 doesn't exist) KOI8-R
+ ISO 8859-3 ISO 8859-8 ISO 8859-13 KOI8-U
+ ISO 8859-4 ISO 8859-9 ISO 8859-14
+ ISO 8859-5 ISO 8859-10 ISO 8859-15
+ ISO 8859-16
+
+ Latin1 => 8859-1 Latin6 => 8859-10
+ Latin2 => 8859-2 Latin7 => 8859-13
+ Latin3 => 8859-3 Latin8 => 8859-14
+ Latin4 => 8859-4 Latin9 => 8859-15
+ Latin5 => 8859-9 Latin10 => 8859-16
+
+ Cyrillic => 8859-5
+ Arabic => 8859-6
+ Greek => 8859-7
+ Hebrew => 8859-8
+ Thai => 8859-11
+ TIS620 => 8859-11
+
+The CJKV: Chinese, Japanese, Korean, Vietnamese:
+
+ ISO 2022 ISO 2022 JP-1 JIS 0201 GB 1988 Big5 EUC-CN
+ ISO 2022 CN ISO 2022 JP-2 JIS 0208 GB 2312 HZ EUC-JP
+ ISO 2022 JP ISO 2022 KR JIS 0210 GB 12345 CNS 11643 EUC-JP-0212
+ Shift-JIS EUC-KR
+ VISCII
+
+The PC codepages:
+
+ CP37 CP852 CP861 CP866 CP949 CP1251 CP1256
+ CP424 CP855 CP862 CP869 CP950 CP1252 CP1257
+ CP737 CP856 CP863 CP874 CP1006 CP1253 CP1258
+ CP775 CP857 CP864 CP932 CP1047 CP1254
+ CP850 CP860 CP865 CP936 CP1250 CP1255
+
+ WinLatin1 => CP1252
+ WinLatin2 => CP1250
+ WinCyrillic => CP1251
+ WinGreek => CP1253
+ WinTurkiskh => CP1254
+ WinHebrew => CP1255
+ WinArabic => CP1256
+ WinBaltic => CP1257
+ WinVietnamese => CP1258
+
+(All the CPI<NNN...> are available also as IBMI<NNN...>.)
+
+The Mac codepages:
+
+ MacCentralEuropean MacJapanese
+ MacCroatian MacRoman
+ MacCyrillic MacRumanian
+ MacDingbats MacSami
+ MacGreek MacThai
+ MacIcelandic MacTurkish
+ MacUkraine
+
+Miscellaneous:
+
+ 7bit-greek IR-197
+ 7bit-kana NeXTstep
+ 7bit-latin1 POSIX-BC
+ DingBats Roman8
+ GSM 0338 Symbol
+
=head1 PERL ENCODING API
=head2 Generic Encoding Interface
Convert B<in-place> the data between two encodings. How did the data
in $string originally get to be in FROM_ENCODING? Either using
-encode() or through PerlIO: See L</"Encode and PerlIO">. For CHECK
+encode() or through PerlIO: See L</"Encoding and IO">. For CHECK
see L</"Handling Malformed Data">.
For example to convert ISO 8859-1 data to UTF-8:
=head2 Other Encodings of Unicode
UTF-16 is similar to UCS-2, 16 bit or 2-byte chunks. UCS-2 can only
-represent 0..0xFFFF, while UTF-16 has a "surrogate pair" scheme which
+represent 0..0xFFFF, while UTF-16 has a I<surrogate pair> scheme which
allows it to cover the whole Unicode range.
+Surrogates are code points set aside to encode the 0x01000..0x10FFFF
+range of Unicode code points in pairs of 16-bit units. The I<high
+surrogates> are the range 0xD800..0xDBFF, and the I<low surrogates>
+are the range 0xDC00..0xDFFFF. The surrogate encoding is
+
+ $hi = ($uni - 0x10000) / 0x400 + 0xD800;
+ $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
+
+and the decoding is
+
+ $uni = 0x10000 + ($hi - 0xD8000) * 0x400 + ($lo - 0xDC00);
+
Encode implements big-endian UCS-2 aliased to "iso-10646-1" as that
happens to be the name used by that representation when used with X11
fonts.
to transfer strings in this form (e.g. to write them to a file) would
need to
- pack('L',map(chr($_),split(//,$string))); # native
+ pack('L*', unpack('U*', $string)); # native
or
- pack('V',map(chr($_),split(//,$string))); # little-endian
+ pack('V*', unpack('U*', $string)); # little-endian
or
- pack('N',map(chr($_),split(//,$string))); # big-endian
+ pack('N*', unpack('U*', $string)); # big-endian
-depending on the endian required.
+depending on the endianness required.
No UTF-32 encodings are implemented yet.
=head2 Defining Encodings
- use Encode qw(define_alias);
- define_encoding( $object, 'canonicalName' [,alias...]);
+ use Encode qw(define_alias);
+ define_encoding( $object, 'canonicalName' [,alias...]);
Causes I<canonicalName> to be associated with I<$object>. The object
should provide the interface described in L</"IMPLEMENTATION CLASSES">
C<Encode> provides a "layer" (See L<perliol>) which can transform
data as it is read or written.
- open(my $ilyad,'>:encoding(iso-8859-7)','ilyad.greek');
- print $ilyad @epic;
+Here is how the blind poet would modernise the encoding:
+
+ use Encode;
+ open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
+ open(my $utf8,'>:utf8','iliad.utf8');
+ my @epic = <$iliad>;
+ print $utf8 @epic;
+ close($utf8);
+ close($illiad);
In addition the new IO system can also be configured to read/write
UTF-8 encoded characters (as noted above this is efficient):
- open(my $fh,'>:utf8','anything');
- print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
+ open(my $fh,'>:utf8','anything');
+ print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
Either of the above forms of "layer" specifications can be made the default
for a lexical scope with the C<use open ...> pragma. See L<open>.
transform the bytes read from a handle into characters before doing
"character operations" (e.g. C<lc>, C</\W+/>, ...).
-=head1 Encode and PerlIO
-
-The PerlIO layer (new since Perl 5.7) can be used to automatically
-convert the data being read in or written out to be converted from
-some encoding into Perl's internal encoding or from Perl's internal
-encoding into some other encoding.
-
-Examples:
-
- open(my $f, "<:encoding(cp1252)")
-
- open(my $g, ">:encoding(iso-8859-1)")
-
You can also use PerlIO to convert larger amounts of data you don't
want to bring into memory. For example to convert between ISO 8859-1
(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
- open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
- open(G, ">:utf8", "data.utf") or die $!;
- while (<F>) { print G }
+ open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
+ open(G, ">:utf8", "data.utf") or die $!;
+ while (<F>) { print G }
+
+ # Could also do "print G <F>" but that would pull
+ # the whole file into memory just to write it out again.
- # Could also do "print G <F>" but that would pull
- # the whole file into memory just to write it out again.
+More examples:
+
+ open(my $f, "<:encoding(cp1252)")
+ open(my $g, ">:encoding(iso-8859-2)")
+ open(my $h, ">:encoding(latin9)") # iso-8859-15
See L<PerlIO> for more information.
+See also L<encoding> for how to change the default encoding of the
+data in your script.
+
=head1 Encoding How to ...
To do:
=head1 SEE ALSO
-L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>
+L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>, L<encoding>
=cut