package Encode;
use strict;
-our $VERSION = do { my @r = (q$Revision: 0.94 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 0.99 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $DEBUG = 0;
require DynaLoader;
require Exporter;
our @EXPORT_OK =
qw(
define_encoding
- define_alias
from_to
is_utf8
is_8bit
use Carp;
-# Make a %encoding package variable to allow a certain amount of cheating
-our %encoding;
-my @alias; # ordered matching list
-my %alias; # cached known aliases
-
- # 0 1 2 3 4 5 6 7 8 9 10
-our @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
-
-our %winlatin2cp = (
- 'Latin1' => 1252,
- 'Latin2' => 1250,
- 'Cyrillic' => 1251,
- 'Greek' => 1253,
- 'Turkish' => 1254,
- 'Hebrew' => 1255,
- 'Arabic' => 1256,
- 'Baltic' => 1257,
- 'Vietnamese' => 1258,
- );
-
-our %external_tables =
+our $ON_EBCDIC = (ord("A") == 193);
+use Encode::Alias;
+
+# Make a %Encoding package variable to allow a certain amount of cheating
+our %Encoding;
+
+our %ExtModule =
(
- 'euc-cn' => 'Encode/CN.pm',
- gb2312 => 'Encode/CN.pm',
- gb12345 => 'Encode/CN.pm',
- gbk => 'Encode/CN.pm',
- cp936 => 'Encode/CN.pm',
- 'iso-ir-165' => 'Encode/CN.pm',
- 'euc-jp' => 'Encode/JP.pm',
- 'iso-2022-jp' => 'Encode/JP.pm',
- '7bit-jis' => 'Encode/JP.pm',
- shiftjis => 'Encode/JP.pm',
- macjapan => 'Encode/JP.pm',
- cp932 => 'Encode/JP.pm',
- 'euc-kr' => 'Encode/KR.pm',
- ksc5601 => 'Encode/KR.pm',
- cp949 => 'Encode/KR.pm',
- big5 => 'Encode/TW.pm',
- 'big5-hkscs' => 'Encode/TW.pm',
- cp950 => 'Encode/TW.pm',
- gb18030 => 'Encode/HanExtra.pm',
- big5plus => 'Encode/HanExtra.pm',
- 'euc-tw' => 'Encode/HanExtra.pm',
+ viscii => 'Encode/Byte.pm',
+ 'koi8-r' => 'Encode/Byte.pm',
+ cp1047 => 'Encode/EBCDIC.pm',
+ cp37 => 'Encode/EBCDIC.pm',
+ 'posix-bc' => 'Encode/EBCDIC.pm',
+ symbol => 'Encode/Symbol.pm',
+ dingbats => 'Encode/Symbol.pm',
);
-sub encodings
-{
- my ($class) = @_;
- return
- map { $_->[0] }
- sort { $a->[1] cmp $b->[1] }
- map { [$_, lc $_] }
- grep { $_ ne 'Internal' }
- keys %encoding;
+for my $k (2..11,13..16){
+ $ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm';
+}
+
+for my $k (1250..1258){
+ $ExtModule{"cp$k"} = 'Encode/Byte.pm';
+}
+
+unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
+%ExtModule =(
+ %ExtModule,
+ 'euc-cn' => 'Encode/CN.pm',
+ gb2312 => 'Encode/CN.pm',
+ gb12345 => 'Encode/CN.pm',
+ gbk => 'Encode/CN.pm',
+ cp936 => 'Encode/CN.pm',
+ 'iso-ir-165' => 'Encode/CN.pm',
+ 'euc-jp' => 'Encode/JP.pm',
+ 'iso-2022-jp' => 'Encode/JP.pm',
+ 'iso-2022-jp-1' => 'Encode/JP.pm',
+ '7bit-jis' => 'Encode/JP.pm',
+ shiftjis => 'Encode/JP.pm',
+ macjapan => 'Encode/JP.pm',
+ cp932 => 'Encode/JP.pm',
+ 'euc-kr' => 'Encode/KR.pm',
+ ksc5601 => 'Encode/KR.pm',
+ cp949 => 'Encode/KR.pm',
+ big5 => 'Encode/TW.pm',
+ 'big5-hkscs' => 'Encode/TW.pm',
+ cp950 => 'Encode/TW.pm',
+ gb18030 => 'Encode/HanExtra.pm',
+ big5plus => 'Encode/HanExtra.pm',
+ 'euc-tw' => 'Encode/HanExtra.pm',
+ );
}
-sub findAlias
+for my $k (qw(centeuro croatian cyrillic dingbats greek
+ iceland roman rumanian sami
+ thai turkish ukraine))
{
- my $class = shift;
- local $_ = shift;
- # print "# findAlias $_\n";
- unless (exists $alias{$_})
- {
- for (my $i=0; $i < @alias; $i += 2)
- {
- my $alias = $alias[$i];
- my $val = $alias[$i+1];
- my $new;
- if (ref($alias) eq 'Regexp' && $_ =~ $alias)
- {
- $new = eval $val;
- }
- elsif (ref($alias) eq 'CODE')
- {
- $new = &{$alias}($val)
- }
- elsif (lc($_) eq lc($alias))
- {
- $new = $val;
- }
- if (defined($new))
- {
- next if $new eq $_; # avoid (direct) recursion on bugs
- my $enc = (ref($new)) ? $new : find_encoding($new);
- if ($enc)
- {
- $alias{$_} = $enc;
- last;
- }
- }
- }
- }
- return $alias{$_};
+ $ExtModule{"mac$k"} = 'Encode/Byte.pm';
}
-sub define_alias
+
+sub encodings
{
- while (@_)
+ my $class = shift;
+ my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
+ for my $m (@modules)
{
- my ($alias,$name) = splice(@_,0,2);
- push(@alias, $alias => $name);
+ $DEBUG and warn "about to require $m;";
+ eval { require $m; };
}
+ return
+ map({$_->[0]}
+ sort({$a->[1] cmp $b->[1]}
+ map({[$_, lc $_]}
+ grep({ $_ ne 'Internal' } keys %Encoding))));
}
-# Allow variants of iso-8859-1 etc.
-define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
-
-# At least HP-UX has these.
-define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );
-
-# More HP stuff.
-define_alias( qr/^(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
-
-# The Official name of ASCII.
-define_alias( qr/^ANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
-
-# This is a font issue, not an encoding issue.
-# (The currency symbol of the Latin 1 upper half
-# has been redefined as the euro symbol.)
-define_alias( qr/^(.+)\@euro$/i => '"$1"' );
-
-# Allow latin-1 style names as well
-define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
-
-# Allow winlatin1 style names as well
-define_alias( qr/^win(latin[12]|cyrillic|baltic|greek|turkish|hebrew|arabic|baltic|vietnamese)$/i => '"cp$winlatin2cp{\u$1}"' );
-
-# Common names for non-latin prefered MIME names
-define_alias( 'ascii' => 'US-ascii',
- 'cyrillic' => 'iso-8859-5',
- 'arabic' => 'iso-8859-6',
- 'greek' => 'iso-8859-7',
- 'hebrew' => 'iso-8859-8',
- 'thai' => 'iso-8859-11',
- 'tis620' => 'iso-8859-11',
- );
-
-# At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
-# And Microsoft has their own naming (again, surprisingly).
-define_alias( qr/^(?:ibm|ms)[-_]?(\d\d\d\d?)$/i => '"cp$1"');
-
-# Sometimes seen with a leading zero.
-define_alias( qr/^cp037$/i => '"cp37"');
-
-# Ououououou.
-define_alias( qr/^macRomanian$/i => '"macRumanian"');
-
-# Standardize on the dashed versions.
-define_alias( qr/^utf8$/i => 'utf-8' );
-define_alias( qr/^koi8r$/i => 'koi8-r' );
-define_alias( qr/^koi8u$/i => 'koi8-u' );
-
-# Seen in some Linuxes.
-define_alias( qr/^ujis$/i => 'euc-jp' );
-
-# CP936 doesn't have vendor-addon for GBK, so they're identical.
-define_alias( qr/^gbk$/i => '"cp936"');
-
-# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
-# TODO: HP-UX '15' encodings japanese15 korean15 roi15
-# TODO: Cyrillic encoding ISO-IR-111 (useful?)
-# TODO: Armenian encoding ARMSCII-8
-# TODO: Hebrew encoding ISO-8859-8-1
-# TODO: Thai encoding TCVN
-# TODO: Korean encoding Johab
-# TODO: Vietnamese encodings VPS
-# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
-# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
-# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
-# Kannada Khmer Korean Laotian Malayalam Mongolian
-# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
-
-# Map white space and _ to '-'
-define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
-
sub define_encoding
{
my $obj = shift;
my $name = shift;
- $encoding{$name} = $obj;
+ $Encoding{$name} = $obj;
my $lc = lc($name);
define_alias($lc => $obj) unless $lc eq $name;
while (@_)
return $name;
}
my $lc = lc $name;
- if (exists $encoding{$name})
+ if (exists $Encoding{$name})
{
- return $encoding{$name};
+ return $Encoding{$name};
}
- if (exists $encoding{$lc})
+ if (exists $Encoding{$lc})
{
- return $encoding{$lc};
+ return $Encoding{$lc};
}
- my $oc = $class->findAlias($name);
+ my $oc = $class->find_alias($name);
return $oc if defined $oc;
- $oc = $class->findAlias($lc) if $lc ne $name;
+ $oc = $class->find_alias($lc) if $lc ne $name;
return $oc if defined $oc;
- if (!$skip_external and exists $external_tables{$lc})
+ if (!$skip_external and exists $ExtModule{$lc})
{
- require $external_tables{$lc};
- return $encoding{$name} if exists $encoding{$name};
+ eval{ require $ExtModule{$lc}; };
+ return $Encoding{$name} if exists $Encoding{$name};
}
return;
require Encode::Internal;
require Encode::Unicode;
require Encode::utf8;
-require Encode::iso10646_1;
+require Encode::10646_1;
require Encode::ucs2_le;
1;
and the rest of the system. Perl strings are sequences of B<characters>.
To find more about character encodings, please consult
-L<Encode::Description> . This document focuses on programming references.
+L<Encode::Details>. This document focuses on programming references.
=head1 PERL ENCODING API
=over 4
-=item *
-
- $bytes = encode(ENCODING, $string[, CHECK])
+=item $bytes = encode(ENCODING, $string[, CHECK])
Encodes string from Perl's internal form into I<ENCODING> and returns
a sequence of octets. For CHECK see L</"Handling Malformed Data">.
$octets = encode("utf8", $unicode);
-=item *
-
- $string = decode(ENCODING, $bytes[, CHECK])
+=item $string = decode(ENCODING, $bytes[, CHECK])
Decode sequence of octets assumed to be in I<ENCODING> into Perl's
internal form and returns the resulting string. For CHECK see
$utf8 = decode("latin1", $latin1);
-=item *
-
- from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
+=item from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
Convert B<in-place> the data between two encodings. How did the data
in $string originally get to be in FROM_ENCODING? Either using
Multiple return values rather than in-place modifications.
-Index into the string could be pos($str) allowing s/\G...//.
+Index into the string could be C<pos($str)> allowing C<s/\G...//>.
=back
=head2 UTF-8 / utf8
The Unicode consortium defines the UTF-8 standard as a way of encoding
-the entire Unicode repertiore as sequences of octets. This encoding is
-expected to become very widespread. Perl can use this form internaly
+the entire Unicode repertoire as sequences of octets. This encoding is
+expected to become very widespread. Perl can use this form internally
to represent strings, so conversions to and from this form are
particularly efficient (as octets in memory do not have to change,
just the meta-data that tells Perl how to treat them).
=over 4
-=item *
-
- $bytes = encode_utf8($string);
+=item $bytes = encode_utf8($string);
The characters that comprise string are encoded in Perl's superset of UTF-8
and the resulting octets returned as a sequence of bytes. All possible
characters have a UTF-8 representation so this function cannot fail.
-=item *
-
- $string = decode_utf8($bytes [,CHECK]);
+=item $string = decode_utf8($bytes [, CHECK]);
The sequence of octets represented by $bytes is decoded from UTF-8
into a sequence of logical characters. Not all sequences of octets
=head2 Listing available encodings
- use Encode qw(encodings);
- @list = encodings();
-
-Returns a list of the canonical names of the available encodings.
-
-=head2 Defining Aliases
-
- use Encode qw(define_alias);
- define_alias( newName => ENCODING);
-
-Allows newName to be used as am alias for ENCODING. ENCODING may be
-either the name of an encoding or and encoding object (as above).
+ use Encode;
+ @list = Encode->encodings();
-Currently I<newName> can be specified in the following ways:
+Returns a list of the canonical names of the available encodings that
+are loaded. To get a list of all available encodings including the
+ones that are not loaded yet, say
-=over 4
+ @all_encodings = Encode->encodings(":all");
-=item As a simple string.
+Or you can give the name of specific module.
-=item As a qr// compiled regular expression, e.g.:
+ @with_jp = Encode->encodings("Encode/JP.pm");
- define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
+Note in this case you have to say C<"Encode/JP.pm"> instead of
+C<"Encode::JP">.
-In this case if I<ENCODING> is not a reference it is C<eval>-ed to
-allow C<$1> etc. to be subsituted. The example is one way to names as
-used in X11 font names to alias the MIME names for the iso-8859-*
-family. Note the double quote inside the single quote. If you are
-using regex here, you have to do so or it won't work in this case.
+To find which encodings are supported by this package in details,
+see L<Encode::Supported>.
-=item As a code reference, e.g.:
+=head2 Defining Aliases
- define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
+ use Encode;
+ use Encode::Alias;
+ define_alias(newName => ENCODING);
-In this case C<$_> will be set to the name that is being looked up and
-I<ENCODING> is passed to the sub as its first argument. The example
-is another way to names as used in X11 font names to alias the MIME
-names for the iso-8859-* family.
+Allows newName to be used as am alias for ENCODING. ENCODING may be
+either the name of an encoding or and encoding object (as above).
-=back
+See L<Encode::Alias> on details.
=head1 Defining Encodings
use Encode qw(define_alias);
- define_encoding( $object, 'canonicalName' [,alias...]);
+ define_encoding($object, 'canonicalName' [, alias...]);
Causes I<canonicalName> to be associated with I<$object>. The object
should provide the interface described in L<Encode::Encoding>
=over 4
-=item * is_utf8(STRING [, CHECK])
+=item is_utf8(STRING [, CHECK])
[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
If CHECK is true, also checks the data in STRING for being well-formed
UTF-8. Returns true if successful, false otherwise.
-=item *
-
- _utf8_on(STRING)
+=item _utf8_on(STRING)
[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
B<not> checked for being well-formed UTF-8. Do not use unless you
state of the UTF-8 flag (so please don't test the return value as
I<not> success or failure), or C<undef> if STRING is not a string.
-=item *
-
- _utf8_off(STRING)
+=item _utf8_off(STRING)
[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
Returns the previous state of the UTF-8 flag (so please don't test the
=head1 SEE ALSO
-L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>, L<encoding>,
-L<utf8>, L<Encode::Description>, L<Encode::Encoding> the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
-
+L<Encode::Details>,
+L<Encode::Encoding>,
+L<Encode::Supported>,
+L<PerlIO>,
+L<encoding>,
+L<perlebcdic>,
+L<perlfunc/open>,
+L<perlunicode>,
+L<utf8>,
+the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
=cut
-