Upgrade to Encode 0.99, from Dan Kogai.

[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm

index 8dc14c3..42bb24e 100644 (file)
--- a/ext/Encode/Encode.pm
+++ b/ext/Encode/Encode.pm
@@ -1,6 +1,7 @@
 package Encode;
 use strict;
-our $VERSION = do { my @r = (q$Revision: 0.94 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 0.99 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $DEBUG = 0;
 
 require DynaLoader;
 require Exporter;
@@ -20,7 +21,6 @@ our @EXPORT = qw (
 our @EXPORT_OK =
     qw(
        define_encoding
-       define_alias
        from_to
        is_utf8
        is_8bit
@@ -37,186 +37,88 @@ bootstrap Encode ();
 
 use Carp;
 
-# Make a %encoding package variable to allow a certain amount of cheating
-our %encoding;
-my @alias;  # ordered matching list
-my %alias;  # cached known aliases
-
-                     # 0  1  2  3  4  5   6   7   8   9  10
-our @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
-
-our %winlatin2cp   = (
-                     'Latin1'     => 1252,
-                     'Latin2'     => 1250,
-                     'Cyrillic'   => 1251,
-                     'Greek'      => 1253,
-                     'Turkish'    => 1254,
-                     'Hebrew'     => 1255,
-                     'Arabic'     => 1256,
-                     'Baltic'     => 1257,
-                     'Vietnamese' => 1258,
-                    );
-
-our %external_tables =
+our $ON_EBCDIC = (ord("A") == 193);
+use Encode::Alias;
+
+# Make a %Encoding package variable to allow a certain amount of cheating
+our %Encoding;
+
+our %ExtModule =
     (
-       'euc-cn'        => 'Encode/CN.pm',
-       gb2312          => 'Encode/CN.pm',
-       gb12345         => 'Encode/CN.pm',
-       gbk             => 'Encode/CN.pm',
-       cp936           => 'Encode/CN.pm',
-       'iso-ir-165'    => 'Encode/CN.pm',
-       'euc-jp'        => 'Encode/JP.pm',
-       'iso-2022-jp'   => 'Encode/JP.pm',
-       '7bit-jis'      => 'Encode/JP.pm',
-       shiftjis        => 'Encode/JP.pm',
-       macjapan        => 'Encode/JP.pm',
-       cp932           => 'Encode/JP.pm',
-       'euc-kr'        => 'Encode/KR.pm',
-       ksc5601         => 'Encode/KR.pm',
-       cp949           => 'Encode/KR.pm',
-       big5            => 'Encode/TW.pm',
-       'big5-hkscs'    => 'Encode/TW.pm',
-       cp950           => 'Encode/TW.pm',
-       gb18030         => 'Encode/HanExtra.pm',
-       big5plus        => 'Encode/HanExtra.pm',
-       'euc-tw'        => 'Encode/HanExtra.pm',
+     viscii             => 'Encode/Byte.pm',
+     'koi8-r'           => 'Encode/Byte.pm',
+     cp1047             => 'Encode/EBCDIC.pm',
+     cp37               => 'Encode/EBCDIC.pm',
+     'posix-bc'         => 'Encode/EBCDIC.pm',
+     symbol             => 'Encode/Symbol.pm',
+     dingbats           => 'Encode/Symbol.pm',
     );
 
-sub encodings
-{
- my ($class) = @_;
- return
-     map { $_->[0] }
-         sort { $a->[1] cmp $b->[1] }
-               map { [$_, lc $_] }
-                   grep { $_ ne 'Internal' }
-                        keys %encoding;
+for my $k (2..11,13..16){
+    $ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm';
+}
+
+for my $k (1250..1258){
+    $ExtModule{"cp$k"} = 'Encode/Byte.pm';
+}
+
+unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
+%ExtModule =(
+            %ExtModule,
+            'euc-cn'           => 'Encode/CN.pm',
+            gb2312             => 'Encode/CN.pm',
+            gb12345            => 'Encode/CN.pm',
+            gbk                => 'Encode/CN.pm',
+            cp936              => 'Encode/CN.pm',
+            'iso-ir-165'       => 'Encode/CN.pm',
+            'euc-jp'           => 'Encode/JP.pm',
+            'iso-2022-jp'      => 'Encode/JP.pm',
+            'iso-2022-jp-1'    => 'Encode/JP.pm',
+            '7bit-jis'         => 'Encode/JP.pm',
+            shiftjis           => 'Encode/JP.pm',
+            macjapan           => 'Encode/JP.pm',
+            cp932              => 'Encode/JP.pm',
+            'euc-kr'           => 'Encode/KR.pm',
+            ksc5601            => 'Encode/KR.pm',
+            cp949              => 'Encode/KR.pm',
+            big5               => 'Encode/TW.pm',
+            'big5-hkscs'       => 'Encode/TW.pm',
+            cp950              => 'Encode/TW.pm',
+            gb18030            => 'Encode/HanExtra.pm',
+            big5plus           => 'Encode/HanExtra.pm',
+            'euc-tw'           => 'Encode/HanExtra.pm',
+            );
 }
 
-sub findAlias
+for my $k (qw(centeuro croatian cyrillic dingbats greek
+             iceland roman rumanian sami 
+             thai turkish  ukraine))
 {
-    my $class = shift;
-    local $_ = shift;
-    # print "# findAlias $_\n";
-    unless (exists $alias{$_})
-    {
-       for (my $i=0; $i < @alias; $i += 2)
-       {
-           my $alias = $alias[$i];
-           my $val   = $alias[$i+1];
-           my $new;
-           if (ref($alias) eq 'Regexp' && $_ =~ $alias)
-           {
-               $new = eval $val;
-           }
-           elsif (ref($alias) eq 'CODE')
-           {
-               $new = &{$alias}($val)
-               }
-           elsif (lc($_) eq lc($alias))
-           {
-               $new = $val;
-           }
-           if (defined($new))
-           {
-               next if $new eq $_; # avoid (direct) recursion on bugs
-               my $enc = (ref($new)) ? $new : find_encoding($new);
-               if ($enc)
-               {
-                   $alias{$_} = $enc;
-                   last;
-               }
-           }
-       }
-    }
-    return $alias{$_};
+    $ExtModule{"mac$k"} = 'Encode/Byte.pm';
 }
 
-sub define_alias
+
+sub encodings
 {
-    while (@_)
+    my $class = shift;
+    my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
+    for my $m (@modules)
     {
-       my ($alias,$name) = splice(@_,0,2);
-       push(@alias, $alias => $name);
+       $DEBUG and warn "about to require $m;";
+       eval { require $m; };
     }
+    return
+       map({$_->[0]} 
+           sort({$a->[1] cmp $b->[1]}
+                map({[$_, lc $_]} 
+                    grep({ $_ ne 'Internal' }  keys %Encoding))));
 }
 
-# Allow variants of iso-8859-1 etc.
-define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
-
-# At least HP-UX has these.
-define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );
-
-# More HP stuff.
-define_alias( qr/^(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
-
-# The Official name of ASCII.
-define_alias( qr/^ANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
-
-# This is a font issue, not an encoding issue.
-# (The currency symbol of the Latin 1 upper half
-#  has been redefined as the euro symbol.)
-define_alias( qr/^(.+)\@euro$/i => '"$1"' );
-
-# Allow latin-1 style names as well
-define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
-
-# Allow winlatin1 style names as well
-define_alias( qr/^win(latin[12]|cyrillic|baltic|greek|turkish|hebrew|arabic|baltic|vietnamese)$/i => '"cp$winlatin2cp{\u$1}"' );
-
-# Common names for non-latin prefered MIME names
-define_alias( 'ascii'    => 'US-ascii',
-              'cyrillic' => 'iso-8859-5',
-              'arabic'   => 'iso-8859-6',
-              'greek'    => 'iso-8859-7',
-              'hebrew'   => 'iso-8859-8',
-              'thai'     => 'iso-8859-11',
-              'tis620'   => 'iso-8859-11',
-           );
-
-# At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
-# And Microsoft has their own naming (again, surprisingly).
-define_alias( qr/^(?:ibm|ms)[-_]?(\d\d\d\d?)$/i => '"cp$1"');
-
-# Sometimes seen with a leading zero.
-define_alias( qr/^cp037$/i => '"cp37"');
-
-# Ououououou.
-define_alias( qr/^macRomanian$/i => '"macRumanian"');
-
-# Standardize on the dashed versions.
-define_alias( qr/^utf8$/i  => 'utf-8' );
-define_alias( qr/^koi8r$/i => 'koi8-r' );
-define_alias( qr/^koi8u$/i => 'koi8-u' );
-
-# Seen in some Linuxes.
-define_alias( qr/^ujis$/i => 'euc-jp' );
-
-# CP936 doesn't have vendor-addon for GBK, so they're identical.
-define_alias( qr/^gbk$/i => '"cp936"');
-
-# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
-# TODO: HP-UX '15' encodings japanese15 korean15 roi15
-# TODO: Cyrillic encoding ISO-IR-111 (useful?)
-# TODO: Armenian encoding ARMSCII-8
-# TODO: Hebrew encoding ISO-8859-8-1
-# TODO: Thai encoding TCVN
-# TODO: Korean encoding Johab
-# TODO: Vietnamese encodings VPS
-# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
-#       ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
-#       Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
-#       Kannada Khmer Korean Laotian Malayalam Mongolian
-#       Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
-
-# Map white space and _ to '-'
-define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
-
 sub define_encoding
 {
     my $obj  = shift;
     my $name = shift;
-    $encoding{$name} = $obj;
+    $Encoding{$name} = $obj;
     my $lc = lc($name);
     define_alias($lc => $obj) unless $lc eq $name;
     while (@_)
@@ -236,25 +138,25 @@ sub getEncoding
        return $name;
     }
     my $lc = lc $name;
-    if (exists $encoding{$name})
+    if (exists $Encoding{$name})
     {
-       return $encoding{$name};
+       return $Encoding{$name};
     }
-    if (exists $encoding{$lc})
+    if (exists $Encoding{$lc})
     {
-       return $encoding{$lc};
+       return $Encoding{$lc};
     }
 
-    my $oc = $class->findAlias($name);
+    my $oc = $class->find_alias($name);
     return $oc if defined $oc;
 
-    $oc = $class->findAlias($lc) if $lc ne $name;
+    $oc = $class->find_alias($lc) if $lc ne $name;
     return $oc if defined $oc;
 
-    if (!$skip_external and exists $external_tables{$lc})
+    if (!$skip_external and exists $ExtModule{$lc})
     {
-       require $external_tables{$lc};
-       return $encoding{$name} if exists $encoding{$name};
+       eval{ require $ExtModule{$lc}; };
+       return $Encoding{$name} if exists $Encoding{$name};
     }
 
     return;
@@ -319,7 +221,7 @@ require Encode::XS;
 require Encode::Internal;
 require Encode::Unicode;
 require Encode::utf8;
-require Encode::iso10646_1;
+require Encode::10646_1;
 require Encode::ucs2_le;
 
 1;
@@ -340,7 +242,7 @@ The C<Encode> module provides the interfaces between Perl's strings
 and the rest of the system.  Perl strings are sequences of B<characters>.
 
 To find more about character encodings, please consult
-L<Encode::Description> . This document focuses on programming references.
+L<Encode::Details>. This document focuses on programming references.
 
 =head1 PERL ENCODING API
 
@@ -348,9 +250,7 @@ L<Encode::Description> . This document focuses on programming references.
 
 =over 4
 
-=item *
-
-        $bytes  = encode(ENCODING, $string[, CHECK])
+=item $bytes  = encode(ENCODING, $string[, CHECK])
 
 Encodes string from Perl's internal form into I<ENCODING> and returns
 a sequence of octets.  For CHECK see L</"Handling Malformed Data">.
@@ -360,9 +260,7 @@ to octets:
 
        $octets = encode("utf8", $unicode);
 
-=item *
-
-        $string = decode(ENCODING, $bytes[, CHECK])
+=item $string = decode(ENCODING, $bytes[, CHECK])
 
 Decode sequence of octets assumed to be in I<ENCODING> into Perl's
 internal form and returns the resulting string.  For CHECK see
@@ -372,9 +270,7 @@ For example to convert ISO-8859-1 data to UTF-8:
 
        $utf8 = decode("latin1", $latin1);
 
-=item *
-
-       from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
+=item from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
 
 Convert B<in-place> the data between two encodings.  How did the data
 in $string originally get to be in FROM_ENCODING?  Either using
@@ -448,32 +344,28 @@ Hybrids of above.
 
 Multiple return values rather than in-place modifications.
 
-Index into the string could be pos($str) allowing s/\G...//.
+Index into the string could be C<pos($str)> allowing C<s/\G...//>.
 
 =back
 
 =head2 UTF-8 / utf8
 
 The Unicode consortium defines the UTF-8 standard as a way of encoding
-the entire Unicode repertiore as sequences of octets.  This encoding is
-expected to become very widespread. Perl can use this form internaly
+the entire Unicode repertoire as sequences of octets.  This encoding is
+expected to become very widespread. Perl can use this form internally
 to represent strings, so conversions to and from this form are
 particularly efficient (as octets in memory do not have to change,
 just the meta-data that tells Perl how to treat them).
 
 =over 4
 
-=item *
-
-        $bytes = encode_utf8($string);
+=item $bytes = encode_utf8($string);
 
 The characters that comprise string are encoded in Perl's superset of UTF-8
 and the resulting octets returned as a sequence of bytes. All possible
 characters have a UTF-8 representation so this function cannot fail.
 
-=item *
-
-        $string = decode_utf8($bytes [,CHECK]);
+=item $string = decode_utf8($bytes [, CHECK]);
 
 The sequence of octets represented by $bytes is decoded from UTF-8
 into a sequence of logical characters. Not all sequences of octets
@@ -484,50 +376,40 @@ For CHECK see L</"Handling Malformed Data">.
 
 =head2 Listing available encodings
 
-  use Encode qw(encodings);
-  @list = encodings();
-
-Returns a list of the canonical names of the available encodings.
-
-=head2 Defining Aliases
-
-  use Encode qw(define_alias);
-  define_alias( newName => ENCODING);
-
-Allows newName to be used as am alias for ENCODING. ENCODING may be
-either the name of an encoding or and encoding object (as above).
+  use Encode;
+  @list = Encode->encodings();
 
-Currently I<newName> can be specified in the following ways:
+Returns a list of the canonical names of the available encodings that
+are loaded.  To get a list of all available encodings including the
+ones that are not loaded yet, say
 
-=over 4
+  @all_encodings = Encode->encodings(":all");
 
-=item As a simple string.
+Or you can give the name of specific module.
 
-=item As a qr// compiled regular expression, e.g.:
+  @with_jp = Encode->encodings("Encode/JP.pm");
 
-  define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
+Note in this case you have to say C<"Encode/JP.pm"> instead of
+C<"Encode::JP">.
 
-In this case if I<ENCODING> is not a reference it is C<eval>-ed to
-allow C<$1> etc. to be subsituted.  The example is one way to names as
-used in X11 font names to alias the MIME names for the iso-8859-*
-family.  Note the double quote inside the single quote.  If you are
-using regex here, you have to do so or it won't work in this case.
+To find which encodings are supported by this package in details, 
+see L<Encode::Supported>.
 
-=item As a code reference, e.g.:
+=head2 Defining Aliases
 
-  define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
+  use Encode;
+  use Encode::Alias;
+  define_alias(newName => ENCODING);
 
-In this case C<$_> will be set to the name that is being looked up and
-I<ENCODING> is passed to the sub as its first argument.  The example
-is another way to names as used in X11 font names to alias the MIME
-names for the iso-8859-* family.
+Allows newName to be used as am alias for ENCODING. ENCODING may be
+either the name of an encoding or and encoding object (as above).
 
-=back
+See L<Encode::Alias> on details.
 
 =head1 Defining Encodings
 
     use Encode qw(define_alias);
-    define_encoding( $object, 'canonicalName' [,alias...]);
+    define_encoding($object, 'canonicalName' [, alias...]);
 
 Causes I<canonicalName> to be associated with I<$object>.  The object
 should provide the interface described in L<Encode::Encoding>
@@ -607,15 +489,13 @@ implementation.  As such they are efficient, but may change.
 
 =over 4
 
-=item * is_utf8(STRING [, CHECK])
+=item is_utf8(STRING [, CHECK])
 
 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
 If CHECK is true, also checks the data in STRING for being well-formed
 UTF-8.  Returns true if successful, false otherwise.
 
-=item *
-
-        _utf8_on(STRING)
+=item _utf8_on(STRING)
 
 [INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
 B<not> checked for being well-formed UTF-8.  Do not use unless you
@@ -623,9 +503,7 @@ B<know> that the STRING is well-formed UTF-8.  Returns the previous
 state of the UTF-8 flag (so please don't test the return value as
 I<not> success or failure), or C<undef> if STRING is not a string.
 
-=item *
-
-        _utf8_off(STRING)
+=item _utf8_off(STRING)
 
 [INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
 Returns the previous state of the UTF-8 flag (so please don't test the
@@ -636,9 +514,15 @@ not a string.
 
 =head1 SEE ALSO
 
-L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>, L<encoding>,
-L<utf8>, L<Encode::Description>, L<Encode::Encoding> the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
-
+L<Encode::Details>, 
+L<Encode::Encoding>,
+L<Encode::Supported>,
+L<PerlIO>, 
+L<encoding>,
+L<perlebcdic>, 
+L<perlfunc/open>, 
+L<perlunicode>, 
+L<utf8>, 
+the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
 
 =cut
-