package Encode;
+use strict;
-$VERSION = 0.01;
+our $VERSION = 0.02;
require DynaLoader;
require Exporter;
-@ISA = qw(Exporter DynaLoader);
+our @ISA = qw(Exporter DynaLoader);
# Public, encouraged API is exported by default
-@EXPORT = qw (
+our @EXPORT = qw (
encode
decode
encode_utf8
decode_utf8
find_encoding
+ encodings
);
-@EXPORT_OK =
+our @EXPORT_OK =
qw(
- encodings
+ define_encoding
+ define_alias
from_to
is_utf8
is_8bit
use Carp;
-# The global hash is declared in XS code
-$encoding{Unicode} = bless({},'Encode::Unicode');
-$encoding{utf8} = bless({},'Encode::utf8');
-$encoding{'iso10646-1'} = bless({},'Encode::iso10646_1');
+# Make a %encoding package variable to allow a certain amount of cheating
+our %encoding;
+my @alias; # ordered matching list
+my %alias; # cached known aliases
sub encodings
{
my ($class) = @_;
- foreach my $dir (@INC)
+ return keys %encoding;
+}
+
+sub findAlias
+{
+ my $class = shift;
+ local $_ = shift;
+ unless (exists $alias{$_})
{
- if (opendir(my $dh,"$dir/Encode"))
+ for (my $i=0; $i < @alias; $i += 2)
{
- while (defined(my $name = readdir($dh)))
+ my $alias = $alias[$i];
+ my $val = $alias[$i+1];
+ my $new;
+
+ if (ref($alias) eq 'Regexp' && $_ =~ $alias)
+ {
+ $new = eval $val;
+ }
+ elsif (ref($alias) eq 'CODE')
+ {
+ $new = &{$alias}($val)
+ }
+ elsif (lc($_) eq lc($alias))
+ {
+ $new = $val;
+ }
+ if (defined($new))
{
- if ($name =~ /^(.*)\.enc$/)
+ next if $new eq $_; # avoid (direct) recursion on bugs
+ my $enc = (ref($new)) ? $new : find_encoding($new);
+ if ($enc)
{
- next if exists $encoding{$1};
- $encoding{$1} = "$dir/$name";
+ $alias{$_} = $enc;
+ last;
}
}
- closedir($dh);
}
}
- return keys %encoding;
+ return $alias{$_};
}
-sub loadEncoding
+sub define_alias
{
- my ($class,$name,$file) = @_;
- if (open(my $fh,$file))
+ while (@_)
{
- my $type;
- while (1)
- {
- my $line = <$fh>;
- $type = substr($line,0,1);
- last unless $type eq '#';
- }
- $class .= ('::'.(($type eq 'E') ? 'Escape' : 'Table'));
- #warn "Loading $file";
- return $class->read($fh,$name,$type);
+ my ($alias,$name) = splice(@_,0,2);
+ push(@alias, $alias => $name);
}
- else
+}
+
+# Allow variants of iso-8859-1 etc.
+define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
+
+# Allow latin-1 style names as well
+ # 0 1 2 3 4 5 6 7 8 9 10
+my @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
+define_alias( qr/^latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
+
+# Common names for non-latin prefered MIME names
+define_alias( 'ascii' => 'US-ascii',
+ 'cyrillic' => 'iso-8859-5',
+ 'arabic' => 'iso-8859-6',
+ 'greek' => 'iso-8859-7',
+ 'hebrew' => 'iso-8859-8');
+
+define_alias( 'ibm-1047' => 'cp1047');
+
+# Map white space and _ to '-'
+define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
+
+sub define_encoding
+{
+ my $obj = shift;
+ my $name = shift;
+ $encoding{$name} = $obj;
+ my $lc = lc($name);
+ define_alias($lc => $obj) unless $lc eq $name;
+ while (@_)
{
- return undef;
+ my $alias = shift;
+ define_alias($alias,$obj);
}
+ return $obj;
}
sub getEncoding
{
my ($class,$name) = @_;
my $enc;
- unless (ref($enc = $encoding{$name}))
+ if (exists $encoding{$name})
{
- $enc = $class->loadEncoding($name,$enc) if defined $enc;
- unless (ref($enc))
- {
- foreach my $dir (@INC)
- {
- last if ($enc = $class->loadEncoding($name,"$dir/Encode/$name.enc"));
- }
- }
- $encoding{$name} = $enc;
+ return $encoding{$name};
+ }
+ else
+ {
+ return $class->findAlias($name);
}
- return $enc;
}
sub find_encoding
sub encode_utf8
{
my ($str) = @_;
- utf8_encode($str);
+ utf8::encode($str);
return $str;
}
sub decode_utf8
{
my ($str) = @_;
- return undef unless utf8_decode($str);
+ return undef unless utf8::decode($str);
return $str;
}
package Encode::Encoding;
# Base class for classes which implement encodings
+sub Define
+{
+ my $obj = shift;
+ my $canonical = shift;
+ $obj = bless { Name => $canonical },$obj unless ref $obj;
+ # warn "$canonical => $obj\n";
+ Encode::define_encoding($obj, $canonical, @_);
+}
+
+sub name { shift->{'Name'} }
+
# Temporary legacy methods
sub toUnicode { shift->decode(@_) }
sub fromUnicode { shift->encode(@_) }
package Encode::XS;
use base 'Encode::Encoding';
-package Encode::Unicode;
+package Encode::Internal;
use base 'Encode::Encoding';
# Dummy package that provides the encode interface but leaves data
-# as UTF-8 encoded. It is here so that from_to() works.
+# as UTF-X encoded. It is here so that from_to() works.
+
+__PACKAGE__->Define('Internal');
-sub name { 'Unicode' }
+Encode::define_alias( 'Unicode' => 'Internal' ) if ord('A') == 65;
sub decode
{
my ($obj,$str,$chk) = @_;
- Encode::utf8_upgrade($str);
+ utf8::upgrade($str);
$_[1] = '' if $chk;
return $str;
}
*encode = \&decode;
-package Encode::utf8;
+package Encoding::Unicode;
use base 'Encode::Encoding';
-# package to allow long-hand
-# $octets = encode( utf8 => $string );
-#
-
-sub name { 'utf8' }
+__PACKAGE__->Define('Unicode') unless ord('A') == 65;
sub decode
{
- my ($obj,$octets,$chk) = @_;
- my $str = Encode::decode_utf8($octets);
- if (defined $str)
+ my ($obj,$str,$chk) = @_;
+ my $res = '';
+ for (my $i = 0; $i < length($str); $i++)
{
- $_[1] = '' if $chk;
- return $str;
+ $res .= chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
}
- return undef;
-}
-
-sub encode
-{
- my ($obj,$string,$chk) = @_;
- my $octets = Encode::encode_utf8($string);
$_[1] = '' if $chk;
- return $octets;
+ return $res;
}
-package Encode::Table;
-use base 'Encode::Encoding';
-
-sub read
+sub encode
{
- my ($class,$fh,$name,$type) = @_;
- my $rep = $class->can("rep_$type");
- my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
- my @touni;
- my %fmuni;
- my $count = 0;
- $def = hex($def);
- while ($pages--)
+ my ($obj,$str,$chk) = @_;
+ my $res = '';
+ for (my $i = 0; $i < length($str); $i++)
{
- my $line = <$fh>;
- chomp($line);
- my $page = hex($line);
- my @page;
- my $ch = $page * 256;
- for (my $i = 0; $i < 16; $i++)
- {
- my $line = <$fh>;
- for (my $j = 0; $j < 16; $j++)
- {
- my $val = hex(substr($line,0,4,''));
- if ($val || !$ch)
- {
- my $uch = chr($val);
- push(@page,$uch);
- $fmuni{$uch} = $ch;
- $count++;
- }
- else
- {
- push(@page,undef);
- }
- $ch++;
- }
- }
- $touni[$page] = \@page;
+ $res .= chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
}
-
- return bless {Name => $name,
- Rep => $rep,
- ToUni => \@touni,
- FmUni => \%fmuni,
- Def => $def,
- Num => $count,
- },$class;
+ $_[1] = '' if $chk;
+ return $res;
}
-sub name { shift->{'Name'} }
-
-sub rep_S { 'C' }
-
-sub rep_D { 'n' }
-sub rep_M { ($_[0] > 255) ? 'n' : 'C' }
+package Encode::utf8;
+use base 'Encode::Encoding';
+# package to allow long-hand
+# $octets = encode( utf8 => $string );
+#
-sub representation
-{
- my ($obj,$ch) = @_;
- $ch = 0 unless @_ > 1;
- $obj-{'Rep'}->($ch);
-}
+__PACKAGE__->Define(qw(UTF-8 utf8));
sub decode
{
- my ($obj,$str,$chk) = @_;
- my $rep = $obj->{'Rep'};
- my $touni = $obj->{'ToUni'};
- my $uni = '';
- while (length($str))
+ my ($obj,$octets,$chk) = @_;
+ my $str = Encode::decode_utf8($octets);
+ if (defined $str)
{
- my $ch = ord(substr($str,0,1,''));
- my $x;
- if (&$rep($ch) eq 'C')
- {
- $x = $touni->[0][$ch];
- }
- else
- {
- $x = $touni->[$ch][ord(substr($str,0,1,''))];
- }
- unless (defined $x)
- {
- last if $chk;
- # What do we do here ?
- $x = '';
- }
- $uni .= $x;
+ $_[1] = '' if $chk;
+ return $str;
}
- $_[1] = $str if $chk;
- return $uni;
+ return undef;
}
sub encode
{
- my ($obj,$uni,$chk) = @_;
- my $fmuni = $obj->{'FmUni'};
- my $str = '';
- my $def = $obj->{'Def'};
- my $rep = $obj->{'Rep'};
- while (length($uni))
- {
- my $ch = substr($uni,0,1,'');
- my $x = $fmuni->{chr(ord($ch))};
- unless (defined $x)
- {
- last if ($chk);
- $x = $def;
- }
- $str .= pack(&$rep($x),$x);
- }
- $_[1] = $uni if $chk;
- return $str;
+ my ($obj,$string,$chk) = @_;
+ my $octets = Encode::encode_utf8($string);
+ $_[1] = '' if $chk;
+ return $octets;
}
package Encode::iso10646_1;
use base 'Encode::Encoding';
-
-# Encoding is 16-bit network order Unicode
+# Encoding is 16-bit network order Unicode (no surogates)
# Used for X font encodings
-sub name { 'iso10646-1' }
+__PACKAGE__->Define(qw(UCS-2 iso-10646-1));
sub decode
{
$uni .= chr($code);
}
$_[1] = $str if $chk;
- Encode::utf8_upgrade($uni);
+ utf8::upgrade($uni);
return $uni;
}
return $str;
}
-
-package Encode::Escape;
-use base 'Encode::Encoding';
-
-use Carp;
-
-sub read
-{
- my ($class,$fh,$name) = @_;
- my %self = (Name => $name, Num => 0);
- while (<$fh>)
- {
- my ($key,$val) = /^(\S+)\s+(.*)$/;
- $val =~ s/^\{(.*?)\}/$1/g;
- $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge;
- $self{$key} = $val;
- }
- return bless \%self,$class;
-}
-
-sub name { shift->{'Name'} }
-
-sub decode
-{
- croak("Not implemented yet");
-}
-
-sub encode
-{
- croak("Not implemented yet");
-}
-
# switch back to Encode package in case we ever add AutoLoader
package Encode;
=head2 TERMINOLOGY
-=over
+=over 4
=item *
encodings have state C<Encode> uses the encoding object internally
once an operation is in progress.
-I<Aliasing is not yet implemented.>
-
=head1 PERL ENCODING API
=head2 Generic Encoding Interface
UCS-2 can only represent 0..0xFFFF, while UTF-16 has a "surogate pair"
scheme which allows it to cover the whole Unicode range.
-Encode implements big-endian UCS-2 as the encoding "iso10646-1" as that
+Encode implements big-endian UCS-2 aliased to "iso-10646-1" as that
happens to be the name used by that representation when used with X11 fonts.
UTF-32 or UCS-4 is 32-bit or 4-byte chunks. Perl's logical characters
depending on the endian required.
-No UTF-32 encodings are not yet implemented.
+No UTF-32 encodings are implemented yet.
Both UCS-2 and UCS-4 style encodings can have "byte order marks" by representing
the code point 0xFFFE as the very first thing in a file.
+=head2 Listing available encodings
+
+ use Encode qw(encodings);
+ @list = encodings();
+
+Returns a list of the canonical names of the available encodings.
+
+=head2 Defining Aliases
+
+ use Encode qw(define_alias);
+ define_alias( newName => ENCODING);
+
+Allows newName to be used as am alias for ENCODING. ENCODING may be either the
+name of an encoding or and encoding object (as above).
+
+Currently I<newName> can be specified in the following ways:
+
+=over 4
+
+=item As a simple string.
+
+=item As a qr// compiled regular expression, e.g.:
+
+ define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
+
+In this case if I<ENCODING> is not a reference it is C<eval>-ed to allow
+C<$1> etc. to be subsituted.
+The example is one way to names as used in X11 font names to alias the MIME names for the
+iso-8859-* family.
+
+=item As a code reference, e.g.:
+
+ define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
+
+In this case C<$_> will be set to the name that is being looked up and
+I<ENCODING> is passed to the sub as its first argument.
+The example is another way to names as used in X11 font names to alias the MIME names for
+the iso-8859-* family.
+
+=back
+
+=head2 Defining Encodings
+
+ use Encode qw(define_alias);
+ define_encoding( $object, 'canonicalName' [,alias...]);
+
+Causes I<canonicalName> to be associated with I<$object>.
+The object should provide the interface described in L</"IMPLEMENTATION CLASSES"> below.
+If more than two arguments are provided then additional arguments are taken
+as aliases for I<$object> as for C<define_alias>.
+
=head1 Encoding and IO
It is very common to want to do encoding transformations when
C<Encode> provides a "layer" (See L<perliol>) which can transform
data as it is read or written.
- open(my $ilyad,'>:encoding(iso8859-7)','ilyad.greek');
+ open(my $ilyad,'>:encoding(iso-8859-7)','ilyad.greek');
print $ilyad @epic;
In addition the new IO system can also be configured to read/write
=over 4
-=item *
-
- $num_octets = utf8_upgrade($string);
-
-Converts internal representation of string to the UTF-8 form.
-Returns the number of octets necessary to represent the string as UTF-8.
-
-=item * utf8_downgrade($string[, CHECK])
-
-Converts internal representation of string to be un-encoded bytes.
-
=item * is_utf8(STRING [, CHECK])
[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
As mentioned above encodings are (in the current implementation at least)
defined by objects. The mapping of encoding name to object is via the
-C<%Encode::encodings> hash. (It is a package hash to allow XS code to get
-at it.)
+C<%encodings> hash.
The values of the hash can currently be either strings or objects.
The string form may go away in the future. The string form occurs
It is also highly desirable that encoding classes inherit from C<Encode::Encoding>
as a base class. This allows that class to define additional behaviour for
-all encoding objects.
+all encoding objects. For example built in Unicode, UCS-2 and UTF-8 classes
+use :
+
+ package Encode::MyEncoding;
+ use base qw(Encode::Encoding);
+
+ __PACKAGE__->Define(qw(myCanonical myAlias));
+
+To create an object with bless {Name => ...},$class, and call define_encoding.
+They inherit their C<name> method from C<Encode::Encoding>.
=head2 Compiled Encodings