#
-# $Id: Encode.pm,v 1.64 2002/04/29 06:54:06 dankogai Exp $
+# $Id: Encode.pm,v 2.0 2004/05/16 20:55:15 dankogai Exp $
#
package Encode;
use strict;
-our $VERSION = do { my @r = (q$Revision: 1.64 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
-our $DEBUG = 0;
+our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+sub DEBUG () { 0 }
use XSLoader ();
XSLoader::load(__PACKAGE__, $VERSION);
our @EXPORT = qw(
decode decode_utf8 encode encode_utf8
- encodings find_encoding
+ encodings find_encoding clone_encoding
);
our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
}else{
%enc = %Encoding;
for my $mod (map {m/::/o ? $_ : "Encode::$_" } @_){
- $DEBUG and warn $mod;
+ DEBUG and warn $mod;
for my $enc (keys %ExtModule){
$ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
}
{
my ($class, $name, $skip_external) = @_;
- ref($name) && $name->can('new_sequence') and return $name;
+ ref($name) && $name->can('renew') and return $name;
exists $Encoding{$name} and return $Encoding{$name};
my $lc = lc $name;
exists $Encoding{$lc} and return $Encoding{$lc};
return;
}
-sub find_encoding
+sub find_encoding($;$)
{
my ($name, $skip_external) = @_;
return __PACKAGE__->getEncoding($name,$skip_external);
}
-sub resolve_alias {
+sub resolve_alias($){
my $obj = find_encoding(shift);
defined $obj and return $obj->name;
return;
}
+sub clone_encoding($){
+ my $obj = find_encoding(shift);
+ ref $obj or return;
+ eval { require Storable };
+ $@ and return;
+ return Storable::dclone($obj);
+}
+
sub encode($$;$)
{
- my ($name,$string,$check) = @_;
+ my ($name, $string, $check) = @_;
+ return undef unless defined $string;
$check ||=0;
my $enc = find_encoding($name);
unless(defined $enc){
Carp::croak("Unknown encoding '$name'");
}
my $octets = $enc->encode($string,$check);
- return undef if ($check && length($string));
+ $_[1] = $string if $check;
return $octets;
}
sub decode($$;$)
{
my ($name,$octets,$check) = @_;
+ return undef unless defined $octets;
$check ||=0;
my $enc = find_encoding($name);
unless(defined $enc){
sub from_to($$$;$)
{
my ($string,$from,$to,$check) = @_;
+ return undef unless defined $string;
$check ||=0;
my $f = find_encoding($from);
unless (defined $f){
return $str;
}
-sub decode_utf8($)
+sub decode_utf8($;$)
{
- my ($str) = @_;
- return undef unless utf8::decode($str);
- return $str;
+ my ($str, $check) = @_;
+ if ($check){
+ return decode("utf8", $str, $check);
+ }else{
+ return undef unless utf8::decode($str);
+ return $str;
+ }
}
-predefine_encodings();
+predefine_encodings(1);
#
# This is to restore %Encoding if really needed;
sub predefine_encodings{
use Encode::Encoding;
+ no warnings 'redefine';
+ my $use_xs = shift;
if ($ON_EBCDIC) {
# was in Encode::UTF_EBCDIC
package Encode::UTF_EBCDIC;
# was in Encode::utf8
package Encode::utf8;
push @Encode::utf8::ISA, 'Encode::Encoding';
- *decode = sub{
- my ($obj,$octets,$chk) = @_;
- my $str = Encode::decode_utf8($octets);
- if (defined $str) {
+ #
+ if ($use_xs){
+ Encode::DEBUG and warn __PACKAGE__, " XS on";
+ *decode = \&decode_xs;
+ *encode = \&encode_xs;
+ }else{
+ Encode::DEBUG and warn __PACKAGE__, " XS off";
+ *decode = sub{
+ my ($obj,$octets,$chk) = @_;
+ my $str = Encode::decode_utf8($octets);
+ if (defined $str) {
+ $_[1] = '' if $chk;
+ return $str;
+ }
+ return undef;
+ };
+ *encode = sub {
+ my ($obj,$string,$chk) = @_;
+ my $octets = Encode::encode_utf8($string);
$_[1] = '' if $chk;
- return $str;
+ return $octets;
+ };
+ }
+ *cat_decode = sub{ # ($obj, $dst, $src, $pos, $trm, $chk)
+ my ($obj, undef, undef, $pos, $trm) = @_; # currently ignores $chk
+ my ($rdst, $rsrc, $rpos) = \@_[1,2,3];
+ use bytes;
+ if ((my $npos = index($$rsrc, $trm, $pos)) >= 0) {
+ $$rdst .= substr($$rsrc, $pos, $npos - $pos + length($trm));
+ $$rpos = $npos + length($trm);
+ return 1;
}
- return undef;
- };
- *encode = sub {
- my ($obj,$string,$chk) = @_;
- my $octets = Encode::encode_utf8($string);
- $_[1] = '' if $chk;
- return $octets;
+ $$rdst .= substr($$rsrc, $pos);
+ $$rpos = length($$rsrc);
+ return '';
};
$Encode::Encoding{utf8} =
bless {Name => "utf8"} => "Encode::utf8";
the result is always off, even when it contains completely valid utf8
string. See L</"The UTF-8 flag"> below.
+encode($valid_encoding, undef) is harmless but warns you for
+C<Use of uninitialized value in subroutine entry>.
+encode($valid_encoding, '') is harmless and warnless.
+
=item $string = decode(ENCODING, $octets [, CHECK])
Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's
ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF-8 flag">
below.
+decode($valid_encoding, undef) is harmless but warns you for
+C<Use of uninitialized value in subroutine entry>.
+decode($valid_encoding, '') is harmless and warnless.
+
=item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
Converts B<in-place> data between two encodings. The data in $octets
=head1 Handling Malformed Data
-=over 2
-
The I<CHECK> argument is used as follows. When you omit it,
the behaviour is the same as if you had passed a value of 0 for
I<CHECK>.
+=over 2
+
=item I<CHECK> = Encode::FB_DEFAULT ( == 0)
If I<CHECK> is 0, (en|de)code will put a I<substitution character>
while(defined(read $fh, $buffer, 256)){
# buffer may end in a partial character so we append
$data .= $buffer;
- $utf8 .= decode($encoding, $data, ENCODE::FB_QUIET);
+ $utf8 .= decode($encoding, $data, Encode::FB_QUIET);
# $data now contains the unprocessed partial character
}
FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
DIE_ON_ERR 0x0001 X
- WARN_ON_ER 0x0002 X
+ WARN_ON_ERR 0x0002 X
RETURN_ON_ERR 0x0004 X X
LEAVE_SRC 0x0008
PERLQQ 0x0100 X
HTMLCREF 0x0200
XMLCREF 0x0400
+=back
+
=head2 Unimplemented fallback schemes
In the future, you will be able to use a code reference to a callback
function for the value of I<CHECK> but its API is still undecided.
+The fallback scheme does not work on EBCDIC platforms.
+
=head1 Defining Encodings
To define a new encoding, use:
When you encode, the resulting utf8 flag is always off.
-=item
+=item *
When you decode, the resulting utf8 flag is on unless you can
unambiguously represent data. Here is the definition of
If CHECK is true, also checks the data in STRING for being well-formed
UTF-8. Returns true if successful, false otherwise.
+As of perl 5.8.1, L<utf8> also has utf8::is_utf8().
+
=item _utf8_on(STRING)
[INTERNAL] Turns on the UTF-8 flag in STRING. The data in STRING is