#
-# $Id: Encode.pm,v 2.18 2006/06/03 20:28:48 dankogai Exp dankogai $
+# $Id: Encode.pm,v 2.35 2009/07/13 00:49:38 dankogai Exp $
#
package Encode;
use strict;
use warnings;
-our $VERSION = "2.18_01";
+our $VERSION = sprintf "%d.%02d", q$Revision: 2.35 $ =~ /(\d+)/g;
sub DEBUG () { 0 }
use XSLoader ();
XSLoader::load( __PACKAGE__, $VERSION );
our %EXPORT_TAGS = (
all => [ @EXPORT, @EXPORT_OK ],
- fallbacks => [@FB_CONSTS],
+ default => [ @EXPORT ],
+ fallbacks => [ @FB_CONSTS ],
fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
);
our %Encoding;
our %ExtModule;
require Encode::Config;
-eval { require Encode::ConfigLocal };
+# See
+# https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2
+# to find why sig handers inside eval{} are disabled.
+eval {
+ local $SIG{__DIE__};
+ local $SIG{__WARN__};
+ require Encode::ConfigLocal;
+};
sub encodings {
my $class = shift;
return undef unless defined $string;
$string .= '' if ref $string; # stringify;
$check ||= 0;
+ unless ( defined $name ) {
+ require Carp;
+ Carp::croak("Encoding name should not be undef");
+ }
my $enc = find_encoding($name);
unless ( defined $enc ) {
require Carp;
Carp::croak("Unknown encoding '$name'");
}
my $octets = $enc->encode( $string, $check );
- $_[1] = $string if $check and !( $check & LEAVE_SRC() );
+ $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC() );
return $octets;
}
*str2bytes = \&encode;
Carp::croak("Unknown encoding '$name'");
}
my $string = $enc->decode( $octets, $check );
- $_[1] = $octets if $check and !( $check & LEAVE_SRC() );
+ $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
return $string;
}
*bytes2str = \&decode;
#
sub predefine_encodings {
- use Encode::Encoding;
+ require Encode::Encoding;
no warnings 'redefine';
my $use_xs = shift;
if ($ON_EBCDIC) {
$octets = encode("iso-8859-1", $string);
-B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then $octets
-B<may not be equal to> $string. Though they both contain the same data, the UTF8 flag
-for $octets is B<always> off. When you encode anything, UTF8 flag of
-the result is always off, even when it contains completely valid utf8
-string. See L</"The UTF8 flag"> below.
+B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then
+$octets B<may not be equal to> $string. Though they both contain the
+same data, the UTF8 flag for $octets is B<always> off. When you
+encode anything, UTF8 flag of the result is always off, even when it
+contains completely valid utf8 string. See L</"The UTF8 flag"> below.
If the $string is C<undef> then C<undef> is returned.
If the $string is C<undef> then C<undef> is returned.
+=item [$obj =] find_encoding(ENCODING)
+
+Returns the I<encoding object> corresponding to ENCODING. Returns
+undef if no matching ENCODING is find.
+
+This object is what actually does the actual (en|de)coding.
+
+ $utf8 = decode($name, $bytes);
+
+is in fact
+
+ $utf8 = do{
+ $obj = find_encoding($name);
+ croak qq(encoding "$name" not found) unless ref $obj;
+ $obj->decode($bytes)
+ };
+
+with more error checking.
+
+Therefore you can save time by reusing this object as follows;
+
+ my $enc = find_encoding("iso-8859-1");
+ while(<>){
+ my $utf8 = $enc->decode($_);
+ # and do someting with $utf8;
+ }
+
+Besides C<< ->decode >> and C<< ->encode >>, other methods are
+available as well. For instance, C<< -> name >> returns the canonical
+name of the encoding object.
+
+ find_encoding("latin1")->name; # iso-8859-1
+
+See L<Encode::Encoding> for details.
+
=item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
Converts B<in-place> data between two encodings. The data in $octets
See L</"The UTF8 flag"> below.
+Also note that
+
+ from_to($octets, $from, $to, $check);
+
+is equivalent to
+
+ $octets = encode($to, decode($from, $octets), $check);
+
+Yes, it does not respect the $check during decoding. It is
+deliberately done that way. If you need minute control, C<decode>
+then C<encode> as follows;
+
+ $octets = encode($to, decode($from, $octets, $check_from), $check_to);
+
=item $octets = encode_utf8($string);
Equivalent to C<$octets = encode("utf8", $string);> The characters
See L<Encode::Alias> for details.
+=head2 Finding IANA Character Set Registry names
+
+The canonical name of a given encoding does not necessarily agree with
+IANA IANA Character Set Registry, commonly seen as C<< Content-Type:
+text/plain; charset=I<whatever> >>. For most cases canonical names
+work but sometimes it does not (notably 'utf-8-strict').
+
+Therefore as of Encode version 2.21, a new method C<mime_name()> is added.
+
+ use Encode;
+ my $enc = find_encoding('UTF-8');
+ warn $enc->name; # utf-8-strict
+ warn $enc->mime_name; # UTF-8
+
+See also: L<Encode::Encoding>
+
=head1 Encoding via PerlIO
-If your perl supports I<PerlIO> (which is the default), you can use a PerlIO layer to decode
-and encode directly via a filehandle. The following two examples
-are totally identical in their functionality.
+If your perl supports I<PerlIO> (which is the default), you can use a
+PerlIO layer to decode and encode directly via a filehandle. The
+following two examples are totally identical in their functionality.
# via PerlIO
open my $in, "<:encoding(shiftjis)", $infile or die;
=back
+=over 2
+
+=item Encode::LEAVE_SRC
+
+If the C<Encode::LEAVE_SRC> bit is not set, but I<CHECK> is, then the second
+argument to C<encode()> or C<decode()> may be assigned to by the functions. If
+you're not interested in this, then bitwise-or the bitmask with it.
+
+=back
+
=head2 coderef for CHECK
As of Encode 2.12 CHECK can also be a code reference which takes the
state of the UTF8 flag (so please don't treat the return value as
indicating success or failure), or C<undef> if STRING is not a string.
+This function does not work on tainted values.
+
=item _utf8_off(STRING)
[INTERNAL] Turns off the UTF8 flag in STRING. Do not use frivolously.
return value as indicating success or failure), or C<undef> if STRING is
not a string.
+This function does not work on tainted values.
+
=back
=head1 UTF-8 vs. utf8 vs. UTF8
L<encoding>,
L<perlebcdic>,
L<perlfunc/open>,
-L<perlunicode>,
+L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut>
L<utf8>,
the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>