#
-# $Id: Encode.pm,v 2.16 2006/05/03 18:32:25 dankogai Exp dankogai $
+# $Id: Encode.pm,v 2.23 2007/05/29 18:15:32 dankogai Exp dankogai $
#
package Encode;
use strict;
-our $VERSION = sprintf "%d.%02d", q$Revision: 2.16 $ =~ /(\d+)/g;
+use warnings;
+our $VERSION = sprintf "%d.%02d", q$Revision: 2.23 $ =~ /(\d+)/g;
sub DEBUG () { 0 }
use XSLoader ();
XSLoader::load( __PACKAGE__, $VERSION );
Carp::croak("Unknown encoding '$name'");
}
my $octets = $enc->encode( $string, $check );
- $_[1] = $string if $check and !( $check & LEAVE_SRC() );
+ $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC() );
return $octets;
}
*str2bytes = \&encode;
Carp::croak("Unknown encoding '$name'");
}
my $string = $enc->decode( $octets, $check );
- $_[1] = $octets if $check and !( $check & LEAVE_SRC() );
+ $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
return $string;
}
*bytes2str = \&decode;
#
sub predefine_encodings {
- use Encode::Encoding;
+ require Encode::Encoding;
no warnings 'redefine';
my $use_xs = shift;
if ($ON_EBCDIC) {
$octets = encode("iso-8859-1", $string);
-B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then $octets
-B<may not be equal to> $string. Though they both contain the same data, the utf8 flag
-for $octets is B<always> off. When you encode anything, utf8 flag of
-the result is always off, even when it contains completely valid utf8
-string. See L</"The UTF-8 flag"> below.
+B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then
+$octets B<may not be equal to> $string. Though they both contain the
+same data, the UTF8 flag for $octets is B<always> off. When you
+encode anything, UTF8 flag of the result is always off, even when it
+contains completely valid utf8 string. See L</"The UTF8 flag"> below.
If the $string is C<undef> then C<undef> is returned.
B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
B<may not be equal to> $octets. Though they both contain the same data,
-the utf8 flag for $string is on unless $octets entirely consists of
-ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF-8 flag">
+the UTF8 flag for $string is on unless $octets entirely consists of
+ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF8 flag">
below.
If the $string is C<undef> then C<undef> is returned.
+=item [$obj =] find_encoding(ENCODING)
+
+Returns the I<encoding object> corresponding to ENCODING. Returns
+undef if no matching ENCODING is find.
+
+This object is what actually does the actual (en|de)coding.
+
+ $utf8 = decode($name, $bytes);
+
+is in fact
+
+ $utf8 = do{
+ $obj = find_encoding($name);
+ croak qq(encoding "$name" not found) unless ref $obj;
+ $obj->decode($bytes)
+ };
+
+with more error checking.
+
+Therefore you can save time by reusing this object as follows;
+
+ my $enc = find_encoding("iso-8859-1");
+ while(<>){
+ my $utf8 = $enc->decode($_);
+ # and do someting with $utf8;
+ }
+
+Besides C<< ->decode >> and C<< ->encode >>, other methods are
+available as well. For instance, C<< -> name >> returns the canonical
+name of the encoding object.
+
+ find_encoding("latin1")->name; # iso-8859-1
+
+See L<Encode::Encoding> for details.
+
=item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
Converts B<in-place> data between two encodings. The data in $octets
$data = decode("iso-8859-1", $data); #2
Both #1 and #2 make $data consist of a completely valid UTF-8 string
-but only #2 turns utf8 flag on. #1 is equivalent to
+but only #2 turns UTF8 flag on. #1 is equivalent to
$data = encode("utf8", decode("iso-8859-1", $data));
-See L</"The UTF-8 flag"> below.
+See L</"The UTF8 flag"> below.
+
+Also note that
+
+ from_to($octets, $from, $to, $check);
+
+is equivalent to
+
+ $octets = encode($to, decode($from, $octets), $check);
+
+Yes, it does not respect the $check during decoding. It is
+deliberately done that way. If you need minute control, C<decode>
+then C<encode> as follows;
+
+ $octets = encode($to, decode($from, $octets, $check_from), $check_to);
=item $octets = encode_utf8($string);
See L<Encode::Alias> for details.
+=head2 Finding IANA Character Set Registry names
+
+The canonical name of a given encoding does not necessarily agree with
+IANA IANA Character Set Registry, commonly seen as C<< Content-Type:
+text/plain; charset=I<whatever> >>. For most cases canonical names
+work but sometimes it does not (notably 'utf-8-strict').
+
+Therefore as of Encode version 2.21, a new method C<mime_name()> is added.
+
+ use Encode;
+ my $enc = find_encoding('UTF-8');
+ warn $enc->name; # utf-8-strict
+ warn $enc->mime_name; # UTF-8
+
+See also: L<Encode::Encoding>
+
=head1 Encoding via PerlIO
-If your perl supports I<PerlIO> (which is the default), you can use a PerlIO layer to decode
-and encode directly via a filehandle. The following two examples
-are totally identical in their functionality.
+If your perl supports I<PerlIO> (which is the default), you can use a
+PerlIO layer to decode and encode directly via a filehandle. The
+following two examples are totally identical in their functionality.
# via PerlIO
open my $in, "<:encoding(shiftjis)", $infile or die;
=back
-=head2 coderef for CHECK
+=over 2
+
+=item Encode::LEAVE_SRC
+
+If the C<Encode::LEAVE_SRC> bit is not set, but I<CHECK> is, then the second
+argument to C<encode()> or C<decode()> may be assigned to by the functions. If
+you're not interested in this, then bitwise-or the bitmask with it.
+
+=back
+
+=Head2 coderef for CHECK
As of Encode 2.12 CHECK can also be a code reference which takes the
ord value of unmapped caharacter as an argument and returns a string
See L<Encode::Encoding> for more details.
-=head1 The UTF-8 flag
+=head1 The UTF8 flag
-Before the introduction of utf8 support in perl, The C<eq> operator
+Before the introduction of Unicode support in perl, The C<eq> operator
just compared the strings represented by two scalars. Beginning with
-perl 5.8, C<eq> compares two strings with simultaneous consideration
-of I<the utf8 flag>. To explain why we made it so, I will quote page
-402 of C<Programming Perl, 3rd ed.>
+perl 5.8, C<eq> compares two strings with simultaneous consideration of
+I<the UTF8 flag>. To explain why we made it so, I will quote page 402 of
+C<Programming Perl, 3rd ed.>
=over 2
Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0
was born and many features documented in the book remained
unimplemented for a long time. Perl 5.8 corrected this and the introduction
-of the UTF-8 flag is one of them. You can think of this perl notion as of a
-byte-oriented mode (utf8 flag off) and a character-oriented mode (utf8
+of the UTF8 flag is one of them. You can think of this perl notion as of a
+byte-oriented mode (UTF8 flag off) and a character-oriented mode (UTF8
flag on).
-Here is how Encode takes care of the utf8 flag.
+Here is how Encode takes care of the UTF8 flag.
=over 2
=item *
-When you encode, the resulting utf8 flag is always off.
+When you encode, the resulting UTF8 flag is always off.
=item *
-When you decode, the resulting utf8 flag is on unless you can
+When you decode, the resulting UTF8 flag is on unless you can
unambiguously represent data. Here is the definition of
dis-ambiguity.
After C<$utf8 = decode('foo', $octet);>,
- When $octet is... The utf8 flag in $utf8 is
+ When $octet is... The UTF8 flag in $utf8 is
---------------------------------------------
In ASCII only (or EBCDIC only) OFF
In ISO-8859-1 ON
Goal #1. And with Encode Goal #2 is assumed but you still have to be
careful in such cases mentioned in B<CAVEAT> paragraphs.
-This utf8 flag is not visible in perl scripts, exactly for the same
+This UTF8 flag is not visible in perl scripts, exactly for the same
reason you cannot (or you I<don't have to>) see if a scalar contains a
string, integer, or floating point number. But you can still peek
and poke these if you will. See the section below.
=item is_utf8(STRING [, CHECK])
-[INTERNAL] Tests whether the UTF-8 flag is turned on in the STRING.
+[INTERNAL] Tests whether the UTF8 flag is turned on in the STRING.
If CHECK is true, also checks the data in STRING for being well-formed
UTF-8. Returns true if successful, false otherwise.
=item _utf8_on(STRING)
-[INTERNAL] Turns on the UTF-8 flag in STRING. The data in STRING is
+[INTERNAL] Turns on the UTF8 flag in STRING. The data in STRING is
B<not> checked for being well-formed UTF-8. Do not use unless you
B<know> that the STRING is well-formed UTF-8. Returns the previous
-state of the UTF-8 flag (so please don't treat the return value as
+state of the UTF8 flag (so please don't treat the return value as
indicating success or failure), or C<undef> if STRING is not a string.
=item _utf8_off(STRING)
-[INTERNAL] Turns off the UTF-8 flag in STRING. Do not use frivolously.
-Returns the previous state of the UTF-8 flag (so please don't treat the
+[INTERNAL] Turns off the UTF8 flag in STRING. Do not use frivolously.
+Returns the previous state of the UTF8 flag (so please don't treat the
return value as indicating success or failure), or C<undef> if STRING is
not a string.
=back
-=head1 UTF-8 vs. utf8
+=head1 UTF-8 vs. utf8 vs. UTF8
....We now view strings not as sequences of bytes, but as sequences
of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
find_encoding("utf_8")->name # ditto. "_" are treated as "-"
find_encoding("UTF8")->name # is 'utf8'.
+The UTF8 flag is internally called UTF8, without a hyphen. It indicates
+whether a string is internally encoded as utf8, also without a hypen.
=head1 SEE ALSO
L<encoding>,
L<perlebcdic>,
L<perlfunc/open>,
-L<perlunicode>,
+L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut>
L<utf8>,
the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>