X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=ext%2FEncode%2FEncode.pm;h=b502e8fdc469a5a8570ac4f7af7e60ae9ce6499b;hb=7e19fb92789b07f9ae6ba1ee1b4f5fbb72612161;hp=b03d93d707865eb8f32b4efd7daeb4a3854bac18;hpb=ef175861651dea779f900fb460eac98bb0784db1;p=p5sagit%2Fp5-mst-13.2.git diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index b03d93d..b502e8f 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -1,12 +1,12 @@ package Encode; use strict; -our $VERSION = do { my @r = (q$Revision: 1.58 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.61 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; our $DEBUG = 0; use XSLoader (); XSLoader::load 'Encode'; require Exporter; -our @ISA = qw(Exporter); +use base qw/Exporter/; # Public, encouraged API is exported by default @@ -15,8 +15,10 @@ our @EXPORT = qw( encodings find_encoding ); -our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC PERLQQ); -our @FB_CONSTS = qw(FB_DEFAULT FB_QUIET FB_WARN FB_PERLQQ FB_CROAK); +our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC + PERLQQ HTMLCREF XMLCREF); +our @FB_CONSTS = qw(FB_DEFAULT FB_CROAK FB_QUIET FB_WARN + FB_PERLQQ FB_HTMLCREF FB_XMLCREF); our @EXPORT_OK = ( @@ -194,6 +196,11 @@ sub predefine_encodings{ package Encode::UTF_EBCDIC; *name = sub{ shift->{'Name'} }; *new_sequence = sub{ return $_[0] }; + *needs_lines = sub{ 0 }; + *perlio_ok = sub { + eval{ require PerlIO::encoding }; + return $@ ? 0 : 1; + }; *decode = sub{ my ($obj,$str,$chk) = @_; my $res = ''; @@ -221,6 +228,11 @@ sub predefine_encodings{ package Encode::Internal; *name = sub{ shift->{'Name'} }; *new_sequence = sub{ return $_[0] }; + *needs_lines = sub{ 0 }; + *perlio_ok = sub { + eval{ require PerlIO::encoding }; + return $@ ? 0 : 1; + }; *decode = sub{ my ($obj,$str,$chk) = @_; utf8::upgrade($str); @@ -237,6 +249,11 @@ sub predefine_encodings{ package Encode::utf8; *name = sub{ shift->{'Name'} }; *new_sequence = sub{ return $_[0] }; + *needs_lines = sub{ 0 }; + *perlio_ok = sub { + eval{ require PerlIO::encoding }; + return $@ ? 0 : 1; + }; *decode = sub{ my ($obj,$octets,$chk) = @_; my $str = Encode::decode_utf8($octets); @@ -314,7 +331,7 @@ byte has 256 possible values, it easily fits in Perl's much larger =head2 TERMINOLOGY -=over 4 +=over 2 =item * @@ -339,7 +356,7 @@ and such details may change in future releases. =head1 PERL ENCODING API -=over 4 +=over 2 =item $octets = encode(ENCODING, $string[, CHECK]) @@ -351,7 +368,13 @@ For CHECK, see L. For example, to convert (internally UTF-8 encoded) Unicode string to iso-8859-1 (also known as Latin1), - $octets = encode("iso-8859-1", $unicode); + $octets = encode("iso-8859-1", $utf8); + +B: When you C<$octets = encode("utf8", $utf8)>, then $octets +B $utf8. Though they both contain the same data, the utf8 flag +for $octets is B off. When you encode anything, utf8 flag of +the result is always off, even when it contains completely valid utf8 +string. See L below. =item $string = decode(ENCODING, $octets[, CHECK]) @@ -365,16 +388,22 @@ For example, to convert ISO-8859-1 data to UTF-8: $utf8 = decode("iso-8859-1", $latin1); -=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK]) +B: When you C<$utf8 = encode("utf8", $octets)>, then $utf8 +B $utf8. Though they both contain the same data, +the utf8 flag for $utf8 is on unless $octets entirely conststs of +ASCII data (or EBCDIC on EBCDIC machines). See L +below. -Converts B data between two encodings. -For example, to convert ISO-8859-1 data to UTF-8: +=item [$length =] from_to($string, FROM_ENC, TO_ENC [, CHECK]) + +Converts B data between two encodings. For example, to +convert ISO-8859-1 data to UTF-8: - from_to($data, "iso-8859-1", "utf-8"); + from_to($data, "iso-8859-1", "utf8"); and to convert it back: - from_to($data, "utf-8", "iso-8859-1"); + from_to($data, "utf8", "iso-8859-1"); Note that because the conversion happens in place, the data to be converted cannot be a string constant; it must be a scalar variable. @@ -382,32 +411,34 @@ converted cannot be a string constant; it must be a scalar variable. from_to() returns the length of the converted string on success, undef otherwise. -=back +B: The following operations look the same but not quite so; + + from_to($data, "iso-8859-1", "utf8"); #1 + $data = decode("iso-8859-1", $data); #2 -=head2 UTF-8 / utf8 +Both #1 and #2 makes $data consists of completely valid UTF-8 string +but only #2 turns utf8 flag on. #1 is equivalent to -The Unicode Consortium defines the UTF-8 transformation format as a -way of encoding the entire Unicode repertoire as sequences of octets. -This encoding is expected to become very widespread. Perl can use this -form internally to represent strings, so conversions to and from this -form are particularly efficient (as octets in memory do not have to -change, just the meta-data that tells Perl how to treat them). + $data = encode("utf8", decode("iso-8859-1", $data)); -=over 4 +See L below. =item $octets = encode_utf8($string); -The characters that comprise $string are encoded in Perl's superset of -UTF-8 and the resulting octets are returned as a sequence of bytes. All -possible characters have a UTF-8 representation so this function cannot -fail. +Equivalent to C<$octets = encode("utf8", $string);> The characters +that comprise $string are encoded in Perl's superset of UTF-8 and the +resulting octets are returned as a sequence of bytes. All possible +characters have a UTF-8 representation so this function cannot fail. + =item $string = decode_utf8($octets [, CHECK]); -The sequence of octets represented by $octets is decoded from UTF-8 -into a sequence of logical characters. Not all sequences of octets -form valid UTF-8 encodings, so it is possible for this call to fail. -For CHECK, see L. +equivalent to C<$string = decode("utf8", $octets [, CHECK])>. +decode_utf8($octets [, CHECK]); The sequence of octets represented by +$octets is decoded from UTF-8 into a sequence of logical +characters. Not all sequences of octets form valid UTF-8 encodings, so +it is possible for this call to fail. For CHECK, see +L. =back @@ -493,7 +524,7 @@ For gory details, see L. =head1 Handling Malformed Data -=over 4 +=over 2 The I argument is used as follows. When you omit it, the behaviour is the same as if you had passed a value of 0 for @@ -507,7 +538,7 @@ EsubcharE will be used. For Unicode, "\x{FFFD}" is used. If the data is supposed to be UTF-8, an optional lexical warning (category utf8) is given. -=item I = Encode::DIE_ON_ERROR (== 1) +=item I = Encode::FB_CROAK ( == 1) If I is 1, methods will die immediately with an error message. Therefore, when I is set to 1, you should trap the @@ -539,6 +570,10 @@ you are debugging the mode above. =item perlqq mode (I = Encode::FB_PERLQQ) +=item HTML charref mode (I = Encode::FB_HTMLCREF) + +=item XML charref mode (I = Encode::FB_XMLCREF) + For encodings that are implemented by Encode::XS, CHECK == Encode::FB_PERLQQ turns (en|de)code into C fallback mode. @@ -548,6 +583,10 @@ decoded to utf8. And when you encode, '\x{I}' will be inserted, where I is the Unicode ID of the character that cannot be found in the character repertoire of the encoding. +HTML/XML character reference modes are about the same, in place of +\x{I}, HTML uses &#I<1234>; where I<1234> is a decimal digit and +XML uses &#xI; where I is the hexadecimal digit. + =item The bitmask These modes are actually set via a bitmask. Here is how the FB_XX @@ -561,6 +600,8 @@ constants via C. RETURN_ON_ERR 0x0004 X X LEAVE_SRC 0x0008 PERLQQ 0x0100 X + HTMLCREF 0x0200 + XMLCREF 0x0400 =head2 Unimplemented fallback schemes @@ -581,12 +622,84 @@ arguments are taken as aliases for I<$object>, as for C. See L for more details. -=head1 Messing with Perl's Internals +=head1 The UTF-8 flag + +Before the introduction of utf8 support in perl, The C operator +just compares internal data of the scalars. Now C means internal +data equality AND I. To explain why we made it so, I +will quote page 402 of C + +=over 2 + +=item Goal #1: + +Old byte-oriented programs should not spontaneously break on the old +byte-oriented data they used to work on. + +=item Goal #2: + +Old byte-oriented programs should magically start working on the new +character-oriented data when appropriate. + +=item Goal #3: + +Programs should run just as fast in the new character-oriented mode +as in the old byte-oriented mode. + +=item Goal #4: + +Perl should remain one language, rather than forking into a +byte-oriented Perl and a character-oriented Perl. + +=back + +Back when C was written, not even Perl 5.6.0 +was born and many features documented in the book remained +unimplemented. Perl 5.8 hopefully correct this and the introduction +of UTF-8 flag is one of them. You can think this perl notion of +byte-oriented mode (utf8 flag off) and character-oriented mode (utf8 +flag on). + +Here is how Encode takes care of the utf8 flag. + +=over2 + +=item * + +When you encode, the resulting utf8 flag is always off. + +=item + +When you decode, the resuting utf8 flag is on unless you can +unambiguously represent data. Here is the definition of +dis-ambiguity. + + After C<$utf8 = decode('foo', $octet);>, + + When $octet is... The utf8 flag in $utf8 is + --------------------------------------------- + In ASCII only (or EBCDIC only) OFF + In ISO-8859-1 ON + In any other Encoding ON + --------------------------------------------- + +As you see, there is one exception, In ASCII. That way you can assue +Goal #1. And with Encode Goal #2 is assumed but you still have to be +careful in such cases mentioned in B paragraphs. + +This utf8 flag is not visible in perl scripts, exactly for the same +reason you cannot (or you I) see if a scalar contains a +string, integer, or floating point number. But you can still peek +and poke these if you will. See the section below. + +=back + +=head2 Messing with Perl's Internals The following API uses parts of Perl's internals in the current implementation. As such, they are efficient but may change. -=over 4 +=over 2 =item is_utf8(STRING [, CHECK]) @@ -626,8 +739,8 @@ the Perl Unicode Mailing List Eperl-unicode@perl.orgE =head1 MAINTAINER This project was originated by Nick Ing-Simmons and later maintained -by Dan Kogai Edankogai@dan.co.jpE. See AUTHORS for a full list -of people involved. For any questions, use -Eperl-unicode@perl.orgE so others can share. +by Dan Kogai Edankogai@dan.co.jpE. See AUTHORS for a full +list of people involved. For any questions, use +Eperl-unicode@perl.orgE so we can all share share. =cut