From: Rafael Garcia-Suarez Date: Tue, 12 Apr 2005 15:30:23 +0000 (+0000) Subject: Upgrade to Encode 2.0902 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=7f0d54d76fcb154445173f08ae0ae11c9cf150ff;p=p5sagit%2Fp5-mst-13.2.git Upgrade to Encode 2.0902 p4raw-id: //depot/perl@24231 --- diff --git a/MANIFEST b/MANIFEST index 2a4d8f5..c791a84 100644 --- a/MANIFEST +++ b/MANIFEST @@ -428,6 +428,7 @@ ext/Encode/t/Mod_EUCJP.pm module that t/enc_module.enc uses ext/Encode/t/perlio.t test script ext/Encode/t/rt.pl test script ext/Encode/t/unibench.pl benchmark script +ext/Encode/t/utf8strict.t test script ext/Encode/t/Unicode.t test script ext/Encode/TW/Makefile.PL Encode extension ext/Encode/TW/TW.pm Encode extension diff --git a/ext/Encode/Changes b/ext/Encode/Changes index 249d298..3dd1fce 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -3,6 +3,29 @@ # $Id: Changes,v 2.9 2004/12/03 19:16:53 dankogai Exp dankogai $ # $Revision: 2.9 $ $Date: 2004/12/03 19:16:53 $ +! Encode.pm + New Pod section: "UTF-8 vs utf8"; explains utf-8-strict ++ t/utf8strict.t + Tests utf-8-strict, accordingly to + UTF-8 decoder capability and stress test" by Markus Kuhn + http://smontagu.damowmow.com/utf8test.html + Note that malformed and overlong sequences are not test here + because perl already does that for you, utf-8-strict or not. +! Encode.pm Encode/encode.h t/fallback.t + Addressed "encode(..., Encode::LEAVE_SRC) does not work". + Now FB_(PERLQQ|HTMLCREF|XMLCREF) implies LEAVE_SRC so + you can (en|de)code constant strings with these fallbacks. + http://rt.cpan.org/NoAuth/Bug.html?id=8736 +! Encode.pm Encode.xs lib/Encode/Alias.pm t/Aliases.t + Make Encode.pm support the real UTF-8, by GAAS + Message-Id: + Message-Id: +! Encode.pm Encode.xs + post-2.09 comment patches from GAAS applied. + Message-Id: + Message-Id: + +2.09 2004/12/03 19:16:53 ! Encode.pm Encode.xs Addressed " :encoding(utf8) broken in perl-5.8.6". Message-Id: diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 5e67e4c..49813d5 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -3,7 +3,8 @@ # package Encode; use strict; -our $VERSION = do { my @r = (q$Revision: 2.9 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +# our $VERSION = do { my @r = (q$Revision: 2.9 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = '2.0902'; sub DEBUG () { 0 } use XSLoader (); XSLoader::load(__PACKAGE__, $VERSION); @@ -148,7 +149,7 @@ sub encode($$;$) Carp::croak("Unknown encoding '$name'"); } my $octets = $enc->encode($string,$check); - $_[1] = $string if $check; + $_[1] = $string if $check and !($check & LEAVE_SRC()); return $octets; } @@ -164,7 +165,7 @@ sub decode($$;$) Carp::croak("Unknown encoding '$name'"); } my $string = $enc->decode($octets,$check); - $_[1] = $octets if $check; + $_[1] = $octets if $check and !($check & LEAVE_SRC()); return $string; } @@ -300,6 +301,8 @@ sub predefine_encodings{ }; $Encode::Encoding{utf8} = bless {Name => "utf8"} => "Encode::utf8"; + $Encode::Encoding{"utf-8-strict"} = + bless {Name => "utf-8-strict", strict_utf8 => 1 } => "Encode::utf8"; } } @@ -401,7 +404,7 @@ for $octets is B off. When you encode anything, utf8 flag of the result is always off, even when it contains completely valid utf8 string. See L below. -If the $string is C or a reference then C is returned. +If the $string is C then C is returned. =item $string = decode(ENCODING, $octets [, CHECK]) @@ -421,7 +424,7 @@ the utf8 flag for $string is on unless $octets entirely consists of ASCII data (or EBCDIC on EBCDIC machines). See L below. -If the $string is C or a reference then C is returned. +If the $string is C then C is returned. =item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK]) @@ -626,6 +629,8 @@ HTML/XML character reference modes are about the same, in place of C<\x{I}>, HTML uses C<&#I;> where I is a decimal number and XML uses C<&#xI;> where I is the hexadecimal number. +In Encode 2.10 or later, C is also implied. + =item The bitmask These modes are actually set via a bitmask. Here is how the FB_XX @@ -637,7 +642,7 @@ constants via C. DIE_ON_ERR 0x0001 X WARN_ON_ERR 0x0002 X RETURN_ON_ERR 0x0004 X X - LEAVE_SRC 0x0008 + LEAVE_SRC 0x0008 X PERLQQ 0x0100 X HTMLCREF 0x0200 XMLCREF 0x0400 @@ -770,6 +775,54 @@ not a string. =back +=head1 UTF-8 vs. utf8 + + ....We now view strings not as sequences of bytes, but as sequences + of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit + computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed. + +That has been the perl's notion of UTF-8 but official UTF-8 is more +strict; Its ranges is much narrower (0 .. 10FFFF), some sequences are +not allowed (i.e. Those used in the surrogate pair, 0xFFFE, et al). + +Now that is overruled by Larry Wall himself. + + From: Larry Wall + Date: December 04, 2004 11:51:58 JST + To: perl-unicode@perl.org + Subject: Re: Make Encode.pm support the real UTF-8 + Message-Id: <20041204025158.GA28754@wall.org> + + On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote: + : I've no problem with 'utf8' being perl's unrestricted uft8 encoding, + : but "UTF-8" is the name of the standard and should give the + : corresponding behaviour. + + For what it's worth, that's how I've always kept them straight in my + head. + + Also for what it's worth, Perl 6 will mostly default to strict but + make it easy to switch back to lax. + + Larry + +Do you copy? As of Perl 5.8.7, B means strict, official UTF-8 +while B means liberal, lax, version thereof. And Encode version +2.10 or later thus groks the difference between C and C"utf8". + + encode("utf8", "\x{FFFF_FFFF}", 1); # okay + encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks + +C in Encode is actually a canonical name for C. +Yes, the hyphen between "UTF" and "8" is important. Without it Encode +goes "liberal" + + find_encoding("UTF-8")->name # is 'utf-8-strict' + find_encoding("utf-8")->name # ditto. names are case insensitive + find_encoding("utf8")->name # ditto. "_" are treated as "-" + find_encoding("UTF8")->name # is 'utf8'. + + =head1 SEE ALSO L, diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index 4d64fb1..de7028c 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -29,6 +29,12 @@ UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) +#define UTF8_ALLOW_STRICT 0 +#define UTF8_ALLOW_NONSTRICT (UTF8_ALLOW_ANY & \ + ~(UTF8_ALLOW_CONTINUATION | \ + UTF8_ALLOW_NON_CONTINUATION | \ + UTF8_ALLOW_LONG)) + void Encode_XSEncoding(pTHX_ encode_t * enc) { @@ -247,6 +253,115 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, return dst; } +static bool +strict_utf8(pTHX_ SV* sv) +{ + HV* hv; + SV** svp; + sv = SvRV(sv); + if (!sv || SvTYPE(sv) != SVt_PVHV) + return 0; + hv = (HV*)sv; + svp = hv_fetch(hv, "strict_utf8", 11, 0); + if (!svp) + return 0; + return SvTRUE(*svp); +} + +static U8* +process_utf8(pTHX_ SV* dst, U8* s, U8* e, int check, + bool encode, bool strict, bool stop_at_partial) +{ + UV uv; + STRLEN ulen; + + SvPOK_only(dst); + SvCUR_set(dst,0); + + while (s < e) { + if (UTF8_IS_INVARIANT(*s)) { + sv_catpvn(dst, (char *)s, 1); + s++; + continue; + } + + if (UTF8_IS_START(*s)) { + U8 skip = UTF8SKIP(s); + if ((s + skip) > e) { + /* Partial character */ + /* XXX could check that rest of bytes are UTF8_IS_CONTINUATION(ch) */ + if (stop_at_partial) + break; + + goto malformed_byte; + } + + uv = utf8n_to_uvuni(s, e - s, &ulen, + UTF8_CHECK_ONLY | (strict ? UTF8_ALLOW_STRICT : + UTF8_ALLOW_NONSTRICT) + ); +#if 1 /* perl-5.8.6 and older do not check UTF8_ALLOW_LONG */ + if (strict && uv > PERL_UNICODE_MAX) + ulen = -1; +#endif + if (ulen == -1) { + if (strict) { + uv = utf8n_to_uvuni(s, e - s, &ulen, + UTF8_CHECK_ONLY | UTF8_ALLOW_NONSTRICT); + if (ulen == -1) + goto malformed_byte; + goto malformed; + } + goto malformed_byte; + } + + + /* Whole char is good */ + sv_catpvn(dst,(char *)s,skip); + s += skip; + continue; + } + + /* If we get here there is something wrong with alleged UTF-8 */ + malformed_byte: + uv = (UV)*s; + ulen = 1; + + malformed: + if (check & ENCODE_DIE_ON_ERR){ + if (encode) + Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, "utf8"); + else + Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", uv); + } + if (check & ENCODE_WARN_ON_ERR){ + if (encode) + Perl_warner(aTHX_ packWARN(WARN_UTF8), + ERR_ENCODE_NOMAP, uv, "utf8"); + else + Perl_warner(aTHX_ packWARN(WARN_UTF8), + ERR_DECODE_NOMAP, "utf8", uv); + } + if (check & ENCODE_RETURN_ON_ERR) { + break; + } + if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ + SV* subchar = newSVpvf(check & ENCODE_PERLQQ ? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}"): + check & ENCODE_HTMLCREF ? "&#%" UVuf ";" : + "&#x%" UVxf ";", uv); + sv_catsv(dst, subchar); + SvREFCNT_dec(subchar); + } else { + sv_catpv(dst, FBCHAR_UTF8); + } + s += ulen; + } + *SvEND(dst) = '\0'; + + return s; +} + + MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_ PROTOTYPES: DISABLE @@ -264,8 +379,7 @@ CODE: SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */ /* - * PerlO check -- we assume the object is of PerlIO if renewed - * and if so, we set RETURN_ON_ERR for partial character + * PerlIO check -- we assume the object is of PerlIO if renewed */ int renewed = 0; dSP; ENTER; SAVETMPS; @@ -283,8 +397,6 @@ CODE: FREETMPS; LEAVE; /* end PerlIO check */ - SvPOK_only(dst); - SvCUR_set(dst,0); if (SvUTF8(src)) { s = utf8_to_bytes(s,&slen); if (s) { @@ -296,53 +408,8 @@ CODE: croak("Cannot decode string with wide characters"); } } - while (s < e) { - if (UTF8_IS_INVARIANT(*s) || UTF8_IS_START(*s)) { - U8 skip = UTF8SKIP(s); - if ((s + skip) > e) { - /* Partial character - done */ - if (renewed) - break; - goto decode_utf8_fallback; - } - else if (is_utf8_char(s)) { - /* Whole char is good */ - sv_catpvn(dst,(char *)s,skip); - s += skip; - continue; - } - else { - /* starts ok but isn't "good" */ - } - } - else { - /* Invalid start byte */ - } - /* If we get here there is something wrong with alleged UTF-8 */ - decode_utf8_fallback: - if (check & ENCODE_DIE_ON_ERR){ - Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", (UV)*s); - XSRETURN(0); - } - if (check & ENCODE_WARN_ON_ERR){ - Perl_warner(aTHX_ packWARN(WARN_UTF8), - ERR_DECODE_NOMAP, "utf8", (UV)*s); - } - if (check & ENCODE_RETURN_ON_ERR) { - break; - } - if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ - SV* subchar = newSVpvf(check & ENCODE_PERLQQ ? "\\x%02" UVXf : - check & ENCODE_HTMLCREF ? "&#%" UVuf ";" : - "&#x%" UVxf ";", (UV)*s); - sv_catsv(dst, subchar); - SvREFCNT_dec(subchar); - } else { - sv_catpv(dst, FBCHAR_UTF8); - } - s++; - } - *SvEND(dst) = '\0'; + + s = process_utf8(aTHX_ dst, s, e, check, 0, strict_utf8(aTHX_ obj), renewed); /* Clear out translated part of source unless asked not to */ if (check && !(check & ENCODE_LEAVE_SRC)){ @@ -369,9 +436,15 @@ CODE: U8 *e = (U8 *) SvEND(src); SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */ if (SvUTF8(src)) { - /* Already encoded - trust it and just copy the octets */ - sv_setpvn(dst,(char *)s,(e-s)); - s = e; + /* Already encoded */ + if (strict_utf8(aTHX_ obj)) { + s = process_utf8(aTHX_ dst, s, e, check, 1, 1, 0); + } + else { + /* trust it and just copy the octets */ + sv_setpvn(dst,(char *)s,(e-s)); + s = e; + } } else { /* Native bytes - can always encode */ diff --git a/ext/Encode/Encode/encode.h b/ext/Encode/Encode/encode.h index fc8301a..d7a57a4 100644 --- a/ext/Encode/Encode/encode.h +++ b/ext/Encode/Encode/encode.h @@ -103,8 +103,8 @@ extern void Encode_DefineEncoding(encode_t *enc); #define ENCODE_FB_CROAK 0x0001 #define ENCODE_FB_QUIET ENCODE_RETURN_ON_ERR #define ENCODE_FB_WARN (ENCODE_RETURN_ON_ERR|ENCODE_WARN_ON_ERR) -#define ENCODE_FB_PERLQQ ENCODE_PERLQQ -#define ENCODE_FB_HTMLCREF ENCODE_HTMLCREF -#define ENCODE_FB_XMLCREF ENCODE_XMLCREF +#define ENCODE_FB_PERLQQ (ENCODE_PERLQQ|ENCODE_LEAVE_SRC) +#define ENCODE_FB_HTMLCREF (ENCODE_HTMLCREF|ENCODE_LEAVE_SRC) +#define ENCODE_FB_XMLCREF (ENCODE_XMLCREF|ENCODE_LEAVE_SRC) #endif /* ENCODE_H */ diff --git a/ext/Encode/MANIFEST b/ext/Encode/MANIFEST index 6a6aab8..7f31c3c 100644 --- a/ext/Encode/MANIFEST +++ b/ext/Encode/MANIFEST @@ -89,6 +89,7 @@ t/mime-header.t test script t/perlio.t test script t/rt.pl even more test script t/unibench.pl benchmark script +t/utf8strict.t test script ucm/8859-1.ucm Unicode Character Map ucm/8859-10.ucm Unicode Character Map ucm/8859-11.ucm Unicode Character Map diff --git a/ext/Encode/META.yml b/ext/Encode/META.yml index 6a52035..3853ffa 100644 --- a/ext/Encode/META.yml +++ b/ext/Encode/META.yml @@ -1,7 +1,7 @@ # http://module-build.sourceforge.net/META-spec.html #XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX# name: Encode -version: 2.09 +version: 2.0902 version_from: Encode.pm installdirs: perl requires: diff --git a/ext/Encode/lib/Encode/Alias.pm b/ext/Encode/lib/Encode/Alias.pm index a1cc253..f9bc3fe 100644 --- a/ext/Encode/lib/Encode/Alias.pm +++ b/ext/Encode/lib/Encode/Alias.pm @@ -187,7 +187,6 @@ sub init_aliases # define_alias( qr/\bmacRomanian$/i => '"macRumanian"'); # Standardize on the dashed versions. - # define_alias( qr/\butf8$/i => '"utf-8"' ); define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' ); unless ($Encode::ON_EBCDIC){ @@ -222,7 +221,7 @@ sub init_aliases define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' ); } # utf8 is blessed :) - define_alias( qr/^UTF-8$/i => '"utf8"',); + define_alias( qr/^UTF-8$/i => '"utf-8-strict"'); # At last, Map white space and _ to '-' define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' ); } diff --git a/ext/Encode/t/Aliases.t b/ext/Encode/t/Aliases.t index 64a42a4..2fce73e 100644 --- a/ext/Encode/t/Aliases.t +++ b/ext/Encode/t/Aliases.t @@ -22,7 +22,7 @@ sub init_a2c{ %a2c = ( 'US-ascii' => 'ascii', 'ISO-646-US' => 'ascii', - 'UTF-8' => 'utf8', + 'UTF-8' => 'utf-8-strict', 'UCS-2' => 'UCS-2BE', 'UCS2' => 'UCS-2BE', 'iso-10646-1' => 'UCS-2BE', diff --git a/ext/Encode/t/fallback.t b/ext/Encode/t/fallback.t index e030414..4a04f54 100644 --- a/ext/Encode/t/fallback.t +++ b/ext/Encode/t/fallback.t @@ -125,29 +125,29 @@ is($src, $residue, "FB_QUIET residue utf8"); $src = $uo; $dst = $ascii->encode($src, FB_PERLQQ); is($dst, $ap, "FB_PERLQQ ascii"); -is($src, '', "FB_PERLQQ residue ascii"); +is($src, $uo, "FB_PERLQQ residue ascii"); $src = $ao; $dst = $utf8->decode($src, FB_PERLQQ); is($dst, $up, "FB_PERLQQ utf8"); -is($src, '', "FB_PERLQQ residue utf8"); +is($src, $ao, "FB_PERLQQ residue utf8"); $src = $uo; $dst = $ascii->encode($src, FB_HTMLCREF); is($dst, $ah, "FB_HTMLCREF ascii"); -is($src, '', "FB_HTMLCREF residue ascii"); +is($src, $uo, "FB_HTMLCREF residue ascii"); $src = $ao; $dst = $utf8->decode($src, FB_HTMLCREF); is($dst, $uh, "FB_HTMLCREF utf8"); -is($src, '', "FB_HTMLCREF residue utf8"); +is($src, $ao, "FB_HTMLCREF residue utf8"); $src = $uo; $dst = $ascii->encode($src, FB_XMLCREF); is($dst, $ax, "FB_XMLCREF ascii"); -is($src, '', "FB_XMLCREF residue ascii"); +is($src, $uo, "FB_XMLCREF residue ascii"); $src = $ao; $dst = $utf8->decode($src, FB_XMLCREF); is($dst, $ax, "FB_XMLCREF utf8"); -is($src, '', "FB_XMLCREF residue utf8"); +is($src, $ao, "FB_XMLCREF residue utf8"); diff --git a/ext/Encode/t/utf8strict.t b/ext/Encode/t/utf8strict.t new file mode 100644 index 0000000..dac5d6f --- /dev/null +++ b/ext/Encode/t/utf8strict.t @@ -0,0 +1,78 @@ +#!../perl +our $DEBUG = @ARGV; +our (%ORD, %SEQ, $NTESTS); +BEGIN { + if ($ENV{'PERL_CORE'}){ + chdir 't'; + unshift @INC, '../lib'; + } + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + if ($] <= 5.008 and !$Config{perl_patchlevel}){ + print "1..0 # Skip: Perl 5.8.1 or later required\n"; + exit 0; + } + # http://smontagu.damowmow.com/utf8test.html + %ORD = ( + 0x00000080 => 0, # 2.1.2 + 0x00000800 => 0, # 2.1.3 + 0x00010000 => 0, # 2.1.4 + 0x00200000 => 1, # 2.1.5 + 0x00400000 => 1, # 2.1.6 + 0x0000007F => 0, # 2.2.1 -- unmapped okay + 0x000007FF => 0, # 2.2.2 + 0x0000FFFF => 1, # 2.2.3 + 0x001FFFFF => 1, # 2.2.4 + 0x03FFFFFF => 1, # 2.2.5 + 0x7FFFFFFF => 1, # 2.2.6 + 0x0000D800 => 1, # 5.1.1 + 0x0000DB7F => 1, # 5.1.2 + 0x0000D880 => 1, # 5.1.3 + 0x0000DBFF => 1, # 5.1.4 + 0x0000DC00 => 1, # 5.1.5 + 0x0000DF80 => 1, # 5.1.6 + 0x0000DFFF => 1, # 5.1.7 + # 5.2 "Paird UTF-16 surrogates skipped + # because utf-8-strict raises exception at the first one + 0x0000FFFF => 1, # 5.3.1 + ); + $NTESTS += scalar keys %ORD; + %SEQ = ( + qq/ed 9f bf/ => 0, # 2.3.1 + qq/ee 80 80/ => 0, # 2.3.2 + qq/f4 8f bf bf/ => 0, # 2.3.3 + qq/f4 90 80 80/ => 1, # 2.3.4 -- out of range so NG + # "3 Malformed sequences" are checked by perl. + # "4 Overlong sequences" are checked by perl. + ); + $NTESTS += scalar keys %SEQ; +} +use strict; +use Encode; +use utf8; +use Test::More tests => $NTESTS; + +local($SIG{__WARN__}) = sub { $DEBUG and $@ and print STDERR $@ }; + +my $d = find_encoding("utf-8-strict"); +for my $u (sort keys %ORD){ + my $c = chr($u); + eval { $d->encode($c,1) }; + $DEBUG and $@ and warn $@; + my $t = $@ ? 1 : 0; + is($t, $ORD{$u}, sprintf "U+%04X", $u); +} +for my $s (sort keys %SEQ){ + my $o = pack "C*" => map {hex} split /\s+/, $s; + eval { $d->decode($o,1) }; + $DEBUG and $@ and warn $@; + my $t = $@ ? 1 : 0; + is($t, $SEQ{$s}, $s); +} + +__END__ + +