From: Marc Lehmann Date: Thu, 12 Apr 2007 08:41:53 +0000 (+0200) Subject: Re: Compress::Zlib, pack "C" and utf-8 [PATCH] X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=1651fc447620d3610b694c35696c13530282f981;p=p5sagit%2Fp5-mst-13.2.git Re: Compress::Zlib, pack "C" and utf-8 [PATCH] Message-ID: <20070412064153.GA22475@schmorp.de> p4raw-id: //depot/perl@31194 --- diff --git a/ext/Encode/t/encoding.t b/ext/Encode/t/encoding.t index 67ea068..b17b11f 100644 --- a/ext/Encode/t/encoding.t +++ b/ext/Encode/t/encoding.t @@ -57,7 +57,7 @@ print "not " unless unpack("C", pack("C", 0xdf)) == 0xdf; print "ok 8\n"; # the first octet of UTF-8 encoded 0x3af -print "not " unless unpack("C", chr(0xdf)) == 0xce; +print "not " unless unpack("U0 C", chr(0xdf)) == 0xce; print "ok 9\n"; print "not " unless unpack("U", pack("U", 0xdf)) == 0xdf; diff --git a/ext/Storable/t/utf8hash.t b/ext/Storable/t/utf8hash.t index eeb80eb..a529ea5 100644 --- a/ext/Storable/t/utf8hash.t +++ b/ext/Storable/t/utf8hash.t @@ -34,7 +34,7 @@ use Storable qw(store nstore retrieve thaw freeze); } # Better than no plan, because I was getting out of memory errors, at which # point Test::More tidily prints up 1..79 as if I meant to finish there. -use Test::More tests=>148; +use Test::More tests=>144; use bytes (); my %utf8hash; @@ -57,13 +57,10 @@ my @ords = ( foreach my $i (@ords){ my $u = chr($i); utf8::upgrade($u); # warn sprintf "%d,%d", bytes::length($u), is_utf8($u); - my $b = pack("C*", unpack("C*", $u)); + my $b = chr($i); utf8::encode($b); # warn sprintf "%d,%d" ,bytes::length($b), is_utf8($b); - isnt($u, $b, - "equivalence - with utf8flag"); - is (pack("C*", unpack("C*", $u)), pack("C*", unpack("C*", $b)), - "equivalence - without utf8flag"); + isnt($u, $b, "equivalence - with utf8flag"); $utf8hash{$u} = $utf8hash{$b} = $i; } diff --git a/ext/Unicode/Normalize/t/short.t b/ext/Unicode/Normalize/t/short.t index d799f4a..a9e444f 100644 --- a/ext/Unicode/Normalize/t/short.t +++ b/ext/Unicode/Normalize/t/short.t @@ -35,7 +35,7 @@ print "ok 1\n"; no warnings qw(utf8); # U+3042 is 3-byte length (in UTF-8/UTF-EBCDIC) -our $a = pack 'U0C', unpack 'C', "\x{3042}"; +our $a = pack 'U0C', unpack 'U0C', "\x{3042}"; print NFD($a) eq "\0" ? "ok" : "not ok", " 2\n"; diff --git a/lib/CGI/Util.pm b/lib/CGI/Util.pm index 9cef416..0cb6e51 100644 --- a/lib/CGI/Util.pm +++ b/lib/CGI/Util.pm @@ -200,8 +200,8 @@ sub escape { shift() if @_ > 1 and ( ref($_[0]) || (defined $_[1] && $_[0] eq $CGI::DefaultClass)); my $toencode = shift; return undef unless defined($toencode); - # force bytes while preserving backward compatibility -- dankogai - $toencode = pack("C*", unpack("C*", $toencode)); + # we enforce UTF-8 encoding for URLs for no good reason except UTF-8 being the future + utf8::encode $toencode; if ($EBCDIC) { $toencode=~s/([^a-zA-Z0-9_.~-])/uc sprintf("%%%02x",$E2A[ord($1)])/eg; } else { diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod index 1755bce..101d10e 100644 --- a/pod/perlfunc.pod +++ b/pod/perlfunc.pod @@ -3505,8 +3505,7 @@ of values, as follows: H A hex string (high nybble first). c A signed char (8-bit) value. - C An unsigned C char (octet) even under Unicode. Should normally not - be used. See U and W instead. + C An unsigned char (octet) value. W An unsigned char value (can be greater than 255). s A signed short (16-bit) value. @@ -3547,8 +3546,8 @@ of values, as follows: P A pointer to a structure (fixed-length string). u A uuencoded string. - U A Unicode character number. Encodes to UTF-8 internally - (or UTF-EBCDIC in EBCDIC platforms). + U A Unicode character number. Encodes to a character in character mode + and UTF-8 (or UTF-EBCDIC in EBCDIC platforms) in byte mode. w A BER compressed integer (not an ASN.1 BER, see perlpacktut for details). Its bytes represent an unsigned integer in base 128, diff --git a/pp_pack.c b/pp_pack.c index 7aa95a9..76e6315 100644 --- a/pp_pack.c +++ b/pp_pack.c @@ -26,7 +26,6 @@ * other pp*.c files for the rest of the pp_ functions. */ - #include "EXTERN.h" #define PERL_IN_PP_PACK_C #include "perl.h" @@ -381,7 +380,7 @@ STATIC const packprops_t packprops[512] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* C */ sizeof(unsigned char) | PACK_SIZE_UNPREDICTABLE, + /* C */ sizeof(unsigned char), #if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE) /* D */ LONG_DOUBLESIZE, #else @@ -532,7 +531,7 @@ STATIC const packprops_t packprops[512] = { /* w */ sizeof(char) | PACK_SIZE_UNPREDICTABLE | PACK_SIZE_CANNOT_CSUM, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* C */ sizeof(unsigned char) | PACK_SIZE_UNPREDICTABLE, + /* C */ sizeof(unsigned char), #if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE) /* D */ LONG_DOUBLESIZE, #else @@ -1562,10 +1561,29 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c XPUSHs(sv); break; } + case 'C': + if (len == 0) { + if (explicit_length) + /* Switch to "character" mode */ + utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0; + break; + } + /* FALL THROUGH */ case 'c': - while (len-- > 0) { - int aint = SHIFT_BYTE(utf8, s, strend, datumtype); - if (aint >= 128) /* fake up signed chars */ + while (len-- > 0 && s < strend) { + int aint; + if (utf8) + { + STRLEN retlen; + aint = utf8n_to_uvchr((U8 *) s, strend-s, &retlen, + ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); + if (retlen == (STRLEN) -1 || retlen == 0) + Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); + s += retlen; + } + else + aint = *(U8 *)(s)++; + if (aint >= 128 && datumtype != 'C') /* fake up signed chars */ aint -= 256; if (!checksum) PUSHs(sv_2mortal(newSViv((IV)aint))); @@ -1575,18 +1593,9 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c cuv += aint; } break; - case 'C': case 'W': W_checksum: - if (len == 0) { - if (explicit_length && datumtype == 'C') - /* Switch to "character" mode */ - utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0; - break; - } - if (datumtype == 'C' ? - (symptr->flags & FLAG_DO_UTF8) && - !(symptr->flags & FLAG_WAS_UTF8) : utf8) { + if (utf8) { while (len-- > 0 && s < strend) { STRLEN retlen; const UV val = utf8n_to_uvchr((U8 *) s, strend-s, &retlen, @@ -2930,7 +2939,6 @@ S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist ) utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0; break; } - GROWING(0, cat, start, cur, len); while (len-- > 0) { IV aiv; fromstr = NEXTFROM; @@ -2939,7 +2947,7 @@ S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist ) ckWARN(WARN_PACK)) Perl_warner(aTHX_ packWARN(WARN_PACK), "Character in 'C' format wrapped in pack"); - *cur++ = (char)(aiv & 0xff); + PUSH_BYTE(utf8, cur, (U8)(aiv & 0xff)); } break; case 'W': { diff --git a/t/op/chr.t b/t/op/chr.t index 056f11a..5ac453f 100644 --- a/t/op/chr.t +++ b/t/op/chr.t @@ -37,7 +37,7 @@ SKIP: { sub hexes { no warnings 'utf8'; # avoid surrogate and beyond Unicode warnings - join(" ",map{sprintf"%02x",$_}unpack("C*",chr($_[0]))); + join(" ",unpack "U0 (H2)*", chr $_[0]); } # The following code points are some interesting steps in UTF-8. diff --git a/t/op/pack.t b/t/op/pack.t index f37c73f..ef88540 100755 --- a/t/op/pack.t +++ b/t/op/pack.t @@ -12,7 +12,7 @@ my $no_endianness = $] > 5.009 ? '' : my $no_signedness = $] > 5.009 ? '' : "Signed/unsigned pack modifiers not available on this perl"; -plan tests => 14697; +plan tests => 14696; use strict; use warnings qw(FATAL all); @@ -918,7 +918,7 @@ SKIP: { isnt(v1.20.300.4000, sprintf "%vd", pack("C0U*",1,20,300,4000)); my $rslt = $Is_EBCDIC ? "156 67" : "199 162"; -is(join(" ", unpack("C*", chr(0x1e2))), $rslt); +is(join(" ", unpack("U0 C*", chr(0x1e2))), $rslt); # does pack U create Unicode? is(ord(pack('U', 300)), 300); @@ -936,9 +936,6 @@ is("@{[unpack('U*', pack('U*', 100, 200))]}", "100 200"); SKIP: { skip "Not for EBCDIC", 4 if $Is_EBCDIC; - # does unpack C unravel pack U? - is("@{[unpack('C*', pack('U*', 100, 200))]}", "100 195 136"); - # does pack U0C create Unicode? is("@{[pack('U0C*', 100, 195, 136)]}", v100.v200); @@ -1648,7 +1645,7 @@ is(unpack('c'), 65, "one-arg unpack (change #18751)"); # defaulting to $_ } { - # C is *not* neutral + # C *is* neutral my $down = "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x05\x06"; my $up = $down; utf8::upgrade($up); @@ -1658,7 +1655,7 @@ is(unpack('c'), 65, "one-arg unpack (change #18751)"); # defaulting to $_ is(pack("C*", @down), $down, "byte join"); my @up = unpack("C*", $up); - my @expect_up = (0xc3, 0xb8, 0xc3, 0xb9, 0xc3, 0xba, 0xc3, 0xbb, 0xc3, 0xbc, 0xc3, 0xbd, 0xc3, 0xbe, 0xc3, 0xbf, 0x05, 0x06); + my @expect_up = (0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 0x05, 0x06); is("@up", "@expect_up", "UTF-8 expand"); is(pack("U0C0C*", @up), $up, "UTF-8 join"); } diff --git a/t/op/utftaint.t b/t/op/utftaint.t index d6e900d..df99c8d 100644 --- a/t/op/utftaint.t +++ b/t/op/utftaint.t @@ -23,10 +23,7 @@ plan(tests => 3*10 + 3*8 + 2*16 + 2); my $arg = $ENV{PATH}; # a tainted value use constant UTF8 => "\x{1234}"; -sub is_utf8 { - my $s = shift; - return 0xB6 != unpack('C', chr(0xB6).$s); -} +*is_utf8 = \&utf8::is_utf8; for my $ary ([ascii => 'perl'], [latin1 => "\xB6"], [utf8 => "\x{100}"]) { my $encode = $ary->[0];