From: Marc Lehmann <pcg@goof.com>
Date: Thu, 12 Apr 2007 08:41:53 +0000 (+0200)
Subject: Re: Compress::Zlib, pack "C" and utf-8 [PATCH]
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=1651fc447620d3610b694c35696c13530282f981;p=p5sagit%2Fp5-mst-13.2.git

Re: Compress::Zlib, pack "C" and utf-8 [PATCH]
Message-ID: <20070412064153.GA22475@schmorp.de>

p4raw-id: //depot/perl@31194
---

diff --git a/ext/Encode/t/encoding.t b/ext/Encode/t/encoding.t
index 67ea068..b17b11f 100644
--- a/ext/Encode/t/encoding.t
+++ b/ext/Encode/t/encoding.t
@@ -57,7 +57,7 @@ print "not " unless unpack("C", pack("C", 0xdf)) == 0xdf;
 print "ok 8\n";
 
 # the first octet of UTF-8 encoded 0x3af 
-print "not " unless unpack("C", chr(0xdf)) == 0xce;
+print "not " unless unpack("U0 C", chr(0xdf)) == 0xce;
 print "ok 9\n";
 
 print "not " unless unpack("U", pack("U", 0xdf)) == 0xdf;
diff --git a/ext/Storable/t/utf8hash.t b/ext/Storable/t/utf8hash.t
index eeb80eb..a529ea5 100644
--- a/ext/Storable/t/utf8hash.t
+++ b/ext/Storable/t/utf8hash.t
@@ -34,7 +34,7 @@ use Storable qw(store nstore retrieve thaw freeze);
 }
 # Better than no plan, because I was getting out of memory errors, at which
 # point Test::More tidily prints up 1..79 as if I meant to finish there.
-use Test::More tests=>148;
+use Test::More tests=>144;
 use bytes ();
 my %utf8hash;
 
@@ -57,13 +57,10 @@ my @ords = (
 foreach my $i (@ords){
     my $u = chr($i); utf8::upgrade($u);
     # warn sprintf "%d,%d", bytes::length($u), is_utf8($u);
-    my $b = pack("C*", unpack("C*", $u));
+    my $b = chr($i); utf8::encode($b);
     # warn sprintf "%d,%d" ,bytes::length($b), is_utf8($b);
 
-    isnt($u,	                        $b, 
-	 "equivalence - with utf8flag");
-    is   (pack("C*", unpack("C*", $u)), pack("C*", unpack("C*", $b)),
-	  "equivalence - without utf8flag");
+    isnt($u, $b, "equivalence - with utf8flag");
 
     $utf8hash{$u} = $utf8hash{$b} = $i;
 }
diff --git a/ext/Unicode/Normalize/t/short.t b/ext/Unicode/Normalize/t/short.t
index d799f4a..a9e444f 100644
--- a/ext/Unicode/Normalize/t/short.t
+++ b/ext/Unicode/Normalize/t/short.t
@@ -35,7 +35,7 @@ print "ok 1\n";
 no warnings qw(utf8);
 
 # U+3042 is 3-byte length (in UTF-8/UTF-EBCDIC)
-our $a = pack 'U0C', unpack 'C', "\x{3042}";
+our $a = pack 'U0C', unpack 'U0C', "\x{3042}";
 
 print NFD($a) eq "\0"
    ? "ok" : "not ok", " 2\n";
diff --git a/lib/CGI/Util.pm b/lib/CGI/Util.pm
index 9cef416..0cb6e51 100644
--- a/lib/CGI/Util.pm
+++ b/lib/CGI/Util.pm
@@ -200,8 +200,8 @@ sub escape {
   shift() if @_ > 1 and ( ref($_[0]) || (defined $_[1] && $_[0] eq $CGI::DefaultClass));
   my $toencode = shift;
   return undef unless defined($toencode);
-  # force bytes while preserving backward compatibility -- dankogai
-  $toencode = pack("C*", unpack("C*", $toencode));
+  # we enforce UTF-8 encoding for URLs for no good reason except UTF-8 being the future
+  utf8::encode $toencode;
     if ($EBCDIC) {
       $toencode=~s/([^a-zA-Z0-9_.~-])/uc sprintf("%%%02x",$E2A[ord($1)])/eg;
     } else {
diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod
index 1755bce..101d10e 100644
--- a/pod/perlfunc.pod
+++ b/pod/perlfunc.pod
@@ -3505,8 +3505,7 @@ of values, as follows:
     H	A hex string (high nybble first).
 
     c	A signed char (8-bit) value.
-    C	An unsigned C char (octet) even under Unicode. Should normally not
-        be used. See U and W instead.
+    C	An unsigned char (octet) value.
     W   An unsigned char value (can be greater than 255).
 
     s	A signed short (16-bit) value.
@@ -3547,8 +3546,8 @@ of values, as follows:
     P	A pointer to a structure (fixed-length string).
 
     u	A uuencoded string.
-    U	A Unicode character number.  Encodes to UTF-8 internally
-	(or UTF-EBCDIC in EBCDIC platforms).
+    U	A Unicode character number.  Encodes to a character in character mode
+        and UTF-8 (or UTF-EBCDIC in EBCDIC platforms) in byte mode.
 
     w	A BER compressed integer (not an ASN.1 BER, see perlpacktut for
 	details).  Its bytes represent an unsigned integer in base 128,
diff --git a/pp_pack.c b/pp_pack.c
index 7aa95a9..76e6315 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -26,7 +26,6 @@
  * other pp*.c files for the rest of the pp_ functions.
  */
 
-
 #include "EXTERN.h"
 #define PERL_IN_PP_PACK_C
 #include "perl.h"
@@ -381,7 +380,7 @@ STATIC const packprops_t packprops[512] = {
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0,
-    /* C */ sizeof(unsigned char) | PACK_SIZE_UNPREDICTABLE,
+    /* C */ sizeof(unsigned char),
 #if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE)
     /* D */ LONG_DOUBLESIZE,
 #else
@@ -532,7 +531,7 @@ STATIC const packprops_t packprops[512] = {
     /* w */ sizeof(char) | PACK_SIZE_UNPREDICTABLE | PACK_SIZE_CANNOT_CSUM,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    /* C */ sizeof(unsigned char) | PACK_SIZE_UNPREDICTABLE,
+    /* C */ sizeof(unsigned char),
 #if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE)
     /* D */ LONG_DOUBLESIZE,
 #else
@@ -1562,10 +1561,29 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c
 	    XPUSHs(sv);
 	    break;
 	}
+	case 'C':
+            if (len == 0) {
+                if (explicit_length)
+		    /* Switch to "character" mode */
+		    utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0;
+		break;
+	    }
+	    /* FALL THROUGH */
 	case 'c':
-	    while (len-- > 0) {
-		int aint = SHIFT_BYTE(utf8, s, strend, datumtype);
-		if (aint >= 128)	/* fake up signed chars */
+	    while (len-- > 0 && s < strend) {
+		int aint;
+		if (utf8)
+		  {
+		    STRLEN retlen;
+		    aint = utf8n_to_uvchr((U8 *) s, strend-s, &retlen,
+				 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+		    if (retlen == (STRLEN) -1 || retlen == 0)
+			Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
+		    s += retlen;
+		  }
+		else
+		  aint = *(U8 *)(s)++;
+		if (aint >= 128 && datumtype != 'C')	/* fake up signed chars */
 		    aint -= 256;
 		if (!checksum)
 		    PUSHs(sv_2mortal(newSViv((IV)aint)));
@@ -1575,18 +1593,9 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c
 		    cuv += aint;
 	    }
 	    break;
-	case 'C':
 	case 'W':
 	  W_checksum:
-            if (len == 0) {
-                if (explicit_length && datumtype == 'C')
-		    /* Switch to "character" mode */
-		    utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0;
-		break;
-	    }
-	    if (datumtype == 'C' ?
-		 (symptr->flags & FLAG_DO_UTF8) &&
-		!(symptr->flags & FLAG_WAS_UTF8) : utf8) {
+	    if (utf8) {
 		while (len-- > 0 && s < strend) {
 		    STRLEN retlen;
 		    const UV val = utf8n_to_uvchr((U8 *) s, strend-s, &retlen,
@@ -2930,7 +2939,6 @@ S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist )
 		utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0;
 		break;
 	    }
-	    GROWING(0, cat, start, cur, len);
 	    while (len-- > 0) {
 		IV aiv;
 		fromstr = NEXTFROM;
@@ -2939,7 +2947,7 @@ S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist )
 		    ckWARN(WARN_PACK))
 		    Perl_warner(aTHX_ packWARN(WARN_PACK),
 				"Character in 'C' format wrapped in pack");
-		*cur++ = (char)(aiv & 0xff);
+		PUSH_BYTE(utf8, cur, (U8)(aiv & 0xff));
 	    }
 	    break;
 	case 'W': {
diff --git a/t/op/chr.t b/t/op/chr.t
index 056f11a..5ac453f 100644
--- a/t/op/chr.t
+++ b/t/op/chr.t
@@ -37,7 +37,7 @@ SKIP: {
 
 sub hexes {
     no warnings 'utf8'; # avoid surrogate and beyond Unicode warnings
-    join(" ",map{sprintf"%02x",$_}unpack("C*",chr($_[0])));
+    join(" ",unpack "U0 (H2)*", chr $_[0]);
 }
 
 # The following code points are some interesting steps in UTF-8.
diff --git a/t/op/pack.t b/t/op/pack.t
index f37c73f..ef88540 100755
--- a/t/op/pack.t
+++ b/t/op/pack.t
@@ -12,7 +12,7 @@ my $no_endianness = $] > 5.009 ? '' :
 my $no_signedness = $] > 5.009 ? '' :
   "Signed/unsigned pack modifiers not available on this perl";
 
-plan tests => 14697;
+plan tests => 14696;
 
 use strict;
 use warnings qw(FATAL all);
@@ -918,7 +918,7 @@ SKIP: {
 isnt(v1.20.300.4000, sprintf "%vd", pack("C0U*",1,20,300,4000));
 
 my $rslt = $Is_EBCDIC ? "156 67" : "199 162";
-is(join(" ", unpack("C*", chr(0x1e2))), $rslt);
+is(join(" ", unpack("U0 C*", chr(0x1e2))), $rslt);
 
 # does pack U create Unicode?
 is(ord(pack('U', 300)), 300);
@@ -936,9 +936,6 @@ is("@{[unpack('U*', pack('U*', 100, 200))]}", "100 200");
 SKIP: {
     skip "Not for EBCDIC", 4 if $Is_EBCDIC;
 
-    # does unpack C unravel pack U?
-    is("@{[unpack('C*', pack('U*', 100, 200))]}", "100 195 136");
-
     # does pack U0C create Unicode?
     is("@{[pack('U0C*', 100, 195, 136)]}", v100.v200);
 
@@ -1648,7 +1645,7 @@ is(unpack('c'), 65, "one-arg unpack (change #18751)"); # defaulting to $_
 }
 
 {
-    # C is *not* neutral
+    # C *is* neutral
     my $down = "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x05\x06";
     my $up   = $down;
     utf8::upgrade($up);
@@ -1658,7 +1655,7 @@ is(unpack('c'), 65, "one-arg unpack (change #18751)"); # defaulting to $_
     is(pack("C*", @down), $down, "byte join");
 
     my @up   = unpack("C*", $up);
-    my @expect_up = (0xc3, 0xb8, 0xc3, 0xb9, 0xc3, 0xba, 0xc3, 0xbb, 0xc3, 0xbc, 0xc3, 0xbd, 0xc3, 0xbe, 0xc3, 0xbf, 0x05, 0x06);
+    my @expect_up = (0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 0x05, 0x06);
     is("@up", "@expect_up", "UTF-8 expand");
     is(pack("U0C0C*", @up), $up, "UTF-8 join");
 }
diff --git a/t/op/utftaint.t b/t/op/utftaint.t
index d6e900d..df99c8d 100644
--- a/t/op/utftaint.t
+++ b/t/op/utftaint.t
@@ -23,10 +23,7 @@ plan(tests => 3*10 + 3*8 + 2*16 + 2);
 my $arg = $ENV{PATH}; # a tainted value
 use constant UTF8 => "\x{1234}";
 
-sub is_utf8 {
-    my $s = shift;
-    return 0xB6 != unpack('C', chr(0xB6).$s);
-}
+*is_utf8 = \&utf8::is_utf8;
 
 for my $ary ([ascii => 'perl'], [latin1 => "\xB6"], [utf8 => "\x{100}"]) {
     my $encode = $ary->[0];