From: Jarkko Hietaniemi <jhi@iki.fi>
Date: Tue, 11 Sep 2001 02:27:25 +0000 (+0000)
Subject: Fix unpack U to be the reverse of pack U
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=35bcd33832d74e56bb99eb6538654e7d815f1ecb;p=p5sagit%2Fp5-mst-13.2.git

Fix unpack U to be the reverse of pack U
(but implement unpack U0U as a backdoor to get
 the UTF-8 malformed warnings from un-UTF-8 data)

p4raw-id: //depot/perl@11993
---

diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index 9609cdc..e6a14a7 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -456,7 +456,9 @@ outside of the utf8 pragma too.)
 The C<chr()> and C<ord()> functions work on characters.  This is like
 C<pack("U")> and C<unpack("U")>, not like C<pack("C")> and
 C<unpack("C")>.  In fact, the latter are how you now emulate
-byte-oriented C<chr()> and C<ord()> under utf8.
+byte-oriented C<chr()> and C<ord()> for Unicode strings.
+(Note that this reveals the internal UTF-8 encoding of strings and
+you are not supposed to do that unless you know what you are doing.)
 
 =item *
 
diff --git a/pp_pack.c b/pp_pack.c
index 7dc2874..54ed0b7 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -167,6 +167,7 @@ PP(pp_unpack)
     int natint;		/* native integer */
     int unatint;	/* unsigned native integer */
 #endif
+    bool do_utf8 = DO_UTF8(right);
 
     if (gimme != G_ARRAY) {		/* arrange to do first one only */
 	/*SUPPRESS 530*/
@@ -205,7 +206,7 @@ PP(pp_unpack)
 		DIE(aTHX_ "'!' allowed only after types %s", natstr);
 	}
 	star = 0;
-	if (pat >= patend)
+	if (pat > patend)
 	    len = 1;
 	else if (*pat == '*') {
 	    len = strend - strbeg;	/* long enough */
@@ -416,6 +417,11 @@ PP(pp_unpack)
 	    }
 	    break;
 	case 'C':
+	unpack_C: /* unpack U will jump here if not UTF-8 */
+            if (len == 0) {
+		do_utf8 = FALSE;
+		break;
+	    }
 	    if (len > strend - s)
 		len = strend - s;
 	    if (checksum) {
@@ -437,6 +443,12 @@ PP(pp_unpack)
 	    }
 	    break;
 	case 'U':
+	    if (len == 0) {
+		do_utf8 = TRUE;
+		break;
+	    }
+	    if (!do_utf8)
+		 goto unpack_C;
 	    if (len > strend - s)
 		len = strend - s;
 	    if (checksum) {
diff --git a/t/op/pack.t b/t/op/pack.t
index 1c6222e..8d32746 100755
--- a/t/op/pack.t
+++ b/t/op/pack.t
@@ -26,7 +26,7 @@ sub ok {
 }
 
 
-print "1..161\n";
+print "1..169\n";
 
 # Note: All test numbers in comments are off by 1 after the comment below..
 
@@ -457,7 +457,46 @@ print 'not ' unless v1.20.300.4000 ne
                     sprintf "%vd", pack("C0U*",1,20,300,4000);
 print "ok $test\n"; $test++;
 
-# 160
+# 161
 print "not " unless join(" ", unpack("C*", chr(0x1e2)))
         eq ((ord(A) == 193) ? "156 67" : "199 162");
 print "ok $test\n"; $test++;
+
+# 162: does pack U create Unicode?
+print "not " unless ord(pack('U', 300)) == 300;
+print "ok $test\n"; $test++;
+
+# 163: does unpack U deref Unicode?
+print "not " unless (unpack('U', chr(300)))[0] == 300;
+print "ok $test\n"; $test++;
+
+# 164: is unpack U the reverse of pack U for Unicode string?
+print "not "
+    unless "@{[unpack('U*', pack('U*', 100, 200, 300))]}" eq "100 200 300";
+print "ok $test\n"; $test++;
+
+# 165: is unpack U the reverse of pack U for byte string?
+print "not "
+    unless "@{[unpack('U*', pack('U*', 100, 200))]}" eq "100 200";
+print "ok $test\n"; $test++;
+
+# 166: does unpack C unravel pack U?
+print "not " unless "@{[unpack('C*', pack('U*', 100, 200))]}" eq "100 195 136";
+print "ok $test\n"; $test++;
+
+# 167: does pack U0C create Unicode?
+print "not " unless "@{[pack('U0C*', 100, 195, 136)]}" eq v100.v200;
+print "ok $test\n"; $test++;
+
+# 168: does pack C0U create characters?
+print "not " unless "@{[pack('C0U*', 100, 200)]}" eq pack("C*", 100, 195, 136);
+print "ok $test\n"; $test++;
+
+# 169: does unpack U0U on byte data warn?
+{
+    local $SIG{__WARN__} = sub { $@ = "@_" };
+    my @null = unpack('U0U', chr(255));
+    print "not " unless $@ =~ /^Malformed UTF-8 character /;
+    print "ok $test\n"; $test++;
+}
+
diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t
index cc2b26a..499049a 100644
--- a/t/op/utf8decode.t
+++ b/t/op/utf8decode.t
@@ -136,24 +136,21 @@ __EOMK__
 
 # 104..181
 {
-    my $WARNCNT;
     my $id;
 
-    local $SIG{__WARN__} =
-	sub {
-	    print "# $id: @_";
-	    $WARNCNT++;
-	    $WARNMSG = "@_";
-	};
+    local $SIG{__WARN__} = sub {
+	print "# $id: @_";
+	$@ = "@_";
+    };
 
     sub moan {
 	print "$id: @_";
     }
 
-    sub test_unpack_U {
-	$WARNCNT = 0;
-	$WARNMSG = "";
-	unpack('U*', $_[0]);
+    sub warn_unpack_U {
+	$@ = '';
+	my @null = unpack('U0U*', $_[0]);
+	return $@;
     }
 
     for (@MK) {
@@ -161,7 +158,7 @@ __EOMK__
 	    # print "# $_\n";
 	} elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+((?:\d+|-)(?:\s+(.+))?))?$/) {
 	    $id = $1;
-	    my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $error) =
+	    my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $experr) =
 		($2, $3, $4, $5, $6, $7, $8);
 	    my @hex = split(/:/, $hex);
 	    unless (@hex == $byteslen) {
@@ -175,20 +172,19 @@ __EOMK__
 		    moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n";
 		}
 	    }
+	    my $warn = warn_unpack_U($bytes);
 	    if ($okay eq 'y') {
-		test_unpack_U($bytes);
-		if ($WARNCNT) {
-		    moan "unpack('U*') false negative\n";
+		if ($warn) {
+		    moan "unpack('U0U*') false negative\n";
 		    print "not ";
 		}
 	    } elsif ($okay eq 'n') {
-		test_unpack_U($bytes);
-		if ($WARNCNT == 0 || ($error ne '' && $WARNMSG !~ /$error/)) {
-		    moan "unpack('U*') false positive\n";
+		if (not $warn || ($experr ne '' && $warn !~ /$experr/)) {
+		    moan "unpack('U0U*') false positive\n";
 		    print "not ";
 		}
 	    }
-	    print "ok $test\n";
+	    print "ok $test # $id $okay\n";
 	    $test++;
  	} else {
 	    moan "unknown format\n";