From: Jarkko Hietaniemi Date: Tue, 11 Sep 2001 02:27:25 +0000 (+0000) Subject: Fix unpack U to be the reverse of pack U X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=35bcd33832d74e56bb99eb6538654e7d815f1ecb;p=p5sagit%2Fp5-mst-13.2.git Fix unpack U to be the reverse of pack U (but implement unpack U0U as a backdoor to get the UTF-8 malformed warnings from un-UTF-8 data) p4raw-id: //depot/perl@11993 --- diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 9609cdc..e6a14a7 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -456,7 +456,9 @@ outside of the utf8 pragma too.) The C and C functions work on characters. This is like C and C, not like C and C. In fact, the latter are how you now emulate -byte-oriented C and C under utf8. +byte-oriented C and C for Unicode strings. +(Note that this reveals the internal UTF-8 encoding of strings and +you are not supposed to do that unless you know what you are doing.) =item * diff --git a/pp_pack.c b/pp_pack.c index 7dc2874..54ed0b7 100644 --- a/pp_pack.c +++ b/pp_pack.c @@ -167,6 +167,7 @@ PP(pp_unpack) int natint; /* native integer */ int unatint; /* unsigned native integer */ #endif + bool do_utf8 = DO_UTF8(right); if (gimme != G_ARRAY) { /* arrange to do first one only */ /*SUPPRESS 530*/ @@ -205,7 +206,7 @@ PP(pp_unpack) DIE(aTHX_ "'!' allowed only after types %s", natstr); } star = 0; - if (pat >= patend) + if (pat > patend) len = 1; else if (*pat == '*') { len = strend - strbeg; /* long enough */ @@ -416,6 +417,11 @@ PP(pp_unpack) } break; case 'C': + unpack_C: /* unpack U will jump here if not UTF-8 */ + if (len == 0) { + do_utf8 = FALSE; + break; + } if (len > strend - s) len = strend - s; if (checksum) { @@ -437,6 +443,12 @@ PP(pp_unpack) } break; case 'U': + if (len == 0) { + do_utf8 = TRUE; + break; + } + if (!do_utf8) + goto unpack_C; if (len > strend - s) len = strend - s; if (checksum) { diff --git a/t/op/pack.t b/t/op/pack.t index 1c6222e..8d32746 100755 --- a/t/op/pack.t +++ b/t/op/pack.t @@ -26,7 +26,7 @@ sub ok { } -print "1..161\n"; +print "1..169\n"; # Note: All test numbers in comments are off by 1 after the comment below.. @@ -457,7 +457,46 @@ print 'not ' unless v1.20.300.4000 ne sprintf "%vd", pack("C0U*",1,20,300,4000); print "ok $test\n"; $test++; -# 160 +# 161 print "not " unless join(" ", unpack("C*", chr(0x1e2))) eq ((ord(A) == 193) ? "156 67" : "199 162"); print "ok $test\n"; $test++; + +# 162: does pack U create Unicode? +print "not " unless ord(pack('U', 300)) == 300; +print "ok $test\n"; $test++; + +# 163: does unpack U deref Unicode? +print "not " unless (unpack('U', chr(300)))[0] == 300; +print "ok $test\n"; $test++; + +# 164: is unpack U the reverse of pack U for Unicode string? +print "not " + unless "@{[unpack('U*', pack('U*', 100, 200, 300))]}" eq "100 200 300"; +print "ok $test\n"; $test++; + +# 165: is unpack U the reverse of pack U for byte string? +print "not " + unless "@{[unpack('U*', pack('U*', 100, 200))]}" eq "100 200"; +print "ok $test\n"; $test++; + +# 166: does unpack C unravel pack U? +print "not " unless "@{[unpack('C*', pack('U*', 100, 200))]}" eq "100 195 136"; +print "ok $test\n"; $test++; + +# 167: does pack U0C create Unicode? +print "not " unless "@{[pack('U0C*', 100, 195, 136)]}" eq v100.v200; +print "ok $test\n"; $test++; + +# 168: does pack C0U create characters? +print "not " unless "@{[pack('C0U*', 100, 200)]}" eq pack("C*", 100, 195, 136); +print "ok $test\n"; $test++; + +# 169: does unpack U0U on byte data warn? +{ + local $SIG{__WARN__} = sub { $@ = "@_" }; + my @null = unpack('U0U', chr(255)); + print "not " unless $@ =~ /^Malformed UTF-8 character /; + print "ok $test\n"; $test++; +} + diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t index cc2b26a..499049a 100644 --- a/t/op/utf8decode.t +++ b/t/op/utf8decode.t @@ -136,24 +136,21 @@ __EOMK__ # 104..181 { - my $WARNCNT; my $id; - local $SIG{__WARN__} = - sub { - print "# $id: @_"; - $WARNCNT++; - $WARNMSG = "@_"; - }; + local $SIG{__WARN__} = sub { + print "# $id: @_"; + $@ = "@_"; + }; sub moan { print "$id: @_"; } - sub test_unpack_U { - $WARNCNT = 0; - $WARNMSG = ""; - unpack('U*', $_[0]); + sub warn_unpack_U { + $@ = ''; + my @null = unpack('U0U*', $_[0]); + return $@; } for (@MK) { @@ -161,7 +158,7 @@ __EOMK__ # print "# $_\n"; } elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+((?:\d+|-)(?:\s+(.+))?))?$/) { $id = $1; - my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $error) = + my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $experr) = ($2, $3, $4, $5, $6, $7, $8); my @hex = split(/:/, $hex); unless (@hex == $byteslen) { @@ -175,20 +172,19 @@ __EOMK__ moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n"; } } + my $warn = warn_unpack_U($bytes); if ($okay eq 'y') { - test_unpack_U($bytes); - if ($WARNCNT) { - moan "unpack('U*') false negative\n"; + if ($warn) { + moan "unpack('U0U*') false negative\n"; print "not "; } } elsif ($okay eq 'n') { - test_unpack_U($bytes); - if ($WARNCNT == 0 || ($error ne '' && $WARNMSG !~ /$error/)) { - moan "unpack('U*') false positive\n"; + if (not $warn || ($experr ne '' && $warn !~ /$experr/)) { + moan "unpack('U0U*') false positive\n"; print "not "; } } - print "ok $test\n"; + print "ok $test # $id $okay\n"; $test++; } else { moan "unknown format\n";