X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=t%2Fop%2Futf8decode.t;h=499049aab93398d8f22c716f4d8e957694ff610e;hb=84281c3142b669e898404773f536776423f8c11b;hp=4d05a6b8d3793e0349a3038a2402dded2e50fe9c;hpb=0e06870bf080a38cda51c06c6612359afc2334e1;p=p5sagit%2Fp5-mst-13.2.git diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t index 4d05a6b..499049a 100644 --- a/t/op/utf8decode.t +++ b/t/op/utf8decode.t @@ -5,6 +5,20 @@ BEGIN { @INC = '../lib'; } +{ + my $wide = v256; + use bytes; + my $ordwide = ord($wide); + printf "# under use bytes ord(v256) = 0x%02x\n", $ordwide; + if ($ordwide == 140) { + print "1..0 # Skip: UTF-EBCDIC (not UTF-8) used here\n"; + exit 0; + } + elsif ($ordwide != 196) { + printf "# v256 starts with 0x%02x\n", $ordwide; + } +} + no utf8; print "1..78\n"; @@ -13,7 +27,7 @@ my $test = 1; # This table is based on Markus Kuhn's UTF-8 Decode Stress Tester, # http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt, -# version dated 2000-09-02. +# version dated 2000-09-02. # We use the \x notation instead of raw binary bytes for \x00-\x1f\x7f-\xff # because e.g. many patch programs have issues with binary data. @@ -21,7 +35,7 @@ my $test = 1; my @MK = split(/\n/, <<__EOMK__); 1 Correct UTF-8 1.1.1 y "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5" - 11 ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5 5 -2 Boundary conditions +2 Boundary conditions 2.1 First possible sequence of certain length 2.1.1 y "\x00" 0 1 00 1 2.1.2 y "\xc2\x80" 80 2 c2:80 1 @@ -122,24 +136,21 @@ __EOMK__ # 104..181 { - my $WARNCNT; my $id; - local $SIG{__WARN__} = - sub { - print "# $id: @_"; - $WARNCNT++; - $WARNMSG = "@_"; - }; + local $SIG{__WARN__} = sub { + print "# $id: @_"; + $@ = "@_"; + }; sub moan { print "$id: @_"; } - - sub test_unpack_U { - $WARNCNT = 0; - $WARNMSG = ""; - unpack('U*', $_[0]); + + sub warn_unpack_U { + $@ = ''; + my @null = unpack('U0U*', $_[0]); + return $@; } for (@MK) { @@ -147,7 +158,7 @@ __EOMK__ # print "# $_\n"; } elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+((?:\d+|-)(?:\s+(.+))?))?$/) { $id = $1; - my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $error) = + my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $experr) = ($2, $3, $4, $5, $6, $7, $8); my @hex = split(/:/, $hex); unless (@hex == $byteslen) { @@ -161,20 +172,19 @@ __EOMK__ moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n"; } } + my $warn = warn_unpack_U($bytes); if ($okay eq 'y') { - test_unpack_U($bytes); - if ($WARNCNT) { - moan "unpack('U*') false negative\n"; + if ($warn) { + moan "unpack('U0U*') false negative\n"; print "not "; } } elsif ($okay eq 'n') { - test_unpack_U($bytes); - if ($WARNCNT == 0 || ($error ne '' && $WARNMSG !~ /$error/)) { - moan "unpack('U*') false positive\n"; + if (not $warn || ($experr ne '' && $warn !~ /$experr/)) { + moan "unpack('U0U*') false positive\n"; print "not "; } } - print "ok $test\n"; + print "ok $test # $id $okay\n"; $test++; } else { moan "unknown format\n";