Integrate perlio:
[p5sagit/p5-mst-13.2.git] / t / op / utf8decode.t
CommitLineData
a9917092 1#!./perl
2
3BEGIN {
4 chdir 't' if -d 't';
5 @INC = '../lib';
daf0f78e 6}
7
8{
9 my $wide = v256;
10 use bytes;
daf0f78e 11 if (ord($wide) == 140) {
12 print "1..0 # Skip: UTF-EBCDIC (not UTF-8) used here\n";
13 exit 0;
14 }
15 elsif (ord($wide) != 196) {
16 warn sprintf("v256 starts with %02X\n",ord($wide));
17 }
a9917092 18}
19
ffbc6a93 20{
21 my $wide = v256;
22 use bytes;
23 my $ordwide = ord($wide);
24 printf "# under use bytes ord(v256) = 0x%02x\n", $ordwide;
25 if ($ordwide == 140) {
26 print "1..0 # Skip: UTF-EBCDIC (not UTF-8) used here\n";
27 exit 0;
28 }
29 elsif ($ordwide != 196) {
30 printf "# v256 starts with 0x%02x\n", $ordwide;
31 }
a9917092 32}
33
3b0e0cb6 34no utf8;
ffc61ed2 35
a9917092 36print "1..78\n";
37
38my $test = 1;
39
40# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
41# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
daf0f78e 42# version dated 2000-09-02.
a9917092 43
3b0e0cb6 44# We use the \x notation instead of raw binary bytes for \x00-\x1f\x7f-\xff
45# because e.g. many patch programs have issues with binary data.
a9917092 46
47my @MK = split(/\n/, <<__EOMK__);
481 Correct UTF-8
3b0e0cb6 491.1.1 y "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5" - 11 ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5 5
daf0f78e 502 Boundary conditions
a9917092 512.1 First possible sequence of certain length
3b0e0cb6 522.1.1 y "\x00" 0 1 00 1
532.1.2 y "\xc2\x80" 80 2 c2:80 1
542.1.3 y "\xe0\xa0\x80" 800 3 e0:a0:80 1
552.1.4 y "\xf0\x90\x80\x80" 10000 4 f0:90:80:80 1
562.1.5 y "\xf8\x88\x80\x80\x80" 200000 5 f8:88:80:80:80 1
572.1.6 y "\xfc\x84\x80\x80\x80\x80" 4000000 6 fc:84:80:80:80:80 1
a9917092 582.2 Last possible sequence of certain length
3b0e0cb6 592.2.1 y "\x7f" 7f 1 7f 1
602.2.2 y "\xdf\xbf" 7ff 2 df:bf 1
a9917092 61# The ffff is illegal unless UTF8_ALLOW_FFFF
3b0e0cb6 622.2.3 n "\xef\xbf\xbf" ffff 3 ef:bf:bf 1 character 0xffff
632.2.4 y "\xf7\xbf\xbf\xbf" 1fffff 4 f7:bf:bf:bf 1
642.2.5 y "\xfb\xbf\xbf\xbf\xbf" 3ffffff 5 fb:bf:bf:bf:bf 1
652.2.6 y "\xfd\xbf\xbf\xbf\xbf\xbf" 7fffffff 6 fd:bf:bf:bf:bf:bf 1
a9917092 662.3 Other boundary conditions
3b0e0cb6 672.3.1 y "\xed\x9f\xbf" d7ff 3 ed:9f:bf 1
682.3.2 y "\xee\x80\x80" e000 3 ee:80:80 1
692.3.3 y "\xef\xbf\xbd" fffd 3 ef:bf:bd 1
702.3.4 y "\xf4\x8f\xbf\xbf" 10ffff 4 f4:8f:bf:bf 1
712.3.5 y "\xf4\x90\x80\x80" 110000 4 f4:90:80:80 1
a9917092 723 Malformed sequences
733.1 Unexpected continuation bytes
3b0e0cb6 743.1.1 n "\x80" - 1 80 - unexpected continuation byte 0x80
753.1.2 n "\xbf" - 1 bf - unexpected continuation byte 0xbf
763.1.3 n "\x80\xbf" - 2 80:bf - unexpected continuation byte 0x80
773.1.4 n "\x80\xbf\x80" - 3 80:bf:80 - unexpected continuation byte 0x80
783.1.5 n "\x80\xbf\x80\xbf" - 4 80:bf:80:bf - unexpected continuation byte 0x80
793.1.6 n "\x80\xbf\x80\xbf\x80" - 5 80:bf:80:bf:80 - unexpected continuation byte 0x80
803.1.7 n "\x80\xbf\x80\xbf\x80\xbf" - 6 80:bf:80:bf:80:bf - unexpected continuation byte 0x80
813.1.8 n "\x80\xbf\x80\xbf\x80\xbf\x80" - 7 80:bf:80:bf:80:bf:80 - unexpected continuation byte 0x80
823.1.9 n "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" - 64 80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf - unexpected continuation byte 0x80
a9917092 833.2 Lonely start characters
3b0e0cb6 843.2.1 n "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf " - 64 c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20 - unexpected non-continuation byte 0x20 after start byte 0xc0
853.2.2 n "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef " - 32 e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20 - unexpected non-continuation byte 0x20 after start byte 0xe0
863.2.3 n "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 " - 16 f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20 - unexpected non-continuation byte 0x20 after start byte 0xf0
873.2.4 n "\xf8 \xf9 \xfa \xfb " - 8 f8:20:f9:20:fa:20:fb:20 - unexpected non-continuation byte 0x20 after start byte 0xf8
883.2.5 n "\xfc \xfd " - 4 fc:20:fd:20 - unexpected non-continuation byte 0x20 after start byte 0xfc
a9917092 893.3 Sequences with last continuation byte missing
3b0e0cb6 903.3.1 n "\xc0" - 1 c0 - 1 byte, need 2
913.3.2 n "\xe0\x80" - 2 e0:80 - 2 bytes, need 3
923.3.3 n "\xf0\x80\x80" - 3 f0:80:80 - 3 bytes, need 4
933.3.4 n "\xf8\x80\x80\x80" - 4 f8:80:80:80 - 4 bytes, need 5
943.3.5 n "\xfc\x80\x80\x80\x80" - 5 fc:80:80:80:80 - 5 bytes, need 6
953.3.6 n "\xdf" - 1 df - 1 byte, need 2
963.3.7 n "\xef\xbf" - 2 ef:bf - 2 bytes, need 3
973.3.8 n "\xf7\xbf\xbf" - 3 f7:bf:bf - 3 bytes, need 4
983.3.9 n "\xfb\xbf\xbf\xbf" - 4 fb:bf:bf:bf - 4 bytes, need 5
993.3.10 n "\xfd\xbf\xbf\xbf\xbf" - 5 fd:bf:bf:bf:bf - 5 bytes, need 6
a9917092 1003.4 Concatenation of incomplete sequences
3b0e0cb6 1013.4.1 n "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf" - 30 c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf - unexpected non-continuation byte 0xe0 after start byte 0xc0
a9917092 1023.5 Impossible bytes
3b0e0cb6 1033.5.1 n "\xfe" - 1 fe - byte 0xfe
1043.5.2 n "\xff" - 1 ff - byte 0xff
1053.5.3 n "\xfe\xfe\xff\xff" - 4 fe:fe:ff:ff - byte 0xfe
a9917092 1064 Overlong sequences
1074.1 Examples of an overlong ASCII character
3b0e0cb6 1084.1.1 n "\xc0\xaf" - 2 c0:af - 2 bytes, need 1
1094.1.2 n "\xe0\x80\xaf" - 3 e0:80:af - 3 bytes, need 1
1104.1.3 n "\xf0\x80\x80\xaf" - 4 f0:80:80:af - 4 bytes, need 1
1114.1.4 n "\xf8\x80\x80\x80\xaf" - 5 f8:80:80:80:af - 5 bytes, need 1
1124.1.5 n "\xfc\x80\x80\x80\x80\xaf" - 6 fc:80:80:80:80:af - 6 bytes, need 1
a9917092 1134.2 Maximum overlong sequences
3b0e0cb6 1144.2.1 n "\xc1\xbf" - 2 c1:bf - 2 bytes, need 1
1154.2.2 n "\xe0\x9f\xbf" - 3 e0:9f:bf - 3 bytes, need 2
1164.2.3 n "\xf0\x8f\xbf\xbf" - 4 f0:8f:bf:bf - 4 bytes, need 3
1174.2.4 n "\xf8\x87\xbf\xbf\xbf" - 5 f8:87:bf:bf:bf - 5 bytes, need 4
1184.2.5 n "\xfc\x83\xbf\xbf\xbf\xbf" - 6 fc:83:bf:bf:bf:bf - 6 bytes, need 5
a9917092 1194.3 Overlong representation of the NUL character
3b0e0cb6 1204.3.1 n "\xc0\x80" - 2 c0:80 - 2 bytes, need 1
1214.3.2 n "\xe0\x80\x80" - 3 e0:80:80 - 3 bytes, need 1
1224.3.3 n "\xf0\x80\x80\x80" - 4 f0:80:80:80 - 4 bytes, need 1
1234.3.4 n "\xf8\x80\x80\x80\x80" - 5 f8:80:80:80:80 - 5 bytes, need 1
1244.3.5 n "\xfc\x80\x80\x80\x80\x80" - 6 fc:80:80:80:80:80 - 6 bytes, need 1
a9917092 1255 Illegal code positions
1265.1 Single UTF-16 surrogates
3b0e0cb6 1275.1.1 n "\xed\xa0\x80" - 3 ed:a0:80 - UTF-16 surrogate 0xd800
1285.1.2 n "\xed\xad\xbf" - 3 ed:ad:bf - UTF-16 surrogate 0xdb7f
1295.1.3 n "\xed\xae\x80" - 3 ed:ae:80 - UTF-16 surrogate 0xdb80
1305.1.4 n "\xed\xaf\xbf" - 3 ed:af:bf - UTF-16 surrogate 0xdbff
1315.1.5 n "\xed\xb0\x80" - 3 ed:b0:80 - UTF-16 surrogate 0xdc00
1325.1.6 n "\xed\xbe\x80" - 3 ed:be:80 - UTF-16 surrogate 0xdf80
1335.1.7 n "\xed\xbf\xbf" - 3 ed:bf:bf - UTF-16 surrogate 0xdfff
a9917092 1345.2 Paired UTF-16 surrogates
3b0e0cb6 1355.2.1 n "\xed\xa0\x80\xed\xb0\x80" - 6 ed:a0:80:ed:b0:80 - UTF-16 surrogate 0xd800
1365.2.2 n "\xed\xa0\x80\xed\xbf\xbf" - 6 ed:a0:80:ed:bf:bf - UTF-16 surrogate 0xd800
1375.2.3 n "\xed\xad\xbf\xed\xb0\x80" - 6 ed:ad:bf:ed:b0:80 - UTF-16 surrogate 0xdb7f
1385.2.4 n "\xed\xad\xbf\xed\xbf\xbf" - 6 ed:ad:bf:ed:bf:bf - UTF-16 surrogate 0xdb7f
1395.2.5 n "\xed\xae\x80\xed\xb0\x80" - 6 ed:ae:80:ed:b0:80 - UTF-16 surrogate 0xdb80
1405.2.6 n "\xed\xae\x80\xed\xbf\xbf" - 6 ed:ae:80:ed:bf:bf - UTF-16 surrogate 0xdb80
1415.2.7 n "\xed\xaf\xbf\xed\xb0\x80" - 6 ed:af:bf:ed:b0:80 - UTF-16 surrogate 0xdbff
1425.2.8 n "\xed\xaf\xbf\xed\xbf\xbf" - 6 ed:af:bf:ed:bf:bf - UTF-16 surrogate 0xdbff
a9917092 1435.3 Other illegal code positions
3b0e0cb6 1445.3.1 n "\xef\xbf\xbe" - 3 ef:bf:be - byte order mark 0xfffe
a9917092 145# The ffff is illegal unless UTF8_ALLOW_FFFF
3b0e0cb6 1465.3.2 n "\xef\xbf\xbf" - 3 ef:bf:bf - character 0xffff
a9917092 147__EOMK__
148
149# 104..181
150{
151 my $WARNCNT;
152 my $id;
153
154 local $SIG{__WARN__} =
155 sub {
421a8bf2 156 print "# $id: @_";
a9917092 157 $WARNCNT++;
158 $WARNMSG = "@_";
159 };
160
161 sub moan {
162 print "$id: @_";
163 }
daf0f78e 164
a9917092 165 sub test_unpack_U {
166 $WARNCNT = 0;
167 $WARNMSG = "";
168 unpack('U*', $_[0]);
169 }
170
171 for (@MK) {
172 if (/^(?:\d+(?:\.\d+)?)\s/ || /^#/) {
173 # print "# $_\n";
174 } elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+((?:\d+|-)(?:\s+(.+))?))?$/) {
175 $id = $1;
176 my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $error) =
177 ($2, $3, $4, $5, $6, $7, $8);
178 my @hex = split(/:/, $hex);
179 unless (@hex == $byteslen) {
180 my $nhex = @hex;
181 moan "amount of hex ($nhex) not equal to byteslen ($byteslen)\n";
182 }
183 {
184 use bytes;
185 my $bytesbyteslen = length($bytes);
186 unless ($bytesbyteslen == $byteslen) {
187 moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n";
188 }
189 }
190 if ($okay eq 'y') {
191 test_unpack_U($bytes);
192 if ($WARNCNT) {
193 moan "unpack('U*') false negative\n";
194 print "not ";
195 }
196 } elsif ($okay eq 'n') {
197 test_unpack_U($bytes);
198 if ($WARNCNT == 0 || ($error ne '' && $WARNMSG !~ /$error/)) {
199 moan "unpack('U*') false positive\n";
200 print "not ";
201 }
202 }
203 print "ok $test\n";
204 $test++;
205 } else {
206 moan "unknown format\n";
207 }
208 }
209}