12 my $ordwide = ord($wide);
13 printf "# under use bytes ord(v256) = 0x%02x\n", $ordwide;
14 if ($ordwide == 140) {
15 print "1..0 # Skip: UTF-EBCDIC (not UTF-8) used here\n";
18 elsif ($ordwide != 196) {
19 printf "# v256 starts with 0x%02x\n", $ordwide;
29 # This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
30 # http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
31 # version dated 2000-09-02.
33 # We use the \x notation instead of raw binary bytes for \x00-\x1f\x7f-\xff
34 # because e.g. many patch programs have issues with binary data.
36 my @MK = split(/\n/, <<__EOMK__);
38 1.1.1 y "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5" - 11 ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5 5
40 2.1 First possible sequence of certain length
41 2.1.1 y "\x00" 0 1 00 1
42 2.1.2 y "\xc2\x80" 80 2 c2:80 1
43 2.1.3 y "\xe0\xa0\x80" 800 3 e0:a0:80 1
44 2.1.4 y "\xf0\x90\x80\x80" 10000 4 f0:90:80:80 1
45 2.1.5 y "\xf8\x88\x80\x80\x80" 200000 5 f8:88:80:80:80 1
46 2.1.6 y "\xfc\x84\x80\x80\x80\x80" 4000000 6 fc:84:80:80:80:80 1
47 2.2 Last possible sequence of certain length
48 2.2.1 y "\x7f" 7f 1 7f 1
49 2.2.2 y "\xdf\xbf" 7ff 2 df:bf 1
50 # The ffff is illegal unless UTF8_ALLOW_FFFF
51 2.2.3 n "\xef\xbf\xbf" ffff 3 ef:bf:bf 1 character 0xffff
52 2.2.4 y "\xf7\xbf\xbf\xbf" 1fffff 4 f7:bf:bf:bf 1
53 2.2.5 y "\xfb\xbf\xbf\xbf\xbf" 3ffffff 5 fb:bf:bf:bf:bf 1
54 2.2.6 y "\xfd\xbf\xbf\xbf\xbf\xbf" 7fffffff 6 fd:bf:bf:bf:bf:bf 1
55 2.3 Other boundary conditions
56 2.3.1 y "\xed\x9f\xbf" d7ff 3 ed:9f:bf 1
57 2.3.2 y "\xee\x80\x80" e000 3 ee:80:80 1
58 2.3.3 y "\xef\xbf\xbd" fffd 3 ef:bf:bd 1
59 2.3.4 y "\xf4\x8f\xbf\xbf" 10ffff 4 f4:8f:bf:bf 1
60 2.3.5 y "\xf4\x90\x80\x80" 110000 4 f4:90:80:80 1
62 3.1 Unexpected continuation bytes
63 3.1.1 n "\x80" - 1 80 - unexpected continuation byte 0x80
64 3.1.2 n "\xbf" - 1 bf - unexpected continuation byte 0xbf
65 3.1.3 n "\x80\xbf" - 2 80:bf - unexpected continuation byte 0x80
66 3.1.4 n "\x80\xbf\x80" - 3 80:bf:80 - unexpected continuation byte 0x80
67 3.1.5 n "\x80\xbf\x80\xbf" - 4 80:bf:80:bf - unexpected continuation byte 0x80
68 3.1.6 n "\x80\xbf\x80\xbf\x80" - 5 80:bf:80:bf:80 - unexpected continuation byte 0x80
69 3.1.7 n "\x80\xbf\x80\xbf\x80\xbf" - 6 80:bf:80:bf:80:bf - unexpected continuation byte 0x80
70 3.1.8 n "\x80\xbf\x80\xbf\x80\xbf\x80" - 7 80:bf:80:bf:80:bf:80 - unexpected continuation byte 0x80
71 3.1.9 n "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" - 64 80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf - unexpected continuation byte 0x80
72 3.2 Lonely start characters
73 3.2.1 n "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf " - 64 c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20 - unexpected non-continuation byte 0x20 after start byte 0xc0
74 3.2.2 n "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef " - 32 e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20 - unexpected non-continuation byte 0x20 after start byte 0xe0
75 3.2.3 n "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 " - 16 f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20 - unexpected non-continuation byte 0x20 after start byte 0xf0
76 3.2.4 n "\xf8 \xf9 \xfa \xfb " - 8 f8:20:f9:20:fa:20:fb:20 - unexpected non-continuation byte 0x20 after start byte 0xf8
77 3.2.5 n "\xfc \xfd " - 4 fc:20:fd:20 - unexpected non-continuation byte 0x20 after start byte 0xfc
78 3.3 Sequences with last continuation byte missing
79 3.3.1 n "\xc0" - 1 c0 - 1 byte, need 2
80 3.3.2 n "\xe0\x80" - 2 e0:80 - 2 bytes, need 3
81 3.3.3 n "\xf0\x80\x80" - 3 f0:80:80 - 3 bytes, need 4
82 3.3.4 n "\xf8\x80\x80\x80" - 4 f8:80:80:80 - 4 bytes, need 5
83 3.3.5 n "\xfc\x80\x80\x80\x80" - 5 fc:80:80:80:80 - 5 bytes, need 6
84 3.3.6 n "\xdf" - 1 df - 1 byte, need 2
85 3.3.7 n "\xef\xbf" - 2 ef:bf - 2 bytes, need 3
86 3.3.8 n "\xf7\xbf\xbf" - 3 f7:bf:bf - 3 bytes, need 4
87 3.3.9 n "\xfb\xbf\xbf\xbf" - 4 fb:bf:bf:bf - 4 bytes, need 5
88 3.3.10 n "\xfd\xbf\xbf\xbf\xbf" - 5 fd:bf:bf:bf:bf - 5 bytes, need 6
89 3.4 Concatenation of incomplete sequences
90 3.4.1 n "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf" - 30 c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf - unexpected non-continuation byte 0xe0 after start byte 0xc0
92 3.5.1 n "\xfe" - 1 fe - byte 0xfe
93 3.5.2 n "\xff" - 1 ff - byte 0xff
94 3.5.3 n "\xfe\xfe\xff\xff" - 4 fe:fe:ff:ff - byte 0xfe
96 4.1 Examples of an overlong ASCII character
97 4.1.1 n "\xc0\xaf" - 2 c0:af - 2 bytes, need 1
98 4.1.2 n "\xe0\x80\xaf" - 3 e0:80:af - 3 bytes, need 1
99 4.1.3 n "\xf0\x80\x80\xaf" - 4 f0:80:80:af - 4 bytes, need 1
100 4.1.4 n "\xf8\x80\x80\x80\xaf" - 5 f8:80:80:80:af - 5 bytes, need 1
101 4.1.5 n "\xfc\x80\x80\x80\x80\xaf" - 6 fc:80:80:80:80:af - 6 bytes, need 1
102 4.2 Maximum overlong sequences
103 4.2.1 n "\xc1\xbf" - 2 c1:bf - 2 bytes, need 1
104 4.2.2 n "\xe0\x9f\xbf" - 3 e0:9f:bf - 3 bytes, need 2
105 4.2.3 n "\xf0\x8f\xbf\xbf" - 4 f0:8f:bf:bf - 4 bytes, need 3
106 4.2.4 n "\xf8\x87\xbf\xbf\xbf" - 5 f8:87:bf:bf:bf - 5 bytes, need 4
107 4.2.5 n "\xfc\x83\xbf\xbf\xbf\xbf" - 6 fc:83:bf:bf:bf:bf - 6 bytes, need 5
108 4.3 Overlong representation of the NUL character
109 4.3.1 n "\xc0\x80" - 2 c0:80 - 2 bytes, need 1
110 4.3.2 n "\xe0\x80\x80" - 3 e0:80:80 - 3 bytes, need 1
111 4.3.3 n "\xf0\x80\x80\x80" - 4 f0:80:80:80 - 4 bytes, need 1
112 4.3.4 n "\xf8\x80\x80\x80\x80" - 5 f8:80:80:80:80 - 5 bytes, need 1
113 4.3.5 n "\xfc\x80\x80\x80\x80\x80" - 6 fc:80:80:80:80:80 - 6 bytes, need 1
114 5 Illegal code positions
115 5.1 Single UTF-16 surrogates
116 5.1.1 n "\xed\xa0\x80" - 3 ed:a0:80 - UTF-16 surrogate 0xd800
117 5.1.2 n "\xed\xad\xbf" - 3 ed:ad:bf - UTF-16 surrogate 0xdb7f
118 5.1.3 n "\xed\xae\x80" - 3 ed:ae:80 - UTF-16 surrogate 0xdb80
119 5.1.4 n "\xed\xaf\xbf" - 3 ed:af:bf - UTF-16 surrogate 0xdbff
120 5.1.5 n "\xed\xb0\x80" - 3 ed:b0:80 - UTF-16 surrogate 0xdc00
121 5.1.6 n "\xed\xbe\x80" - 3 ed:be:80 - UTF-16 surrogate 0xdf80
122 5.1.7 n "\xed\xbf\xbf" - 3 ed:bf:bf - UTF-16 surrogate 0xdfff
123 5.2 Paired UTF-16 surrogates
124 5.2.1 n "\xed\xa0\x80\xed\xb0\x80" - 6 ed:a0:80:ed:b0:80 - UTF-16 surrogate 0xd800
125 5.2.2 n "\xed\xa0\x80\xed\xbf\xbf" - 6 ed:a0:80:ed:bf:bf - UTF-16 surrogate 0xd800
126 5.2.3 n "\xed\xad\xbf\xed\xb0\x80" - 6 ed:ad:bf:ed:b0:80 - UTF-16 surrogate 0xdb7f
127 5.2.4 n "\xed\xad\xbf\xed\xbf\xbf" - 6 ed:ad:bf:ed:bf:bf - UTF-16 surrogate 0xdb7f
128 5.2.5 n "\xed\xae\x80\xed\xb0\x80" - 6 ed:ae:80:ed:b0:80 - UTF-16 surrogate 0xdb80
129 5.2.6 n "\xed\xae\x80\xed\xbf\xbf" - 6 ed:ae:80:ed:bf:bf - UTF-16 surrogate 0xdb80
130 5.2.7 n "\xed\xaf\xbf\xed\xb0\x80" - 6 ed:af:bf:ed:b0:80 - UTF-16 surrogate 0xdbff
131 5.2.8 n "\xed\xaf\xbf\xed\xbf\xbf" - 6 ed:af:bf:ed:bf:bf - UTF-16 surrogate 0xdbff
132 5.3 Other illegal code positions
133 5.3.1 n "\xef\xbf\xbe" - 3 ef:bf:be - byte order mark 0xfffe
134 # The ffff is illegal unless UTF8_ALLOW_FFFF
135 5.3.2 n "\xef\xbf\xbf" - 3 ef:bf:bf - character 0xffff
143 local $SIG{__WARN__} =
161 if (/^(?:\d+(?:\.\d+)?)\s/ || /^#/) {
163 } elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+((?:\d+|-)(?:\s+(.+))?))?$/) {
165 my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $error) =
166 ($2, $3, $4, $5, $6, $7, $8);
167 my @hex = split(/:/, $hex);
168 unless (@hex == $byteslen) {
170 moan "amount of hex ($nhex) not equal to byteslen ($byteslen)\n";
174 my $bytesbyteslen = length($bytes);
175 unless ($bytesbyteslen == $byteslen) {
176 moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n";
180 test_unpack_U($bytes);
182 moan "unpack('U*') false negative\n";
185 } elsif ($okay eq 'n') {
186 test_unpack_U($bytes);
187 if ($WARNCNT == 0 || ($error ne '' && $WARNMSG !~ /$error/)) {
188 moan "unpack('U*') false positive\n";
195 moan "unknown format\n";