Fix for
[p5sagit/p5-mst-13.2.git] / t / pragma / utf8.t

Software error:

Malformed UTF-8 character (fatal) at /var/www/git.shadowcat.co.uk/docroot/gitweb/gitweb.cgi line 1024.

For help, please send mail to the webmaster (chrisj@shadowcatsystems.co.uk), giving this error message and the time and date of the error.

CommitLineData
f96ec2a2 1#!./perl
2
3BEGIN {
4 chdir 't' if -d 't';
20822f61 5 @INC = '../lib';
f96ec2a2 6 $ENV{PERL5LIB} = '../lib';
f70c35af 7 if ( ord("\t") != 9 ) { # skip on ebcdic platforms
8 print "1..0 # Skip utf8 tests on ebcdic platform.\n";
9 exit;
10 }
f96ec2a2 11}
12
89491803 13print "1..191\n";
f96ec2a2 14
15my $test = 1;
16
17sub ok {
18 my ($got,$expect) = @_;
19 print "# expected [$expect], got [$got]\nnot " if $got ne $expect;
20 print "ok $test\n";
21}
22
7bbb0251 23sub nok {
24 my ($got,$expect) = @_;
25 print "# expected not [$expect], got [$got]\nnot " if $got eq $expect;
26 print "ok $test\n";
27}
28
be341bce 29sub ok_bytes {
30 use bytes;
31 my ($got,$expect) = @_;
32 print "# expected [$expect], got [$got]\nnot " if $got ne $expect;
33 print "ok $test\n";
34}
35
7bbb0251 36sub nok_bytes {
37 use bytes;
38 my ($got,$expect) = @_;
39 print "# expected not [$expect], got [$got]\nnot " if $got eq $expect;
40 print "ok $test\n";
41}
be341bce 42
f96ec2a2 43{
44 use utf8;
45 $_ = ">\x{263A}<";
46 s/([\x{80}-\x{10ffff}])/"&#".ord($1).";"/eg;
47 ok $_, '>&#9786;<';
c5cc3500 48 $test++; # 1
f96ec2a2 49
50 $_ = ">\x{263A}<";
51 my $rx = "\x{80}-\x{10ffff}";
52 s/([$rx])/"&#".ord($1).";"/eg;
53 ok $_, '>&#9786;<';
c5cc3500 54 $test++; # 2
f96ec2a2 55
56 $_ = ">\x{263A}<";
57 my $rx = "\\x{80}-\\x{10ffff}";
58 s/([$rx])/"&#".ord($1).";"/eg;
59 ok $_, '>&#9786;<';
c5cc3500 60 $test++; # 3
b8c5462f 61
62 $_ = "alpha,numeric";
63 m/([[:alpha:]]+)/;
64 ok $1, 'alpha';
c5cc3500 65 $test++; # 4
b8c5462f 66
67 $_ = "alphaNUMERICstring";
68 m/([[:^lower:]]+)/;
69 ok $1, 'NUMERIC';
c5cc3500 70 $test++; # 5
b8c5462f 71
72 $_ = "alphaNUMERICstring";
73 m/(\p{Ll}+)/;
74 ok $1, 'alpha';
c5cc3500 75 $test++; # 6
b8c5462f 76
77 $_ = "alphaNUMERICstring";
78 m/(\p{Lu}+)/;
79 ok $1, 'NUMERIC';
c5cc3500 80 $test++; # 7
b8c5462f 81
82 $_ = "alpha,numeric";
83 m/([\p{IsAlpha}]+)/;
84 ok $1, 'alpha';
c5cc3500 85 $test++; # 8
b8c5462f 86
87 $_ = "alphaNUMERICstring";
88 m/([^\p{IsLower}]+)/;
89 ok $1, 'NUMERIC';
c5cc3500 90 $test++; # 9
b8c5462f 91
0f4b6630 92 $_ = "alpha123numeric456";
93 m/([\p{IsDigit}]+)/;
94 ok $1, '123';
c5cc3500 95 $test++; # 10
b8c5462f 96
0f4b6630 97 $_ = "alpha123numeric456";
98 m/([^\p{IsDigit}]+)/;
99 ok $1, 'alpha';
c5cc3500 100 $test++; # 11
b8c5462f 101
0f4b6630 102 $_ = ",123alpha,456numeric";
103 m/([\p{IsAlnum}]+)/;
104 ok $1, '123alpha';
c5cc3500 105 $test++; # 12
0f4b6630 106}
a197cbdd 107{
108 use utf8;
109
110 $_ = "\x{263A}>\x{263A}\x{263A}";
111
112 ok length, 4;
c5cc3500 113 $test++; # 13
a197cbdd 114
115 ok length((m/>(.)/)[0]), 1;
c5cc3500 116 $test++; # 14
a197cbdd 117
118 ok length($&), 2;
c5cc3500 119 $test++; # 15
a197cbdd 120
121 ok length($'), 1;
c5cc3500 122 $test++; # 16
a197cbdd 123
124 ok length($`), 1;
c5cc3500 125 $test++; # 17
a197cbdd 126
127 ok length($1), 1;
c5cc3500 128 $test++; # 18
a197cbdd 129
130 ok length($tmp=$&), 2;
c5cc3500 131 $test++; # 19
a197cbdd 132
133 ok length($tmp=$'), 1;
c5cc3500 134 $test++; # 20
a197cbdd 135
136 ok length($tmp=$`), 1;
c5cc3500 137 $test++; # 21
a197cbdd 138
139 ok length($tmp=$1), 1;
c5cc3500 140 $test++; # 22
a197cbdd 141
c5cc3500 142 {
be341bce 143 use bytes;
c5cc3500 144
be341bce 145 my $tmp = $&;
c5cc3500 146 ok $tmp, pack("C*", ord(">"), 0342, 0230, 0272);
147 $test++; # 23
a197cbdd 148
c5cc3500 149 $tmp = $';
150 ok $tmp, pack("C*", 0342, 0230, 0272);
151 $test++; # 24
a197cbdd 152
c5cc3500 153 $tmp = $`;
154 ok $tmp, pack("C*", 0342, 0230, 0272);
155 $test++; # 25
a197cbdd 156
c5cc3500 157 $tmp = $1;
158 ok $tmp, pack("C*", 0342, 0230, 0272);
159 $test++; # 26
160 }
a197cbdd 161
be341bce 162 ok_bytes $&, pack("C*", ord(">"), 0342, 0230, 0272);
163 $test++; # 27
164
165 ok_bytes $', pack("C*", 0342, 0230, 0272);
166 $test++; # 28
167
168 ok_bytes $`, pack("C*", 0342, 0230, 0272);
169 $test++; # 29
170
171 ok_bytes $1, pack("C*", 0342, 0230, 0272);
172 $test++; # 30
173
a197cbdd 174 {
175 use bytes;
176 no utf8;
177
178 ok length, 10;
be341bce 179 $test++; # 31
a197cbdd 180
181 ok length((m/>(.)/)[0]), 1;
be341bce 182 $test++; # 32
a197cbdd 183
184 ok length($&), 2;
be341bce 185 $test++; # 33
a197cbdd 186
187 ok length($'), 5;
be341bce 188 $test++; # 34
a197cbdd 189
190 ok length($`), 3;
be341bce 191 $test++; # 35
a197cbdd 192
193 ok length($1), 1;
be341bce 194 $test++; # 36
a197cbdd 195
196 ok $&, pack("C*", ord(">"), 0342);
be341bce 197 $test++; # 37
a197cbdd 198
199 ok $', pack("C*", 0230, 0272, 0342, 0230, 0272);
be341bce 200 $test++; # 38
a197cbdd 201
202 ok $`, pack("C*", 0342, 0230, 0272);
be341bce 203 $test++; # 39
a197cbdd 204
205 ok $1, pack("C*", 0342);
be341bce 206 $test++; # 40
a197cbdd 207
208 }
209
210
211 {
212 no utf8;
213 $_="\342\230\272>\342\230\272\342\230\272";
214 }
215
216 ok length, 10;
be341bce 217 $test++; # 41
a197cbdd 218
219 ok length((m/>(.)/)[0]), 1;
be341bce 220 $test++; # 42
a197cbdd 221
222 ok length($&), 2;
be341bce 223 $test++; # 43
a197cbdd 224
225 ok length($'), 1;
be341bce 226 $test++; # 44
a197cbdd 227
228 ok length($`), 1;
be341bce 229 $test++; # 45
a197cbdd 230
231 ok length($1), 1;
be341bce 232 $test++; # 46
a197cbdd 233
234 ok length($tmp=$&), 2;
be341bce 235 $test++; # 47
a197cbdd 236
237 ok length($tmp=$'), 1;
be341bce 238 $test++; # 48
a197cbdd 239
240 ok length($tmp=$`), 1;
be341bce 241 $test++; # 49
a197cbdd 242
243 ok length($tmp=$1), 1;
be341bce 244 $test++; # 50
a197cbdd 245
c5cc3500 246 {
247 use bytes;
a197cbdd 248
c5cc3500 249 my $tmp = $&;
250 ok $tmp, pack("C*", ord(">"), 0342, 0230, 0272);
be341bce 251 $test++; # 51
a197cbdd 252
c5cc3500 253 $tmp = $';
254 ok $tmp, pack("C*", 0342, 0230, 0272);
be341bce 255 $test++; # 52
a197cbdd 256
c5cc3500 257 $tmp = $`;
258 ok $tmp, pack("C*", 0342, 0230, 0272);
be341bce 259 $test++; # 53
a197cbdd 260
c5cc3500 261 $tmp = $1;
262 ok $tmp, pack("C*", 0342, 0230, 0272);
be341bce 263 $test++; # 54
c5cc3500 264 }
a197cbdd 265 {
266 use bytes;
267 no utf8;
268
269 ok length, 10;
be341bce 270 $test++; # 55
a197cbdd 271
272 ok length((m/>(.)/)[0]), 1;
be341bce 273 $test++; # 56
a197cbdd 274
275 ok length($&), 2;
be341bce 276 $test++; # 57
a197cbdd 277
278 ok length($'), 5;
be341bce 279 $test++; # 58
a197cbdd 280
281 ok length($`), 3;
be341bce 282 $test++; # 59
a197cbdd 283
284 ok length($1), 1;
be341bce 285 $test++; # 60
a197cbdd 286
287 ok $&, pack("C*", ord(">"), 0342);
be341bce 288 $test++; # 61
a197cbdd 289
290 ok $', pack("C*", 0230, 0272, 0342, 0230, 0272);
be341bce 291 $test++; # 62
a197cbdd 292
293 ok $`, pack("C*", 0342, 0230, 0272);
be341bce 294 $test++; # 63
a197cbdd 295
296 ok $1, pack("C*", 0342);
be341bce 297 $test++; # 64
a197cbdd 298
299 }
de35ba6f 300
301 ok "\x{ab}" =~ /^\x{ab}$/, 1;
be341bce 302 $test++; # 65
a197cbdd 303}
aaa68c4a 304
305{
306 use utf8;
307 ok_bytes chr(0xe2), pack("C*", 0xc3, 0xa2);
308 $test++; # 66
309}
28cb3359 310
311{
312 use utf8;
313 my @a = map ord, split(//, join("", map chr, (1234, 123, 2345)));
314 ok "@a", "1234 123 2345";
315 $test++; # 67
316}
317
318{
319 use utf8;
320 my $x = chr(123);
321 my @a = map ord, split(/$x/, join("", map chr, (1234, 123, 2345)));
322 ok "@a", "1234 2345";
323 $test++; # 68
324}
31067593 325
7bbb0251 326{
da450f52 327 # bug id 20001009.001
328
89491803 329 my ($a, $b);
330
331 { use bytes; $a = "\xc3\xa4" }
332 { use utf8; $b = "\xe4" } # \xXX must not produce UTF-8
333
334 print "not " if $a eq $b;
335 print "ok $test\n"; $test++;
336
337 { use utf8; print "not " if $a eq $b; }
338 print "ok $test\n"; $test++;
7bbb0251 339}
31067593 340
341{
da450f52 342 # bug id 20001008.001
343
31067593 344 my @x = ("stra\337e 138","stra\337e 138");
345 for (@x) {
346 s/(\d+)\s*([\w\-]+)/$1 . uc $2/e;
347 my($latin) = /^(.+)(?:\s+\d)/;
348 print $latin eq "stra\337e" ? "ok $test\n" :
349 "#latin[$latin]\nnot ok $test\n";
350 $test++;
351 $latin =~ s/stra\337e/straße/; # \303\237 after the 2nd a
352 use utf8;
353 $latin =~ s!(s)tr(?:aß|s+e)!$1tr.!; # \303\237 after the a
354 }
355}
b7018214 356
357{
da450f52 358 # bug id 20000819.004
359
b7018214 360 $_ = $dx = "\x{10f2}";
361 s/($dx)/$dx$1/;
362 {
363 use bytes;
364 print "not " unless $_ eq "$dx$dx";
365 print "ok $test\n";
366 $test++;
367 }
368
369 $_ = $dx = "\x{10f2}";
370 s/($dx)/$1$dx/;
371 {
372 use bytes;
373 print "not " unless $_ eq "$dx$dx";
374 print "ok $test\n";
375 $test++;
376 }
377
378 $dx = "\x{10f2}";
379 $_ = "\x{10f2}\x{10f2}";
380 s/($dx)($dx)/$1$2/;
381 {
382 use bytes;
383 print "not " unless $_ eq "$dx$dx";
384 print "ok $test\n";
385 $test++;
386 }
387}
8b72f7e2 388
389{
da450f52 390 # bug id 20000323.056
391
8b72f7e2 392 use utf8;
393
394 print "not " unless "\x{41}" eq +v65;
395 print "ok $test\n";
396 $test++;
397
398 print "not " unless "\x41" eq +v65;
399 print "ok $test\n";
400 $test++;
401
402 print "not " unless "\x{c8}" eq +v200;
403 print "ok $test\n";
404 $test++;
405
406 print "not " unless "\xc8" eq +v200;
407 print "ok $test\n";
408 $test++;
409
410 print "not " unless "\x{221b}" eq v8731;
411 print "ok $test\n";
412 $test++;
413}
da450f52 414
415{
416 # bug id 20000427.003
417
418 use utf8;
419 use warnings;
420 use strict;
421
422 my $sushi = "\x{b36c}\x{5a8c}\x{ff5b}\x{5079}\x{505b}";
423
424 my @charlist = split //, $sushi;
425 my $r = '';
426 foreach my $ch (@charlist) {
427 $r = $r . " " . sprintf "U+%04X", ord($ch);
428 }
429
430 print "not " unless $r eq " U+B36C U+5A8C U+FF5B U+5079 U+505B";
431 print "ok $test\n";
432 $test++;
433}
434
435{
436 # bug id 20000901.092
437 # test that undef left and right of utf8 results in a valid string
438
439 my $a;
440 $a .= "\x{1ff}";
441 print "not " unless $a eq "\x{1ff}";
442 print "ok $test\n";
443 $test++;
444}
93f04dac 445
446{
447 # bug id 20000426.003
448
449 use utf8;
450
451 my $s = "\x20\x40\x{80}\x{100}\x{80}\x40\x20";
452
453 my ($a, $b, $c) = split(/\x40/, $s);
454 print "not "
455 unless $a eq "\x20" && $b eq "\x{80}\x{100}\x{80}" && $c eq $a;
456 print "ok $test\n";
457 $test++;
458
459 my ($a, $b) = split(/\x{100}/, $s);
460 print "not " unless $a eq "\x20\x40\x{80}" && $b eq "\x{80}\x40\x20";
461 print "ok $test\n";
462 $test++;
463
464 my ($a, $b) = split(/\x{80}\x{100}\x{80}/, $s);
465 print "not " unless $a eq "\x20\x40" && $b eq "\x40\x20";
466 print "ok $test\n";
467 $test++;
468
469 my ($a, $b) = split(/\x40\x{80}/, $s);
470 print "not " unless $a eq "\x20" && $b eq "\x{100}\x{80}\x40\x20";
471 print "ok $test\n";
472 $test++;
473
474 my ($a, $b, $c) = split(/[\x40\x{80}]+/, $s);
475 print "not " unless $a eq "\x20" && $b eq "\x{100}" && $c eq "\x20";
476 print "ok $test\n";
477 $test++;
478}
60ff4832 479
480{
481 # bug id 20000730.004
482
483 use utf8;
484
485 my $smiley = "\x{263a}";
486
487 for my $s ("\x{263a}", # 1
488 $smiley, # 2
489
490 "" . $smiley, # 3
491 "" . "\x{263a}", # 4
492
493 $smiley . "", # 5
494 "\x{263a}" . "", # 6
495 ) {
496 my $length_chars = length($s);
497 my $length_bytes;
498 { use bytes; $length_bytes = length($s) }
499 my @regex_chars = $s =~ m/(.)/g;
500 my $regex_chars = @regex_chars;
501 my @split_chars = split //, $s;
502 my $split_chars = @split_chars;
503 print "not "
504 unless "$length_chars/$regex_chars/$split_chars/$length_bytes" eq
505 "1/1/1/3";
506 print "ok $test\n";
507 $test++;
508 }
509
510 for my $s ("\x{263a}" . "\x{263a}", # 7
511 $smiley . $smiley, # 8
512
513 "\x{263a}\x{263a}", # 9
514 "$smiley$smiley", # 10
515
516 "\x{263a}" x 2, # 11
517 $smiley x 2, # 12
518 ) {
519 my $length_chars = length($s);
520 my $length_bytes;
521 { use bytes; $length_bytes = length($s) }
522 my @regex_chars = $s =~ m/(.)/g;
523 my $regex_chars = @regex_chars;
524 my @split_chars = split //, $s;
525 my $split_chars = @split_chars;
526 print "not "
527 unless "$length_chars/$regex_chars/$split_chars/$length_bytes" eq
528 "2/2/2/6";
529 print "ok $test\n";
530 $test++;
531 }
532}
689440ec 533
534{
535 # ID 20001020.006
536
537 "x" =~ /(.)/; # unset $2
538
539 # Without the fix this will croak:
540 # Modification of a read-only value attempted at ...
541 "$2\x{1234}";
542
543 print "ok $test\n";
544 $test++;
545
546 # For symmetry with the above.
547 "\x{1234}$2";
548
549 print "ok $test\n";
550 $test++;
5280a8e5 551
552 *pi = \undef;
553 # This bug existed earlier than the $2 bug, but is fixed with the same
554 # patch. Without the fix this will also croak:
555 # Modification of a read-only value attempted at ...
556 "$pi\x{1234}";
557
558 print "ok $test\n";
559 $test++;
560
561 # For symmetry with the above.
562 "\x{1234}$pi";
563
564 print "ok $test\n";
565 $test++;
689440ec 566}
ba210ebe 567
568# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
569# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
570# version dated 2000-09-02.
571
572my @MK = split(/\n/, <<__EOMK__);
5731 Correct UTF-8
5741.1.1 y "κόσμε" - 11 ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5 5
5752 Boundary conditions
5762.1 First possible sequence of certain length
5772.1.1 y "\0" 0 1 00 1
5782.1.2 y "\80" 80 2 c2:80 1
5792.1.3 y "ࠀ" 800 3 e0:a0:80 1
5802.1.4 y "𐀀" 10000 4 f0:90:80:80 1
5812.1.5 y "" 200000 5 f8:88:80:80:80 1
5822.1.6 y "" 4000000 6 fc:84:80:80:80:80 1
5832.2 Last possible sequence of certain length
5842.2.1 y "\7f" 7f 1 7f 1
5852.2.2 y "߿" 7ff 2 df:bf 1
fcc8fcf6 586# The ffff is illegal unless UTF8_ALLOW_FFFF
587