t/pragma/utf8.t

   1 #!./perl
   2
   3 BEGIN {
   4     chdir 't' if -d 't';
   5     @INC = '../lib';
   6     $ENV{PERL5LIB} = '../lib';
   7     if ( ord("\t") != 9 ) { # skip on ebcdic platforms
   8         print "1..0 # Skip utf8 tests on ebcdic platform.\n";
   9         exit;
  10     }
  11 }
  12
  13 print "1..191\n";
  14
  15 my $test = 1;
  16
  17 sub ok {
  18     my ($got,$expect) = @_;
  19     print "# expected [$expect], got [$got]\nnot " if $got ne $expect;
  20     print "ok $test\n";
  21 }
  22
  23 sub nok {
  24     my ($got,$expect) = @_;
  25     print "# expected not [$expect], got [$got]\nnot " if $got eq $expect;
  26     print "ok $test\n";
  27 }
  28
  29 sub ok_bytes {
  30     use bytes;
  31     my ($got,$expect) = @_;
  32     print "# expected [$expect], got [$got]\nnot " if $got ne $expect;
  33     print "ok $test\n";
  34 }
  35
  36 sub nok_bytes {
  37     use bytes;
  38     my ($got,$expect) = @_;
  39     print "# expected not [$expect], got [$got]\nnot " if $got eq $expect;
  40     print "ok $test\n";
  41 }
  42
  43 {
  44     use utf8;
  45     $_ = ">\x{263A}<";
  46     s/([\x{80}-\x{10ffff}])/"&#".ord($1).";"/eg;
  47     ok $_, '>&#9786;<';
  48     $test++;                            # 1
  49
  50     $_ = ">\x{263A}<";
  51     my $rx = "\x{80}-\x{10ffff}";
  52     s/([$rx])/"&#".ord($1).";"/eg;
  53     ok $_, '>&#9786;<';
  54     $test++;                            # 2
  55
  56     $_ = ">\x{263A}<";
  57     my $rx = "\\x{80}-\\x{10ffff}";
  58     s/([$rx])/"&#".ord($1).";"/eg;
  59     ok $_, '>&#9786;<';
  60     $test++;                            # 3
  61
  62     $_ = "alpha,numeric";
  63     m/([[:alpha:]]+)/;
  64     ok $1, 'alpha';
  65     $test++;                            # 4
  66
  67     $_ = "alphaNUMERICstring";
  68     m/([[:^lower:]]+)/;
  69     ok $1, 'NUMERIC';
  70     $test++;                            # 5
  71
  72     $_ = "alphaNUMERICstring";
  73     m/(\p{Ll}+)/;
  74     ok $1, 'alpha';
  75     $test++;                            # 6
  76
  77     $_ = "alphaNUMERICstring";
  78     m/(\p{Lu}+)/;
  79     ok $1, 'NUMERIC';
  80     $test++;                            # 7
  81
  82     $_ = "alpha,numeric";
  83     m/([\p{IsAlpha}]+)/;
  84     ok $1, 'alpha';
  85     $test++;                            # 8
  86
  87     $_ = "alphaNUMERICstring";
  88     m/([^\p{IsLower}]+)/;
  89     ok $1, 'NUMERIC';
  90     $test++;                            # 9
  91
  92     $_ = "alpha123numeric456";
  93     m/([\p{IsDigit}]+)/;
  94     ok $1, '123';
  95     $test++;                            # 10
  96
  97     $_ = "alpha123numeric456";
  98     m/([^\p{IsDigit}]+)/;
  99     ok $1, 'alpha';
 100     $test++;                            # 11
 101
 102     $_ = ",123alpha,456numeric";
 103     m/([\p{IsAlnum}]+)/;
 104     ok $1, '123alpha';
 105     $test++;                            # 12
 106 }
 107 {
 108     use utf8;
 109
 110     $_ = "\x{263A}>\x{263A}\x{263A}";
 111
 112     ok length, 4;
 113     $test++;                            # 13
 114
 115     ok length((m/>(.)/)[0]), 1;
 116     $test++;                            # 14
 117
 118     ok length($&), 2;
 119     $test++;                            # 15
 120
 121     ok length($'), 1;
 122     $test++;                            # 16
 123
 124     ok length($`), 1;
 125     $test++;                            # 17
 126
 127     ok length($1), 1;
 128     $test++;                            # 18
 129
 130     ok length($tmp=$&), 2;
 131     $test++;                            # 19
 132
 133     ok length($tmp=$'), 1;
 134     $test++;                            # 20
 135
 136     ok length($tmp=$`), 1;
 137     $test++;                            # 21
 138
 139     ok length($tmp=$1), 1;
 140     $test++;                            # 22
 141
 142     {
 143         use bytes;
 144
 145         my $tmp = $&;
 146         ok $tmp, pack("C*", ord(">"), 0342, 0230, 0272);
 147         $test++;                                # 23
 148
 149         $tmp = $';
 150         ok $tmp, pack("C*", 0342, 0230, 0272);
 151         $test++;                                # 24
 152
 153         $tmp = $`;
 154         ok $tmp, pack("C*", 0342, 0230, 0272);
 155         $test++;                                # 25
 156
 157         $tmp = $1;
 158         ok $tmp, pack("C*", 0342, 0230, 0272);
 159         $test++;                                # 26
 160     }
 161
 162     ok_bytes $&, pack("C*", ord(">"), 0342, 0230, 0272);
 163     $test++;                            # 27
 164
 165     ok_bytes $', pack("C*", 0342, 0230, 0272);
 166     $test++;                            # 28
 167
 168     ok_bytes $`, pack("C*", 0342, 0230, 0272);
 169     $test++;                            # 29
 170
 171     ok_bytes $1, pack("C*", 0342, 0230, 0272);
 172     $test++;                            # 30
 173
 174     {
 175         use bytes;
 176         no utf8;
 177
 178         ok length, 10;
 179         $test++;                                # 31
 180
 181         ok length((m/>(.)/)[0]), 1;
 182         $test++;                                # 32
 183
 184         ok length($&), 2;
 185         $test++;                                # 33
 186
 187         ok length($'), 5;
 188         $test++;                                # 34
 189
 190         ok length($`), 3;
 191         $test++;                                # 35
 192
 193         ok length($1), 1;
 194         $test++;                                # 36
 195
 196         ok $&, pack("C*", ord(">"), 0342);
 197         $test++;                                # 37
 198
 199         ok $', pack("C*", 0230, 0272, 0342, 0230, 0272);
 200         $test++;                                # 38
 201
 202         ok $`, pack("C*", 0342, 0230, 0272);
 203         $test++;                                # 39
 204
 205         ok $1, pack("C*", 0342);
 206         $test++;                                # 40
 207
 208     }
 209
 210
 211     {
 212         no utf8;
 213         $_="\342\230\272>\342\230\272\342\230\272";
 214     }
 215
 216     ok length, 10;
 217     $test++;                            # 41
 218
 219     ok length((m/>(.)/)[0]), 1;
 220     $test++;                            # 42
 221
 222     ok length($&), 2;
 223     $test++;                            # 43
 224
 225     ok length($'), 1;
 226     $test++;                            # 44
 227
 228     ok length($`), 1;
 229     $test++;                            # 45
 230
 231     ok length($1), 1;
 232     $test++;                            # 46
 233
 234     ok length($tmp=$&), 2;
 235     $test++;                            # 47
 236
 237     ok length($tmp=$'), 1;
 238     $test++;                            # 48
 239
 240     ok length($tmp=$`), 1;
 241     $test++;                            # 49
 242
 243     ok length($tmp=$1), 1;
 244     $test++;                            # 50
 245
 246     {
 247         use bytes;
 248
 249         my $tmp = $&;
 250         ok $tmp, pack("C*", ord(">"), 0342, 0230, 0272);
 251         $test++;                                # 51
 252
 253         $tmp = $';
 254         ok $tmp, pack("C*", 0342, 0230, 0272);
 255         $test++;                                # 52
 256
 257         $tmp = $`;
 258         ok $tmp, pack("C*", 0342, 0230, 0272);
 259         $test++;                                # 53
 260
 261         $tmp = $1;
 262         ok $tmp, pack("C*", 0342, 0230, 0272);
 263         $test++;                                # 54
 264     }
 265     {
 266         use bytes;
 267         no utf8;
 268
 269         ok length, 10;
 270         $test++;                                # 55
 271
 272         ok length((m/>(.)/)[0]), 1;
 273         $test++;                                # 56
 274
 275         ok length($&), 2;
 276         $test++;                                # 57
 277
 278         ok length($'), 5;
 279         $test++;                                # 58
 280
 281         ok length($`), 3;
 282         $test++;                                # 59
 283
 284         ok length($1), 1;
 285         $test++;                                # 60
 286
 287         ok $&, pack("C*", ord(">"), 0342);
 288         $test++;                                # 61
 289
 290         ok $', pack("C*", 0230, 0272, 0342, 0230, 0272);
 291         $test++;                                # 62
 292
 293         ok $`, pack("C*", 0342, 0230, 0272);
 294         $test++;                                # 63
 295
 296         ok $1, pack("C*", 0342);
 297         $test++;                                # 64
 298
 299     }
 300
 301     ok "\x{ab}" =~ /^\x{ab}$/, 1;
 302     $test++;                                    # 65
 303 }
 304
 305 {
 306     use utf8;
 307     ok_bytes chr(0xe2), pack("C*", 0xc3, 0xa2);
 308     $test++;                # 66
 309 }
 310
 311 {
 312     use utf8;
 313     my @a = map ord, split(//, join("", map chr, (1234, 123, 2345)));
 314     ok "@a", "1234 123 2345";
 315     $test++;                # 67
 316 }
 317
 318 {
 319     use utf8;
 320     my $x = chr(123);
 321     my @a = map ord, split(/$x/, join("", map chr, (1234, 123, 2345)));
 322     ok "@a", "1234 2345";
 323     $test++;                # 68
 324 }
 325
 326 {
 327     # bug id 20001009.001
 328
 329     my ($a, $b);
 330
 331     { use bytes; $a = "\xc3\xa4" }
 332     { use utf8;  $b = "\xe4"     } # \xXX must not produce UTF-8
 333
 334     print "not " if $a eq $b;
 335     print "ok $test\n"; $test++;
 336
 337     { use utf8; print "not " if $a eq $b; }
 338     print "ok $test\n"; $test++;
 339 }
 340
 341 {
 342     # bug id 20001008.001
 343
 344     my @x = ("stra\337e 138","stra\337e 138");
 345     for (@x) {
 346         s/(\d+)\s*([\w\-]+)/$1 . uc $2/e;
 347         my($latin) = /^(.+)(?:\s+\d)/;
 348         print $latin eq "stra\337e" ? "ok $test\n" :
 349             "#latin[$latin]\nnot ok $test\n";
 350         $test++;
 351         $latin =~ s/stra\337e/straße/; # \303\237 after the 2nd a
 352         use utf8;
 353         $latin =~ s!(s)tr(?:aß|s+e)!$1tr.!; # \303\237 after the a
 354     }
 355 }
 356
 357 {
 358     # bug id 20000819.004
 359
 360     $_ = $dx = "\x{10f2}";
 361     s/($dx)/$dx$1/;
 362     {
 363         use bytes;
 364         print "not " unless $_ eq "$dx$dx";
 365         print "ok $test\n";
 366         $test++;
 367     }
 368
 369     $_ = $dx = "\x{10f2}";
 370     s/($dx)/$1$dx/;
 371     {
 372         use bytes;
 373         print "not " unless $_ eq "$dx$dx";
 374         print "ok $test\n";
 375         $test++;
 376     }
 377
 378     $dx = "\x{10f2}";
 379     $_  = "\x{10f2}\x{10f2}";
 380     s/($dx)($dx)/$1$2/;
 381     {
 382         use bytes;
 383         print "not " unless $_ eq "$dx$dx";
 384         print "ok $test\n";
 385         $test++;
 386     }
 387 }
 388
 389 {
 390     # bug id 20000323.056
 391
 392     use utf8;
 393
 394     print "not " unless "\x{41}" eq +v65;
 395     print "ok $test\n";
 396     $test++;
 397
 398     print "not " unless "\x41" eq +v65;
 399     print "ok $test\n";
 400     $test++;
 401
 402     print "not " unless "\x{c8}" eq +v200;
 403     print "ok $test\n";
 404     $test++;
 405
 406     print "not " unless "\xc8" eq +v200;
 407     print "ok $test\n";
 408     $test++;
 409
 410     print "not " unless "\x{221b}" eq v8731;
 411     print "ok $test\n";
 412     $test++;
 413 }
 414
 415 {
 416     # bug id 20000427.003
 417
 418     use utf8;
 419     use warnings;
 420     use strict;
 421
 422     my $sushi = "\x{b36c}\x{5a8c}\x{ff5b}\x{5079}\x{505b}";
 423
 424     my @charlist = split //, $sushi;
 425     my $r = '';
 426     foreach my $ch (@charlist) {
 427         $r = $r . " " . sprintf "U+%04X", ord($ch);
 428     }
 429
 430     print "not " unless $r eq " U+B36C U+5A8C U+FF5B U+5079 U+505B";
 431     print "ok $test\n";
 432     $test++;
 433 }
 434
 435 {
 436     # bug id 20000901.092
 437     # test that undef left and right of utf8 results in a valid string
 438
 439     my $a;
 440     $a .= "\x{1ff}";
 441     print "not " unless $a eq "\x{1ff}";
 442     print "ok $test\n";
 443     $test++;
 444 }
 445
 446 {
 447     # bug id 20000426.003
 448
 449     use utf8;
 450
 451     my $s = "\x20\x40\x{80}\x{100}\x{80}\x40\x20";
 452
 453     my ($a, $b, $c) = split(/\x40/, $s);
 454     print "not "
 455         unless $a eq "\x20" && $b eq "\x{80}\x{100}\x{80}" && $c eq $a;
 456     print "ok $test\n";
 457     $test++;
 458
 459     my ($a, $b) = split(/\x{100}/, $s);
 460     print "not " unless $a eq "\x20\x40\x{80}" && $b eq "\x{80}\x40\x20";
 461     print "ok $test\n";
 462     $test++;
 463
 464     my ($a, $b) = split(/\x{80}\x{100}\x{80}/, $s);
 465     print "not " unless $a eq "\x20\x40" && $b eq "\x40\x20";
 466     print "ok $test\n";
 467     $test++;
 468
 469     my ($a, $b) = split(/\x40\x{80}/, $s);
 470     print "not " unless $a eq "\x20" && $b eq "\x{100}\x{80}\x40\x20";
 471     print "ok $test\n";
 472     $test++;
 473
 474     my ($a, $b, $c) = split(/[\x40\x{80}]+/, $s);
 475     print "not " unless $a eq "\x20" && $b eq "\x{100}" && $c eq "\x20";
 476     print "ok $test\n";
 477     $test++;
 478 }
 479
 480 {
 481     # bug id 20000730.004
 482
 483     use utf8;
 484
 485     my $smiley = "\x{263a}";
 486
 487     for my $s ("\x{263a}",                     #  1
 488                $smiley,                        #  2
 489
 490                "" . $smiley,                   #  3
 491                "" . "\x{263a}",                #  4
 492
 493                $smiley    . "",                #  5
 494                "\x{263a}" . "",                #  6
 495                ) {
 496         my $length_chars = length($s);
 497         my $length_bytes;
 498         { use bytes; $length_bytes = length($s) }
 499         my @regex_chars = $s =~ m/(.)/g;
 500         my $regex_chars = @regex_chars;
 501         my @split_chars = split //, $s;
 502         my $split_chars = @split_chars;
 503         print "not "
 504             unless "$length_chars/$regex_chars/$split_chars/$length_bytes" eq
 505                    "1/1/1/3";
 506         print "ok $test\n";
 507         $test++;
 508     }
 509
 510     for my $s ("\x{263a}" . "\x{263a}",        #  7
 511                $smiley    . $smiley,           #  8
 512
 513                "\x{263a}\x{263a}",             #  9
 514                "$smiley$smiley",               # 10
 515
 516                "\x{263a}" x 2,                 # 11
 517                $smiley    x 2,                 # 12
 518                ) {
 519         my $length_chars = length($s);
 520         my $length_bytes;
 521         { use bytes; $length_bytes = length($s) }
 522         my @regex_chars = $s =~ m/(.)/g;
 523         my $regex_chars = @regex_chars;
 524         my @split_chars = split //, $s;
 525         my $split_chars = @split_chars;
 526         print "not "
 527             unless "$length_chars/$regex_chars/$split_chars/$length_bytes" eq
 528                    "2/2/2/6";
 529         print "ok $test\n";
 530         $test++;
 531     }
 532 }
 533
 534 {
 535     # ID 20001020.006
 536
 537     "x" =~ /(.)/; # unset $2
 538
 539     # Without the fix this will croak:
 540     # Modification of a read-only value attempted at ...
 541     "$2\x{1234}";
 542
 543     print "ok $test\n";
 544     $test++;
 545
 546     # For symmetry with the above.
 547     "\x{1234}$2";
 548
 549     print "ok $test\n";
 550     $test++;
 551
 552     *pi = \undef;
 553     # This bug existed earlier than the $2 bug, but is fixed with the same
 554     # patch. Without the fix this will also croak:
 555     # Modification of a read-only value attempted at ...
 556     "$pi\x{1234}";
 557
 558     print "ok $test\n";
 559     $test++;
 560
 561     # For symmetry with the above.
 562     "\x{1234}$pi";
 563
 564     print "ok $test\n";
 565     $test++;
 566 }
 567
 568 # This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
 569 # http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
 570 # version dated 2000-09-02.
 571
 572 my @MK = split(/\n/, <<__EOMK__);
 573 1       Correct UTF-8
 574 1.1.1 y "κόσμε"   -               11      ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5        5
 575 2       Boundary conditions
 576 2.1     First possible sequence of certain length
 577 2.1.1 y "\0"                     0               1       00      1
 578 2.1.2 y "\80"                    80              2       c2:80   1
 579 2.1.3 y "ࠀ"           800             3       e0:a0:80        1
 580 2.1.4 y "𐀀"          10000           4       f0:90:80:80     1
 581 2.1.5 y "�����" 200000          5       f8:88:80:80:80  1
 582 2.1.6 y "������"        4000000         6       fc:84:80:80:80:80       1
 583 2.2     Last possible sequence of certain length
 584 2.2.1 y "\7f"                     7f              1       7f      1
 585 2.2.2 y "߿"                    7ff             2       df:bf   1
 586 # The ffff is illegal unless UTF8_ALLOW_FFFF