lib/unicore/mktables

   1 #!/usr/bin/perl -w
   2
   3 #
   4 # mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl)
   5 # from the Unicode database files (lib/unicore/*.txt).
   6 #
   7
   8 use strict;
   9
  10 my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1.
  11
  12 mkdir("In", 0755);
  13 mkdir("Is", 0755);
  14 mkdir("To", 0755);
  15
  16 sub extend {
  17     my ($table, $last) = @_;
  18
  19     $table->[-1]->[1] = $last;
  20 }
  21
  22 sub append {
  23     my ($table, $code, $name) = @_;
  24     if (@$table &&
  25         hex($table->[-1]->[1]) == hex($code) - 1 &&
  26         (!defined $name || $table->[-1]->[2] eq $name)) {
  27         extend($table, $code);
  28     } else {
  29         push @$table, [$code, $code, $name];
  30     }
  31 }
  32
  33 sub append_range {
  34     my ($table, $code_ini, $code_fin, $name) = @_;
  35     append($table, $code_ini, $name);
  36     extend($table, $code_fin);
  37 }
  38
  39 sub inverse {
  40     my ($table) = @_;
  41     my $inverse = [];
  42     my ($first, $last);
  43     if ($table->[0]->[0]) {
  44         $last = hex($table->[0]->[0]);
  45         push @$inverse, [ "0000",
  46                           sprintf("%04X", $last - 1) ];
  47     }
  48     for my $i (0..$#$table-1) {
  49         $first = defined $table->[$i    ]->[1] ?
  50                      hex($table->[$i    ]->[1]) : 0;
  51         $last  = defined $table->[$i + 1]->[0] ?
  52                      hex($table->[$i + 1]->[0]) : $first;
  53         push @$inverse, [ sprintf("%04X", $first + 1),
  54                           sprintf("%04X", $last  - 1) ]
  55                               unless $first + 1 == $last;
  56     }
  57     return $inverse;
  58 }
  59
  60 sub header {
  61     my $fh = shift;
  62
  63     print $fh <<EOT;
  64 # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
  65 # This file is built by $0 from e.g. Unicode.txt.
  66 # Any changes made here will be lost!
  67 EOT
  68 }
  69
  70 sub begin {
  71     my $fh = shift;
  72
  73     print $fh <<EOT;
  74 return <<'END';
  75 EOT
  76 }
  77
  78 sub end {
  79     my $fh = shift;
  80
  81     print $fh <<EOT;
  82 END
  83 EOT
  84 }
  85
  86 sub flush {
  87     my ($table, $file) = @_;
  88     print "$file\n";
  89     if (open(my $fh, ">$file")) {
  90         header($fh);
  91         begin($fh);
  92         for my $i (@$table) {
  93             print $fh $i->[0], "\t",
  94                       $i->[1] ne $i->[0] ? $i->[1] : "", "\t",
  95                       defined $i->[2] ? $i->[2] : "", "\n";
  96         }
  97         end($fh);
  98         close($fh);
  99     } else {
 100         die "$0: $file: $!\n";
 101     }
 102 }
 103
 104 #
 105 # The %In contains the mapping of the script/block name into a number.
 106 #
 107
 108 my %In;
 109 my $InId = 0;
 110 my %InIn;
 111
 112 my %InScript;
 113 my %InBlock;
 114
 115 #
 116 # Read in the Unicode.txt, the main Unicode database.
 117 #
 118
 119 my %Cat;
 120 my %General;
 121 my @General;
 122
 123 sub gencat {
 124     my ($Name, $GeneralH, $GeneralA, $Cat,
 125         $name, $cat, $code, $op) = @_;
 126
 127     $op->($Name,                     $code, $name);
 128     $op->($GeneralA,                 $code, $cat);
 129
 130     $op->($GeneralH->{$name} ||= [], $code, $name);
 131
 132     $op->($Cat->{$cat}       ||= [], $code);
 133     $op->($Cat->{substr($cat, 0, 1)}
 134                             ||= [],  $code);
 135     # 005F: SPACING UNDERSCORE
 136     $op->($Cat->{Word}       ||= [], $code)
 137         if $cat =~ /^[LMN]/ or $code eq "005F";
 138     $op->($Cat->{Alnum}      ||= [], $code)
 139         if $cat =~ /^[LMN]/;
 140     $op->($Cat->{Alpha}      ||= [], $code)
 141         if $cat =~ /^[LM]/;
 142     # 0009: HORIZONTAL TABULATION
 143     # 000A: LINE FEED
 144     # 000B: VERTICAL TABULATION
 145     # 000C: FORM FEED
 146     # 000D: CARRIAGE RETURN
 147     # 0020: SPACE
 148     $op->($Cat->{Space}      ||= [], $code)
 149         if $cat  =~ /^Z/ ||
 150             $code =~ /^(0009|000A|000B|000C|000D)$/;
 151     $op->($Cat->{SpacePerl}  ||= [], $code)
 152         if $cat  =~ /^Z/ ||
 153             $code =~ /^(0009|000A|000C|000D)$/;
 154     $op->($Cat->{Blank}      ||= [], $code)
 155         if $code =~ /^(0020|0009)$/ ||
 156             $cat  =~ /^Z[^lp]$/;
 157     $op->($Cat->{Digit}      ||= [], $code) if $cat eq "Nd";
 158     $op->($Cat->{Upper}      ||= [], $code) if $cat eq "Lu";
 159     $op->($Cat->{Lower}      ||= [], $code) if $cat eq "Ll";
 160     $op->($Cat->{Title}      ||= [], $code) if $cat eq "Lt";
 161     $op->($Cat->{ASCII}      ||= [], $code) if $code le "007F";
 162     $op->($Cat->{Cntrl}      ||= [], $code) if $cat =~ /^C/;
 163     $op->($Cat->{Graph}      ||= [], $code) if $cat =~ /^([LMNPS]|Co)/;
 164     $op->($Cat->{Print}      ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/;
 165     $op->($Cat->{Punct}      ||= [], $code) if $cat =~ /^P/;
 166     # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f
 167     $op->($Cat->{XDigit}     ||= [], $code)
 168         if $code =~ /^00(3[0-9]|[46][1-6])$/;
 169
 170 }
 171
 172 if (open(my $Unicode, "Unicode.txt")) {
 173     my @Name;
 174     my @Bidi;
 175     my %Bidi;
 176     my @Comb;
 177     my @Deco;
 178     my %Deco;
 179     my %DC;
 180     my @Number;
 181     my @Mirrored;
 182     my %To;
 183
 184
 185     my $LastCodeInt = -1; # a numeric, not a hexadecimal string.
 186
 187     # UnicodeData-3.1.0.html says
 188     # no characters in the file have the property, Cn, Not Assigned.
 189
 190     sub check_no_characters { # in the scope of my $LastCodeInt;
 191         my $code = shift;
 192         my $diff_from_last = hex($code) - $LastCodeInt;
 193         my $code_ini = sprintf("%04X", $LastCodeInt + 1);
 194         $LastCodeInt = hex($code);
 195         if ($diff_from_last == 1) {
 196             return;
 197         } elsif ($diff_from_last == 2) {
 198             append($Cat{Cn}             ||= [], $code_ini);
 199             append($Cat{C}              ||= [], $code_ini);
 200         } else {
 201             my $code_fin = sprintf("%04X", hex($code) - 1);
 202             append_range($Cat{Cn}       ||= [], $code_ini, $code_fin);
 203             append_range($Cat{C}        ||= [], $code_ini, $code_fin);
 204         }
 205     }
 206
 207     while (<$Unicode>) {
 208         next unless /^[0-9A-Fa-f]+;/;
 209         s/\s+$//;
 210
 211         my ($code, $name, $cat, $comb, $bidi, $deco,
 212             $decimal, $digit, $number,
 213             $mirrored, $unicode10, $comment,
 214             $upper, $lower, $title) = split(/\s*;\s*/);
 215
 216         if ($name =~ /^<(.+), (First|Last)>$/) {
 217             if($2 eq 'First') {
 218                 check_no_characters($code);
 219             } else {
 220                 $LastCodeInt = hex($code);
 221             }
 222
 223             $name = $1;
 224             gencat(\@Name, \%General, \@General, \%Cat,
 225                    $name, $cat, $code,
 226                    $2 eq 'First' ? \&append : \&extend);
 227             unless (defined $In{$name}) {
 228                 $In{$name}   = $InId++;
 229                 $InIn{$name} = $General{$name};
 230             }
 231         } else {
 232             check_no_characters($code);
 233
 234             gencat(\@Name, \%General, \@General, \%Cat,
 235                    $name, $cat, $code, \&append);
 236
 237             append($To{Upper}       ||= [], $code, $upper)   if $upper;
 238             append($To{Lower}       ||= [], $code, $lower)   if $lower;
 239             append($To{Title}       ||= [], $code, $title)   if $title;
 240             append($To{Digit}       ||= [], $code, $decimal) if $decimal;
 241
 242             append(\@Bidi,                  $code, $bidi);
 243             append($Bidi{$bidi}     ||= [], $code);
 244
 245             append(\@Comb,                  $code, $comb) if $comb;
 246
 247             if ($deco) {
 248                 append(\@Deco,                  $code, $deco);
 249                 if ($deco =~/^<(\w+)>/) {
 250                     append($Deco{Compat} ||= [], $code);
 251                     append($DC{$1}       ||= [], $code);
 252                 } else {
 253                     append($Deco{Canon}  ||= [], $code);
 254                 }
 255             }
 256
 257             append(\@Number,                     $code, $number) if $number;
 258
 259             append(\@Mirrored,                   $code) if $mirrored eq "Y";
 260         }
 261     }
 262
 263     check_no_characters(sprintf("%X", $LastUnicodeCodepoint + 1));
 264
 265     flush(\@Name, "Name.pl");
 266
 267     foreach my $cat (sort keys %Cat) {
 268         flush($Cat{$cat}, "Is/$cat.pl");
 269     }
 270
 271     foreach my $to (sort keys %To) {
 272         flush($To{$to}, "To/$to.pl");
 273     }
 274
 275     flush(\@Bidi, "Bidirectional.pl");
 276     foreach my $bidi (sort keys %Bidi) {
 277         flush($Bidi{$bidi}, "Is/Bidi$bidi.pl");
 278     }
 279
 280     flush(\@Comb, "CombiningClass.pl");
 281
 282     flush(\@Deco, "Decomposition.pl");
 283     foreach my $deco (sort keys %Deco) {
 284         flush($Deco{$deco}, "Is/Deco$deco.pl");
 285     }
 286     foreach my $dc (sort keys %DC) {
 287         flush($DC{$dc}, "Is/DC$dc.pl");
 288     }
 289
 290     flush(\@Number, "Number.pl");
 291
 292     flush(\@Mirrored, "Is/Mirrored.pl");
 293 } else {
 294     die "$0: Unicode.txt: $!\n";
 295 }
 296
 297 #  The general cateory can be written out already now.
 298
 299 flush(\@General, "Category.pl");
 300
 301 #
 302 # Read in the LineBrk.txt.
 303 #
 304
 305 if (open(my $LineBrk, "LineBrk.txt")) {
 306     my @Lbrk;
 307     my %Lbrk;
 308
 309     while (<$LineBrk>) {
 310         next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/;
 311
 312         my ($first, $last, $lbrk) = ($1, $2, $3);
 313
 314         append(\@Lbrk,              $first, $lbrk);
 315         append($Lbrk{$lbrk} ||= [], $first);
 316         if (defined $last) {
 317             extend(\@Lbrk,          $last);
 318             extend($Lbrk{$lbrk},    $last);
 319         }
 320     }
 321
 322     flush(\@Lbrk, "Lbrk.pl");
 323     foreach my $lbrk (sort keys %Lbrk) {
 324         flush($Lbrk{$lbrk}, "Is/Lbrk$lbrk.pl");
 325     }
 326 } else {
 327     die "$0: LineBrk.txt: $!\n";
 328 }
 329
 330 #
 331 # Read in the ArabShap.txt.
 332 #
 333
 334 if (open(my $ArabShap, "ArabShap.txt")) {
 335     my @ArabLink;
 336     my @ArabLinkGroup;
 337
 338     while (<$ArabShap>) {
 339         next unless /^[0-9A-Fa-f]+;/;
 340         s/\s+$//;
 341
 342         my ($code, $name, $link, $linkgroup) = split(/\s*;\s*/);
 343
 344         append(\@ArabLink,      $code, $link);
 345         append(\@ArabLinkGroup, $code, $linkgroup);
 346     }
 347
 348     flush(\@ArabLink,      "ArabLink.pl");
 349     flush(\@ArabLinkGroup, "ArabLnkGrp.pl");
 350 } else {
 351     die "$0: ArabShap.txt: $!\n";
 352 }
 353
 354 #
 355 # Read in the Jamo.txt.
 356 #
 357
 358 if (open(my $Jamo, "Jamo.txt")) {
 359     my @Short;
 360
 361     while (<$Jamo>) {
 362         next unless /^([0-9A-Fa-f]+)\s*;\s*(\w*)/;
 363
 364         my ($code, $short) = ($1, $2);
 365
 366         append(\@Short, $code, $short);
 367     }
 368
 369     flush(\@Short, "JamoShort.pl");
 370 } else {
 371     die "$0: Jamo.txt: $!\n";
 372 }
 373
 374 #
 375 # Read in the Scripts.txt.
 376 #
 377
 378 my @Scripts;
 379
 380 if (open(my $Scripts, "Scripts.txt")) {
 381     while (<$Scripts>) {
 382         next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/;
 383
 384         # Wait until all the scripts have been read since
 385         # they are not listed in numeric order.
 386         push @Scripts, [ hex($1), $1, $2, $3 ];
 387     }
 388 } else {
 389     die "$0: Scripts.txt: $!\n";
 390 }
 391
 392 # Now append the scripts properties in their code point order.
 393
 394 my %Script;
 395 my $Scripts = [];
 396
 397 for my $script (sort { $a->[0] <=> $b->[0] } @Scripts) {
 398     my ($code, $first, $last, $name) = @$script;
 399     append($Scripts,              $first, $name);
 400     append($Script{$name} ||= [], $first, $name);
 401     if (defined $last) {
 402         extend($Scripts,       $last);
 403         extend($Script{$name}, $last);
 404     }
 405     unless (defined $In{$name}) {
 406         $InScript{$InId} = $name;
 407         $In{$name}       = $InId++;
 408         $InIn{$name}     = $Script{$name};
 409     }
 410 }
 411
 412 # Scripts.pl can be written out already now.
 413
 414 flush(\@Scripts, "Scripts.pl");
 415
 416 # Common is everything not explicitly assigned to a Script
 417
 418 $In{Common} = $InId++;
 419 my $Common = inverse($Scripts);
 420 $InIn{Common} = $Common;
 421
 422 #
 423 # Read in the Blocks.txt.
 424 #
 425
 426 my @Blocks;
 427 my %Blocks;
 428
 429 if (open(my $Blocks, "Blocks.txt")) {
 430     while (<$Blocks>) {
 431         next unless /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/;
 432
 433         my ($first, $last, $name) = ($1, $2, $3);
 434         my $origname = $name;
 435
 436         # If there's a naming conflict (the script names are
 437         # in uppercase), the name of the block has " Block"
 438         # appended to it.
 439         my $pat = $name;
 440         $pat =~ s/([- _])/(?:[-_]|\\s+)?/g;
 441         for my $i (values %InScript) {
 442             if ($i =~ /^$pat$/i) {
 443                 $name .= " Block";
 444                 last;
 445             }
 446         }
 447
 448         append(\@Blocks,              $first, $name);
 449         append($Blocks{$name} ||= [], $first, $name);
 450         if (defined $last) {
 451             extend(\@Blocks,       $last);
 452             extend($Blocks{$name}, $last);
 453         }
 454         unless (defined $In{$name}) {
 455             $InBlock{$InId} = $origname;
 456             $In{$name}      = $InId++;
 457             $InIn{$name}    = $Blocks{$name};
 458         }
 459     }
 460 } else {
 461     die "$0: Blocks.txt: $!\n";
 462 }
 463
 464 # Blocks.pl can be written out already now.
 465
 466 flush(\@Blocks, "Blocks.pl");
 467
 468 #
 469 # Read in the PropList.txt.  It contains extended properties not
 470 # listed in the Unicode.txt, such as 'Other_Alphabetic':
 471 # alphabetic but not of the general category L; many modifiers
 472 # belong to this extended property category: while they are not
 473 # alphabets, they are alphabetic in nature.
 474 #
 475
 476 my @Props;
 477
 478 if (open(my $Props, "PropList.txt")) {
 479     while (<$Props>) {
 480         next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/;
 481
 482         # Wait until all the extended properties have been read since
 483         # they are not listed in numeric order.
 484         push @Props, [ hex($1), $1, $2, $3 ];
 485     }
 486 } else {
 487     die "$0: PropList.txt: $!\n";
 488 }
 489
 490 # Now append the extended properties in their code point order.
 491
 492 my %Prop;
 493 my $Props = [];
 494
 495 for my $prop (sort { $a->[0] <=> $b->[0] } @Props) {
 496     my ($code, $first, $last, $name) = @$prop;
 497     append($Props,              $first, $name);
 498     append($Prop{$name} ||= [], $first, $name);
 499     if (defined $last) {
 500         extend($Props,       $last);
 501         extend($Prop{$name}, $last);
 502     }
 503     unless (defined $In{$name}) {
 504         $In{$name}   = $InId++;
 505         $InIn{$name} = $Prop{$name};
 506     }
 507 }
 508
 509 # Assigned is everything not Cn
 510
 511 $In{Assigned} = $InId++;
 512 my $Assigned = inverse($Cat{Cn});
 513 $InIn{Assigned} = $Assigned;
 514
 515 # Unassigned is everything not Assigned
 516
 517 $In{Unassigned} = $InId++;
 518 my $Unassigned = $Cat{Cn};
 519 $InIn{Unassigned} = $Unassigned;
 520
 521 # Unassigned is everything not Assigned
 522 sub merge_general_and_extended {
 523     my ($name, $general, $extended) = @_;
 524     my $merged;
 525
 526     push @$merged,
 527          map { pop @{$_}; $_ }
 528              sort { $a->[2] <=> $b->[2] }
 529                   map { [ $_->[0], $_->[1], hex($_->[0]) ] }
 530                       ($general ?
 531                          map { ref $_ ? @$_ : $_ }
 532                              @Cat {ref $general  ? @$general  : $general } :
 533                          (),
 534                        $extended ?
 535                          map { ref $_ ? @$_ : $_ }
 536                              @Prop{ref $extended ? @$extended : $extended} :
 537                          ());
 538
 539     $In{$name}   = $InId++;
 540     $InIn{$name} = $merged;
 541
 542     return $merged;
 543 }
 544
 545 # Alphabetic is L and Other_Alphabetic.
 546
 547 my $Alphabetic =
 548     merge_general_and_extended('Alphabetic', 'L', 'Other_Alphabetic');
 549
 550 # Lowercase is Ll and Other_Lowercase.
 551
 552 my $Lowercase =
 553     merge_general_and_extended('Lowercase', 'Ll', 'Other_Lowercase');
 554
 555 # Uppercase is Lu and Other_Uppercase.
 556
 557 my $Uppercase =
 558     merge_general_and_extended('Uppercase', 'Lu', 'Other_Uppercase');
 559
 560 # Math is Sm and Other_Math.
 561
 562 my $Math =
 563     merge_general_and_extended('Math', 'Sm', 'Other_Math');
 564
 565 # Lampersand is Ll, Lu, and Lt.
 566
 567 my $Lampersand =
 568     merge_general_and_extended('Lampersand', [ qw(Ll Lu Lt) ]);
 569
 570 # ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl.
 571
 572 my $ID_Start =
 573     merge_general_and_extended('ID_Start', [ qw(Ll Lu Lt Lm Lo Nl) ]);
 574
 575 # ID_Continue is ID_Start, Mn, Mc, Nd, and Pc.
 576
 577 my $ID_Continue =
 578     merge_general_and_extended('ID_Continue', [ qw(Ll Lu Lt Lm Lo Nl
 579                                                    Mn Mc Nd Pc) ]);
 580
 581 #
 582 # Any is any.
 583 #
 584
 585 $In{Any} = $InId++;
 586 my $Any = [ [ 0, sprintf("%04X", $LastUnicodeCodepoint) ] ];
 587 $InIn{Any} = $Any;
 588
 589 #
 590 # All is any, too.
 591 #
 592
 593 $In{All} = $InId++;
 594 $InIn{All} = $Any;
 595
 596 #
 597 # mapping() will be used to write out the In and Is virtual mappings.
 598 #
 599
 600 sub mapping {
 601     my ($map, $name) = @_;
 602
 603     if (open(my $fh, ">$name.pl")) {
 604         print "$name.pl\n";
 605         header($fh);
 606
 607         # The %pat will hold a hash that maps the first two
 608         # lowercased letters of a class to a 'fuzzified' regular
 609         # expression that points to the real mapping.
 610
 611         my %pat;
 612
 613         # But first write out the offical name to real name
 614         # (the filename) mapping.
 615
 616         print $fh <<EOT;
 617 %utf8::${name} =
 618 (
 619 EOT
 620         for my $i (sort { lc $a cmp lc $b } keys %$map) {
 621             my $pat = $i;
 622             # Here is the 'fuzzification': accept any space,
 623             # dash, or underbar where in the official name
 624             # there is space or a dash (or underbar, but
 625             # there never is).
 626             $pat =~ s/([- _])/(?:[-_]|\\s+)?/g;
 627             # The prefix length of 2 is enough spread,
 628             # and besides, we have 'Yi' as an In category.
 629             push @{$pat{lc(substr($i, 0, 2))}}, [ $i, $pat ];
 630             printf $fh "%-45s => '$map->{$i}',\n", "'$i'";
 631         }
 632         print $fh <<EOT;
 633 );
 634 EOT
 635
 636         # Now write out the %pat mapping.
 637
 638         print $fh <<EOT;
 639 %utf8::${name}Pat =
 640 (
 641 EOT
 642         foreach my $prefix (sort keys %pat) {
 643             print $fh "'$prefix' => {\n";
 644             foreach my $ipat (@{$pat{$prefix}}) {
 645                 my ($i, $pat) = @$ipat;
 646                 print $fh "\t'$pat' => '$map->{$i}',\n";
 647             }
 648             print $fh "},\n";
 649         }
 650         print $fh <<EOT;
 651 );
 652 EOT
 653
 654         close($fh);
 655     } else {
 656         die "$0: $name.pl: $!\n";
 657     }
 658 }
 659
 660 #
 661 # Write out the virtual In mappings.
 662 #
 663
 664 mapping(\%In, "In");
 665
 666 #
 667 # Append the InScript and InBlock mappings.
 668 # These are needed only if Script= and Block= syntaxes are used.
 669 #
 670
 671 if (open(my $In, ">>In.pl")) {
 672     print $In <<EOT;
 673
 674 %utf8::InScript =
 675 (
 676 EOT
 677     for my $i (sort { $a <=> $b } keys %InScript) {
 678         printf $In "%4d => '$InScript{$i}',\n", $i;
 679     }
 680     print $In <<EOT;
 681 );
 682 EOT
 683
 684     print $In <<EOT;
 685
 686 %utf8::InBlock =
 687 (
 688 EOT
 689     for my $i (sort { $a <=> $b } keys %InBlock) {
 690         printf $In "%4d => '$InBlock{$i}',\n", $i;
 691     }
 692     print $In <<EOT;
 693 );
 694 EOT
 695 } else {
 696     die "$0: In.pl: $!\n";
 697 }
 698
 699 #
 700 # Write out the real In mappings
 701 # (the In.pl written out just above has the virtual In mappings)
 702 #
 703
 704 foreach my $in (sort { $In{$a} <=> $In{$b} } keys %In) {
 705     flush($InIn{$in}, "In/$In{$in}.pl");
 706 }
 707
 708 #
 709 # The mapping from General Category long forms to short forms is
 710 # currently hardwired here since no simple data file in the UCD
 711 # seems to do that.  Unicode 3.2 will assumedly correct this.
 712 #
 713
 714 my %Is = (
 715         'Letter'                        =>      'L',
 716         'Uppercase_Letter'              =>      'Lu',
 717         'Lowercase_Letter'              =>      'Ll',
 718         'Titlecase_Letter'              =>      'Lt',
 719         'Modifier_Letter'               =>      'Lm',
 720         'Other_Letter'                  =>      'Lo',
 721
 722         'Mark'                          =>      'M',
 723         'Non_Spacing_Mark'              =>      'Mn',
 724         'Spacing_Mark'                  =>      'Mc',
 725         'Enclosing_Mark'                =>      'Me',
 726
 727         'Separator'                     =>      'Z',
 728         'Space_Separator'               =>      'Zs',
 729         'Line_Separator'                =>      'Zl',
 730         'Paragraph_Separator'           =>      'Zp',
 731
 732         'Number'                        =>      'N',
 733         'Decimal_Number'                =>      'Nd',
 734         'Letter_Number'                 =>      'Nl',
 735         'Other_Number'                  =>      'No',
 736
 737         'Punctuation'                   =>      'P',
 738         'Connector_Punctuation'         =>      'Pc',
 739         'Dash_Punctuation'              =>      'Pd',
 740         'Open_Punctuation'              =>      'Ps',
 741         'Close_Punctuation'             =>      'Pe',
 742         'Initial_Punctuation'           =>      'Pi',
 743         'Final_Punctuation'             =>      'Pf',
 744         'Other_Punctuation'             =>      'Po',
 745
 746         'Symbol'                        =>      'S',
 747         'Math_Symbol'                   =>      'Sm',
 748         'Currency_Symbol'               =>      'Sc',
 749         'Modifier_Symbol'               =>      'Sk',
 750         'Other_Symbol'                  =>      'So',
 751
 752         'Other'                         =>      'C',
 753         'Control'                       =>      'Cc',
 754         'Format'                        =>      'Cf',
 755         'Surrogate'                     =>      'Cs',
 756         'Private Use'                   =>      'Co',
 757         'Unassigned'                    =>      'Cn',
 758 );
 759
 760 #
 761 # Write out the virtual Is mappings.
 762 #
 763
 764 mapping(\%Is, "Is");
 765
 766 #
 767 # Read in the special cases.
 768 #
 769
 770 my %Case;
 771
 772 if (open(my $SpecCase, "SpecCase.txt")) {
 773     while (<$SpecCase>) {
 774         next unless /^[0-9A-Fa-f]+;/;
 775         s/\#.*//;
 776         s/\s+$//;
 777
 778         my ($code, $lower, $title, $upper, $condition) = split(/\s*;\s*/);
 779
 780         if ($condition) { # not implemented yet
 781             print "# SKIPPING $_\n";
 782             next;
 783         }
 784
 785         # Wait until all the special cases have been read since
 786         # they are not listed in numeric order.
 787         my $ix = hex($code);
 788         push @{$Case{Lower}}, [ $ix, $code, $lower ];
 789         push @{$Case{Title}}, [ $ix, $code, $title ];
 790         push @{$Case{Upper}}, [ $ix, $code, $upper ];
 791     }
 792 } else {
 793     die "$0: SpecCase.txt: $!\n";
 794 }
 795
 796 # Now write out the special cases properties in their code point order.
 797 # Prepend them to the To/{Upper,Lower,Title}.pl.
 798
 799 for my $case (qw(Lower Title Upper)) {
 800     my $NormalCase = do "To/$case.pl" || die "$0: To/$case.pl: $!\n";
 801     if (open(my $Case, ">To/$case.pl")) {
 802         header($Case);
 803         print $Case <<EOT;
 804
 805 %utf8::ToSpec$case = (
 806 EOT
 807         for my $prop (sort { $a->[0] <=> $b->[0] } @{$Case{$case}}) {
 808             my ($ix, $code, $to) = @$prop;
 809             my $tostr =
 810                 join "", map { sprintf "\\x{%s}", $_ } split ' ', $to;
 811             printf $Case qq['%04X' => "$tostr",\n], $ix;
 812         }
 813         print $Case <<EOT;
 814 );
 815
 816 EOT
 817         begin($Case);
 818         print $Case $NormalCase;
 819         end($Case);
 820     } else {
 821         die "$0: To/$case.txt: $!\n";
 822     }
 823 }
 824
 825 #
 826 # Read in the case foldings.
 827 #
 828 # We will do full case folding, C + F + I (see CaseFold.txt).
 829 #
 830
 831 if (open(my $CaseFold, "CaseFold.txt")) {
 832     my @Fold;
 833     my %Fold;
 834
 835     while (<$CaseFold>) {
 836         next unless /^([0-9A-Fa-f]+)\s*;\s*([CFI])\s*;\s*([0-9A-Fa-f]+(?: [0-9A-Fa-f]+)*)\s*;/;
 837
 838         my ($code, $status, $fold) = ($1, $2, $3);
 839
 840         if ($status eq 'C') { # Common: one-to-one folding
 841             append(\@Fold, $code, $fold);
 842         } else { # F: full, or I: dotted uppercase I -> dotless lowercase I
 843             $Fold{hex($code)} = $fold;
 844         }
 845     }
 846
 847     flush(\@Fold, "To/Fold.pl");
 848
 849     #
 850     # Prepend the special foldings to the common foldings.
 851     #
 852
 853     my $CommonFold = do "To/Fold.pl" || die "$0: To/Fold.pl: $!\n";
 854     if (open(my $Fold, ">To/Fold.pl")) {
 855         header($Fold);
 856         print $Fold <<EOT;
 857
 858 %utf8::ToSpecFold = (
 859 EOT
 860         for my $code (sort { $a <=> $b } keys %Fold) {
 861             my $foldstr =
 862                 join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code};
 863             printf $Fold qq['%04X' => "$foldstr",\n], $code;
 864         }
 865         print $Fold <<EOT;
 866 );
 867
 868 EOT
 869         begin($Fold);
 870         print $Fold $CommonFold;
 871         end($Fold);
 872     } else {
 873         die "$0: To/Fold.pl: $!\n";
 874     }
 875 } else {
 876     die "$0: CaseFold.txt: $!\n";
 877 }
 878
 879 # That's all, folks!
 880