lib/unicore/mktables

   1 #!/usr/bin/perl -w
   2
   3 #
   4 # mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl)
   5 # from the Unicode database files (lib/unicore/*.txt).
   6 #
   7
   8 use strict;
   9
  10 my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1.
  11
  12 mkdir("In", 0755);
  13 mkdir("Is", 0755);
  14 mkdir("To", 0755);
  15
  16 sub extend {
  17     my ($table, $last) = @_;
  18
  19     $table->[-1]->[1] = $last;
  20 }
  21
  22 sub append {
  23     my ($table, $code, $name) = @_;
  24     if (@$table &&
  25         hex($table->[-1]->[1]) == hex($code) - 1 &&
  26         (!defined $name || $table->[-1]->[2] eq $name)) {
  27         extend($table, $code);
  28     } else {
  29         push @$table, [$code, $code, $name];
  30     }
  31 }
  32
  33 sub inverse {
  34     my ($table) = @_;
  35     my $inverse = [];
  36     my ($first, $last);
  37     if ($table->[0]->[0]) {
  38         $last = hex($table->[0]->[0]);
  39         push @$inverse, [ "0000",
  40                           sprintf("%04X", $last - 1) ];
  41     }
  42     for my $i (0..$#$table-1) {
  43         $first = defined $table->[$i    ]->[1] ?
  44                      hex($table->[$i    ]->[1]) : 0;
  45         $last  = defined $table->[$i + 1]->[0] ?
  46                      hex($table->[$i + 1]->[0]) : $first;
  47         push @$inverse, [ sprintf("%04X", $first + 1),
  48                           sprintf("%04X", $last  - 1) ]
  49                               unless $first + 1 == $last;
  50     }
  51     return $inverse;
  52 }
  53
  54 sub header {
  55     my $fh = shift;
  56
  57     print $fh <<EOT;
  58 # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
  59 # This file is built by $0 from e.g. Unicode.txt.
  60 # Any changes made here will be lost!
  61 EOT
  62 }
  63
  64 sub begin {
  65     my $fh = shift;
  66
  67     print $fh <<EOT;
  68 return <<'END';
  69 EOT
  70 }
  71
  72 sub end {
  73     my $fh = shift;
  74
  75     print $fh <<EOT;
  76 END
  77 EOT
  78 }
  79
  80 sub flush {
  81     my ($table, $file) = @_;
  82     print "$file\n";
  83     if (open(my $fh, ">$file")) {
  84         header($fh);
  85         begin($fh);
  86         for my $i (@$table) {
  87             print $fh $i->[0], "\t",
  88                       $i->[1] ne $i->[0] ? $i->[1] : "", "\t",
  89                       defined $i->[2] ? $i->[2] : "", "\n";
  90         }
  91         end($fh);
  92         close($fh);
  93     } else {
  94         die "$0: $file: $!\n";
  95     }
  96 }
  97
  98 #
  99 # The %In contains the mapping of the script/block name into a number.
 100 #
 101
 102 my %In;
 103 my $InId = 0;
 104 my %InIn;
 105
 106 my %InScript;
 107 my %InBlock;
 108
 109 #
 110 # Read in the Unicode.txt, the main Unicode database.
 111 #
 112
 113 my %Cat;
 114 my %General;
 115 my @General;
 116
 117 sub gencat {
 118     my ($Name, $GeneralH, $GeneralA, $Cat,
 119         $name, $cat, $code, $op) = @_;
 120
 121     $op->($Name,                     $code, $name);
 122     $op->($GeneralA,                 $code, $cat);
 123
 124     $op->($GeneralH->{$name} ||= [], $code, $name);
 125
 126     $op->($Cat->{$cat}       ||= [], $code);
 127     $op->($Cat->{substr($cat, 0, 1)}
 128                             ||= [],  $code);
 129     # 005F: SPACING UNDERSCORE
 130     $op->($Cat->{Word}       ||= [], $code)
 131         if $cat =~ /^[LMN]/ or $code eq "005F";
 132     $op->($Cat->{Alnum}      ||= [], $code)
 133         if $cat =~ /^[LMN]/;
 134     $op->($Cat->{Alpha}      ||= [], $code)
 135         if $cat =~ /^[LM]/;
 136     # 0009: HORIZONTAL TABULATION
 137     # 000A: LINE FEED
 138     # 000B: VERTICAL TABULATION
 139     # 000C: FORM FEED
 140     # 000D: CARRIAGE RETURN
 141     # 0020: SPACE
 142     $op->($Cat->{Space}      ||= [], $code)
 143         if $cat  =~ /^Z/ ||
 144             $code =~ /^(0009|000A|000B|000C|000D)$/;
 145     $op->($Cat->{SpacePerl}  ||= [], $code)
 146         if $cat  =~ /^Z/ ||
 147             $code =~ /^(0009|000A|000C|000D)$/;
 148     $op->($Cat->{Blank}      ||= [], $code)
 149         if $code =~ /^(0020|0009)$/ ||
 150             $cat  =~ /^Z[^lp]$/;
 151     $op->($Cat->{Digit}      ||= [], $code) if $cat eq "Nd";
 152     $op->($Cat->{Upper}      ||= [], $code) if $cat eq "Lu";
 153     $op->($Cat->{Lower}      ||= [], $code) if $cat eq "Ll";
 154     $op->($Cat->{Title}      ||= [], $code) if $cat eq "Lt";
 155     $op->($Cat->{ASCII}      ||= [], $code) if $code le "007F";
 156     $op->($Cat->{Cntrl}      ||= [], $code) if $cat =~ /^C/;
 157     $op->($Cat->{Graph}      ||= [], $code) if $cat =~ /^([LMNPS]|Co)/;
 158     $op->($Cat->{Print}      ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/;
 159     $op->($Cat->{Punct}      ||= [], $code) if $cat =~ /^P/;
 160     # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f
 161     $op->($Cat->{XDigit}     ||= [], $code)
 162         if $code =~ /^00(3[0-9]|[46][1-6])$/;
 163
 164 }
 165
 166 if (open(my $Unicode, "Unicode.txt")) {
 167     my @Name;
 168     my @Bidi;
 169     my %Bidi;
 170     my @Comb;
 171     my @Deco;
 172     my %Deco;
 173     my %DC;
 174     my @Number;
 175     my @Mirrored;
 176     my %To;
 177     while (<$Unicode>) {
 178         next unless /^[0-9A-Fa-f]+;/;
 179         s/\s+$//;
 180
 181         my ($code, $name, $cat, $comb, $bidi, $deco,
 182             $decimal, $digit, $number,
 183             $mirrored, $unicode10, $comment,
 184             $upper, $lower, $title) = split(/\s*;\s*/);
 185
 186         if ($name =~ /^<(.+), (First|Last)>$/) {
 187             $name = $1;
 188             gencat(\@Name, \%General, \@General, \%Cat,
 189                    $name, $cat, $code,
 190                    $2 eq 'First' ? \&append : \&extend);
 191             unless (defined $In{$name}) {
 192                 $In{$name}   = $InId++;
 193                 $InIn{$name} = $General{$name};
 194             }
 195         } else {
 196
 197             gencat(\@Name, \%General, \@General, \%Cat,
 198                    $name, $cat, $code, \&append);
 199
 200             append($To{Upper}       ||= [], $code, $upper)   if $upper;
 201             append($To{Lower}       ||= [], $code, $lower)   if $lower;
 202             append($To{Title}       ||= [], $code, $title)   if $title;
 203             append($To{Digit}       ||= [], $code, $decimal) if $decimal;
 204
 205             append(\@Bidi,                  $code, $bidi);
 206             append($Bidi{$bidi}     ||= [], $code);
 207
 208             append(\@Comb,                  $code, $comb) if $comb;
 209
 210             if ($deco) {
 211                 append(\@Deco,                  $code, $deco);
 212                 if ($deco =~/^<(\w+)>/) {
 213                     append($Deco{Compat} ||= [], $code);
 214                     append($DC{$1}       ||= [], $code);
 215                 } else {
 216                     append($Deco{Canon}  ||= [], $code);
 217                 }
 218             }
 219
 220             append(\@Number,                     $code, $number) if $number;
 221
 222             append(\@Mirrored,                   $code) if $mirrored eq "Y";
 223         }
 224     }
 225
 226     flush(\@Name, "Name.pl");
 227
 228     foreach my $cat (sort keys %Cat) {
 229         flush($Cat{$cat}, "Is/$cat.pl");
 230     }
 231
 232     foreach my $to (sort keys %To) {
 233         flush($To{$to}, "To/$to.pl");
 234     }
 235
 236     flush(\@Bidi, "Bidirectional.pl");
 237     foreach my $bidi (sort keys %Bidi) {
 238         flush($Bidi{$bidi}, "Is/Bidi$bidi.pl");
 239     }
 240
 241     flush(\@Comb, "CombiningClass.pl");
 242
 243     flush(\@Deco, "Decomposition.pl");
 244     foreach my $deco (sort keys %Deco) {
 245         flush($Deco{$deco}, "Is/Deco$deco.pl");
 246     }
 247     foreach my $dc (sort keys %DC) {
 248         flush($DC{$dc}, "Is/DC$dc.pl");
 249     }
 250
 251     flush(\@Number, "Number.pl");
 252
 253     flush(\@Mirrored, "Is/Mirrored.pl");
 254 } else {
 255     die "$0: Unicode.txt: $!\n";
 256 }
 257
 258 #  The general cateory can be written out already now.
 259
 260 flush(\@General, "Category.pl");
 261
 262 #
 263 # Read in the LineBrk.txt.
 264 #
 265
 266 if (open(my $LineBrk, "LineBrk.txt")) {
 267     my @Lbrk;
 268     my %Lbrk;
 269
 270     while (<$LineBrk>) {
 271         next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/;
 272
 273         my ($first, $last, $lbrk) = ($1, $2, $3);
 274
 275         append(\@Lbrk,              $first, $lbrk);
 276         append($Lbrk{$lbrk} ||= [], $first);
 277         if (defined $last) {
 278             extend(\@Lbrk,          $last);
 279             extend($Lbrk{$lbrk},    $last);
 280         }
 281     }
 282
 283     flush(\@Lbrk, "Lbrk.pl");
 284     foreach my $lbrk (sort keys %Lbrk) {
 285         flush($Lbrk{$lbrk}, "Is/Lbrk$lbrk.pl");
 286     }
 287 } else {
 288     die "$0: LineBrk.txt: $!\n";
 289 }
 290
 291 #
 292 # Read in the ArabShap.txt.
 293 #
 294
 295 if (open(my $ArabShap, "ArabShap.txt")) {
 296     my @ArabLink;
 297     my @ArabLinkGroup;
 298
 299     while (<$ArabShap>) {
 300         next unless /^[0-9A-Fa-f]+;/;
 301         s/\s+$//;
 302
 303         my ($code, $name, $link, $linkgroup) = split(/\s*;\s*/);
 304
 305         append(\@ArabLink,      $code, $link);
 306         append(\@ArabLinkGroup, $code, $linkgroup);
 307     }
 308
 309     flush(\@ArabLink,      "ArabLink.pl");
 310     flush(\@ArabLinkGroup, "ArabLnkGrp.pl");
 311 } else {
 312     die "$0: ArabShap.txt: $!\n";
 313 }
 314
 315 #
 316 # Read in the Jamo.txt.
 317 #
 318
 319 if (open(my $Jamo, "Jamo.txt")) {
 320     my @Short;
 321
 322     while (<$Jamo>) {
 323         next unless /^([0-9A-Fa-f]+)\s*;\s*(\w*)/;
 324
 325         my ($code, $short) = ($1, $2);
 326
 327         append(\@Short, $code, $short);
 328     }
 329
 330     flush(\@Short, "JamoShort.pl");
 331 } else {
 332     die "$0: Jamo.txt: $!\n";
 333 }
 334
 335 #
 336 # Read in the Scripts.txt.
 337 #
 338
 339 my @Scripts;
 340
 341 if (open(my $Scripts, "Scripts.txt")) {
 342     while (<$Scripts>) {
 343         next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/;
 344
 345         # Wait until all the scripts have been read since
 346         # they are not listed in numeric order.
 347         push @Scripts, [ hex($1), $1, $2, $3 ];
 348     }
 349 } else {
 350     die "$0: Scripts.txt: $!\n";
 351 }
 352
 353 # Now append the scripts properties in their code point order.
 354
 355 my %Script;
 356 my $Scripts = [];
 357
 358 for my $script (sort { $a->[0] <=> $b->[0] } @Scripts) {
 359     my ($code, $first, $last, $name) = @$script;
 360     append($Scripts,              $first, $name);
 361     append($Script{$name} ||= [], $first, $name);
 362     if (defined $last) {
 363         extend($Scripts,       $last);
 364         extend($Script{$name}, $last);
 365     }
 366     unless (defined $In{$name}) {
 367         $InScript{$InId} = $name;
 368         $In{$name}       = $InId++;
 369         $InIn{$name}     = $Script{$name};
 370     }
 371 }
 372
 373 # Scripts.pl can be written out already now.
 374
 375 flush(\@Scripts, "Scripts.pl");
 376
 377 # Common is everything not explicitly assigned to a Script
 378
 379 $In{Common} = $InId++;
 380 my $Common = inverse($Scripts);
 381 $InIn{Common} = $Common;
 382
 383 #
 384 # Read in the Blocks.txt.
 385 #
 386
 387 my @Blocks;
 388 my %Blocks;
 389
 390 if (open(my $Blocks, "Blocks.txt")) {
 391     while (<$Blocks>) {
 392         next unless /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/;
 393
 394         my ($first, $last, $name) = ($1, $2, $3);
 395         my $origname = $name;
 396
 397         # If there's a naming conflict (the script names are
 398         # in uppercase), the name of the block has " Block"
 399         # appended to it.
 400         my $pat = $name;
 401         $pat =~ s/([- _])/(?:[-_]|\\s+)?/g;
 402         for my $i (values %InScript) {
 403             if ($i =~ /^$pat$/i) {
 404                 $name .= " Block";
 405                 last;
 406             }
 407         }
 408
 409         append(\@Blocks,              $first, $name);
 410         append($Blocks{$name} ||= [], $first, $name);
 411         if (defined $last) {
 412             extend(\@Blocks,       $last);
 413             extend($Blocks{$name}, $last);
 414         }
 415         unless (defined $In{$name}) {
 416             $InBlock{$InId} = $origname;
 417             $In{$name}      = $InId++;
 418             $InIn{$name}    = $Blocks{$name};
 419         }
 420     }
 421 } else {
 422     die "$0: Blocks.txt: $!\n";
 423 }
 424
 425 # Blocks.pl can be written out already now.
 426
 427 flush(\@Blocks, "Blocks.pl");
 428
 429 #
 430 # Read in the PropList.txt.  It contains extended properties not
 431 # listed in the Unicode.txt, such as 'Other_Alphabetic':
 432 # alphabetic but not of the general category L; many modifiers
 433 # belong to this extended property category: while they are not
 434 # alphabets, they are alphabetic in nature.
 435 #
 436
 437 my @Props;
 438
 439 if (open(my $Props, "PropList.txt")) {
 440     while (<$Props>) {
 441         next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/;
 442
 443         # Wait until all the extended properties have been read since
 444         # they are not listed in numeric order.
 445         push @Props, [ hex($1), $1, $2, $3 ];
 446     }
 447 } else {
 448     die "$0: PropList.txt: $!\n";
 449 }
 450
 451 # Now append the extended properties in their code point order.
 452
 453 my %Prop;
 454 my $Props = [];
 455
 456 for my $prop (sort { $a->[0] <=> $b->[0] } @Props) {
 457     my ($code, $first, $last, $name) = @$prop;
 458     append($Props,              $first, $name);
 459     append($Prop{$name} ||= [], $first, $name);
 460     if (defined $last) {
 461         extend($Props,       $last);
 462         extend($Prop{$name}, $last);
 463     }
 464     unless (defined $In{$name}) {
 465         $In{$name}   = $InId++;
 466         $InIn{$name} = $Prop{$name};
 467     }
 468 }
 469
 470 # Assigned is everything not Cn aka Noncharacter_Code_Point
 471
 472 $In{Assigned} = $InId++;
 473 my $Assigned = inverse($Prop{Noncharacter_Code_Point});
 474 $InIn{Assigned} = $Assigned;
 475
 476 sub merge_general_and_extended {
 477     my ($name, $general, $extended) = @_;
 478     my $merged;
 479
 480     push @$merged,
 481          map { pop @{$_}; $_ }
 482              sort { $a->[2] <=> $b->[2] }
 483                   map { [ $_->[0], $_->[1], hex($_->[0]) ] }
 484                       ($general ?
 485                          map { ref $_ ? @$_ : $_ }
 486                              @Cat {ref $general  ? @$general  : $general } :
 487                          (),
 488                        $extended ?
 489                          map { ref $_ ? @$_ : $_ }
 490                              @Prop{ref $extended ? @$extended : $extended} :
 491                          ());
 492
 493     $In{$name}   = $InId++;
 494     $InIn{$name} = $merged;
 495
 496     return $merged;
 497 }
 498
 499 # Alphabetic is L and Other_Alphabetic.
 500
 501 my $Alphabetic =
 502     merge_general_and_extended('Alphabetic', 'L', 'Other_Alphabetic');
 503
 504 # Lowercase is Ll and Other_Lowercase.
 505
 506 my $Lowercase =
 507     merge_general_and_extended('Lowercase', 'Ll', 'Other_Lowercase');
 508
 509 # Uppercase is Lu and Other_Uppercase.
 510
 511 my $Uppercase =
 512     merge_general_and_extended('Uppercase', 'Lu', 'Other_Uppercase');
 513
 514 # Math is Sm and Other_Math.
 515
 516 my $Math =
 517     merge_general_and_extended('Math', 'Sm', 'Other_Math');
 518
 519 # Lampersand is Ll, Lu, and Lt.
 520
 521 my $Lampersand =
 522     merge_general_and_extended('Lampersand', [ qw(Ll Lu Lt) ]);
 523
 524 # ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl.
 525
 526 my $ID_Start =
 527     merge_general_and_extended('ID_Start', [ qw(Ll Lu Lt Lm Lo Nl) ]);
 528
 529 # ID_Continue is ID_Start, Mn, Mc, Nd, and Pc.
 530
 531 my $ID_Continue =
 532     merge_general_and_extended('ID_Continue', [ qw(Ll Lu Lt Lm Lo Nl
 533                                                    Mn Mc Nd Pc) ]);
 534
 535 #
 536 # Any is any.
 537 #
 538
 539 $In{Any} = $InId++;
 540 my $Any = [ [ 0, sprintf("%04X", $LastUnicodeCodepoint) ] ];
 541 $InIn{Any} = $Any;
 542
 543 #
 544 # mapping() will be used to write out the In and Is virtual mappings.
 545 #
 546
 547 sub mapping {
 548     my ($map, $name) = @_;
 549
 550     if (open(my $fh, ">$name.pl")) {
 551         print "$name.pl\n";
 552         header($fh);
 553
 554         # The %pat will hold a hash that maps the first two
 555         # lowercased letters of a class to a 'fuzzified' regular
 556         # expression that points to the real mapping.
 557
 558         my %pat;
 559
 560         # But first write out the offical name to real name
 561         # (the filename) mapping.
 562
 563         print $fh <<EOT;
 564 %utf8::${name} =
 565 (
 566 EOT
 567         for my $i (sort { lc $a cmp lc $b } keys %$map) {
 568             my $pat = $i;
 569             # Here is the 'fuzzification': accept any space,
 570             # dash, or underbar where in the official name
 571             # there is space or a dash (or underbar, but
 572             # there never is).
 573             $pat =~ s/([- _])/(?:[-_]|\\s+)?/g;
 574             # The prefix length of 2 is enough spread,
 575             # and besides, we have 'Yi' as an In category.
 576             push @{$pat{lc(substr($i, 0, 2))}}, [ $i, $pat ];
 577             printf $fh "%-45s => '$map->{$i}',\n", "'$i'";
 578         }
 579         print $fh <<EOT;
 580 );
 581 EOT
 582
 583         # Now write out the %pat mapping.
 584
 585         print $fh <<EOT;
 586 %utf8::${name}Pat =
 587 (
 588 EOT
 589         foreach my $prefix (sort keys %pat) {
 590             print $fh "'$prefix' => {\n";
 591             foreach my $ipat (@{$pat{$prefix}}) {
 592                 my ($i, $pat) = @$ipat;
 593                 print $fh "\t'$pat' => '$map->{$i}',\n";
 594             }
 595             print $fh "},\n";
 596         }
 597         print $fh <<EOT;
 598 );
 599 EOT
 600
 601         close($fh);
 602     } else {
 603         die "$0: $name.pl: $!\n";
 604     }
 605 }
 606
 607 #
 608 # Write out the virtual In mappings.
 609 #
 610
 611 mapping(\%In, "In");
 612
 613 #
 614 # Append the InScript and InBlock mappings.
 615 # These are needed only if Script= and Block= syntaxes are used.
 616 #
 617
 618 if (open(my $In, ">>In.pl")) {
 619     print $In <<EOT;
 620
 621 %utf8::InScript =
 622 (
 623 EOT
 624     for my $i (sort { $a <=> $b } keys %InScript) {
 625         printf $In "%4d => '$InScript{$i}',\n", $i;
 626     }
 627     print $In <<EOT;
 628 );
 629 EOT
 630
 631     print $In <<EOT;
 632
 633 %utf8::InBlock =
 634 (
 635 EOT
 636     for my $i (sort { $a <=> $b } keys %InBlock) {
 637         printf $In "%4d => '$InBlock{$i}',\n", $i;
 638     }
 639     print $In <<EOT;
 640 );
 641 EOT
 642 } else {
 643     die "$0: In.pl: $!\n";
 644 }
 645
 646 # Easy low-calorie cheat.
 647 use File::Copy;
 648 copy("In/$In{Noncharacter_Code_Point}.pl", "Is/Cn.pl");
 649
 650 #
 651 # Write out the real In mappings
 652 # (the In.pl written out just above has the virtual In mappings)
 653 #
 654
 655 foreach my $in (sort { $In{$a} <=> $In{$b} } keys %In) {
 656     flush($InIn{$in}, "In/$In{$in}.pl");
 657 }
 658
 659 #
 660 # The mapping from General Category long forms to short forms is
 661 # currently hardwired here since no simple data file in the UCD
 662 # seems to do that.  Unicode 3.2 will assumedly correct this.
 663 #
 664
 665 my %Is = (
 666         'Letter'                        =>      'L',
 667         'Uppercase_Letter'              =>      'Lu',
 668         'Lowercase_Letter'              =>      'Ll',
 669         'Titlecase_Letter'              =>      'Lt',
 670         'Modifier_Letter'               =>      'Lm',
 671         'Other_Letter'                  =>      'Lo',
 672
 673         'Mark'                          =>      'M',
 674         'Non_Spacing_Mark'              =>      'Mn',
 675         'Spacing_Mark'                  =>      'Mc',
 676         'Enclosing_Mark'                =>      'Me',
 677
 678         'Separator'                     =>      'Z',
 679         'Space_Separator'               =>      'Zs',
 680         'Line_Separator'                =>      'Zl',
 681         'Paragraph_Separator'           =>      'Zp',
 682
 683         'Number'                        =>      'N',
 684         'Decimal_Number'                =>      'Nd',
 685         'Letter_Number'                 =>      'Nl',
 686         'Other_Number'                  =>      'No',
 687
 688         'Punctuation'                   =>      'P',
 689         'Connector_Punctuation'         =>      'Pc',
 690         'Dash_Punctuation'              =>      'Pd',
 691         'Open_Punctuation'              =>      'Ps',
 692         'Close_Punctuation'             =>      'Pe',
 693         'Initial_Punctuation'           =>      'Pi',
 694         'Final_Punctuation'             =>      'Pf',
 695         'Other_Punctuation'             =>      'Po',
 696
 697         'Symbol'                        =>      'S',
 698         'Math_Symbol'                   =>      'Sm',
 699         'Currency_Symbol'               =>      'Sc',
 700         'Modifier_Symbol'               =>      'Sk',
 701         'Other_Symbol'                  =>      'So',
 702
 703         'Other'                         =>      'C',
 704         'Control'                       =>      'Cc',
 705         'Format'                        =>      'Cf',
 706         'Surrogate'                     =>      'Cs',
 707         'Private Use'                   =>      'Co',
 708         'Unassigned'                    =>      'Cn',
 709 );
 710
 711 #
 712 # Write out the virtual Is mappings.
 713 #
 714
 715 mapping(\%Is, "Is");
 716
 717 #
 718 # Read in the special cases.
 719 #
 720
 721 my %Case;
 722
 723 if (open(my $SpecCase, "SpecCase.txt")) {
 724     while (<$SpecCase>) {
 725         next unless /^[0-9A-Fa-f]+;/;
 726         s/\#.*//;
 727         s/\s+$//;
 728
 729         my ($code, $lower, $title, $upper, $condition) = split(/\s*;\s*/);
 730
 731         if ($condition) { # not implemented yet
 732             print "# SKIPPING $_\n";
 733             next;
 734         }
 735
 736         # Wait until all the special cases have been read since
 737         # they are not listed in numeric order.
 738         my $ix = hex($code);
 739         push @{$Case{Lower}}, [ $ix, $code, $lower ];
 740         push @{$Case{Title}}, [ $ix, $code, $title ];
 741         push @{$Case{Upper}}, [ $ix, $code, $upper ];
 742     }
 743 } else {
 744     die "$0: SpecCase.txt: $!\n";
 745 }
 746
 747 # Now write out the special cases properties in their code point order.
 748 # Prepend them to the To/{Upper,Lower,Title}.pl.
 749
 750 for my $case (qw(Lower Title Upper)) {
 751     my $NormalCase = do "To/$case.pl" || die "$0: To/$case.pl: $!\n";
 752     if (open(my $Case, ">To/$case.pl")) {
 753         header($Case);
 754         print $Case <<EOT;
 755
 756 %utf8::ToSpec$case = (
 757 EOT
 758         for my $prop (sort { $a->[0] <=> $b->[0] } @{$Case{$case}}) {
 759             my ($ix, $code, $to) = @$prop;
 760             my $tostr =
 761                 join "", map { sprintf "\\x{%s}", $_ } split ' ', $to;
 762             printf $Case qq['%04X' => "$tostr",\n], $ix;
 763         }
 764         print $Case <<EOT;
 765 );
 766
 767 EOT
 768         begin($Case);
 769         print $Case $NormalCase;
 770         end($Case);
 771     } else {
 772         die "$0: To/$case.txt: $!\n";
 773     }
 774 }
 775
 776 #
 777 # Read in the case foldings.
 778 #
 779 # We will do full case folding, C + F + I (see CaseFold.txt).
 780 #
 781
 782 if (open(my $CaseFold, "CaseFold.txt")) {
 783     my @Fold;
 784     my %Fold;
 785
 786     while (<$CaseFold>) {
 787         next unless /^([0-9A-Fa-f]+)\s*;\s*([CFI])\s*;\s*([0-9A-Fa-f]+(?: [0-9A-Fa-f]+)*)\s*;/;
 788
 789         my ($code, $status, $fold) = ($1, $2, $3);
 790
 791         if ($status eq 'C') { # Common: one-to-one folding
 792             append(\@Fold, $code, $fold);
 793         } else { # F: full, or I: dotted uppercase I -> dotless lowercase I
 794             $Fold{hex($code)} = $fold;
 795         }
 796     }
 797
 798     flush(\@Fold, "To/Fold.pl");
 799
 800     #
 801     # Prepend the special foldings to the common foldings.
 802     #
 803
 804     my $CommonFold = do "To/Fold.pl" || die "$0: To/Fold.pl: $!\n";
 805     if (open(my $Fold, ">To/Fold.pl")) {
 806         header($Fold);
 807         print $Fold <<EOT;
 808
 809 %utf8::ToSpecFold = (
 810 EOT
 811         for my $code (sort { $a <=> $b } keys %Fold) {
 812             my $foldstr =
 813                 join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code};
 814             printf $Fold qq['%04X' => "$foldstr",\n], $code;
 815         }
 816         print $Fold <<EOT;
 817 );
 818
 819 EOT
 820         begin($Fold);
 821         print $Fold $CommonFold;
 822         end($Fold);
 823     } else {
 824         die "$0: To/Fold.pl: $!\n";
 825     }
 826 } else {
 827     die "$0: CaseFold.txt: $!\n";
 828 }
 829
 830 # That's all, folks!
 831