lib/unicore/mktables

   1 #!/usr/bin/perl -w
   2
   3 #
   4 # mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl)
   5 # from the Unicode database files (lib/unicore/*.txt).
   6 #
   7
   8 use strict;
   9
  10 my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1.
  11
  12 mkdir("In", 0755);
  13 mkdir("Is", 0755);
  14 mkdir("To", 0755);
  15
  16 sub extend {
  17     my ($table, $last) = @_;
  18
  19     $table->[-1]->[1] = $last;
  20 }
  21
  22 sub append {
  23     my ($table, $code, $name) = @_;
  24     if (@$table &&
  25         hex($table->[-1]->[1]) == hex($code) - 1 &&
  26         (!defined $name || $table->[-1]->[2] eq $name)) {
  27         extend($table, $code);
  28     } else {
  29         push @$table, [$code, $code, $name];
  30     }
  31 }
  32
  33 sub append_range {
  34     my ($table, $code_ini, $code_fin, $name) = @_;
  35     append($table, $code_ini, $name);
  36     extend($table, $code_fin);
  37 }
  38
  39 sub inverse {
  40     my ($table) = @_;
  41     my $inverse = [];
  42     my ($first, $last);
  43     if ($table->[0]->[0]) {
  44         $last = hex($table->[0]->[0]);
  45         push @$inverse, [ "0000",
  46                           sprintf("%04X", $last - 1) ];
  47     }
  48     for my $i (0..$#$table-1) {
  49         $first = defined $table->[$i    ]->[1] ?
  50                      hex($table->[$i    ]->[1]) : 0;
  51         $last  = defined $table->[$i + 1]->[0] ?
  52                      hex($table->[$i + 1]->[0]) : $first;
  53         push @$inverse, [ sprintf("%04X", $first + 1),
  54                           sprintf("%04X", $last  - 1) ]
  55                               unless $first + 1 == $last;
  56     }
  57     return $inverse;
  58 }
  59
  60 sub header {
  61     my $fh = shift;
  62
  63     print $fh <<EOT;
  64 # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
  65 # This file is built by $0 from e.g. Unicode.txt.
  66 # Any changes made here will be lost!
  67 EOT
  68 }
  69
  70 sub begin {
  71     my $fh = shift;
  72
  73     print $fh <<EOT;
  74 return <<'END';
  75 EOT
  76 }
  77
  78 sub end {
  79     my $fh = shift;
  80
  81     print $fh <<EOT;
  82 END
  83 EOT
  84 }
  85
  86 sub flush {
  87     my ($table, $file) = @_;
  88     print "$file\n";
  89     if (open(my $fh, ">$file")) {
  90         header($fh);
  91         begin($fh);
  92         for my $i (@$table) {
  93             print $fh $i->[0], "\t",
  94                       $i->[1] ne $i->[0] ? $i->[1] : "", "\t",
  95                       defined $i->[2] ? $i->[2] : "", "\n";
  96         }
  97         end($fh);
  98         close($fh);
  99     } else {
 100         die "$0: $file: $!\n";
 101     }
 102 }
 103
 104 #
 105 # The %In contains the mapping of the script/block name into a number.
 106 #
 107
 108 my %In;
 109 my $InId = 0;
 110 my %InIn;
 111
 112 my %InScript;
 113 my %InBlock;
 114
 115 #
 116 # Read in the Unicode.txt, the main Unicode database.
 117 #
 118
 119 my %Cat;
 120 my %General;
 121 my @General;
 122
 123 sub gencat {
 124     my ($Name, $GeneralH, $GeneralA, $Cat,
 125         $name, $cat, $code, $op) = @_;
 126
 127     $op->($Name,                     $code, $name);
 128     $op->($GeneralA,                 $code, $cat);
 129
 130     $op->($GeneralH->{$name} ||= [], $code, $name);
 131
 132     $op->($Cat->{$cat}       ||= [], $code);
 133     $op->($Cat->{substr($cat, 0, 1)}
 134                             ||= [],  $code);
 135     # 005F: SPACING UNDERSCORE
 136     $op->($Cat->{Word}       ||= [], $code)
 137         if $cat =~ /^[LMN]/ || $code eq "005F";
 138     $op->($Cat->{Alnum}      ||= [], $code)
 139         if $cat =~ /^[LMN]/;
 140     $op->($Cat->{Alpha}      ||= [], $code)
 141         if $cat =~ /^[LM]/;
 142     # 0009: HORIZONTAL TABULATION
 143     # 000A: LINE FEED
 144     # 000B: VERTICAL TABULATION
 145     # 000C: FORM FEED
 146     # 000D: CARRIAGE RETURN
 147     # 0020: SPACE
 148     $op->($Cat->{Space}      ||= [], $code)
 149         if $cat  =~ /^Z/ ||
 150             $code =~ /^(0009|000A|000B|000C|000D)$/;
 151     $op->($Cat->{SpacePerl}  ||= [], $code)
 152         if $cat  =~ /^Z/ || $code =~ /^(?:0085|2028|2029)$/ ||
 153             $code =~ /^(0009|000A|000C|000D)$/;
 154     $op->($Cat->{Blank}      ||= [], $code)
 155         if $code =~ /^(0020|0009)$/ ||
 156             $cat  =~ /^Z[^lp]$/;
 157     $op->($Cat->{Digit}      ||= [], $code) if $cat eq "Nd";
 158     $op->($Cat->{Upper}      ||= [], $code) if $cat eq "Lu";
 159     $op->($Cat->{Lower}      ||= [], $code) if $cat eq "Ll";
 160     $op->($Cat->{Title}      ||= [], $code) if $cat eq "Lt";
 161     $op->($Cat->{ASCII}      ||= [], $code) if $code le "007F";
 162     $op->($Cat->{Cntrl}      ||= [], $code) if $cat =~ /^C/;
 163     $op->($Cat->{Graph}      ||= [], $code) if $cat =~ /^([LMNPS]|Co)/;
 164     $op->($Cat->{Print}      ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/;
 165     $op->($Cat->{Punct}      ||= [], $code) if $cat =~ /^P/;
 166     # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f
 167     $op->($Cat->{XDigit}     ||= [], $code)
 168         if $code =~ /^00(3[0-9]|[46][1-6])$/;
 169
 170 }
 171
 172 if (open(my $Unicode, "Unicode.txt")) {
 173     my @Name;
 174     my @Bidi;
 175     my %Bidi;
 176     my @Comb;
 177     my @Deco;
 178     my %Deco;
 179     my %DC;
 180     my @Number;
 181     my @Mirrored;
 182     my %To;
 183
 184
 185     my $LastCodeInt = -1; # a numeric, not a hexadecimal string.
 186
 187     # UnicodeData-3.1.0.html says
 188     # no characters in the file have the property, Cn, Not Assigned.
 189
 190     sub check_no_characters { # in the scope of my $LastCodeInt;
 191         my $code = shift;
 192         my $diff_from_last = hex($code) - $LastCodeInt;
 193         my $code_ini = sprintf("%04X", $LastCodeInt + 1);
 194         $LastCodeInt = hex($code);
 195         if ($diff_from_last == 1) {
 196             return;
 197         } elsif ($diff_from_last == 2) {
 198             append($Cat{Cn}             ||= [], $code_ini);
 199             append($Cat{C}              ||= [], $code_ini);
 200         } else {
 201             my $code_fin = sprintf("%04X", hex($code) - 1);
 202             append_range($Cat{Cn}       ||= [], $code_ini, $code_fin);
 203             append_range($Cat{C}        ||= [], $code_ini, $code_fin);
 204         }
 205     }
 206
 207     while (<$Unicode>) {
 208         next unless /^[0-9A-Fa-f]+;/;
 209         s/\s+$//;
 210
 211         my ($code, $name, $cat, $comb, $bidi, $deco,
 212             $decimal, $digit, $number,
 213             $mirrored, $unicode10, $comment,
 214             $upper, $lower, $title) = split(/\s*;\s*/);
 215
 216         if ($name =~ /^<(.+), (First|Last)>$/) {
 217             if($2 eq 'First') {
 218                 check_no_characters($code);
 219             } else {
 220                 $LastCodeInt = hex($code);
 221             }
 222
 223             $name = $1;
 224             gencat(\@Name, \%General, \@General, \%Cat,
 225                    $name, $cat, $code,
 226                    $2 eq 'First' ? \&append : \&extend);
 227             unless (defined $In{$name}) {
 228                 $In{$name}   = $InId++;
 229                 $InIn{$name} = $General{$name};
 230             }
 231         } else {
 232             check_no_characters($code);
 233
 234             gencat(\@Name, \%General, \@General, \%Cat,
 235                    $name, $cat, $code, \&append);
 236
 237             # No append() here since since several codes may map into one.
 238             push @{$To{Upper}}, [ $code, $code, $upper ]     if $upper;
 239             push @{$To{Lower}}, [ $code, $code, $lower ]     if $lower;
 240             push @{$To{Title}}, [ $code, $code, $title ]     if $title;
 241
 242             append($To{Digit}       ||= [], $code, $decimal) if $decimal;
 243
 244             append(\@Bidi,                  $code, $bidi);
 245             append($Bidi{$bidi}     ||= [], $code);
 246
 247             append(\@Comb,                  $code, $comb) if $comb;
 248
 249             if ($deco) {
 250                 append(\@Deco,                  $code, $deco);
 251                 if ($deco =~/^<(\w+)>/) {
 252                     append($Deco{Compat} ||= [], $code);
 253                     append($DC{$1}       ||= [], $code);
 254                 } else {
 255                     append($Deco{Canon}  ||= [], $code);
 256                 }
 257             }
 258
 259             append(\@Number,                     $code, $number) if $number;
 260
 261             append(\@Mirrored,                   $code) if $mirrored eq "Y";
 262         }
 263     }
 264
 265     check_no_characters(sprintf("%X", $LastUnicodeCodepoint + 1));
 266
 267     flush(\@Name, "Name.pl");
 268
 269     foreach my $cat (sort keys %Cat) {
 270         flush($Cat{$cat}, "Is/$cat.pl");
 271     }
 272
 273     foreach my $to (sort keys %To) {
 274         flush($To{$to}, "To/$to.pl");
 275     }
 276
 277     flush(\@Bidi, "Bidirectional.pl");
 278     foreach my $bidi (sort keys %Bidi) {
 279         flush($Bidi{$bidi}, "Is/Bidi$bidi.pl");
 280     }
 281
 282     flush(\@Comb, "CombiningClass.pl");
 283
 284     flush(\@Deco, "Decomposition.pl");
 285     foreach my $deco (sort keys %Deco) {
 286         flush($Deco{$deco}, "Is/Deco$deco.pl");
 287     }
 288     foreach my $dc (sort keys %DC) {
 289         flush($DC{$dc}, "Is/DC$dc.pl");
 290     }
 291
 292     flush(\@Number, "Number.pl");
 293
 294     flush(\@Mirrored, "Is/Mirrored.pl");
 295 } else {
 296     die "$0: Unicode.txt: $!\n";
 297 }
 298
 299 #  The general cateory can be written out already now.
 300
 301 flush(\@General, "Category.pl");
 302
 303 #
 304 # Read in the LineBrk.txt.
 305 #
 306
 307 if (open(my $LineBrk, "LineBrk.txt")) {
 308     my @Lbrk;
 309     my %Lbrk;
 310
 311     while (<$LineBrk>) {
 312         next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/;
 313
 314         my ($first, $last, $lbrk) = ($1, $2, $3);
 315
 316         append(\@Lbrk,              $first, $lbrk);
 317         append($Lbrk{$lbrk} ||= [], $first);
 318         if (defined $last) {
 319             extend(\@Lbrk,          $last);
 320             extend($Lbrk{$lbrk},    $last);
 321         }
 322     }
 323
 324     flush(\@Lbrk, "Lbrk.pl");
 325     foreach my $lbrk (sort keys %Lbrk) {
 326         flush($Lbrk{$lbrk}, "Is/Lbrk$lbrk.pl");
 327     }
 328 } else {
 329     die "$0: LineBrk.txt: $!\n";
 330 }
 331
 332 #
 333 # Read in the ArabShap.txt.
 334 #
 335
 336 if (open(my $ArabShap, "ArabShap.txt")) {
 337     my @ArabLink;
 338     my @ArabLinkGroup;
 339
 340     while (<$ArabShap>) {
 341         next unless /^[0-9A-Fa-f]+;/;
 342         s/\s+$//;
 343
 344         my ($code, $name, $link, $linkgroup) = split(/\s*;\s*/);
 345
 346         append(\@ArabLink,      $code, $link);
 347         append(\@ArabLinkGroup, $code, $linkgroup);
 348     }
 349
 350     flush(\@ArabLink,      "ArabLink.pl");
 351     flush(\@ArabLinkGroup, "ArabLnkGrp.pl");
 352 } else {
 353     die "$0: ArabShap.txt: $!\n";
 354 }
 355
 356 #
 357 # Read in the Jamo.txt.
 358 #
 359
 360 if (open(my $Jamo, "Jamo.txt")) {
 361     my @Short;
 362
 363     while (<$Jamo>) {
 364         next unless /^([0-9A-Fa-f]+)\s*;\s*(\w*)/;
 365
 366         my ($code, $short) = ($1, $2);
 367
 368         append(\@Short, $code, $short);
 369     }
 370
 371     flush(\@Short, "JamoShort.pl");
 372 } else {
 373     die "$0: Jamo.txt: $!\n";
 374 }
 375
 376 #
 377 # Read in the Scripts.txt.
 378 #
 379
 380 my @Scripts;
 381
 382 if (open(my $Scripts, "Scripts.txt")) {
 383     while (<$Scripts>) {
 384         next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/;
 385
 386         # Wait until all the scripts have been read since
 387         # they are not listed in numeric order.
 388         push @Scripts, [ hex($1), $1, $2, $3 ];
 389     }
 390 } else {
 391     die "$0: Scripts.txt: $!\n";
 392 }
 393
 394 # Now append the scripts properties in their code point order.
 395
 396 my %Script;
 397 my $Scripts = [];
 398
 399 for my $script (sort { $a->[0] <=> $b->[0] } @Scripts) {
 400     my ($code, $first, $last, $name) = @$script;
 401     append($Scripts,              $first, $name);
 402     append($Script{$name} ||= [], $first, $name);
 403     if (defined $last) {
 404         extend($Scripts,       $last);
 405         extend($Script{$name}, $last);
 406     }
 407     unless (defined $In{$name}) {
 408         $InScript{$InId} = $name;
 409         $In{$name}       = $InId++;
 410         $InIn{$name}     = $Script{$name};
 411     }
 412 }
 413
 414 # Scripts.pl can be written out already now.
 415
 416 flush(\@Scripts, "Scripts.pl");
 417
 418 # Common is everything not explicitly assigned to a Script
 419
 420 $In{Common} = $InId++;
 421 my $Common = inverse($Scripts);
 422 $InIn{Common} = $Common;
 423
 424 #
 425 # Read in the Blocks.txt.
 426 #
 427
 428 my @Blocks;
 429 my %Blocks;
 430
 431 if (open(my $Blocks, "Blocks.txt")) {
 432     while (<$Blocks>) {
 433         next unless /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/;
 434
 435         my ($first, $last, $name) = ($1, $2, $3);
 436         my $origname = $name;
 437
 438         # If there's a naming conflict (the script names are
 439         # in uppercase), the name of the block has " Block"
 440         # appended to it.
 441         my $pat = $name;
 442         $pat =~ s/([- _])/(?:[-_]|\\s+)?/g;
 443         for my $i (values %InScript) {
 444             if ($i =~ /^$pat$/i) {
 445                 $name .= " Block";
 446                 last;
 447             }
 448         }
 449
 450         append(\@Blocks,              $first, $name);
 451         append($Blocks{$name} ||= [], $first, $name);
 452         if (defined $last) {
 453             extend(\@Blocks,       $last);
 454             extend($Blocks{$name}, $last);
 455         }
 456         unless (defined $In{$name}) {
 457             $InBlock{$InId} = $origname;
 458             $In{$name}      = $InId++;
 459             $InIn{$name}    = $Blocks{$name};
 460         }
 461     }
 462 } else {
 463     die "$0: Blocks.txt: $!\n";
 464 }
 465
 466 # Blocks.pl can be written out already now.
 467
 468 flush(\@Blocks, "Blocks.pl");
 469
 470 #
 471 # Read in the PropList.txt.  It contains extended properties not
 472 # listed in the Unicode.txt, such as 'Other_Alphabetic':
 473 # alphabetic but not of the general category L; many modifiers
 474 # belong to this extended property category: while they are not
 475 # alphabets, they are alphabetic in nature.
 476 #
 477
 478 my @Props;
 479
 480 if (open(my $Props, "PropList.txt")) {
 481     while (<$Props>) {
 482         next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/;
 483
 484         # Wait until all the extended properties have been read since
 485         # they are not listed in numeric order.
 486         push @Props, [ hex($1), $1, $2, $3 ];
 487     }
 488 } else {
 489     die "$0: PropList.txt: $!\n";
 490 }
 491
 492 # Now append the extended properties in their code point order.
 493
 494 my %Prop;
 495 my $Props = [];
 496
 497 for my $prop (sort { $a->[0] <=> $b->[0] } @Props) {
 498     my ($code, $first, $last, $name) = @$prop;
 499     append($Props,              $first, $name);
 500     append($Prop{$name} ||= [], $first, $name);
 501     if (defined $last) {
 502         extend($Props,       $last);
 503         extend($Prop{$name}, $last);
 504     }
 505     unless (defined $In{$name}) {
 506         $In{$name}   = $InId++;
 507         $InIn{$name} = $Prop{$name};
 508     }
 509 }
 510
 511 # Assigned is everything not Cn
 512
 513 $In{Assigned} = $InId++;
 514 my $Assigned = inverse($Cat{Cn});
 515 $InIn{Assigned} = $Assigned;
 516
 517 # Unassigned is everything not Assigned
 518
 519 $In{Unassigned} = $InId++;
 520 my $Unassigned = $Cat{Cn};
 521 $InIn{Unassigned} = $Unassigned;
 522
 523 # Unassigned is everything not Assigned
 524 sub merge_general_and_extended {
 525     my ($name, $general, $extended) = @_;
 526     my $merged;
 527
 528     push @$merged,
 529          map { pop @{$_}; $_ }
 530              sort { $a->[2] <=> $b->[2] }
 531                   map { [ $_->[0], $_->[1], hex($_->[0]) ] }
 532                       ($general ?
 533                          map { ref $_ ? @$_ : $_ }
 534                              @Cat {ref $general  ? @$general  : $general } :
 535                          (),
 536                        $extended ?
 537                          map { ref $_ ? @$_ : $_ }
 538                              @Prop{ref $extended ? @$extended : $extended} :
 539                          ());
 540
 541     $In{$name}   = $InId++;
 542     $InIn{$name} = $merged;
 543
 544     return $merged;
 545 }
 546
 547 # Alphabetic is L and Other_Alphabetic.
 548
 549 my $Alphabetic =
 550     merge_general_and_extended('Alphabetic', 'L', 'Other_Alphabetic');
 551
 552 # Lowercase is Ll and Other_Lowercase.
 553
 554 my $Lowercase =
 555     merge_general_and_extended('Lowercase', 'Ll', 'Other_Lowercase');
 556
 557 # Uppercase is Lu and Other_Uppercase.
 558
 559 my $Uppercase =
 560     merge_general_and_extended('Uppercase', 'Lu', 'Other_Uppercase');
 561
 562 # Math is Sm and Other_Math.
 563
 564 my $Math =
 565     merge_general_and_extended('Math', 'Sm', 'Other_Math');
 566
 567 # Lampersand is Ll, Lu, and Lt.
 568
 569 my $Lampersand =
 570     merge_general_and_extended('Lampersand', [ qw(Ll Lu Lt) ]);
 571
 572 # ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl.
 573
 574 my $ID_Start =
 575     merge_general_and_extended('ID_Start', [ qw(Ll Lu Lt Lm Lo Nl) ]);
 576
 577 # ID_Continue is ID_Start, Mn, Mc, Nd, and Pc.
 578
 579 my $ID_Continue =
 580     merge_general_and_extended('ID_Continue', [ qw(Ll Lu Lt Lm Lo Nl
 581                                                    Mn Mc Nd Pc) ]);
 582
 583 #
 584 # Any is any.
 585 #
 586
 587 $In{Any} = $InId++;
 588 my $Any = [ [ 0, sprintf("%04X", $LastUnicodeCodepoint) ] ];
 589 $InIn{Any} = $Any;
 590
 591 #
 592 # All is any, too.
 593 #
 594
 595 $In{All} = $InId++;
 596 $InIn{All} = $Any;
 597
 598 #
 599 # mapping() will be used to write out the In and Is virtual mappings.
 600 #
 601
 602 sub mapping {
 603     my ($map, $name) = @_;
 604
 605     if (open(my $fh, ">$name.pl")) {
 606         print "$name.pl\n";
 607         header($fh);
 608
 609         # The %pat will hold a hash that maps the first two
 610         # lowercased letters of a class to a 'fuzzified' regular
 611         # expression that points to the real mapping.
 612
 613         my %pat;
 614
 615         # But first write out the offical name to real name
 616         # (the filename) mapping.
 617
 618         print $fh <<EOT;
 619 %utf8::${name} =
 620 (
 621 EOT
 622         for my $i (sort { lc $a cmp lc $b } keys %$map) {
 623             my $pat = $i;
 624             # Here is the 'fuzzification': accept any space,
 625             # dash, or underbar where in the official name
 626             # there is space or a dash (or underbar, but
 627             # there never is).
 628             $pat =~ s/([- _])/(?:[-_]|\\s+)?/g;
 629             # The prefix length of 2 is enough spread,
 630             # and besides, we have 'Yi' as an In category.
 631             push @{$pat{lc(substr($i, 0, 2))}}, [ $i, $pat ];
 632             printf $fh "%-45s => '$map->{$i}',\n", "'$i'";
 633         }
 634         print $fh <<EOT;
 635 );
 636 EOT
 637
 638         # Now write out the %pat mapping.
 639
 640         print $fh <<EOT;
 641 %utf8::${name}Pat =
 642 (
 643 EOT
 644         foreach my $prefix (sort keys %pat) {
 645             print $fh "'$prefix' => {\n";
 646             foreach my $ipat (@{$pat{$prefix}}) {
 647                 my ($i, $pat) = @$ipat;
 648                 print $fh "\t'$pat' => '$map->{$i}',\n";
 649             }
 650             print $fh "},\n";
 651         }
 652         print $fh <<EOT;
 653 );
 654 EOT
 655
 656         close($fh);
 657     } else {
 658         die "$0: $name.pl: $!\n";
 659     }
 660 }
 661
 662 #
 663 # Write out the virtual In mappings.
 664 #
 665
 666 mapping(\%In, "In");
 667
 668 #
 669 # Append the InScript and InBlock mappings.
 670 # These are needed only if Script= and Block= syntaxes are used.
 671 #
 672
 673 if (open(my $In, ">>In.pl")) {
 674     print $In <<EOT;
 675
 676 %utf8::InScript =
 677 (
 678 EOT
 679     for my $i (sort { $a <=> $b } keys %InScript) {
 680         printf $In "%4d => '$InScript{$i}',\n", $i;
 681     }
 682     print $In <<EOT;
 683 );
 684 EOT
 685
 686     print $In <<EOT;
 687
 688 %utf8::InBlock =
 689 (
 690 EOT
 691     for my $i (sort { $a <=> $b } keys %InBlock) {
 692         printf $In "%4d => '$InBlock{$i}',\n", $i;
 693     }
 694     print $In <<EOT;
 695 );
 696 EOT
 697 } else {
 698     die "$0: In.pl: $!\n";
 699 }
 700
 701 #
 702 # Write out the real In mappings
 703 # (the In.pl written out just above has the virtual In mappings)
 704 #
 705
 706 foreach my $in (sort { $In{$a} <=> $In{$b} } keys %In) {
 707     flush($InIn{$in}, "In/$In{$in}.pl");
 708 }
 709
 710 #
 711 # The mapping from General Category long forms to short forms is
 712 # currently hardwired here since no simple data file in the UCD
 713 # seems to do that.  Unicode 3.2 will assumedly correct this.
 714 #
 715
 716 my %Is = (
 717         'Letter'                        =>      'L',
 718         'Uppercase_Letter'              =>      'Lu',
 719         'Lowercase_Letter'              =>      'Ll',
 720         'Titlecase_Letter'              =>      'Lt',
 721         'Modifier_Letter'               =>      'Lm',
 722         'Other_Letter'                  =>      'Lo',
 723
 724         'Mark'                          =>      'M',
 725         'Non_Spacing_Mark'              =>      'Mn',
 726         'Spacing_Mark'                  =>      'Mc',
 727         'Enclosing_Mark'                =>      'Me',
 728
 729         'Separator'                     =>      'Z',
 730         'Space_Separator'               =>      'Zs',
 731         'Line_Separator'                =>      'Zl',
 732         'Paragraph_Separator'           =>      'Zp',
 733
 734         'Number'                        =>      'N',
 735         'Decimal_Number'                =>      'Nd',
 736         'Letter_Number'                 =>      'Nl',
 737         'Other_Number'                  =>      'No',
 738
 739         'Punctuation'                   =>      'P',
 740         'Connector_Punctuation'         =>      'Pc',
 741         'Dash_Punctuation'              =>      'Pd',
 742         'Open_Punctuation'              =>      'Ps',
 743         'Close_Punctuation'             =>      'Pe',
 744         'Initial_Punctuation'           =>      'Pi',
 745         'Final_Punctuation'             =>      'Pf',
 746         'Other_Punctuation'             =>      'Po',
 747
 748         'Symbol'                        =>      'S',
 749         'Math_Symbol'                   =>      'Sm',
 750         'Currency_Symbol'               =>      'Sc',
 751         'Modifier_Symbol'               =>      'Sk',
 752         'Other_Symbol'                  =>      'So',
 753
 754         'Other'                         =>      'C',
 755         'Control'                       =>      'Cc',
 756         'Format'                        =>      'Cf',
 757         'Surrogate'                     =>      'Cs',
 758         'Private Use'                   =>      'Co',
 759         'Unassigned'                    =>      'Cn',
 760 );
 761
 762 #
 763 # Write out the virtual Is mappings.
 764 #
 765
 766 mapping(\%Is, "Is");
 767
 768 #
 769 # Read in the special cases.
 770 #
 771
 772 my %Case;
 773
 774 if (open(my $SpecCase, "SpecCase.txt")) {
 775     while (<$SpecCase>) {
 776         next unless /^[0-9A-Fa-f]+;/;
 777         s/\#.*//;
 778         s/\s+$//;
 779
 780         my ($code, $lower, $title, $upper, $condition) = split(/\s*;\s*/);
 781
 782         if ($condition) { # not implemented yet
 783             print "# SKIPPING $_\n";
 784             next;
 785         }
 786
 787         # Wait until all the special cases have been read since
 788         # they are not listed in numeric order.
 789         my $ix = hex($code);
 790         push @{$Case{Lower}}, [ $ix, $code, $lower ];
 791         push @{$Case{Title}}, [ $ix, $code, $title ];
 792         push @{$Case{Upper}}, [ $ix, $code, $upper ];
 793     }
 794 } else {
 795     die "$0: SpecCase.txt: $!\n";
 796 }
 797
 798 # Now write out the special cases properties in their code point order.
 799 # Prepend them to the To/{Upper,Lower,Title}.pl.
 800
 801 for my $case (qw(Lower Title Upper)) {
 802     my $NormalCase = do "To/$case.pl" || die "$0: To/$case.pl: $!\n";
 803     if (open(my $Case, ">To/$case.pl")) {
 804         header($Case);
 805         print $Case <<EOT;
 806
 807 %utf8::ToSpec$case = (
 808 EOT
 809         for my $prop (sort { $a->[0] <=> $b->[0] } @{$Case{$case}}) {
 810             my ($ix, $code, $to) = @$prop;
 811             my $tostr =
 812                 join "", map { sprintf "\\x{%s}", $_ } split ' ', $to;
 813             printf $Case qq['%04X' => "$tostr",\n], $ix;
 814         }
 815         print $Case <<EOT;
 816 );
 817
 818 EOT
 819         begin($Case);
 820         print $Case $NormalCase;
 821         end($Case);
 822     } else {
 823         die "$0: To/$case.txt: $!\n";
 824     }
 825 }
 826
 827 #
 828 # Read in the case foldings.
 829 #
 830 # We will do full case folding, C + F + I (see CaseFold.txt).
 831 #
 832
 833 if (open(my $CaseFold, "CaseFold.txt")) {
 834     my @Fold;
 835     my %Fold;
 836
 837     while (<$CaseFold>) {
 838         # Skip status 'S', simple case folding
 839         next unless /^([0-9A-Fa-f]+)\s*;\s*([CFI])\s*;\s*([0-9A-Fa-f]+(?: [0-9A-Fa-f]+)*)\s*;/;
 840
 841         my ($code, $status, $fold) = ($1, $2, $3);
 842
 843         if ($status eq 'C') { # Common: one-to-one folding
 844             # No append() since several codes may fold into one.
 845             push @Fold, [ $code, $code, $fold ];
 846         } else { # F: full, or I: dotted uppercase I -> dotless lowercase I
 847             $Fold{hex($code)} = $fold;
 848         }
 849     }
 850
 851     flush(\@Fold, "To/Fold.pl");
 852
 853     #
 854     # Prepend the special foldings to the common foldings.
 855     #
 856
 857     my $CommonFold = do "To/Fold.pl" || die "$0: To/Fold.pl: $!\n";
 858     if (open(my $Fold, ">To/Fold.pl")) {
 859         header($Fold);
 860         print $Fold <<EOT;
 861
 862 %utf8::ToSpecFold = (
 863 EOT
 864         for my $code (sort { $a <=> $b } keys %Fold) {
 865             my $foldstr =
 866                 join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code};
 867             printf $Fold qq['%04X' => "$foldstr",\n], $code;
 868         }
 869         print $Fold <<EOT;
 870 );
 871
 872 EOT
 873         begin($Fold);
 874         print $Fold $CommonFold;
 875         end($Fold);
 876     } else {
 877         die "$0: To/Fold.pl: $!\n";
 878     }
 879 } else {
 880     die "$0: CaseFold.txt: $!\n";
 881 }
 882
 883 # That's all, folks!
 884