lib/unicode/mktables.PL

   1 #!../../miniperl
   2
   3 # Create the equivalence mappings.
   4
   5 $UnicodeData = "UnicodeData-Latest.txt";
   6
   7 # Note: we try to keep filenames unique within first 8 chars.  Using
   8 # subdirectories for the following helps.
   9 mkdir "In", 0777;
  10 mkdir "Is", 0777;
  11 mkdir "To", 0777;
  12 mkdir "Eq", 0777;
  13
  14 @todo = (
  15 # typical
  16
  17     ['IsWord',  '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"',     ''],
  18     ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/',        ''],
  19     ['IsAlpha',  '$cat =~ /^L[ulo]/',   ''],
  20     ['IsSpace',  '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''],
  21     ['IsDigit',  '$cat =~ /^Nd$/',      ''],
  22     ['IsUpper',  '$cat =~ /^Lu$/',      ''],
  23     ['IsLower',  '$cat =~ /^Ll$/',      ''],
  24     ['IsASCII',  'hex $code <= 127',    ''],
  25     ['IsCntrl',  '$cat =~ /^C/',        ''],
  26     ['IsGraph',  '$cat =~ /^[^C]/ and $code ne "0020"', ''],
  27     ['IsPrint',  '$cat =~ /^[^C]/',     ''],
  28     ['IsPunct',  '$cat =~ /^P/',        ''],
  29     ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/',   ''],
  30     ['ToUpper',  '$up',                 '$up'],
  31     ['ToLower',  '$down',               '$down'],
  32     ['ToTitle',  '$title',              '$title'],
  33     ['ToDigit',  '$dec ne ""',          '$dec'],
  34
  35 # Name
  36
  37     ['Name',    '$name',                '$name'],
  38
  39 # Category
  40
  41     ['Category', '$cat',                '$cat'],
  42
  43 # Normative
  44
  45     ['IsM',     '$cat =~ /^M/',         ''],    # Mark
  46     ['IsMn',    '$cat eq "Mn"',         ''],    # Mark, Non-Spacing
  47     ['IsMc',    '$cat eq "Mc"',         ''],    # Mark, Combining
  48
  49     ['IsN',     '$cat =~ /^N/',         ''],    # Number
  50     ['IsNd',    '$cat eq "Nd"',         ''],    # Number, Decimal Digit
  51     ['IsNo',    '$cat eq "No"',         ''],    # Number, Other
  52
  53     ['IsZ',     '$cat =~ /^Z/',         ''],    # Zeparator
  54     ['IsZs',    '$cat eq "Zs"',         ''],    # Separator, Space
  55     ['IsZl',    '$cat eq "Zl"',         ''],    # Separator, Line
  56     ['IsZp',    '$cat eq "Zp"',         ''],    # Separator, Paragraph
  57
  58     ['IsC',     '$cat =~ /^C/',         ''],    # Crazy
  59     ['IsCc',    '$cat eq "Cc"',         ''],    # Other, Control or Format
  60     ['IsCo',    '$cat eq "Co"',         ''],    # Other, Private Use
  61     ['IsCn',    '$cat eq "Cn"',         ''],    # Other, Not Assigned
  62
  63 # Informative
  64
  65     ['IsL',     '$cat =~ /^L/',         ''],    # Letter
  66     ['IsLu',    '$cat eq "Lu"',         ''],    # Letter, Uppercase
  67     ['IsLl',    '$cat eq "Ll"',         ''],    # Letter, Lowercase
  68     ['IsLt',    '$cat eq "Lt"',         ''],    # Letter, Titlecase
  69     ['IsLm',    '$cat eq "Lm"',         ''],    # Letter, Modifier
  70     ['IsLo',    '$cat eq "Lo"',         ''],    # Letter, Other
  71
  72     ['IsP',     '$cat =~ /^P/',         ''],    # Punctuation
  73     ['IsPd',    '$cat eq "Pd"',         ''],    # Punctuation, Dash
  74     ['IsPs',    '$cat eq "Ps"',         ''],    # Punctuation, Open
  75     ['IsPe',    '$cat eq "Pe"',         ''],    # Punctuation, Close
  76     ['IsPo',    '$cat eq "Po"',         ''],    # Punctuation, Other
  77
  78     ['IsS',     '$cat =~ /^S/',         ''],    # Symbol
  79     ['IsSm',    '$cat eq "Sm"',         ''],    # Symbol, Math
  80     ['IsSc',    '$cat eq "Sc"',         ''],    # Symbol, Currency
  81     ['IsSo',    '$cat eq "So"',         ''],    # Symbol, Other
  82
  83 # Combining class
  84     ['CombiningClass', '$comb',         '$comb'],
  85
  86 # BIDIRECTIONAL PROPERTIES
  87
  88     ['Bidirectional', '$bid',           '$bid'],
  89
  90 # Strong types:
  91
  92     ['IsBidiL', '$bid eq "L"',          ''],    # Left-Right; Most alphabetic,
  93                                                 # syllabic, and logographic
  94                                                 # characters (e.g., CJK
  95                                                 # ideographs)
  96     ['IsBidiR', '$bid eq "R"',          ''],    # Right-Left; Arabic, Hebrew,
  97                                                 # and punctuation specific to
  98                                                 # those scripts
  99
 100 # Weak types:
 101
 102     ['IsBidiEN','$bid eq "EN"',         ''],    # European Number
 103     ['IsBidiES','$bid eq "ES"',         ''],    # European Number Separator
 104     ['IsBidiET','$bid eq "ET"',         ''],    # European Number Terminator
 105     ['IsBidiAN','$bid eq "AN"',         ''],    # Arabic Number
 106     ['IsBidiCS','$bid eq "CS"',         ''],    # Common Number Separator
 107
 108 # Separators:
 109
 110     ['IsBidiB', '$bid eq "B"',          ''],    # Block Separator
 111     ['IsBidiS', '$bid eq "S"',          ''],    # Segment Separator
 112
 113 # Neutrals:
 114
 115     ['IsBidiWS','$bid eq "WS"',         ''],    # Whitespace
 116     ['IsBidiON','$bid eq "ON"',         ''],    # Other Neutrals ; All other
 117                                                 # characters: punctuation,
 118                                                 # symbols
 119
 120 # Decomposition
 121
 122     ['Decomposition',   '$decomp',      '$decomp'],
 123     ['IsDecoCanon',     '$decomp && $decomp !~ /^</',   ''],
 124     ['IsDecoCompat',    '$decomp =~ /^</',              ''],
 125     ['IsDCfont',        '$decomp =~ /^<font>/',         ''],
 126     ['IsDCnoBreak',     '$decomp =~ /^<noBreak>/',      ''],
 127     ['IsDCinitial',     '$decomp =~ /^<initial>/',      ''],
 128     ['IsDCinital',      '$decomp =~ /^<medial>/',       ''],
 129     ['IsDCfinal',       '$decomp =~ /^<final>/',        ''],
 130     ['IsDCisolated',    '$decomp =~ /^<isolated>/',     ''],
 131     ['IsDCcircle',      '$decomp =~ /^<circle>/',       ''],
 132     ['IsDCsuper',       '$decomp =~ /^<super>/',        ''],
 133     ['IsDCsub',         '$decomp =~ /^<sub>/',          ''],
 134     ['IsDCvertical',    '$decomp =~ /^<vertical>/',     ''],
 135     ['IsDCwide',        '$decomp =~ /^<wide>/',         ''],
 136     ['IsDCnarrow',      '$decomp =~ /^<narrow>/',       ''],
 137     ['IsDCsmall',       '$decomp =~ /^<small>/',        ''],
 138     ['IsDCsquare',      '$decomp =~ /^<square>/',       ''],
 139     ['IsDCcompat',      '$decomp =~ /^<compat>/',       ''],
 140
 141 # Number
 142
 143     ['Number',  '$num',                 '$num'],
 144
 145 # Mirrored
 146
 147     ['IsMirrored', '$mir eq "Y"',       ''],
 148
 149 # Arabic
 150
 151     ['ArabLink',        '1',            '$link'],
 152     ['ArabLnkGrp',      '1',            '$linkgroup'],
 153
 154 # Jamo
 155
 156     ['JamoShort',       '1',            '$short'],
 157 );
 158
 159 # This is not written for speed...
 160
 161 foreach $file (@todo) {
 162     my ($table, $wanted, $val) = @$file;
 163     next if @ARGV and not grep { $_ eq $table } @ARGV;
 164     print $table,"\n";
 165     if ($table =~ /^(Is|In|To)(.*)/) {
 166         open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n";
 167     }
 168     else {
 169         open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n";
 170     }
 171     print OUT <<"END";
 172 return <<'END';
 173 END
 174     print OUT proplist($table, $wanted, $val);
 175     print OUT "END\n";
 176     close OUT;
 177 }
 178
 179 # Must treat blocks specially.
 180
 181 exit if @ARGV and not grep { $_ eq Block } @ARGV;
 182 print "Block\n";
 183 open(UD, 'blocks.txt') or die "Can't open blocks.txt: $!\n";
 184 open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n";
 185 print OUT <<"END";
 186 return <<'END';
 187 END
 188
 189 while (<UD>) {
 190     next if /^#/;
 191     next if /^$/;
 192     chomp;
 193     ($code, $last, $name) = split(/; */);
 194     if ($name) {
 195         print OUT "$code        $last   $name\n";
 196         $name =~ s/\s+//g;
 197         open(BLOCK, ">In/$name.pl");
 198         print BLOCK <<"END2";
 199 return <<'END';
 200 $code   $last
 201 END
 202 END2
 203         close BLOCK;
 204     }
 205 }
 206
 207 print OUT "END\n";
 208 close OUT;
 209
 210 ##################################################
 211
 212 sub proplist {
 213     my ($table, $wanted, $val) = @_;
 214     my @wanted;
 215     my $out;
 216     my $split;
 217
 218     if ($table =~ /^Arab/) {
 219         open(UD, "arabshp.txt") or warn "Can't open $table: $!";
 220
 221         $split = '($code, $name, $link, $linkgroup) = split(/; */);';
 222     }
 223     elsif ($table =~ /^Jamo/) {
 224         open(UD, "jamo2.txt") or warn "Can't open $table: $!";
 225
 226         $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
 227     }
 228     else {
 229         open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!";
 230
 231         $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1,
 232                 $comment, $up, $down, $title) = split(/;/);';
 233     }
 234
 235     if ($table =~ /^(?:To|Is)[A-Z]/) {
 236         eval <<"END";
 237             while (<UD>) {
 238                 next if /^#/;
 239                 next if /^\s/;
 240                 chop;
 241                 $split
 242                 if ($wanted) {
 243                     push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]);
 244                 }
 245             }
 246 END
 247         die $@ if $@;
 248
 249         while (@wanted) {
 250             $beg = shift @wanted;
 251             $last = $beg;
 252             while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
 253                 (not $val or $wanted[0]->[1] == $last->[1] + 1)) {
 254                     $last = shift @wanted;
 255             }
 256             $out .= sprintf "%04x", $beg->[0];
 257             if ($beg->[2]) {
 258                 $last = shift @wanted;
 259             }
 260             if ($beg == $last) {
 261                 $out .= "\t";
 262             }
 263             else {
 264                 $out .= sprintf "\t%04x", $last->[0];
 265             }
 266             $out .= sprintf "\t%04x", $beg->[1] if $val;
 267             $out .= "\n";
 268         }
 269     }
 270     else {
 271         eval <<"END";
 272             while (<UD>) {
 273                 next if /^#/;
 274                 next if /^\s*\$/;
 275                 chop;
 276                 $split
 277                 if ($wanted) {
 278                     push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]);
 279                 }
 280             }
 281 END
 282         die $@ if $@;
 283
 284         while (@wanted) {
 285             $beg = shift @wanted;
 286             $last = $beg;
 287             while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
 288                 ($wanted[0]->[1] eq $last->[1])) {
 289                     $last = shift @wanted;
 290             }
 291             $out .= sprintf "%04x", $beg->[0];
 292             if ($beg->[2]) {
 293                 $last = shift @wanted;
 294             }
 295             if ($beg == $last) {
 296                 $out .= "\t";
 297             }
 298             else {
 299                 $out .= sprintf "\t%04x", $last->[0];
 300             }
 301             $out .= sprintf "\t%s\n", $beg->[1];
 302         }
 303     }
 304     $out;
 305 }
 306
 307 open(UNICODEDATA, $UnicodeData) || die "$0: $UnicodeData: $!\n";
 308
 309 while (<UNICODEDATA>) {
 310     ($code, $name) = split /;/;
 311
 312     $code{$name} = $code;
 313     $name{$code} = $name;
 314
 315     if ($name =~ /^((?:LATIN|GREEK|CYRILLIC|HEBREW|BENGALI) .+? LETTER .+?) WITH /) {
 316         push @base, [ $code, $1 ];
 317     } elsif ($name =~ /^(ARABIC LETTER \w+?) WITH .+ (\w+ FORM)$/) {
 318         push @base, [ $code, "$1 $2" ];
 319     } elsif ($name =~ /^(ARABIC LETTER \w+?) WITH /) {
 320         push @base, [ $code, $1 ];
 321 # Is the concept of turning ligatures into character classes sound?
 322     } elsif ($name =~ /^(ARABIC) LIGATURE (.+?) (WITH .+ )+(\w+ FORM)$/) {
 323         my $script = $1;
 324         my $base   = $2;
 325         my $with   = $3;
 326         my $form   = $4;
 327         push @base, [ $code, "$script LETTER $base" ];
 328         push @base, [ $code, "$script LETTER $base $form" ];
 329         my @with = split(/\bWITH\s+/, $with);
 330         shift @with;
 331         @with = grep { ! /^ (?:ABOVE|BELOW)/ } @with;
 332         foreach my $base (@with) {
 333             push @base, [ $code, "$script LETTER $base" ];
 334             push @base, [ $code, "$script LETTER $base $form" ];
 335         }
 336     } elsif ($name =~ /^((?:ARMENIAN|CYRILLIC) .+) LIGATURE (\w+) (\w+)$/) {
 337         push @base, [ $code, "$1 LETTER $2" ];
 338         push @base, [ $code, "$1 LETTER $3" ];
 339 # Latin ligatures (ae, oe, ij, ff, fi, fl, ffi, ffl, long st, st) ignored.
 340 # Hebrew Yiddish ligatures (double vav, vav yod, double yod, yod yod patah,
 341 # alef lamed) ignored.
 342     } else {
 343         next;
 344     }
 345
 346 }
 347
 348 foreach my $b (@base) {
 349     ($code, $base) = @$b;
 350     next unless exists $code{$base};
 351     push @{$unicode{$code{$base}}}, $code;
 352     print "$code: $name{$code} -> $base\n",
 353 }
 354
 355 @unicode = sort keys %unicode;
 356
 357 if (open(EQ_UNICODE, ">Eq/Unicode")) {
 358     foreach my $c (@unicode) {
 359         print EQ_UNICODE "$c @{$unicode{$c}}\n";
 360     }
 361     close EQ_UNICODE;
 362 } else {
 363     die "$0: failed to open Eq/Unicode for writing: $!\n";
 364 }
 365
 366 if (open(EQ_LATIN1, ">Eq/Latin1")) {
 367     foreach my $c (@unicode) {
 368         last if hex($c) > 255;
 369         my @c = grep { hex($_) <= 255 } @{$unicode{$c}};
 370         next unless @c;
 371         print EQ_LATIN1 "$c @c\n";
 372     }
 373     close EQ_LATIN1;
 374 } else {
 375     die "$0: failed to open Eq/Latin1 for writing: $!\n";
 376 }
 377