3 # Create the equivalence mappings.
5 $UnicodeData = "UnicodeData-Latest.txt";
7 # Note: we try to keep filenames unique within first 8 chars. Using
8 # subdirectories for the following helps.
17 ['IsWord', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''],
18 ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/', ''],
19 ['IsAlpha', '$cat =~ /^L[ulo]/', ''],
20 ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''],
21 ['IsDigit', '$cat =~ /^Nd$/', ''],
22 ['IsUpper', '$cat =~ /^Lu$/', ''],
23 ['IsLower', '$cat =~ /^Ll$/', ''],
24 ['IsASCII', 'hex $code <= 127', ''],
25 ['IsCntrl', '$cat =~ /^C/', ''],
26 ['IsGraph', '$cat =~ /^[^C]/ and $code ne "0020"', ''],
27 ['IsPrint', '$cat =~ /^[^C]/', ''],
28 ['IsPunct', '$cat =~ /^P/', ''],
29 ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''],
30 ['ToUpper', '$up', '$up'],
31 ['ToLower', '$down', '$down'],
32 ['ToTitle', '$title', '$title'],
33 ['ToDigit', '$dec ne ""', '$dec'],
37 ['Name', '$name', '$name'],
41 ['Category', '$cat', '$cat'],
45 ['IsM', '$cat =~ /^M/', ''], # Mark
46 ['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing
47 ['IsMc', '$cat eq "Mc"', ''], # Mark, Combining
49 ['IsN', '$cat =~ /^N/', ''], # Number
50 ['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit
51 ['IsNo', '$cat eq "No"', ''], # Number, Other
53 ['IsZ', '$cat =~ /^Z/', ''], # Zeparator
54 ['IsZs', '$cat eq "Zs"', ''], # Separator, Space
55 ['IsZl', '$cat eq "Zl"', ''], # Separator, Line
56 ['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph
58 ['IsC', '$cat =~ /^C/', ''], # Crazy
59 ['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format
60 ['IsCo', '$cat eq "Co"', ''], # Other, Private Use
61 ['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned
65 ['IsL', '$cat =~ /^L/', ''], # Letter
66 ['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase
67 ['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase
68 ['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase
69 ['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier
70 ['IsLo', '$cat eq "Lo"', ''], # Letter, Other
72 ['IsP', '$cat =~ /^P/', ''], # Punctuation
73 ['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash
74 ['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open
75 ['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close
76 ['IsPo', '$cat eq "Po"', ''], # Punctuation, Other
78 ['IsS', '$cat =~ /^S/', ''], # Symbol
79 ['IsSm', '$cat eq "Sm"', ''], # Symbol, Math
80 ['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency
81 ['IsSo', '$cat eq "So"', ''], # Symbol, Other
84 ['CombiningClass', '$comb', '$comb'],
86 # BIDIRECTIONAL PROPERTIES
88 ['Bidirectional', '$bid', '$bid'],
92 ['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic,
93 # syllabic, and logographic
94 # characters (e.g., CJK
96 ['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew,
97 # and punctuation specific to
102 ['IsBidiEN','$bid eq "EN"', ''], # European Number
103 ['IsBidiES','$bid eq "ES"', ''], # European Number Separator
104 ['IsBidiET','$bid eq "ET"', ''], # European Number Terminator
105 ['IsBidiAN','$bid eq "AN"', ''], # Arabic Number
106 ['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator
110 ['IsBidiB', '$bid eq "B"', ''], # Block Separator
111 ['IsBidiS', '$bid eq "S"', ''], # Segment Separator
115 ['IsBidiWS','$bid eq "WS"', ''], # Whitespace
116 ['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other
117 # characters: punctuation,
122 ['Decomposition', '$decomp', '$decomp'],
123 ['IsDecoCanon', '$decomp && $decomp !~ /^</', ''],
124 ['IsDecoCompat', '$decomp =~ /^</', ''],
125 ['IsDCfont', '$decomp =~ /^<font>/', ''],
126 ['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''],
127 ['IsDCinitial', '$decomp =~ /^<initial>/', ''],
128 ['IsDCinital', '$decomp =~ /^<medial>/', ''],
129 ['IsDCfinal', '$decomp =~ /^<final>/', ''],
130 ['IsDCisolated', '$decomp =~ /^<isolated>/', ''],
131 ['IsDCcircle', '$decomp =~ /^<circle>/', ''],
132 ['IsDCsuper', '$decomp =~ /^<super>/', ''],
133 ['IsDCsub', '$decomp =~ /^<sub>/', ''],
134 ['IsDCvertical', '$decomp =~ /^<vertical>/', ''],
135 ['IsDCwide', '$decomp =~ /^<wide>/', ''],
136 ['IsDCnarrow', '$decomp =~ /^<narrow>/', ''],
137 ['IsDCsmall', '$decomp =~ /^<small>/', ''],
138 ['IsDCsquare', '$decomp =~ /^<square>/', ''],
139 ['IsDCcompat', '$decomp =~ /^<compat>/', ''],
143 ['Number', '$num', '$num'],
147 ['IsMirrored', '$mir eq "Y"', ''],
151 ['ArabLink', '1', '$link'],
152 ['ArabLnkGrp', '1', '$linkgroup'],
156 ['JamoShort', '1', '$short'],
159 # This is not written for speed...
161 foreach $file (@todo) {
162 my ($table, $wanted, $val) = @$file;
163 next if @ARGV and not grep { $_ eq $table } @ARGV;
165 if ($table =~ /^(Is|In|To)(.*)/) {
166 open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n";
169 open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n";
174 print OUT proplist($table, $wanted, $val);
179 # Must treat blocks specially.
181 exit if @ARGV and not grep { $_ eq Block } @ARGV;
183 open(UD, 'blocks.txt') or die "Can't open blocks.txt: $!\n";
184 open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n";
193 ($code, $last, $name) = split(/; */);
195 print OUT "$code $last $name\n";
197 open(BLOCK, ">In/$name.pl");
198 print BLOCK <<"END2";
210 ##################################################
213 my ($table, $wanted, $val) = @_;
218 if ($table =~ /^Arab/) {
219 open(UD, "arabshp.txt") or warn "Can't open $table: $!";
221 $split = '($code, $name, $link, $linkgroup) = split(/; */);';
223 elsif ($table =~ /^Jamo/) {
224 open(UD, "jamo2.txt") or warn "Can't open $table: $!";
226 $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
229 open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!";
231 $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1,
232 $comment, $up, $down, $title) = split(/;/);';
235 if ($table =~ /^(?:To|Is)[A-Z]/) {
243 push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]);
250 $beg = shift @wanted;
252 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
253 (not $val or $wanted[0]->[1] == $last->[1] + 1)) {
254 $last = shift @wanted;
256 $out .= sprintf "%04x", $beg->[0];
258 $last = shift @wanted;
264 $out .= sprintf "\t%04x", $last->[0];
266 $out .= sprintf "\t%04x", $beg->[1] if $val;
278 push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]);
285 $beg = shift @wanted;
287 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
288 ($wanted[0]->[1] eq $last->[1])) {
289 $last = shift @wanted;
291 $out .= sprintf "%04x", $beg->[0];
293 $last = shift @wanted;
299 $out .= sprintf "\t%04x", $last->[0];
301 $out .= sprintf "\t%s\n", $beg->[1];
307 open(UNICODEDATA, $UnicodeData) || die "$0: $UnicodeData: $!\n";
309 while (<UNICODEDATA>) {
310 ($code, $name) = split /;/;
312 $code{$name} = $code;
313 $name{$code} = $name;
315 if ($name =~ /^((?:LATIN|GREEK|CYRILLIC|HEBREW|BENGALI) .+? LETTER .+?) WITH /) {
316 push @base, [ $code, $1 ];
317 } elsif ($name =~ /^(ARABIC LETTER \w+?) WITH .+ (\w+ FORM)$/) {
318 push @base, [ $code, "$1 $2" ];
319 } elsif ($name =~ /^(ARABIC LETTER \w+?) WITH /) {
320 push @base, [ $code, $1 ];
321 # Is the concept of turning ligatures into character classes sound?
322 } elsif ($name =~ /^(ARABIC) LIGATURE (.+?) (WITH .+ )+(\w+ FORM)$/) {
327 push @base, [ $code, "$script LETTER $base" ];
328 push @base, [ $code, "$script LETTER $base $form" ];
329 my @with = split(/\bWITH\s+/, $with);
331 @with = grep { ! /^ (?:ABOVE|BELOW)/ } @with;
332 foreach my $base (@with) {
333 push @base, [ $code, "$script LETTER $base" ];
334 push @base, [ $code, "$script LETTER $base $form" ];
336 } elsif ($name =~ /^((?:ARMENIAN|CYRILLIC) .+) LIGATURE (\w+) (\w+)$/) {
337 push @base, [ $code, "$1 LETTER $2" ];
338 push @base, [ $code, "$1 LETTER $3" ];
339 # Latin ligatures (ae, oe, ij, ff, fi, fl, ffi, ffl, long st, st) ignored.
340 # Hebrew Yiddish ligatures (double vav, vav yod, double yod, yod yod patah,
341 # alef lamed) ignored.
348 foreach my $b (@base) {
349 ($code, $base) = @$b;
350 next unless exists $code{$base};
351 push @{$unicode{$code{$base}}}, $code;
352 print "$code: $name{$code} -> $base\n",
355 @unicode = sort keys %unicode;
357 if (open(EQ_UNICODE, ">Eq/Unicode")) {
358 foreach my $c (@unicode) {
359 print EQ_UNICODE "$c @{$unicode{$c}}\n";
363 die "$0: failed to open Eq/Unicode for writing: $!\n";
366 if (open(EQ_LATIN1, ">Eq/Latin1")) {
367 foreach my $c (@unicode) {
368 last if hex($c) > 255;
369 my @c = grep { hex($_) <= 255 } @{$unicode{$c}};
371 print EQ_LATIN1 "$c @c\n";
375 die "$0: failed to open Eq/Latin1 for writing: $!\n";