Compute equivalence classes (diacritics stripping) only
[p5sagit/p5-mst-13.2.git] / lib / unicode / mktables.PL
CommitLineData
a0ed51b3 1#!../../miniperl
2
11695a73 3$UnicodeData = "UnicodeData-Latest.txt";
4
a0ed51b3 5# Note: we try to keep filenames unique within first 8 chars. Using
6# subdirectories for the following helps.
7mkdir "In", 0777;
8mkdir "Is", 0777;
9mkdir "To", 0777;
11695a73 10mkdir "Eq", 0777;
a0ed51b3 11
c939d428 12open(UNICODEDATA, $UnicodeData) || die "$0: $UnicodeData: $!\n";
13
14while (<UNICODEDATA>) {
15 ($code, $name) = split /;/;
16
17 $code{$name} = $code;
18 $name{$code} = $name;
19
20 next unless $name =~ /^(.+? LETTER .+?) WITH .+( \w+ FORM)?$/;
21
22 push @base, [ $code, $1 ];
23 push @base, [ $code, $1.$2 ] if $2 ne '';
24
25 # Before this "diacritics stripping" phase (and for Arabic, also
26 # "form stripping" phase) all ligatures could be decomposed into
27 # their constituent letters.
28 #
29 # For example the ligature
30 # ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH ALEF ISOLATED FORM
31 # would go first through ligature decomposition producing the two letters
32 # ARABIC LETTER YEH WITH HAMZA ABOVE ISOLATED FORM
33 # ARABIC LETTER ALEF WITH HAMZA ABOVE ISOLATED FORM
34 # and those with diacritics stripping
35 # ARABIC LETTER YEH ISOLATED FORM
36 # ARABIC LETTER ALEF ISOLATED FORM
37 # and those with the Arabic form stripping
38 # ARABIC LETTER YEH
39 # ARABIC LETTER ALEF ISOLATED FORM
40 # ARABIC LETTER YEH
41 # ARABIC LETTER ALEF ISOLATED FORM
42 #
43 # Similarly for ligatures from other scripts.
44 # Effectively this would mean that ligatures turn into categories
45 # (Unicodese for character classes).
46}
47
48foreach my $b (@base) {
49 ($code, $base) = @$b;
50 next unless exists $code{$base};
51 push @{$unicode{$code{$base}}}, $code;
52# print "$code: $name{$code} -> $base\n",
53}
54
55@unicode = sort keys %unicode;
56
57print "Eq/Unicode\n";
58if (open(EQ_UNICODE, ">Eq/Unicode")) {
59 foreach my $c (@unicode) {
60 print EQ_UNICODE "$c @{$unicode{$c}}\n";
61 }
62 close EQ_UNICODE;
63} else {
64 die "$0: failed to open Eq/Unicode for writing: $!\n";
65}
66
67print "Eq/Latin1\n";
68if (open(EQ_LATIN1, ">Eq/Latin1")) {
69 foreach my $c (@unicode) {
70 last if hex($c) > 255;
71 my @c = grep { hex($_) <= 255 } @{$unicode{$c}};
72 next unless @c;
73 print EQ_LATIN1 "$c @c\n";
74 }
75 close EQ_LATIN1;
76} else {
77 die "$0: failed to open Eq/Latin1 for writing: $!\n";
78}
79
a0ed51b3 80@todo = (
81# typical
82
b8c5462f 83 ['IsWord', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''],
84 ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/', ''],
85 ['IsAlpha', '$cat =~ /^L[ulo]/', ''],
86 ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''],
87 ['IsDigit', '$cat =~ /^Nd$/', ''],
88 ['IsUpper', '$cat =~ /^Lu$/', ''],
89 ['IsLower', '$cat =~ /^Ll$/', ''],
90 ['IsASCII', 'hex $code <= 127', ''],
91 ['IsCntrl', '$cat =~ /^C/', ''],
92 ['IsGraph', '$cat =~ /^[^C]/ and $code ne "0020"', ''],
93 ['IsPrint', '$cat =~ /^[^C]/', ''],
94 ['IsPunct', '$cat =~ /^P/', ''],
95 ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''],
96 ['ToUpper', '$up', '$up'],
97 ['ToLower', '$down', '$down'],
98 ['ToTitle', '$title', '$title'],
99 ['ToDigit', '$dec ne ""', '$dec'],
a0ed51b3 100
101# Name
102
103 ['Name', '$name', '$name'],
104
105# Category
106
107 ['Category', '$cat', '$cat'],
108
109# Normative
110
111 ['IsM', '$cat =~ /^M/', ''], # Mark
112 ['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing
113 ['IsMc', '$cat eq "Mc"', ''], # Mark, Combining
114
115 ['IsN', '$cat =~ /^N/', ''], # Number
116 ['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit
117 ['IsNo', '$cat eq "No"', ''], # Number, Other
118
119 ['IsZ', '$cat =~ /^Z/', ''], # Zeparator
120 ['IsZs', '$cat eq "Zs"', ''], # Separator, Space
121 ['IsZl', '$cat eq "Zl"', ''], # Separator, Line
122 ['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph
123
124 ['IsC', '$cat =~ /^C/', ''], # Crazy
125 ['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format
126 ['IsCo', '$cat eq "Co"', ''], # Other, Private Use
127 ['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned
128
129# Informative
130
131 ['IsL', '$cat =~ /^L/', ''], # Letter
132 ['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase
133 ['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase
134 ['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase
135 ['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier
136 ['IsLo', '$cat eq "Lo"', ''], # Letter, Other
137
138 ['IsP', '$cat =~ /^P/', ''], # Punctuation
139 ['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash
140 ['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open
141 ['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close
142 ['IsPo', '$cat eq "Po"', ''], # Punctuation, Other
143
144 ['IsS', '$cat =~ /^S/', ''], # Symbol
145 ['IsSm', '$cat eq "Sm"', ''], # Symbol, Math
146 ['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency
147 ['IsSo', '$cat eq "So"', ''], # Symbol, Other
148
149# Combining class
150 ['CombiningClass', '$comb', '$comb'],
151
152# BIDIRECTIONAL PROPERTIES
153
154 ['Bidirectional', '$bid', '$bid'],
155
156# Strong types:
157
158 ['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic,
159 # syllabic, and logographic
160 # characters (e.g., CJK
161 # ideographs)
162 ['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew,
163 # and punctuation specific to
164 # those scripts
165
166# Weak types:
167
168 ['IsBidiEN','$bid eq "EN"', ''], # European Number
169 ['IsBidiES','$bid eq "ES"', ''], # European Number Separator
170 ['IsBidiET','$bid eq "ET"', ''], # European Number Terminator
171 ['IsBidiAN','$bid eq "AN"', ''], # Arabic Number
172 ['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator
173
174# Separators:
175
176 ['IsBidiB', '$bid eq "B"', ''], # Block Separator
177 ['IsBidiS', '$bid eq "S"', ''], # Segment Separator
178
179# Neutrals:
180
181 ['IsBidiWS','$bid eq "WS"', ''], # Whitespace
182 ['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other
183 # characters: punctuation,
184 # symbols
185
186# Decomposition
187
188 ['Decomposition', '$decomp', '$decomp'],
189 ['IsDecoCanon', '$decomp && $decomp !~ /^</', ''],
190 ['IsDecoCompat', '$decomp =~ /^</', ''],
191 ['IsDCfont', '$decomp =~ /^<font>/', ''],
192 ['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''],
193 ['IsDCinitial', '$decomp =~ /^<initial>/', ''],
194 ['IsDCinital', '$decomp =~ /^<medial>/', ''],
195 ['IsDCfinal', '$decomp =~ /^<final>/', ''],
196 ['IsDCisolated', '$decomp =~ /^<isolated>/', ''],
197 ['IsDCcircle', '$decomp =~ /^<circle>/', ''],
198 ['IsDCsuper', '$decomp =~ /^<super>/', ''],
199 ['IsDCsub', '$decomp =~ /^<sub>/', ''],
200 ['IsDCvertical', '$decomp =~ /^<vertical>/', ''],
201 ['IsDCwide', '$decomp =~ /^<wide>/', ''],
202 ['IsDCnarrow', '$decomp =~ /^<narrow>/', ''],
203 ['IsDCsmall', '$decomp =~ /^<small>/', ''],
204 ['IsDCsquare', '$decomp =~ /^<square>/', ''],
205 ['IsDCcompat', '$decomp =~ /^<compat>/', ''],
206
207# Number
208
209 ['Number', '$num', '$num'],
210
211# Mirrored
212
213 ['IsMirrored', '$mir eq "Y"', ''],
214
215# Arabic
216
217 ['ArabLink', '1', '$link'],
218 ['ArabLnkGrp', '1', '$linkgroup'],
219
220# Jamo
221
222 ['JamoShort', '1', '$short'],
223);
224
225# This is not written for speed...
226
227foreach $file (@todo) {
228 my ($table, $wanted, $val) = @$file;
229 next if @ARGV and not grep { $_ eq $table } @ARGV;
230 print $table,"\n";
231 if ($table =~ /^(Is|In|To)(.*)/) {
232 open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n";
233 }
234 else {
235 open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n";
236 }
237 print OUT <<"END";
238return <<'END';
239END
240 print OUT proplist($table, $wanted, $val);
241 print OUT "END\n";
242 close OUT;
243}
244
245# Must treat blocks specially.
246
247exit if @ARGV and not grep { $_ eq Block } @ARGV;
248print "Block\n";
249open(UD, 'blocks.txt') or die "Can't open blocks.txt: $!\n";
250open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n";
251print OUT <<"END";
252return <<'END';
253END
254
255while (<UD>) {
256 next if /^#/;
257 next if /^$/;
258 chomp;
259 ($code, $last, $name) = split(/; */);
260 if ($name) {
261 print OUT "$code $last $name\n";
262 $name =~ s/\s+//g;
263 open(BLOCK, ">In/$name.pl");
264 print BLOCK <<"END2";
265return <<'END';
266$code $last
267END
268END2
269 close BLOCK;
270 }
271}
272
273print OUT "END\n";
274close OUT;
275
276##################################################
277
278sub proplist {
279 my ($table, $wanted, $val) = @_;
280 my @wanted;
281 my $out;
282 my $split;
283
284 if ($table =~ /^Arab/) {
285 open(UD, "arabshp.txt") or warn "Can't open $table: $!";
286
287 $split = '($code, $name, $link, $linkgroup) = split(/; */);';
288 }
289 elsif ($table =~ /^Jamo/) {
290 open(UD, "jamo2.txt") or warn "Can't open $table: $!";
291
292 $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
293 }
294 else {
11695a73 295 open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!";
a0ed51b3 296
297 $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1,
298 $comment, $up, $down, $title) = split(/;/);';
299 }
300
301 if ($table =~ /^(?:To|Is)[A-Z]/) {
302 eval <<"END";
303 while (<UD>) {
304 next if /^#/;
305 next if /^\s/;
306 chop;
307 $split
308 if ($wanted) {
309 push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]);
310 }
311 }
312END
313 die $@ if $@;
314
315 while (@wanted) {
316 $beg = shift @wanted;
317 $last = $beg;
318 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
319 (not $val or $wanted[0]->[1] == $last->[1] + 1)) {
320 $last = shift @wanted;
321 }
322 $out .= sprintf "%04x", $beg->[0];
323 if ($beg->[2]) {
324 $last = shift @wanted;
325 }
326 if ($beg == $last) {
327 $out .= "\t";
328 }
329 else {
330 $out .= sprintf "\t%04x", $last->[0];
331 }
332 $out .= sprintf "\t%04x", $beg->[1] if $val;
333 $out .= "\n";
334 }
335 }
336 else {
337 eval <<"END";
338 while (<UD>) {
339 next if /^#/;
340 next if /^\s*\$/;
341 chop;
342 $split
343 if ($wanted) {
344 push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]);
345 }
346 }
347END
348 die $@ if $@;
349
350 while (@wanted) {
351 $beg = shift @wanted;
352 $last = $beg;
353 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
354 ($wanted[0]->[1] eq $last->[1])) {
355 $last = shift @wanted;
356 }
357 $out .= sprintf "%04x", $beg->[0];
358 if ($beg->[2]) {
359 $last = shift @wanted;
360 }
361 if ($beg == $last) {
362 $out .= "\t";
363 }
364 else {
365 $out .= sprintf "\t%04x", $last->[0];
366 }
367 $out .= sprintf "\t%s\n", $beg->[1];
368 }
369 }
370 $out;
371}
11695a73 372
11eeea96 373# Create the equivalence mappings.
374
11695a73 375