More Fcntl constants. (This process really needs
[p5sagit/p5-mst-13.2.git] / lib / unicode / mktables.PL
CommitLineData
a0ed51b3 1#!../../miniperl
2
3# Note: we try to keep filenames unique within first 8 chars. Using
4# subdirectories for the following helps.
5mkdir "In", 0777;
6mkdir "Is", 0777;
7mkdir "To", 0777;
8
9@todo = (
10# typical
11
b8c5462f 12 ['IsWord', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''],
13 ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/', ''],
14 ['IsAlpha', '$cat =~ /^L[ulo]/', ''],
15 ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''],
16 ['IsDigit', '$cat =~ /^Nd$/', ''],
17 ['IsUpper', '$cat =~ /^Lu$/', ''],
18 ['IsLower', '$cat =~ /^Ll$/', ''],
19 ['IsASCII', 'hex $code <= 127', ''],
20 ['IsCntrl', '$cat =~ /^C/', ''],
21 ['IsGraph', '$cat =~ /^[^C]/ and $code ne "0020"', ''],
22 ['IsPrint', '$cat =~ /^[^C]/', ''],
23 ['IsPunct', '$cat =~ /^P/', ''],
24 ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''],
25 ['ToUpper', '$up', '$up'],
26 ['ToLower', '$down', '$down'],
27 ['ToTitle', '$title', '$title'],
28 ['ToDigit', '$dec ne ""', '$dec'],
a0ed51b3 29
30# Name
31
32 ['Name', '$name', '$name'],
33
34# Category
35
36 ['Category', '$cat', '$cat'],
37
38# Normative
39
40 ['IsM', '$cat =~ /^M/', ''], # Mark
41 ['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing
42 ['IsMc', '$cat eq "Mc"', ''], # Mark, Combining
43
44 ['IsN', '$cat =~ /^N/', ''], # Number
45 ['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit
46 ['IsNo', '$cat eq "No"', ''], # Number, Other
47
48 ['IsZ', '$cat =~ /^Z/', ''], # Zeparator
49 ['IsZs', '$cat eq "Zs"', ''], # Separator, Space
50 ['IsZl', '$cat eq "Zl"', ''], # Separator, Line
51 ['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph
52
53 ['IsC', '$cat =~ /^C/', ''], # Crazy
54 ['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format
55 ['IsCo', '$cat eq "Co"', ''], # Other, Private Use
56 ['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned
57
58# Informative
59
60 ['IsL', '$cat =~ /^L/', ''], # Letter
61 ['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase
62 ['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase
63 ['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase
64 ['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier
65 ['IsLo', '$cat eq "Lo"', ''], # Letter, Other
66
67 ['IsP', '$cat =~ /^P/', ''], # Punctuation
68 ['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash
69 ['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open
70 ['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close
71 ['IsPo', '$cat eq "Po"', ''], # Punctuation, Other
72
73 ['IsS', '$cat =~ /^S/', ''], # Symbol
74 ['IsSm', '$cat eq "Sm"', ''], # Symbol, Math
75 ['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency
76 ['IsSo', '$cat eq "So"', ''], # Symbol, Other
77
78# Combining class
79 ['CombiningClass', '$comb', '$comb'],
80
81# BIDIRECTIONAL PROPERTIES
82
83 ['Bidirectional', '$bid', '$bid'],
84
85# Strong types:
86
87 ['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic,
88 # syllabic, and logographic
89 # characters (e.g., CJK
90 # ideographs)
91 ['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew,
92 # and punctuation specific to
93 # those scripts
94
95# Weak types:
96
97 ['IsBidiEN','$bid eq "EN"', ''], # European Number
98 ['IsBidiES','$bid eq "ES"', ''], # European Number Separator
99 ['IsBidiET','$bid eq "ET"', ''], # European Number Terminator
100 ['IsBidiAN','$bid eq "AN"', ''], # Arabic Number
101 ['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator
102
103# Separators:
104
105 ['IsBidiB', '$bid eq "B"', ''], # Block Separator
106 ['IsBidiS', '$bid eq "S"', ''], # Segment Separator
107
108# Neutrals:
109
110 ['IsBidiWS','$bid eq "WS"', ''], # Whitespace
111 ['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other
112 # characters: punctuation,
113 # symbols
114
115# Decomposition
116
117 ['Decomposition', '$decomp', '$decomp'],
118 ['IsDecoCanon', '$decomp && $decomp !~ /^</', ''],
119 ['IsDecoCompat', '$decomp =~ /^</', ''],
120 ['IsDCfont', '$decomp =~ /^<font>/', ''],
121 ['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''],
122 ['IsDCinitial', '$decomp =~ /^<initial>/', ''],
123 ['IsDCinital', '$decomp =~ /^<medial>/', ''],
124 ['IsDCfinal', '$decomp =~ /^<final>/', ''],
125 ['IsDCisolated', '$decomp =~ /^<isolated>/', ''],
126 ['IsDCcircle', '$decomp =~ /^<circle>/', ''],
127 ['IsDCsuper', '$decomp =~ /^<super>/', ''],
128 ['IsDCsub', '$decomp =~ /^<sub>/', ''],
129 ['IsDCvertical', '$decomp =~ /^<vertical>/', ''],
130 ['IsDCwide', '$decomp =~ /^<wide>/', ''],
131 ['IsDCnarrow', '$decomp =~ /^<narrow>/', ''],
132 ['IsDCsmall', '$decomp =~ /^<small>/', ''],
133 ['IsDCsquare', '$decomp =~ /^<square>/', ''],
134 ['IsDCcompat', '$decomp =~ /^<compat>/', ''],
135
136# Number
137
138 ['Number', '$num', '$num'],
139
140# Mirrored
141
142 ['IsMirrored', '$mir eq "Y"', ''],
143
144# Arabic
145
146 ['ArabLink', '1', '$link'],
147 ['ArabLnkGrp', '1', '$linkgroup'],
148
149# Jamo
150
151 ['JamoShort', '1', '$short'],
152);
153
154# This is not written for speed...
155
156foreach $file (@todo) {
157 my ($table, $wanted, $val) = @$file;
158 next if @ARGV and not grep { $_ eq $table } @ARGV;
159 print $table,"\n";
160 if ($table =~ /^(Is|In|To)(.*)/) {
161 open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n";
162 }
163 else {
164 open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n";
165 }
166 print OUT <<"END";
167return <<'END';
168END
169 print OUT proplist($table, $wanted, $val);
170 print OUT "END\n";
171 close OUT;
172}
173
174# Must treat blocks specially.
175
176exit if @ARGV and not grep { $_ eq Block } @ARGV;
177print "Block\n";
178open(UD, 'blocks.txt') or die "Can't open blocks.txt: $!\n";
179open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n";
180print OUT <<"END";
181return <<'END';
182END
183
184while (<UD>) {
185 next if /^#/;
186 next if /^$/;
187 chomp;
188 ($code, $last, $name) = split(/; */);
189 if ($name) {
190 print OUT "$code $last $name\n";
191 $name =~ s/\s+//g;
192 open(BLOCK, ">In/$name.pl");
193 print BLOCK <<"END2";
194return <<'END';
195$code $last
196END
197END2
198 close BLOCK;
199 }
200}
201
202print OUT "END\n";
203close OUT;
204
205##################################################
206
207sub proplist {
208 my ($table, $wanted, $val) = @_;
209 my @wanted;
210 my $out;
211 my $split;
212
213 if ($table =~ /^Arab/) {
214 open(UD, "arabshp.txt") or warn "Can't open $table: $!";
215
216 $split = '($code, $name, $link, $linkgroup) = split(/; */);';
217 }
218 elsif ($table =~ /^Jamo/) {
219 open(UD, "jamo2.txt") or warn "Can't open $table: $!";
220
221 $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
222 }
223 else {
224 open(UD, "UnicodeData-Latest.txt") or warn "Can't open $table: $!";
225
226 $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1,
227 $comment, $up, $down, $title) = split(/;/);';
228 }
229
230 if ($table =~ /^(?:To|Is)[A-Z]/) {
231 eval <<"END";
232 while (<UD>) {
233 next if /^#/;
234 next if /^\s/;
235 chop;
236 $split
237 if ($wanted) {
238 push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]);
239 }
240 }
241END
242 die $@ if $@;
243
244 while (@wanted) {
245 $beg = shift @wanted;
246 $last = $beg;
247 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
248 (not $val or $wanted[0]->[1] == $last->[1] + 1)) {
249 $last = shift @wanted;
250 }
251 $out .= sprintf "%04x", $beg->[0];
252 if ($beg->[2]) {
253 $last = shift @wanted;
254 }
255 if ($beg == $last) {
256 $out .= "\t";
257 }
258 else {
259 $out .= sprintf "\t%04x", $last->[0];
260 }
261 $out .= sprintf "\t%04x", $beg->[1] if $val;
262 $out .= "\n";
263 }
264 }
265 else {
266 eval <<"END";
267 while (<UD>) {
268 next if /^#/;
269 next if /^\s*\$/;
270 chop;
271 $split
272 if ($wanted) {
273 push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]);
274 }
275 }
276END
277 die $@ if $@;
278
279 while (@wanted) {
280 $beg = shift @wanted;
281 $last = $beg;
282 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
283 ($wanted[0]->[1] eq $last->[1])) {
284 $last = shift @wanted;
285 }
286 $out .= sprintf "%04x", $beg->[0];
287 if ($beg->[2]) {
288 $last = shift @wanted;
289 }
290 if ($beg == $last) {
291 $out .= "\t";
292 }
293 else {
294 $out .= sprintf "\t%04x", $last->[0];
295 }
296 $out .= sprintf "\t%s\n", $beg->[1];
297 }
298 }
299 $out;
300}