Commit | Line | Data |
a0ed51b3 |
1 | #!../../miniperl |
2 | |
3 | # Note: we try to keep filenames unique within first 8 chars. Using |
4 | # subdirectories for the following helps. |
5 | mkdir "In", 0777; |
6 | mkdir "Is", 0777; |
7 | mkdir "To", 0777; |
8 | |
9 | @todo = ( |
10 | # typical |
11 | |
12 | ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''], |
13 | ['IsAlpha', '$cat =~ /^L[ulo]/', ''], |
14 | ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''], |
15 | ['IsDigit', '$cat =~ /^Nd$/', ''], |
16 | ['IsUpper', '$cat =~ /^Lu$/', ''], |
17 | ['IsLower', '$cat =~ /^Ll$/', ''], |
18 | ['IsPrint', '$cat =~ /^[^C]/', ''], |
19 | ['ToUpper', '$up', '$up'], |
20 | ['ToLower', '$down', '$down'], |
21 | ['ToTitle', '$title', '$title'], |
22 | ['ToDigit', '$dec ne ""', '$dec'], |
23 | |
24 | # Name |
25 | |
26 | ['Name', '$name', '$name'], |
27 | |
28 | # Category |
29 | |
30 | ['Category', '$cat', '$cat'], |
31 | |
32 | # Normative |
33 | |
34 | ['IsM', '$cat =~ /^M/', ''], # Mark |
35 | ['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing |
36 | ['IsMc', '$cat eq "Mc"', ''], # Mark, Combining |
37 | |
38 | ['IsN', '$cat =~ /^N/', ''], # Number |
39 | ['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit |
40 | ['IsNo', '$cat eq "No"', ''], # Number, Other |
41 | |
42 | ['IsZ', '$cat =~ /^Z/', ''], # Zeparator |
43 | ['IsZs', '$cat eq "Zs"', ''], # Separator, Space |
44 | ['IsZl', '$cat eq "Zl"', ''], # Separator, Line |
45 | ['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph |
46 | |
47 | ['IsC', '$cat =~ /^C/', ''], # Crazy |
48 | ['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format |
49 | ['IsCo', '$cat eq "Co"', ''], # Other, Private Use |
50 | ['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned |
51 | |
52 | # Informative |
53 | |
54 | ['IsL', '$cat =~ /^L/', ''], # Letter |
55 | ['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase |
56 | ['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase |
57 | ['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase |
58 | ['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier |
59 | ['IsLo', '$cat eq "Lo"', ''], # Letter, Other |
60 | |
61 | ['IsP', '$cat =~ /^P/', ''], # Punctuation |
62 | ['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash |
63 | ['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open |
64 | ['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close |
65 | ['IsPo', '$cat eq "Po"', ''], # Punctuation, Other |
66 | |
67 | ['IsS', '$cat =~ /^S/', ''], # Symbol |
68 | ['IsSm', '$cat eq "Sm"', ''], # Symbol, Math |
69 | ['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency |
70 | ['IsSo', '$cat eq "So"', ''], # Symbol, Other |
71 | |
72 | # Combining class |
73 | ['CombiningClass', '$comb', '$comb'], |
74 | |
75 | # BIDIRECTIONAL PROPERTIES |
76 | |
77 | ['Bidirectional', '$bid', '$bid'], |
78 | |
79 | # Strong types: |
80 | |
81 | ['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic, |
82 | # syllabic, and logographic |
83 | # characters (e.g., CJK |
84 | # ideographs) |
85 | ['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew, |
86 | # and punctuation specific to |
87 | # those scripts |
88 | |
89 | # Weak types: |
90 | |
91 | ['IsBidiEN','$bid eq "EN"', ''], # European Number |
92 | ['IsBidiES','$bid eq "ES"', ''], # European Number Separator |
93 | ['IsBidiET','$bid eq "ET"', ''], # European Number Terminator |
94 | ['IsBidiAN','$bid eq "AN"', ''], # Arabic Number |
95 | ['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator |
96 | |
97 | # Separators: |
98 | |
99 | ['IsBidiB', '$bid eq "B"', ''], # Block Separator |
100 | ['IsBidiS', '$bid eq "S"', ''], # Segment Separator |
101 | |
102 | # Neutrals: |
103 | |
104 | ['IsBidiWS','$bid eq "WS"', ''], # Whitespace |
105 | ['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other |
106 | # characters: punctuation, |
107 | # symbols |
108 | |
109 | # Decomposition |
110 | |
111 | ['Decomposition', '$decomp', '$decomp'], |
112 | ['IsDecoCanon', '$decomp && $decomp !~ /^</', ''], |
113 | ['IsDecoCompat', '$decomp =~ /^</', ''], |
114 | ['IsDCfont', '$decomp =~ /^<font>/', ''], |
115 | ['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''], |
116 | ['IsDCinitial', '$decomp =~ /^<initial>/', ''], |
117 | ['IsDCinital', '$decomp =~ /^<medial>/', ''], |
118 | ['IsDCfinal', '$decomp =~ /^<final>/', ''], |
119 | ['IsDCisolated', '$decomp =~ /^<isolated>/', ''], |
120 | ['IsDCcircle', '$decomp =~ /^<circle>/', ''], |
121 | ['IsDCsuper', '$decomp =~ /^<super>/', ''], |
122 | ['IsDCsub', '$decomp =~ /^<sub>/', ''], |
123 | ['IsDCvertical', '$decomp =~ /^<vertical>/', ''], |
124 | ['IsDCwide', '$decomp =~ /^<wide>/', ''], |
125 | ['IsDCnarrow', '$decomp =~ /^<narrow>/', ''], |
126 | ['IsDCsmall', '$decomp =~ /^<small>/', ''], |
127 | ['IsDCsquare', '$decomp =~ /^<square>/', ''], |
128 | ['IsDCcompat', '$decomp =~ /^<compat>/', ''], |
129 | |
130 | # Number |
131 | |
132 | ['Number', '$num', '$num'], |
133 | |
134 | # Mirrored |
135 | |
136 | ['IsMirrored', '$mir eq "Y"', ''], |
137 | |
138 | # Arabic |
139 | |
140 | ['ArabLink', '1', '$link'], |
141 | ['ArabLnkGrp', '1', '$linkgroup'], |
142 | |
143 | # Jamo |
144 | |
145 | ['JamoShort', '1', '$short'], |
146 | ); |
147 | |
148 | # This is not written for speed... |
149 | |
150 | foreach $file (@todo) { |
151 | my ($table, $wanted, $val) = @$file; |
152 | next if @ARGV and not grep { $_ eq $table } @ARGV; |
153 | print $table,"\n"; |
154 | if ($table =~ /^(Is|In|To)(.*)/) { |
155 | open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n"; |
156 | } |
157 | else { |
158 | open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n"; |
159 | } |
160 | print OUT <<"END"; |
161 | return <<'END'; |
162 | END |
163 | print OUT proplist($table, $wanted, $val); |
164 | print OUT "END\n"; |
165 | close OUT; |
166 | } |
167 | |
168 | # Must treat blocks specially. |
169 | |
170 | exit if @ARGV and not grep { $_ eq Block } @ARGV; |
171 | print "Block\n"; |
172 | open(UD, 'blocks.txt') or die "Can't open blocks.txt: $!\n"; |
173 | open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n"; |
174 | print OUT <<"END"; |
175 | return <<'END'; |
176 | END |
177 | |
178 | while (<UD>) { |
179 | next if /^#/; |
180 | next if /^$/; |
181 | chomp; |
182 | ($code, $last, $name) = split(/; */); |
183 | if ($name) { |
184 | print OUT "$code $last $name\n"; |
185 | $name =~ s/\s+//g; |
186 | open(BLOCK, ">In/$name.pl"); |
187 | print BLOCK <<"END2"; |
188 | return <<'END'; |
189 | $code $last |
190 | END |
191 | END2 |
192 | close BLOCK; |
193 | } |
194 | } |
195 | |
196 | print OUT "END\n"; |
197 | close OUT; |
198 | |
199 | ################################################## |
200 | |
201 | sub proplist { |
202 | my ($table, $wanted, $val) = @_; |
203 | my @wanted; |
204 | my $out; |
205 | my $split; |
206 | |
207 | if ($table =~ /^Arab/) { |
208 | open(UD, "arabshp.txt") or warn "Can't open $table: $!"; |
209 | |
210 | $split = '($code, $name, $link, $linkgroup) = split(/; */);'; |
211 | } |
212 | elsif ($table =~ /^Jamo/) { |
213 | open(UD, "jamo2.txt") or warn "Can't open $table: $!"; |
214 | |
215 | $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;'; |
216 | } |
217 | else { |
218 | open(UD, "UnicodeData-Latest.txt") or warn "Can't open $table: $!"; |
219 | |
220 | $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1, |
221 | $comment, $up, $down, $title) = split(/;/);'; |
222 | } |
223 | |
224 | if ($table =~ /^(?:To|Is)[A-Z]/) { |
225 | eval <<"END"; |
226 | while (<UD>) { |
227 | next if /^#/; |
228 | next if /^\s/; |
229 | chop; |
230 | $split |
231 | if ($wanted) { |
232 | push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]); |
233 | } |
234 | } |
235 | END |
236 | die $@ if $@; |
237 | |
238 | while (@wanted) { |
239 | $beg = shift @wanted; |
240 | $last = $beg; |
241 | while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and |
242 | (not $val or $wanted[0]->[1] == $last->[1] + 1)) { |
243 | $last = shift @wanted; |
244 | } |
245 | $out .= sprintf "%04x", $beg->[0]; |
246 | if ($beg->[2]) { |
247 | $last = shift @wanted; |
248 | } |
249 | if ($beg == $last) { |
250 | $out .= "\t"; |
251 | } |
252 | else { |
253 | $out .= sprintf "\t%04x", $last->[0]; |
254 | } |
255 | $out .= sprintf "\t%04x", $beg->[1] if $val; |
256 | $out .= "\n"; |
257 | } |
258 | } |
259 | else { |
260 | eval <<"END"; |
261 | while (<UD>) { |
262 | next if /^#/; |
263 | next if /^\s*\$/; |
264 | chop; |
265 | $split |
266 | if ($wanted) { |
267 | push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]); |
268 | } |
269 | } |
270 | END |
271 | die $@ if $@; |
272 | |
273 | while (@wanted) { |
274 | $beg = shift @wanted; |
275 | $last = $beg; |
276 | while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and |
277 | ($wanted[0]->[1] eq $last->[1])) { |
278 | $last = shift @wanted; |
279 | } |
280 | $out .= sprintf "%04x", $beg->[0]; |
281 | if ($beg->[2]) { |
282 | $last = shift @wanted; |
283 | } |
284 | if ($beg == $last) { |
285 | $out .= "\t"; |
286 | } |
287 | else { |
288 | $out .= sprintf "\t%04x", $last->[0]; |
289 | } |
290 | $out .= sprintf "\t%s\n", $beg->[1]; |
291 | } |
292 | } |
293 | $out; |
294 | } |