Commit | Line | Data |
a0ed51b3 |
1 | #!../../miniperl |
2 | |
505afebf |
3 | $UnicodeData = "Unicode.300"; |
11695a73 |
4 | |
a0ed51b3 |
5 | # Note: we try to keep filenames unique within first 8 chars. Using |
6 | # subdirectories for the following helps. |
7 | mkdir "In", 0777; |
8 | mkdir "Is", 0777; |
9 | mkdir "To", 0777; |
10 | |
11 | @todo = ( |
12 | # typical |
13 | |
b8c5462f |
14 | ['IsWord', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''], |
15 | ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/', ''], |
16 | ['IsAlpha', '$cat =~ /^L[ulo]/', ''], |
17 | ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''], |
18 | ['IsDigit', '$cat =~ /^Nd$/', ''], |
19 | ['IsUpper', '$cat =~ /^Lu$/', ''], |
20 | ['IsLower', '$cat =~ /^Ll$/', ''], |
21 | ['IsASCII', 'hex $code <= 127', ''], |
22 | ['IsCntrl', '$cat =~ /^C/', ''], |
23 | ['IsGraph', '$cat =~ /^[^C]/ and $code ne "0020"', ''], |
24 | ['IsPrint', '$cat =~ /^[^C]/', ''], |
25 | ['IsPunct', '$cat =~ /^P/', ''], |
26 | ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''], |
27 | ['ToUpper', '$up', '$up'], |
28 | ['ToLower', '$down', '$down'], |
29 | ['ToTitle', '$title', '$title'], |
30 | ['ToDigit', '$dec ne ""', '$dec'], |
a0ed51b3 |
31 | |
32 | # Name |
33 | |
34 | ['Name', '$name', '$name'], |
35 | |
36 | # Category |
37 | |
38 | ['Category', '$cat', '$cat'], |
39 | |
40 | # Normative |
41 | |
42 | ['IsM', '$cat =~ /^M/', ''], # Mark |
43 | ['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing |
44 | ['IsMc', '$cat eq "Mc"', ''], # Mark, Combining |
45 | |
46 | ['IsN', '$cat =~ /^N/', ''], # Number |
47 | ['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit |
48 | ['IsNo', '$cat eq "No"', ''], # Number, Other |
49 | |
50 | ['IsZ', '$cat =~ /^Z/', ''], # Zeparator |
51 | ['IsZs', '$cat eq "Zs"', ''], # Separator, Space |
52 | ['IsZl', '$cat eq "Zl"', ''], # Separator, Line |
53 | ['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph |
54 | |
55 | ['IsC', '$cat =~ /^C/', ''], # Crazy |
56 | ['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format |
57 | ['IsCo', '$cat eq "Co"', ''], # Other, Private Use |
58 | ['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned |
59 | |
60 | # Informative |
61 | |
62 | ['IsL', '$cat =~ /^L/', ''], # Letter |
63 | ['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase |
64 | ['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase |
65 | ['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase |
66 | ['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier |
67 | ['IsLo', '$cat eq "Lo"', ''], # Letter, Other |
68 | |
69 | ['IsP', '$cat =~ /^P/', ''], # Punctuation |
70 | ['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash |
71 | ['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open |
72 | ['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close |
73 | ['IsPo', '$cat eq "Po"', ''], # Punctuation, Other |
74 | |
75 | ['IsS', '$cat =~ /^S/', ''], # Symbol |
76 | ['IsSm', '$cat eq "Sm"', ''], # Symbol, Math |
77 | ['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency |
78 | ['IsSo', '$cat eq "So"', ''], # Symbol, Other |
79 | |
80 | # Combining class |
81 | ['CombiningClass', '$comb', '$comb'], |
82 | |
83 | # BIDIRECTIONAL PROPERTIES |
84 | |
85 | ['Bidirectional', '$bid', '$bid'], |
86 | |
87 | # Strong types: |
88 | |
89 | ['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic, |
90 | # syllabic, and logographic |
91 | # characters (e.g., CJK |
92 | # ideographs) |
93 | ['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew, |
94 | # and punctuation specific to |
95 | # those scripts |
96 | |
97 | # Weak types: |
98 | |
99 | ['IsBidiEN','$bid eq "EN"', ''], # European Number |
100 | ['IsBidiES','$bid eq "ES"', ''], # European Number Separator |
101 | ['IsBidiET','$bid eq "ET"', ''], # European Number Terminator |
102 | ['IsBidiAN','$bid eq "AN"', ''], # Arabic Number |
103 | ['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator |
104 | |
105 | # Separators: |
106 | |
107 | ['IsBidiB', '$bid eq "B"', ''], # Block Separator |
108 | ['IsBidiS', '$bid eq "S"', ''], # Segment Separator |
109 | |
110 | # Neutrals: |
111 | |
112 | ['IsBidiWS','$bid eq "WS"', ''], # Whitespace |
113 | ['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other |
114 | # characters: punctuation, |
115 | # symbols |
116 | |
117 | # Decomposition |
118 | |
119 | ['Decomposition', '$decomp', '$decomp'], |
120 | ['IsDecoCanon', '$decomp && $decomp !~ /^</', ''], |
121 | ['IsDecoCompat', '$decomp =~ /^</', ''], |
122 | ['IsDCfont', '$decomp =~ /^<font>/', ''], |
123 | ['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''], |
124 | ['IsDCinitial', '$decomp =~ /^<initial>/', ''], |
125 | ['IsDCinital', '$decomp =~ /^<medial>/', ''], |
126 | ['IsDCfinal', '$decomp =~ /^<final>/', ''], |
127 | ['IsDCisolated', '$decomp =~ /^<isolated>/', ''], |
128 | ['IsDCcircle', '$decomp =~ /^<circle>/', ''], |
129 | ['IsDCsuper', '$decomp =~ /^<super>/', ''], |
130 | ['IsDCsub', '$decomp =~ /^<sub>/', ''], |
131 | ['IsDCvertical', '$decomp =~ /^<vertical>/', ''], |
132 | ['IsDCwide', '$decomp =~ /^<wide>/', ''], |
133 | ['IsDCnarrow', '$decomp =~ /^<narrow>/', ''], |
134 | ['IsDCsmall', '$decomp =~ /^<small>/', ''], |
135 | ['IsDCsquare', '$decomp =~ /^<square>/', ''], |
136 | ['IsDCcompat', '$decomp =~ /^<compat>/', ''], |
137 | |
138 | # Number |
139 | |
140 | ['Number', '$num', '$num'], |
141 | |
142 | # Mirrored |
143 | |
144 | ['IsMirrored', '$mir eq "Y"', ''], |
145 | |
146 | # Arabic |
147 | |
148 | ['ArabLink', '1', '$link'], |
149 | ['ArabLnkGrp', '1', '$linkgroup'], |
150 | |
151 | # Jamo |
152 | |
153 | ['JamoShort', '1', '$short'], |
499bfa7a |
154 | |
155 | # Syllables |
156 | |
157 | ['IsSylV', '$syl eq "V"', ''], |
158 | ['IsSylU', '$syl eq "U"', ''], |
159 | ['IsSylI', '$syl eq "I"', ''], |
160 | ['IsSylA', '$syl eq "A"', ''], |
161 | ['IsSylE', '$syl eq "E"', ''], |
162 | ['IsSylC', '$syl eq "C"', ''], |
163 | ['IsSylO', '$syl eq "O"', ''], |
164 | ['IsSylWV', '$syl eq "V"', ''], |
165 | ['IsSylWI', '$syl eq "I"', ''], |
166 | ['IsSylWA', '$syl eq "A"', ''], |
167 | ['IsSylWE', '$syl eq "E"', ''], |
168 | ['IsSylWC', '$syl eq "C"', ''], |
a77b4ae5 |
169 | |
170 | # Line break properties - Normative |
171 | |
172 | ['IsLbrkBK','$brk eq "BK"', ''], # Mandatory Break |
173 | ['IsLbrkCR','$brk eq "CR"', ''], # Carriage Return |
174 | ['IsLbrkLF','$brk eq "LF"', ''], # Line Feed |
175 | ['IsLbrkCM','$brk eq "CM"', ''], # Attached Characters and Combining Marks |
176 | ['IsLbrkSG','$brk eq "SG"', ''], # Surrogates |
177 | ['IsLbrkGL','$brk eq "GL"', ''], # Non-breaking (Glue) |
178 | ['IsLbrkCB','$brk eq "CB"', ''], # Contingent Break Opportunity |
179 | ['IsLbrkSP','$brk eq "SP"', ''], # Space |
180 | ['IsLbrkZW','$brk eq "ZW"', ''], # Zero Width Space |
181 | |
182 | # Line break properties - Informative |
183 | ['IsLbrkXX','$brk eq "XX"', ''], # Unknown |
184 | ['IsLbrkOP','$brk eq "OP"', ''], # Opening Punctuation |
185 | ['IsLbrkCL','$brk eq "CL"', ''], # Closing Punctuation |
186 | ['IsLbrkQU','$brk eq "QU"', ''], # Ambiguous Quotation |
187 | ['IsLbrkNS','$brk eq "NS"', ''], # Non Starter |
188 | ['IsLbrkEX','$brk eq "EX"', ''], # Exclamation/Interrogation |
189 | ['IsLbrkSY','$brk eq "SY"', ''], # Symbols Allowing Breaks |
190 | ['IsLbrkIS','$brk eq "IS"', ''], # Infix Separator (Numeric) |
191 | ['IsLbrkPR','$brk eq "PR"', ''], # Prefix (Numeric) |
192 | ['IsLbrkPO','$brk eq "PO"', ''], # Postfix (Numeric) |
193 | ['IsLbrkNU','$brk eq "NU"', ''], # Numeric |
194 | ['IsLbrkAL','$brk eq "AL"', ''], # Ordinary Alphabetic and Symbol Characters |
195 | ['IsLbrkID','$brk eq "ID"', ''], # Ideographic |
196 | ['IsLbrkIN','$brk eq "IN"', ''], # Inseparable |
197 | ['IsLbrkHY','$brk eq "HY"', ''], # Hyphen |
198 | ['IsLbrkBB','$brk eq "BB"', ''], # Break Opportunity Before |
199 | ['IsLbrkBA','$brk eq "BA"', ''], # Break Opportunity After |
200 | ['IsLbrkSA','$brk eq "SA"', ''], # Complex Context (South East Asian) |
201 | ['IsLbrkAI','$brk eq "AI"', ''], # Ambiguous (Alphabetic or Ideographic) |
202 | ['IsLbrkB2','$brk eq "B2"', ''], # Break Opportunity Before and After |
a0ed51b3 |
203 | ); |
204 | |
205 | # This is not written for speed... |
206 | |
207 | foreach $file (@todo) { |
208 | my ($table, $wanted, $val) = @$file; |
209 | next if @ARGV and not grep { $_ eq $table } @ARGV; |
210 | print $table,"\n"; |
211 | if ($table =~ /^(Is|In|To)(.*)/) { |
212 | open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n"; |
213 | } |
214 | else { |
215 | open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n"; |
216 | } |
14055466 |
217 | print OUT <<EOH; |
218 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
219 | # This file is built by $0 from e.g. $UnicodeData. |
220 | # Any changes made here will be lost! |
221 | EOH |
a0ed51b3 |
222 | print OUT <<"END"; |
223 | return <<'END'; |
224 | END |
225 | print OUT proplist($table, $wanted, $val); |
226 | print OUT "END\n"; |
227 | close OUT; |
228 | } |
229 | |
230 | # Must treat blocks specially. |
231 | |
232 | exit if @ARGV and not grep { $_ eq Block } @ARGV; |
233 | print "Block\n"; |
d357d9fe |
234 | open(UD, 'Blocks.txt') or die "Can't open blocks.txt: $!\n"; |
a0ed51b3 |
235 | open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n"; |
14055466 |
236 | print OUT <<EOH; |
237 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
238 | # This file is built by $0 from e.g. $UnicodeData. |
239 | # Any changes made here will be lost! |
240 | EOH |
a0ed51b3 |
241 | print OUT <<"END"; |
242 | return <<'END'; |
243 | END |
244 | |
245 | while (<UD>) { |
246 | next if /^#/; |
247 | next if /^$/; |
248 | chomp; |
249 | ($code, $last, $name) = split(/; */); |
250 | if ($name) { |
251 | print OUT "$code $last $name\n"; |
252 | $name =~ s/\s+//g; |
253 | open(BLOCK, ">In/$name.pl"); |
14055466 |
254 | print BLOCK <<EOH; |
255 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
256 | # This file is built by $0 from e.g. $UnicodeData. |
257 | # Any changes made here will be lost! |
258 | EOH |
a0ed51b3 |
259 | print BLOCK <<"END2"; |
260 | return <<'END'; |
261 | $code $last |
262 | END |
263 | END2 |
264 | close BLOCK; |
265 | } |
266 | } |
267 | |
268 | print OUT "END\n"; |
269 | close OUT; |
270 | |
271 | ################################################## |
272 | |
273 | sub proplist { |
274 | my ($table, $wanted, $val) = @_; |
275 | my @wanted; |
276 | my $out; |
277 | my $split; |
278 | |
279 | if ($table =~ /^Arab/) { |
d357d9fe |
280 | open(UD, "ArabShap.txt") or warn "Can't open $table: $!"; |
a0ed51b3 |
281 | |
282 | $split = '($code, $name, $link, $linkgroup) = split(/; */);'; |
283 | } |
284 | elsif ($table =~ /^Jamo/) { |
505afebf |
285 | open(UD, "Jamo.txt") or warn "Can't open $table: $!"; |
a0ed51b3 |
286 | |
287 | $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;'; |
288 | } |
499bfa7a |
289 | elsif ($table =~ /^IsSyl/) { |
290 | open(UD, "syllables.txt") or warn "Can't open $table: $!"; |
291 | |
292 | $split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;'; |
293 | } |
a77b4ae5 |
294 | elsif ($table =~ /^IsLbrk/) { |
295 | open(UD, "LineBrk.txt") or warn "Can't open $table: $!"; |
296 | |
297 | $split = '($code, $brk, $name) = split(/;/);'; |
298 | } |
a0ed51b3 |
299 | else { |
11695a73 |
300 | open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!"; |
a0ed51b3 |
301 | |
302 | $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1, |
303 | $comment, $up, $down, $title) = split(/;/);'; |
304 | } |
305 | |
306 | if ($table =~ /^(?:To|Is)[A-Z]/) { |
307 | eval <<"END"; |
308 | while (<UD>) { |
309 | next if /^#/; |
310 | next if /^\s/; |
311 | chop; |
312 | $split |
313 | if ($wanted) { |
314 | push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]); |
315 | } |
316 | } |
317 | END |
318 | die $@ if $@; |
319 | |
320 | while (@wanted) { |
321 | $beg = shift @wanted; |
322 | $last = $beg; |
323 | while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and |
324 | (not $val or $wanted[0]->[1] == $last->[1] + 1)) { |
325 | $last = shift @wanted; |
326 | } |
327 | $out .= sprintf "%04x", $beg->[0]; |
328 | if ($beg->[2]) { |
329 | $last = shift @wanted; |
330 | } |
331 | if ($beg == $last) { |
332 | $out .= "\t"; |
333 | } |
334 | else { |
335 | $out .= sprintf "\t%04x", $last->[0]; |
336 | } |
337 | $out .= sprintf "\t%04x", $beg->[1] if $val; |
338 | $out .= "\n"; |
339 | } |
340 | } |
341 | else { |
342 | eval <<"END"; |
343 | while (<UD>) { |
344 | next if /^#/; |
345 | next if /^\s*\$/; |
346 | chop; |
347 | $split |
348 | if ($wanted) { |
349 | push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]); |
350 | } |
351 | } |
352 | END |
353 | die $@ if $@; |
354 | |
355 | while (@wanted) { |
356 | $beg = shift @wanted; |
357 | $last = $beg; |
358 | while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and |
359 | ($wanted[0]->[1] eq $last->[1])) { |
360 | $last = shift @wanted; |
361 | } |
362 | $out .= sprintf "%04x", $beg->[0]; |
363 | if ($beg->[2]) { |
364 | $last = shift @wanted; |
365 | } |
366 | if ($beg == $last) { |
367 | $out .= "\t"; |
368 | } |
369 | else { |
370 | $out .= sprintf "\t%04x", $last->[0]; |
371 | } |
372 | $out .= sprintf "\t%s\n", $beg->[1]; |
373 | } |
374 | } |
375 | $out; |
376 | } |
11695a73 |
377 | |
6dd159d1 |
378 | # eof |