Commit | Line | Data |
a0ed51b3 |
1 | #!../../miniperl |
2 | |
11695a73 |
3 | $UnicodeData = "UnicodeData-Latest.txt"; |
4 | |
a0ed51b3 |
5 | # Note: we try to keep filenames unique within first 8 chars. Using |
6 | # subdirectories for the following helps. |
7 | mkdir "In", 0777; |
8 | mkdir "Is", 0777; |
9 | mkdir "To", 0777; |
11695a73 |
10 | mkdir "Eq", 0777; |
a0ed51b3 |
11 | |
12 | @todo = ( |
13 | # typical |
14 | |
b8c5462f |
15 | ['IsWord', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''], |
16 | ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/', ''], |
17 | ['IsAlpha', '$cat =~ /^L[ulo]/', ''], |
18 | ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''], |
19 | ['IsDigit', '$cat =~ /^Nd$/', ''], |
20 | ['IsUpper', '$cat =~ /^Lu$/', ''], |
21 | ['IsLower', '$cat =~ /^Ll$/', ''], |
22 | ['IsASCII', 'hex $code <= 127', ''], |
23 | ['IsCntrl', '$cat =~ /^C/', ''], |
24 | ['IsGraph', '$cat =~ /^[^C]/ and $code ne "0020"', ''], |
25 | ['IsPrint', '$cat =~ /^[^C]/', ''], |
26 | ['IsPunct', '$cat =~ /^P/', ''], |
27 | ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''], |
28 | ['ToUpper', '$up', '$up'], |
29 | ['ToLower', '$down', '$down'], |
30 | ['ToTitle', '$title', '$title'], |
31 | ['ToDigit', '$dec ne ""', '$dec'], |
a0ed51b3 |
32 | |
33 | # Name |
34 | |
35 | ['Name', '$name', '$name'], |
36 | |
37 | # Category |
38 | |
39 | ['Category', '$cat', '$cat'], |
40 | |
41 | # Normative |
42 | |
43 | ['IsM', '$cat =~ /^M/', ''], # Mark |
44 | ['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing |
45 | ['IsMc', '$cat eq "Mc"', ''], # Mark, Combining |
46 | |
47 | ['IsN', '$cat =~ /^N/', ''], # Number |
48 | ['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit |
49 | ['IsNo', '$cat eq "No"', ''], # Number, Other |
50 | |
51 | ['IsZ', '$cat =~ /^Z/', ''], # Zeparator |
52 | ['IsZs', '$cat eq "Zs"', ''], # Separator, Space |
53 | ['IsZl', '$cat eq "Zl"', ''], # Separator, Line |
54 | ['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph |
55 | |
56 | ['IsC', '$cat =~ /^C/', ''], # Crazy |
57 | ['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format |
58 | ['IsCo', '$cat eq "Co"', ''], # Other, Private Use |
59 | ['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned |
60 | |
61 | # Informative |
62 | |
63 | ['IsL', '$cat =~ /^L/', ''], # Letter |
64 | ['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase |
65 | ['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase |
66 | ['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase |
67 | ['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier |
68 | ['IsLo', '$cat eq "Lo"', ''], # Letter, Other |
69 | |
70 | ['IsP', '$cat =~ /^P/', ''], # Punctuation |
71 | ['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash |
72 | ['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open |
73 | ['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close |
74 | ['IsPo', '$cat eq "Po"', ''], # Punctuation, Other |
75 | |
76 | ['IsS', '$cat =~ /^S/', ''], # Symbol |
77 | ['IsSm', '$cat eq "Sm"', ''], # Symbol, Math |
78 | ['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency |
79 | ['IsSo', '$cat eq "So"', ''], # Symbol, Other |
80 | |
81 | # Combining class |
82 | ['CombiningClass', '$comb', '$comb'], |
83 | |
84 | # BIDIRECTIONAL PROPERTIES |
85 | |
86 | ['Bidirectional', '$bid', '$bid'], |
87 | |
88 | # Strong types: |
89 | |
90 | ['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic, |
91 | # syllabic, and logographic |
92 | # characters (e.g., CJK |
93 | # ideographs) |
94 | ['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew, |
95 | # and punctuation specific to |
96 | # those scripts |
97 | |
98 | # Weak types: |
99 | |
100 | ['IsBidiEN','$bid eq "EN"', ''], # European Number |
101 | ['IsBidiES','$bid eq "ES"', ''], # European Number Separator |
102 | ['IsBidiET','$bid eq "ET"', ''], # European Number Terminator |
103 | ['IsBidiAN','$bid eq "AN"', ''], # Arabic Number |
104 | ['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator |
105 | |
106 | # Separators: |
107 | |
108 | ['IsBidiB', '$bid eq "B"', ''], # Block Separator |
109 | ['IsBidiS', '$bid eq "S"', ''], # Segment Separator |
110 | |
111 | # Neutrals: |
112 | |
113 | ['IsBidiWS','$bid eq "WS"', ''], # Whitespace |
114 | ['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other |
115 | # characters: punctuation, |
116 | # symbols |
117 | |
118 | # Decomposition |
119 | |
120 | ['Decomposition', '$decomp', '$decomp'], |
121 | ['IsDecoCanon', '$decomp && $decomp !~ /^</', ''], |
122 | ['IsDecoCompat', '$decomp =~ /^</', ''], |
123 | ['IsDCfont', '$decomp =~ /^<font>/', ''], |
124 | ['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''], |
125 | ['IsDCinitial', '$decomp =~ /^<initial>/', ''], |
126 | ['IsDCinital', '$decomp =~ /^<medial>/', ''], |
127 | ['IsDCfinal', '$decomp =~ /^<final>/', ''], |
128 | ['IsDCisolated', '$decomp =~ /^<isolated>/', ''], |
129 | ['IsDCcircle', '$decomp =~ /^<circle>/', ''], |
130 | ['IsDCsuper', '$decomp =~ /^<super>/', ''], |
131 | ['IsDCsub', '$decomp =~ /^<sub>/', ''], |
132 | ['IsDCvertical', '$decomp =~ /^<vertical>/', ''], |
133 | ['IsDCwide', '$decomp =~ /^<wide>/', ''], |
134 | ['IsDCnarrow', '$decomp =~ /^<narrow>/', ''], |
135 | ['IsDCsmall', '$decomp =~ /^<small>/', ''], |
136 | ['IsDCsquare', '$decomp =~ /^<square>/', ''], |
137 | ['IsDCcompat', '$decomp =~ /^<compat>/', ''], |
138 | |
139 | # Number |
140 | |
141 | ['Number', '$num', '$num'], |
142 | |
143 | # Mirrored |
144 | |
145 | ['IsMirrored', '$mir eq "Y"', ''], |
146 | |
147 | # Arabic |
148 | |
149 | ['ArabLink', '1', '$link'], |
150 | ['ArabLnkGrp', '1', '$linkgroup'], |
151 | |
152 | # Jamo |
153 | |
154 | ['JamoShort', '1', '$short'], |
499bfa7a |
155 | |
156 | # Syllables |
157 | |
158 | ['IsSylV', '$syl eq "V"', ''], |
159 | ['IsSylU', '$syl eq "U"', ''], |
160 | ['IsSylI', '$syl eq "I"', ''], |
161 | ['IsSylA', '$syl eq "A"', ''], |
162 | ['IsSylE', '$syl eq "E"', ''], |
163 | ['IsSylC', '$syl eq "C"', ''], |
164 | ['IsSylO', '$syl eq "O"', ''], |
165 | ['IsSylWV', '$syl eq "V"', ''], |
166 | ['IsSylWI', '$syl eq "I"', ''], |
167 | ['IsSylWA', '$syl eq "A"', ''], |
168 | ['IsSylWE', '$syl eq "E"', ''], |
169 | ['IsSylWC', '$syl eq "C"', ''], |
a0ed51b3 |
170 | ); |
171 | |
172 | # This is not written for speed... |
173 | |
174 | foreach $file (@todo) { |
175 | my ($table, $wanted, $val) = @$file; |
176 | next if @ARGV and not grep { $_ eq $table } @ARGV; |
177 | print $table,"\n"; |
178 | if ($table =~ /^(Is|In|To)(.*)/) { |
179 | open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n"; |
180 | } |
181 | else { |
182 | open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n"; |
183 | } |
14055466 |
184 | print OUT <<EOH; |
185 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
186 | # This file is built by $0 from e.g. $UnicodeData. |
187 | # Any changes made here will be lost! |
188 | EOH |
a0ed51b3 |
189 | print OUT <<"END"; |
190 | return <<'END'; |
191 | END |
192 | print OUT proplist($table, $wanted, $val); |
193 | print OUT "END\n"; |
194 | close OUT; |
195 | } |
196 | |
197 | # Must treat blocks specially. |
198 | |
199 | exit if @ARGV and not grep { $_ eq Block } @ARGV; |
200 | print "Block\n"; |
d357d9fe |
201 | open(UD, 'Blocks.txt') or die "Can't open blocks.txt: $!\n"; |
a0ed51b3 |
202 | open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n"; |
14055466 |
203 | print OUT <<EOH; |
204 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
205 | # This file is built by $0 from e.g. $UnicodeData. |
206 | # Any changes made here will be lost! |
207 | EOH |
a0ed51b3 |
208 | print OUT <<"END"; |
209 | return <<'END'; |
210 | END |
211 | |
212 | while (<UD>) { |
213 | next if /^#/; |
214 | next if /^$/; |
215 | chomp; |
216 | ($code, $last, $name) = split(/; */); |
217 | if ($name) { |
218 | print OUT "$code $last $name\n"; |
219 | $name =~ s/\s+//g; |
220 | open(BLOCK, ">In/$name.pl"); |
14055466 |
221 | print BLOCK <<EOH; |
222 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
223 | # This file is built by $0 from e.g. $UnicodeData. |
224 | # Any changes made here will be lost! |
225 | EOH |
a0ed51b3 |
226 | print BLOCK <<"END2"; |
227 | return <<'END'; |
228 | $code $last |
229 | END |
230 | END2 |
231 | close BLOCK; |
232 | } |
233 | } |
234 | |
235 | print OUT "END\n"; |
236 | close OUT; |
237 | |
238 | ################################################## |
239 | |
240 | sub proplist { |
241 | my ($table, $wanted, $val) = @_; |
242 | my @wanted; |
243 | my $out; |
244 | my $split; |
245 | |
246 | if ($table =~ /^Arab/) { |
d357d9fe |
247 | open(UD, "ArabShap.txt") or warn "Can't open $table: $!"; |
a0ed51b3 |
248 | |
249 | $split = '($code, $name, $link, $linkgroup) = split(/; */);'; |
250 | } |
251 | elsif ($table =~ /^Jamo/) { |
d357d9fe |
252 | open(UD, "Jamo-2.txt") or warn "Can't open $table: $!"; |
a0ed51b3 |
253 | |
254 | $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;'; |
255 | } |
499bfa7a |
256 | elsif ($table =~ /^IsSyl/) { |
257 | open(UD, "syllables.txt") or warn "Can't open $table: $!"; |
258 | |
259 | $split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;'; |
260 | } |
a0ed51b3 |
261 | else { |
11695a73 |
262 | open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!"; |
a0ed51b3 |
263 | |
264 | $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1, |
265 | $comment, $up, $down, $title) = split(/;/);'; |
266 | } |
267 | |
268 | if ($table =~ /^(?:To|Is)[A-Z]/) { |
269 | eval <<"END"; |
270 | while (<UD>) { |
271 | next if /^#/; |
272 | next if /^\s/; |
273 | chop; |
274 | $split |
275 | if ($wanted) { |
276 | push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]); |
277 | } |
278 | } |
279 | END |
280 | die $@ if $@; |
281 | |
282 | while (@wanted) { |
283 | $beg = shift @wanted; |
284 | $last = $beg; |
285 | while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and |
286 | (not $val or $wanted[0]->[1] == $last->[1] + 1)) { |
287 | $last = shift @wanted; |
288 | } |
289 | $out .= sprintf "%04x", $beg->[0]; |
290 | if ($beg->[2]) { |
291 | $last = shift @wanted; |
292 | } |
293 | if ($beg == $last) { |
294 | $out .= "\t"; |
295 | } |
296 | else { |
297 | $out .= sprintf "\t%04x", $last->[0]; |
298 | } |
299 | $out .= sprintf "\t%04x", $beg->[1] if $val; |
300 | $out .= "\n"; |
301 | } |
302 | } |
303 | else { |
304 | eval <<"END"; |
305 | while (<UD>) { |
306 | next if /^#/; |
307 | next if /^\s*\$/; |
308 | chop; |
309 | $split |
310 | if ($wanted) { |
311 | push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]); |
312 | } |
313 | } |
314 | END |
315 | die $@ if $@; |
316 | |
317 | while (@wanted) { |
318 | $beg = shift @wanted; |
319 | $last = $beg; |
320 | while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and |
321 | ($wanted[0]->[1] eq $last->[1])) { |
322 | $last = shift @wanted; |
323 | } |
324 | $out .= sprintf "%04x", $beg->[0]; |
325 | if ($beg->[2]) { |
326 | $last = shift @wanted; |
327 | } |
328 | if ($beg == $last) { |
329 | $out .= "\t"; |
330 | } |
331 | else { |
332 | $out .= sprintf "\t%04x", $last->[0]; |
333 | } |
334 | $out .= sprintf "\t%s\n", $beg->[1]; |
335 | } |
336 | } |
337 | $out; |
338 | } |
11695a73 |
339 | |
11eeea96 |
340 | # Create the equivalence mappings. |
341 | |
13a0e1a7 |
342 | open(UNICODEDATA, $UnicodeData) || die "$0: $UnicodeData: $!\n"; |
343 | |
344 | while (<UNICODEDATA>) { |
6dd159d1 |
345 | ($code, $name, $category, $decomposition) = (split /;/)[0,1,2,5]; |
13a0e1a7 |
346 | |
347 | $code{$name} = $code; |
348 | $name{$code} = $name; |
39f2df93 |
349 | $category{$code} = $category; |
350 | |
351 | next unless $category =~ /^L/; |
352 | |
353 | # The definition of "equivalence" is twofold. |
354 | if ($decomposition ne '') { |
355 | # (1) If there's an official Unicode decomposition |
356 | # and the base is a Unicode letter. |
357 | $decomposition =~ s/^<\w+> //; |
358 | @decomposition = split(' ', $decomposition); |
359 | # Some Arabic ligatures like |
360 | # FC5E;ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM;Lo;... |
361 | # are problematic because their decomposition begins with |
362 | # a space (0020) -- which could be just skipped -- but then |
363 | # their base glyph is not a letter, for example |
364 | # the above decomposes as <isolated> 0020 064C 0651, |
365 | # but 064C is 064C;ARABIC DAMMATAN;Mn;... |
366 | # (the 0651 being ARABIC SHADDA;Mn) |
367 | ($basecode) = shift @decomposition; |
368 | push @base, [ $code, $basecode ]; |
369 | } elsif ($name =~ /^(.+?) WITH /) { |
370 | # (2) If there's a "FOO WITH ..." Unicode name and FOO |
371 | # happens to be valid Unicode letter. This is |
372 | # a debatable definition and all fault is by me (jhi). |
373 | # For example this definition adds |
374 | # LATIN SMALL LETTER O WITH STROKE |
375 | # as a derivative of |
376 | # LATIN SMALL LETTER O |
377 | # which some might rightfully contest, especially |
378 | # the speakers of languages who have the former |
379 | # phonetically as very distinct from the latter. |
380 | push @with, [ $code, $1 ]; |
381 | } |
382 | } |
13a0e1a7 |
383 | |
39f2df93 |
384 | foreach my $w (@with) { |
385 | ($code, $basename) = @$w; |
386 | next if not exists $code{$basename} or |
387 | not $category{$code{$basename}} =~ /^L/; |
388 | push @base, [ $code, $code{$basename} ]; |
13a0e1a7 |
389 | } |
390 | |
39f2df93 |
391 | @base = sort { $a->[0] cmp $b->[0] } @base; |
392 | |
13a0e1a7 |
393 | foreach my $b (@base) { |
6dd159d1 |
394 | ($code, $basecode) = @$b; |
39f2df93 |
395 | $basename = $name{$basecode}; |
396 | next if not defined $basename or |
397 | not exists $code{$basename} or |
398 | not $category{$code{$basename}} =~ /^L/; |
399 | push @{$unicode{$code{$basename}}}, $code; |
400 | # print "$code: $name{$code} -> $basename\n", |
13a0e1a7 |
401 | } |
402 | |
403 | @unicode = sort keys %unicode; |
404 | |
883d4c97 |
405 | print "EqUnicode\n"; |
14055466 |
406 | if (open(OUT, ">Eq/Unicode.pl")) { |
407 | print OUT <<EOH; |
408 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
409 | # This file is built by $0 from e.g. $UnicodeData. |
410 | # Any changes made here will be lost! |
411 | return <<'END'; |
412 | EOH |
13a0e1a7 |
413 | foreach my $c (@unicode) { |
14055466 |
414 | print OUT "$c @{$unicode{$c}}\n"; |
13a0e1a7 |
415 | } |
14055466 |
416 | print OUT "END\n"; |
417 | close OUT; |
13a0e1a7 |
418 | } else { |
419 | die "$0: failed to open Eq/Unicode for writing: $!\n"; |
420 | } |
421 | |
883d4c97 |
422 | print "EqLatin1\n"; |
14055466 |
423 | if (open(OUT, ">Eq/Latin1.pl")) { |
424 | print OUT <<EOH; |
425 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
426 | # This file is built by $0 from e.g. $UnicodeData. |
427 | # Any changes made here will be lost! |
428 | return <<'END'; |
429 | EOH |
13a0e1a7 |
430 | foreach my $c (@unicode) { |
431 | last if hex($c) > 255; |
39f2df93 |
432 | my @c = grep { hex($_) < 256 } @{$unicode{$c}}; |
13a0e1a7 |
433 | next unless @c; |
14055466 |
434 | print OUT "$c @c\n"; |
13a0e1a7 |
435 | } |
14055466 |
436 | print OUT "END\n"; |
437 | close OUT; |
13a0e1a7 |
438 | } else { |
439 | die "$0: failed to open Eq/Latin1 for writing: $!\n"; |
440 | } |
6dd159d1 |
441 | |
442 | # eof |