Commit | Line | Data |
a0ed51b3 |
1 | #!../../miniperl |
2 | |
11695a73 |
3 | $UnicodeData = "UnicodeData-Latest.txt"; |
4 | |
a0ed51b3 |
5 | # Note: we try to keep filenames unique within first 8 chars. Using |
6 | # subdirectories for the following helps. |
7 | mkdir "In", 0777; |
8 | mkdir "Is", 0777; |
9 | mkdir "To", 0777; |
11695a73 |
10 | mkdir "Eq", 0777; |
a0ed51b3 |
11 | |
c939d428 |
12 | open(UNICODEDATA, $UnicodeData) || die "$0: $UnicodeData: $!\n"; |
13 | |
14 | while (<UNICODEDATA>) { |
15 | ($code, $name) = split /;/; |
16 | |
17 | $code{$name} = $code; |
18 | $name{$code} = $name; |
19 | |
20 | next unless $name =~ /^(.+? LETTER .+?) WITH .+( \w+ FORM)?$/; |
21 | |
22 | push @base, [ $code, $1 ]; |
23 | push @base, [ $code, $1.$2 ] if $2 ne ''; |
24 | |
25 | # Before this "diacritics stripping" phase (and for Arabic, also |
26 | # "form stripping" phase) all ligatures could be decomposed into |
27 | # their constituent letters. |
28 | # |
29 | # For example the ligature |
30 | # ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH ALEF ISOLATED FORM |
31 | # would go first through ligature decomposition producing the two letters |
32 | # ARABIC LETTER YEH WITH HAMZA ABOVE ISOLATED FORM |
33 | # ARABIC LETTER ALEF WITH HAMZA ABOVE ISOLATED FORM |
34 | # and those with diacritics stripping |
35 | # ARABIC LETTER YEH ISOLATED FORM |
36 | # ARABIC LETTER ALEF ISOLATED FORM |
37 | # and those with the Arabic form stripping |
38 | # ARABIC LETTER YEH |
39 | # ARABIC LETTER ALEF ISOLATED FORM |
40 | # ARABIC LETTER YEH |
41 | # ARABIC LETTER ALEF ISOLATED FORM |
42 | # |
43 | # Similarly for ligatures from other scripts. |
44 | # Effectively this would mean that ligatures turn into categories |
45 | # (Unicodese for character classes). |
46 | } |
47 | |
48 | foreach my $b (@base) { |
49 | ($code, $base) = @$b; |
50 | next unless exists $code{$base}; |
51 | push @{$unicode{$code{$base}}}, $code; |
52 | # print "$code: $name{$code} -> $base\n", |
53 | } |
54 | |
55 | @unicode = sort keys %unicode; |
56 | |
57 | print "Eq/Unicode\n"; |
58 | if (open(EQ_UNICODE, ">Eq/Unicode")) { |
59 | foreach my $c (@unicode) { |
60 | print EQ_UNICODE "$c @{$unicode{$c}}\n"; |
61 | } |
62 | close EQ_UNICODE; |
63 | } else { |
64 | die "$0: failed to open Eq/Unicode for writing: $!\n"; |
65 | } |
66 | |
67 | print "Eq/Latin1\n"; |
68 | if (open(EQ_LATIN1, ">Eq/Latin1")) { |
69 | foreach my $c (@unicode) { |
70 | last if hex($c) > 255; |
71 | my @c = grep { hex($_) <= 255 } @{$unicode{$c}}; |
72 | next unless @c; |
73 | print EQ_LATIN1 "$c @c\n"; |
74 | } |
75 | close EQ_LATIN1; |
76 | } else { |
77 | die "$0: failed to open Eq/Latin1 for writing: $!\n"; |
78 | } |
79 | |
a0ed51b3 |
80 | @todo = ( |
81 | # typical |
82 | |
b8c5462f |
83 | ['IsWord', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''], |
84 | ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/', ''], |
85 | ['IsAlpha', '$cat =~ /^L[ulo]/', ''], |
86 | ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''], |
87 | ['IsDigit', '$cat =~ /^Nd$/', ''], |
88 | ['IsUpper', '$cat =~ /^Lu$/', ''], |
89 | ['IsLower', '$cat =~ /^Ll$/', ''], |
90 | ['IsASCII', 'hex $code <= 127', ''], |
91 | ['IsCntrl', '$cat =~ /^C/', ''], |
92 | ['IsGraph', '$cat =~ /^[^C]/ and $code ne "0020"', ''], |
93 | ['IsPrint', '$cat =~ /^[^C]/', ''], |
94 | ['IsPunct', '$cat =~ /^P/', ''], |
95 | ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''], |
96 | ['ToUpper', '$up', '$up'], |
97 | ['ToLower', '$down', '$down'], |
98 | ['ToTitle', '$title', '$title'], |
99 | ['ToDigit', '$dec ne ""', '$dec'], |
a0ed51b3 |
100 | |
101 | # Name |
102 | |
103 | ['Name', '$name', '$name'], |
104 | |
105 | # Category |
106 | |
107 | ['Category', '$cat', '$cat'], |
108 | |
109 | # Normative |
110 | |
111 | ['IsM', '$cat =~ /^M/', ''], # Mark |
112 | ['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing |
113 | ['IsMc', '$cat eq "Mc"', ''], # Mark, Combining |
114 | |
115 | ['IsN', '$cat =~ /^N/', ''], # Number |
116 | ['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit |
117 | ['IsNo', '$cat eq "No"', ''], # Number, Other |
118 | |
119 | ['IsZ', '$cat =~ /^Z/', ''], # Zeparator |
120 | ['IsZs', '$cat eq "Zs"', ''], # Separator, Space |
121 | ['IsZl', '$cat eq "Zl"', ''], # Separator, Line |
122 | ['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph |
123 | |
124 | ['IsC', '$cat =~ /^C/', ''], # Crazy |
125 | ['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format |
126 | ['IsCo', '$cat eq "Co"', ''], # Other, Private Use |
127 | ['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned |
128 | |
129 | # Informative |
130 | |
131 | ['IsL', '$cat =~ /^L/', ''], # Letter |
132 | ['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase |
133 | ['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase |
134 | ['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase |
135 | ['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier |
136 | ['IsLo', '$cat eq "Lo"', ''], # Letter, Other |
137 | |
138 | ['IsP', '$cat =~ /^P/', ''], # Punctuation |
139 | ['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash |
140 | ['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open |
141 | ['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close |
142 | ['IsPo', '$cat eq "Po"', ''], # Punctuation, Other |
143 | |
144 | ['IsS', '$cat =~ /^S/', ''], # Symbol |
145 | ['IsSm', '$cat eq "Sm"', ''], # Symbol, Math |
146 | ['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency |
147 | ['IsSo', '$cat eq "So"', ''], # Symbol, Other |
148 | |
149 | # Combining class |
150 | ['CombiningClass', '$comb', '$comb'], |
151 | |
152 | # BIDIRECTIONAL PROPERTIES |
153 | |
154 | ['Bidirectional', '$bid', '$bid'], |
155 | |
156 | # Strong types: |
157 | |
158 | ['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic, |
159 | # syllabic, and logographic |
160 | # characters (e.g., CJK |
161 | # ideographs) |
162 | ['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew, |
163 | # and punctuation specific to |
164 | # those scripts |
165 | |
166 | # Weak types: |
167 | |
168 | ['IsBidiEN','$bid eq "EN"', ''], # European Number |
169 | ['IsBidiES','$bid eq "ES"', ''], # European Number Separator |
170 | ['IsBidiET','$bid eq "ET"', ''], # European Number Terminator |
171 | ['IsBidiAN','$bid eq "AN"', ''], # Arabic Number |
172 | ['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator |
173 | |
174 | # Separators: |
175 | |
176 | ['IsBidiB', '$bid eq "B"', ''], # Block Separator |
177 | ['IsBidiS', '$bid eq "S"', ''], # Segment Separator |
178 | |
179 | # Neutrals: |
180 | |
181 | ['IsBidiWS','$bid eq "WS"', ''], # Whitespace |
182 | ['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other |
183 | # characters: punctuation, |
184 | # symbols |
185 | |
186 | # Decomposition |
187 | |
188 | ['Decomposition', '$decomp', '$decomp'], |
189 | ['IsDecoCanon', '$decomp && $decomp !~ /^</', ''], |
190 | ['IsDecoCompat', '$decomp =~ /^</', ''], |
191 | ['IsDCfont', '$decomp =~ /^<font>/', ''], |
192 | ['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''], |
193 | ['IsDCinitial', '$decomp =~ /^<initial>/', ''], |
194 | ['IsDCinital', '$decomp =~ /^<medial>/', ''], |
195 | ['IsDCfinal', '$decomp =~ /^<final>/', ''], |
196 | ['IsDCisolated', '$decomp =~ /^<isolated>/', ''], |
197 | ['IsDCcircle', '$decomp =~ /^<circle>/', ''], |
198 | ['IsDCsuper', '$decomp =~ /^<super>/', ''], |
199 | ['IsDCsub', '$decomp =~ /^<sub>/', ''], |
200 | ['IsDCvertical', '$decomp =~ /^<vertical>/', ''], |
201 | ['IsDCwide', '$decomp =~ /^<wide>/', ''], |
202 | ['IsDCnarrow', '$decomp =~ /^<narrow>/', ''], |
203 | ['IsDCsmall', '$decomp =~ /^<small>/', ''], |
204 | ['IsDCsquare', '$decomp =~ /^<square>/', ''], |
205 | ['IsDCcompat', '$decomp =~ /^<compat>/', ''], |
206 | |
207 | # Number |
208 | |
209 | ['Number', '$num', '$num'], |
210 | |
211 | # Mirrored |
212 | |
213 | ['IsMirrored', '$mir eq "Y"', ''], |
214 | |
215 | # Arabic |
216 | |
217 | ['ArabLink', '1', '$link'], |
218 | ['ArabLnkGrp', '1', '$linkgroup'], |
219 | |
220 | # Jamo |
221 | |
222 | ['JamoShort', '1', '$short'], |
223 | ); |
224 | |
225 | # This is not written for speed... |
226 | |
227 | foreach $file (@todo) { |
228 | my ($table, $wanted, $val) = @$file; |
229 | next if @ARGV and not grep { $_ eq $table } @ARGV; |
230 | print $table,"\n"; |
231 | if ($table =~ /^(Is|In|To)(.*)/) { |
232 | open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n"; |
233 | } |
234 | else { |
235 | open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n"; |
236 | } |
237 | print OUT <<"END"; |
238 | return <<'END'; |
239 | END |
240 | print OUT proplist($table, $wanted, $val); |
241 | print OUT "END\n"; |
242 | close OUT; |
243 | } |
244 | |
245 | # Must treat blocks specially. |
246 | |
247 | exit if @ARGV and not grep { $_ eq Block } @ARGV; |
248 | print "Block\n"; |
249 | open(UD, 'blocks.txt') or die "Can't open blocks.txt: $!\n"; |
250 | open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n"; |
251 | print OUT <<"END"; |
252 | return <<'END'; |
253 | END |
254 | |
255 | while (<UD>) { |
256 | next if /^#/; |
257 | next if /^$/; |
258 | chomp; |
259 | ($code, $last, $name) = split(/; */); |
260 | if ($name) { |
261 | print OUT "$code $last $name\n"; |
262 | $name =~ s/\s+//g; |
263 | open(BLOCK, ">In/$name.pl"); |
264 | print BLOCK <<"END2"; |
265 | return <<'END'; |
266 | $code $last |
267 | END |
268 | END2 |
269 | close BLOCK; |
270 | } |
271 | } |
272 | |
273 | print OUT "END\n"; |
274 | close OUT; |
275 | |
276 | ################################################## |
277 | |
278 | sub proplist { |
279 | my ($table, $wanted, $val) = @_; |
280 | my @wanted; |
281 | my $out; |
282 | my $split; |
283 | |
284 | if ($table =~ /^Arab/) { |
285 | open(UD, "arabshp.txt") or warn "Can't open $table: $!"; |
286 | |
287 | $split = '($code, $name, $link, $linkgroup) = split(/; */);'; |
288 | } |
289 | elsif ($table =~ /^Jamo/) { |
290 | open(UD, "jamo2.txt") or warn "Can't open $table: $!"; |
291 | |
292 | $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;'; |
293 | } |
294 | else { |
11695a73 |
295 | open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!"; |
a0ed51b3 |
296 | |
297 | $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1, |
298 | $comment, $up, $down, $title) = split(/;/);'; |
299 | } |
300 | |
301 | if ($table =~ /^(?:To|Is)[A-Z]/) { |
302 | eval <<"END"; |
303 | while (<UD>) { |
304 | next if /^#/; |
305 | next if /^\s/; |
306 | chop; |
307 | $split |
308 | if ($wanted) { |
309 | push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]); |
310 | } |
311 | } |
312 | END |
313 | die $@ if $@; |
314 | |
315 | while (@wanted) { |
316 | $beg = shift @wanted; |
317 | $last = $beg; |
318 | while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and |
319 | (not $val or $wanted[0]->[1] == $last->[1] + 1)) { |
320 | $last = shift @wanted; |
321 | } |
322 | $out .= sprintf "%04x", $beg->[0]; |
323 | if ($beg->[2]) { |
324 | $last = shift @wanted; |
325 | } |
326 | if ($beg == $last) { |
327 | $out .= "\t"; |
328 | } |
329 | else { |
330 | $out .= sprintf "\t%04x", $last->[0]; |
331 | } |
332 | $out .= sprintf "\t%04x", $beg->[1] if $val; |
333 | $out .= "\n"; |
334 | } |
335 | } |
336 | else { |
337 | eval <<"END"; |
338 | while (<UD>) { |
339 | next if /^#/; |
340 | next if /^\s*\$/; |
341 | chop; |
342 | $split |
343 | if ($wanted) { |
344 | push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]); |
345 | } |
346 | } |
347 | END |
348 | die $@ if $@; |
349 | |
350 | while (@wanted) { |
351 | $beg = shift @wanted; |
352 | $last = $beg; |
353 | while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and |
354 | ($wanted[0]->[1] eq $last->[1])) { |
355 | $last = shift @wanted; |
356 | } |
357 | $out .= sprintf "%04x", $beg->[0]; |
358 | if ($beg->[2]) { |
359 | $last = shift @wanted; |
360 | } |
361 | if ($beg == $last) { |
362 | $out .= "\t"; |
363 | } |
364 | else { |
365 | $out .= sprintf "\t%04x", $last->[0]; |
366 | } |
367 | $out .= sprintf "\t%s\n", $beg->[1]; |
368 | } |
369 | } |
370 | $out; |
371 | } |
11695a73 |
372 | |
11eeea96 |
373 | # Create the equivalence mappings. |
374 | |
11695a73 |
375 | |