Commit | Line | Data |
ac5ea531 |
1 | #!perl |
2 | # |
628bbff0 |
3 | # This auxiliary script makes five header files |
4 | # used for building XSUB of Unicode::Normalize. |
ac5ea531 |
5 | # |
628bbff0 |
6 | # Usage: |
7 | # <do 'mkheader'> in perl, or <perl mkheader> in command line |
8 | # |
9 | # Input files: |
10 | # unicore/CombiningClass.pl (or unicode/CombiningClass.pl) |
11 | # unicore/Decomposition.pl (or unicode/Decomposition.pl) |
12 | # unicore/CompositionExclusions.txt (or unicode/CompExcl.txt) |
13 | # |
14 | # Output files: |
15 | # unfcan.h |
16 | # unfcpt.h |
17 | # unfcmb.h |
18 | # unfcmp.h |
19 | # unfexc.h |
f027f502 |
20 | # |
ac5ea531 |
21 | use 5.006; |
22 | use strict; |
23 | use warnings; |
24 | use Carp; |
6c941e0c |
25 | use File::Spec; |
26 | |
9f1f04a1 |
27 | BEGIN { |
1efaba7f |
28 | unless ("A" eq pack('U', 0x41)) { |
9f1f04a1 |
29 | die "Unicode::Normalize cannot stringify a Unicode code point\n"; |
30 | } |
31 | } |
ac5ea531 |
32 | |
33 | our $PACKAGE = 'Unicode::Normalize, mkheader'; |
34 | |
35 | our $Combin = do "unicore/CombiningClass.pl" |
8f118dcd |
36 | || do "unicode/CombiningClass.pl" |
37 | || croak "$PACKAGE: CombiningClass.pl not found"; |
ac5ea531 |
38 | |
39 | our $Decomp = do "unicore/Decomposition.pl" |
8f118dcd |
40 | || do "unicode/Decomposition.pl" |
41 | || croak "$PACKAGE: Decomposition.pl not found"; |
ac5ea531 |
42 | |
48287974 |
43 | our %Combin; # $codepoint => $number : combination class |
6c941e0c |
44 | our %Canon; # $codepoint => \@codepoints : canonical decomp. |
45 | our %Compat; # $codepoint => \@codepoints : compat. decomp. |
46 | # after _U_stringify(), ($codepoint => $hexstring) for %Canon and %Compat |
48287974 |
47 | our %Exclus; # $codepoint => 1 : composition exclusions |
48 | our %Single; # $codepoint => 1 : singletons |
49 | our %NonStD; # $codepoint => 1 : non-starter decompositions |
50 | |
51 | our %Comp1st; # $codepoint => $listname : may be composed with a next char. |
52 | our %Comp2nd; # $codepoint => 1 : may be composed with a prev char. |
53 | our %CompList; # $listname,$2nd => $codepoint : composite |
54 | |
55 | our $prefix = "UNF_"; |
56 | our $structname = "${prefix}complist"; |
ac5ea531 |
57 | |
6c941e0c |
58 | ########## definition of Hangul constants ########## |
59 | use constant SBase => 0xAC00; |
60 | use constant SFinal => 0xD7A3; # SBase -1 + SCount |
61 | use constant SCount => 11172; # LCount * NCount |
62 | use constant NCount => 588; # VCount * TCount |
63 | use constant LBase => 0x1100; |
64 | use constant LFinal => 0x1112; |
65 | use constant LCount => 19; |
66 | use constant VBase => 0x1161; |
67 | use constant VFinal => 0x1175; |
68 | use constant VCount => 21; |
69 | use constant TBase => 0x11A7; |
70 | use constant TFinal => 0x11C2; |
71 | use constant TCount => 28; |
72 | |
73 | sub decomposeHangul { |
74 | my $SIndex = $_[0] - SBase; |
75 | my $LIndex = int( $SIndex / NCount); |
76 | my $VIndex = int(($SIndex % NCount) / TCount); |
77 | my $TIndex = $SIndex % TCount; |
78 | my @ret = ( |
79 | LBase + $LIndex, |
80 | VBase + $VIndex, |
81 | $TIndex ? (TBase + $TIndex) : (), |
82 | ); |
2b8d773d |
83 | return @ret; |
6c941e0c |
84 | } |
85 | |
86 | ########## getting full decomposion ########## |
ac5ea531 |
87 | { |
8f118dcd |
88 | my($f, $fh); |
89 | foreach my $d (@INC) { |
8f118dcd |
90 | $f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt"); |
91 | last if open($fh, $f); |
92 | $f = File::Spec->catfile($d, "unicode", "CompExcl.txt"); |
93 | last if open($fh, $f); |
94 | $f = undef; |
95 | } |
48287974 |
96 | croak "$PACKAGE: neither unicore/CompositionExclusions.txt " |
97 | . "nor unicode/CompExcl.txt is found in @INC" unless defined $f; |
98 | |
99 | while (<$fh>) { |
100 | next if /^#/ or /^$/; |
101 | s/#.*//; |
102 | $Exclus{ hex($1) } = 1 if /([0-9A-Fa-f]+)/; |
103 | } |
8f118dcd |
104 | close $fh; |
ac5ea531 |
105 | } |
106 | |
48287974 |
107 | ## |
108 | ## converts string "hhhh hhhh hhhh" to a numeric list |
109 | ## |
110 | sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g } |
111 | |
8f118dcd |
112 | while ($Combin =~ /(.+)/g) { |
113 | my @tab = split /\t/, $1; |
114 | my $ini = hex $tab[0]; |
115 | if ($tab[1] eq '') { |
116 | $Combin{ $ini } = $tab[2]; |
117 | } else { |
118 | $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]); |
119 | } |
ac5ea531 |
120 | } |
121 | |
8f118dcd |
122 | while ($Decomp =~ /(.+)/g) { |
123 | my @tab = split /\t/, $1; |
124 | my $compat = $tab[2] =~ s/<[^>]+>//; |
125 | my $dec = [ _getHexArray($tab[2]) ]; # decomposition |
8f118dcd |
126 | my $ini = hex($tab[0]); # initial decomposable character |
48287974 |
127 | |
128 | my $listname = |
129 | @$dec == 2 ? sprintf("${structname}_%06x", $dec->[0]) : 'USELESS'; |
130 | # %04x is bad since it'd place _3046 after _1d157. |
131 | |
8f118dcd |
132 | if ($tab[1] eq '') { |
133 | $Compat{ $ini } = $dec; |
134 | |
135 | if (! $compat) { |
48287974 |
136 | $Canon{ $ini } = $dec; |
8f118dcd |
137 | |
f027f502 |
138 | if (@$dec == 2) { |
8f118dcd |
139 | if ($Combin{ $dec->[0] }) { |
140 | $NonStD{ $ini } = 1; |
141 | } else { |
48287974 |
142 | $CompList{ $listname }{ $dec->[1] } = $ini; |
143 | $Comp1st{ $dec->[0] } = $listname; |
144 | $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$ini}; |
8f118dcd |
145 | } |
f027f502 |
146 | } elsif (@$dec == 1) { |
8f118dcd |
147 | $Single{ $ini } = 1; |
f027f502 |
148 | } else { |
149 | croak("Weird Canonical Decomposition of U+$tab[0]"); |
8f118dcd |
150 | } |
151 | } |
152 | } else { |
48287974 |
153 | foreach my $u ($ini .. hex($tab[1])) { |
8f118dcd |
154 | $Compat{ $u } = $dec; |
48287974 |
155 | |
8f118dcd |
156 | if (! $compat) { |
48287974 |
157 | $Canon{ $u } = $dec; |
8f118dcd |
158 | |
f027f502 |
159 | if (@$dec == 2) { |
8f118dcd |
160 | if ($Combin{ $dec->[0] }) { |
161 | $NonStD{ $u } = 1; |
162 | } else { |
48287974 |
163 | $CompList{ $listname }{ $dec->[1] } = $u; |
164 | $Comp1st{ $dec->[0] } = $listname; |
165 | $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u}; |
8f118dcd |
166 | } |
f027f502 |
167 | } elsif (@$dec == 1) { |
8f118dcd |
168 | $Single{ $u } = 1; |
f027f502 |
169 | } else { |
170 | croak("Weird Canonical Decomposition of U+$tab[0]"); |
8f118dcd |
171 | } |
172 | } |
173 | } |
ac5ea531 |
174 | } |
ac5ea531 |
175 | } |
176 | |
48287974 |
177 | # modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo |
178 | foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) { |
179 | $Comp2nd{$j} = 1; |
ac5ea531 |
180 | } |
181 | |
182 | sub getCanonList { |
8f118dcd |
183 | my @src = @_; |
6c941e0c |
184 | my @dec = map { |
185 | (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) |
186 | : $Canon{$_} ? @{ $Canon{$_} } : $_ |
187 | } @src; |
8f118dcd |
188 | return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec); |
189 | # condition @src == @dec is not ok. |
ac5ea531 |
190 | } |
191 | |
192 | sub getCompatList { |
8f118dcd |
193 | my @src = @_; |
6c941e0c |
194 | my @dec = map { |
195 | (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) |
196 | : $Compat{$_} ? @{ $Compat{$_} } : $_ |
197 | } @src; |
8f118dcd |
198 | return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec); |
199 | # condition @src == @dec is not ok. |
ac5ea531 |
200 | } |
201 | |
48287974 |
202 | # exhaustive decomposition |
203 | foreach my $key (keys %Canon) { |
204 | $Canon{$key} = [ getCanonList($key) ]; |
205 | } |
206 | |
207 | # exhaustive decomposition |
628bbff0 |
208 | foreach my $key (keys %Compat) { |
48287974 |
209 | $Compat{$key} = [ getCompatList($key) ]; |
210 | } |
ac5ea531 |
211 | |
9f1f04a1 |
212 | sub _pack_U { |
1efaba7f |
213 | return pack('U*', @_); |
9f1f04a1 |
214 | } |
215 | |
2b8d773d |
216 | sub split_into_char { |
217 | use bytes; |
218 | my $uni = shift; |
219 | my $len = length($uni); |
220 | my @ary; |
221 | for(my $i = 0; $i < $len; ++$i) { |
222 | push @ary, ord(substr($uni,$i,1)); |
223 | } |
224 | return @ary; |
225 | } |
226 | |
ac5ea531 |
227 | sub _U_stringify { |
8f118dcd |
228 | sprintf '"%s"', join '', |
2b8d773d |
229 | map sprintf("\\x%02x", $_), split_into_char(_pack_U(@_)); |
ac5ea531 |
230 | } |
231 | |
232 | foreach my $hash (\%Canon, \%Compat) { |
8f118dcd |
233 | foreach my $key (keys %$hash) { |
234 | $hash->{$key} = _U_stringify( @{ $hash->{$key} } ); |
235 | } |
ac5ea531 |
236 | } |
237 | |
6c941e0c |
238 | ########## writing header files ########## |
ac5ea531 |
239 | |
8f118dcd |
240 | my @boolfunc = ( |
241 | { |
242 | name => "Exclusion", |
243 | type => "bool", |
244 | hash => \%Exclus, |
245 | }, |
246 | { |
247 | name => "Singleton", |
248 | type => "bool", |
249 | hash => \%Single, |
250 | }, |
251 | { |
252 | name => "NonStDecomp", |
253 | type => "bool", |
254 | hash => \%NonStD, |
255 | }, |
256 | { |
257 | name => "Comp2nd", |
258 | type => "bool", |
259 | hash => \%Comp2nd, |
260 | }, |
261 | ); |
ac5ea531 |
262 | |
263 | my $file = "unfexc.h"; |
264 | open FH, ">$file" or croak "$PACKAGE: $file can't be made"; |
265 | binmode FH; select FH; |
266 | |
8f118dcd |
267 | print << 'EOF'; |
268 | /* |
269 | * This file is auto-generated by mkheader. |
270 | * Any changes here will be lost! |
271 | */ |
272 | EOF |
ac5ea531 |
273 | |
8f118dcd |
274 | foreach my $tbl (@boolfunc) { |
275 | my @temp = sort {$a <=> $b} keys %{$tbl->{hash}}; |
276 | my $type = $tbl->{type}; |
277 | my $name = $tbl->{name}; |
278 | print "$type is$name (UV uv)\n{\nreturn\n\t"; |
279 | |
280 | while (@temp) { |
281 | my $cur = shift @temp; |
282 | if (@temp && $cur + 1 == $temp[0]) { |
283 | print "($cur <= uv && uv <= "; |
284 | while (@temp && $cur + 1 == $temp[0]) { |
285 | $cur = shift @temp; |
286 | } |
287 | print "$cur)"; |
288 | print "\n\t|| " if @temp; |
289 | } else { |
290 | print "uv == $cur"; |
291 | print "\n\t|| " if @temp; |
292 | } |
ac5ea531 |
293 | } |
8f118dcd |
294 | print "\n\t? TRUE : FALSE;\n}\n\n"; |
ac5ea531 |
295 | } |
296 | |
ac5ea531 |
297 | close FH; |
298 | |
299 | #################################### |
300 | |
48287974 |
301 | my $compinit = |
302 | "typedef struct { UV nextchar; UV composite; } $structname;\n\n"; |
303 | |
304 | foreach my $i (sort keys %CompList) { |
305 | $compinit .= "$structname $i [] = {\n"; |
306 | $compinit .= join ",\n", |
307 | map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}), |
308 | sort {$a <=> $b } keys %{ $CompList{$i} }; |
309 | $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel |
310 | } |
311 | |
ac5ea531 |
312 | my @tripletable = ( |
8f118dcd |
313 | { |
314 | file => "unfcmb", |
315 | name => "combin", |
316 | type => "STDCHAR", |
317 | hash => \%Combin, |
318 | null => 0, |
319 | }, |
320 | { |
321 | file => "unfcan", |
322 | name => "canon", |
323 | type => "char*", |
324 | hash => \%Canon, |
325 | null => "NULL", |
326 | }, |
327 | { |
328 | file => "unfcpt", |
329 | name => "compat", |
330 | type => "char*", |
331 | hash => \%Compat, |
332 | null => "NULL", |
333 | }, |
334 | { |
335 | file => "unfcmp", |
336 | name => "compos", |
337 | type => "$structname *", |
338 | hash => \%Comp1st, |
339 | null => "NULL", |
340 | init => $compinit, |
341 | }, |
ac5ea531 |
342 | ); |
343 | |
344 | foreach my $tbl (@tripletable) { |
8f118dcd |
345 | my $file = "$tbl->{file}.h"; |
346 | my $head = "${prefix}$tbl->{name}"; |
347 | my $type = $tbl->{type}; |
348 | my $hash = $tbl->{hash}; |
349 | my $null = $tbl->{null}; |
350 | my $init = $tbl->{init}; |
351 | |
352 | open FH, ">$file" or croak "$PACKAGE: $file can't be made"; |
353 | binmode FH; select FH; |
354 | my %val; |
355 | |
356 | print FH << 'EOF'; |
ac5ea531 |
357 | /* |
358 | * This file is auto-generated by mkheader. |
359 | * Any changes here will be lost! |
360 | */ |
361 | EOF |
362 | |
8f118dcd |
363 | print $init if defined $init; |
364 | |
365 | foreach my $uv (keys %$hash) { |
f027f502 |
366 | croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv) |
367 | unless $uv <= 0x10FFFF; |
8f118dcd |
368 | my @c = unpack 'CCCC', pack 'N', $uv; |
369 | $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv}; |
370 | } |
371 | |
372 | foreach my $p (sort { $a <=> $b } keys %val) { |
373 | next if ! $val{ $p }; |
374 | for (my $r = 0; $r < 256; $r++) { |
375 | next if ! $val{ $p }{ $r }; |
fe067ad9 |
376 | printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r; |
8f118dcd |
377 | for (my $c = 0; $c < 256; $c++) { |
378 | print "\t", defined $val{$p}{$r}{$c} |
379 | ? "($type)".$val{$p}{$r}{$c} |
380 | : $null; |
381 | print ',' if $c != 255; |
382 | print "\n" if $c % 8 == 7; |
383 | } |
384 | print "};\n\n"; |
385 | } |
386 | } |
387 | foreach my $p (sort { $a <=> $b } keys %val) { |
388 | next if ! $val{ $p }; |
fe067ad9 |
389 | printf "static $type* ${head}_%02x [256] = {\n", $p; |
8f118dcd |
390 | for (my $r = 0; $r < 256; $r++) { |
391 | print $val{ $p }{ $r } |
392 | ? sprintf("${head}_%02x_%02x", $p, $r) |
393 | : "NULL"; |
394 | print ',' if $r != 255; |
395 | print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0; |
396 | } |
397 | print "};\n\n"; |
ac5ea531 |
398 | } |
fe067ad9 |
399 | print "static $type** $head [] = {\n"; |
8f118dcd |
400 | for (my $p = 0; $p <= 0x10; $p++) { |
401 | print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL"; |
402 | print ',' if $p != 0x10; |
403 | print "\n"; |
ac5ea531 |
404 | } |
405 | print "};\n\n"; |
8f118dcd |
406 | close FH; |
ac5ea531 |
407 | } |
408 | |
628bbff0 |
409 | 1; |
ac5ea531 |
410 | __END__ |