Commit | Line | Data |
ac5ea531 |
1 | #!perl |
2 | # |
3 | # This script generates "unfcan.h", "unfcpt.h", "unfcmb.h", |
f027f502 |
4 | # "unfcmp.h", and "unfexc.h" |
ac5ea531 |
5 | # from CombiningClass.pl, Decomposition.pl, CompExcl.txt |
6 | # in lib/unicore or unicode directory |
7 | # for Unicode::Normalize.xs. (cf. Makefile.PL) |
8 | # |
f027f502 |
9 | # Usage: <perl mkheader> in command line |
10 | # or <do 'mkheader'> in perl |
11 | # |
ac5ea531 |
12 | use 5.006; |
13 | use strict; |
14 | use warnings; |
15 | use Carp; |
16 | |
17 | our $PACKAGE = 'Unicode::Normalize, mkheader'; |
18 | |
19 | our $Combin = do "unicore/CombiningClass.pl" |
8f118dcd |
20 | || do "unicode/CombiningClass.pl" |
21 | || croak "$PACKAGE: CombiningClass.pl not found"; |
ac5ea531 |
22 | |
23 | our $Decomp = do "unicore/Decomposition.pl" |
8f118dcd |
24 | || do "unicode/Decomposition.pl" |
25 | || croak "$PACKAGE: Decomposition.pl not found"; |
ac5ea531 |
26 | |
27 | our %Combin; # $codepoint => $number : combination class |
28 | our %Canon; # $codepoint => $hexstring : canonical decomp. |
29 | our %Compat; # $codepoint => $hexstring : compat. decomp. |
f027f502 |
30 | our %Compos; # $1st,$2nd => $codepoint : composite |
ac5ea531 |
31 | our %Exclus; # $codepoint => 1 : composition exclusions |
8f118dcd |
32 | our %Single; # $codepoint => 1 : singletons |
33 | our %NonStD; # $codepoint => 1 : non-starter decompositions |
ac5ea531 |
34 | |
35 | { |
8f118dcd |
36 | my($f, $fh); |
37 | foreach my $d (@INC) { |
38 | use File::Spec; |
39 | $f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt"); |
40 | last if open($fh, $f); |
41 | $f = File::Spec->catfile($d, "unicode", "CompExcl.txt"); |
42 | last if open($fh, $f); |
43 | $f = undef; |
44 | } |
45 | croak "$PACKAGE: CompExcl.txt not found in @INC" unless defined $f; |
46 | while (<$fh>) { |
47 | next if /^#/ or /^$/; |
48 | s/#.*//; |
49 | $Exclus{ hex($1) } =1 if /([0-9A-Fa-f]+)/; |
50 | } |
51 | close $fh; |
ac5ea531 |
52 | } |
53 | |
8f118dcd |
54 | while ($Combin =~ /(.+)/g) { |
55 | my @tab = split /\t/, $1; |
56 | my $ini = hex $tab[0]; |
57 | if ($tab[1] eq '') { |
58 | $Combin{ $ini } = $tab[2]; |
59 | } else { |
60 | $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]); |
61 | } |
ac5ea531 |
62 | } |
63 | |
8f118dcd |
64 | while ($Decomp =~ /(.+)/g) { |
65 | my @tab = split /\t/, $1; |
66 | my $compat = $tab[2] =~ s/<[^>]+>//; |
67 | my $dec = [ _getHexArray($tab[2]) ]; # decomposition |
8f118dcd |
68 | my $ini = hex($tab[0]); # initial decomposable character |
69 | if ($tab[1] eq '') { |
70 | $Compat{ $ini } = $dec; |
71 | |
72 | if (! $compat) { |
73 | $Canon{ $ini } = $dec; |
74 | |
f027f502 |
75 | if (@$dec == 2) { |
8f118dcd |
76 | if ($Combin{ $dec->[0] }) { |
77 | $NonStD{ $ini } = 1; |
78 | } else { |
f027f502 |
79 | $Compos{ $dec->[0] }{ $dec->[1] } = $ini; |
8f118dcd |
80 | } |
f027f502 |
81 | } elsif (@$dec == 1) { |
8f118dcd |
82 | $Single{ $ini } = 1; |
f027f502 |
83 | } else { |
84 | croak("Weird Canonical Decomposition of U+$tab[0]"); |
8f118dcd |
85 | } |
86 | } |
87 | } else { |
88 | foreach my $u ($ini .. hex($tab[1])){ |
89 | $Compat{ $u } = $dec; |
90 | if (! $compat) { |
91 | $Canon{ $u } = $dec; |
92 | |
f027f502 |
93 | if (@$dec == 2) { |
8f118dcd |
94 | if ($Combin{ $dec->[0] }) { |
95 | $NonStD{ $u } = 1; |
96 | } else { |
f027f502 |
97 | $Compos{ $dec->[0] }{ $dec->[1] } = $u; |
8f118dcd |
98 | } |
f027f502 |
99 | } elsif (@$dec == 1) { |
8f118dcd |
100 | $Single{ $u } = 1; |
f027f502 |
101 | } else { |
102 | croak("Weird Canonical Decomposition of U+$tab[0]"); |
8f118dcd |
103 | } |
104 | } |
105 | } |
ac5ea531 |
106 | } |
ac5ea531 |
107 | } |
108 | |
109 | # exhaustive decomposition |
110 | foreach my $key (keys %Canon) { |
8f118dcd |
111 | $Canon{$key} = [ getCanonList($key) ]; |
ac5ea531 |
112 | } |
113 | |
114 | # exhaustive decomposition |
115 | foreach my $key (keys %Compat) { |
8f118dcd |
116 | $Compat{$key} = [ getCompatList($key) ]; |
ac5ea531 |
117 | } |
118 | |
119 | sub getCanonList { |
8f118dcd |
120 | my @src = @_; |
121 | my @dec = map $Canon{$_} ? @{ $Canon{$_} } : $_, @src; |
122 | return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec); |
123 | # condition @src == @dec is not ok. |
ac5ea531 |
124 | } |
125 | |
126 | sub getCompatList { |
8f118dcd |
127 | my @src = @_; |
128 | my @dec = map $Compat{$_} ? @{ $Compat{$_} } : $_, @src; |
129 | return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec); |
130 | # condition @src == @dec is not ok. |
ac5ea531 |
131 | } |
132 | |
8f118dcd |
133 | sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g } |
ac5ea531 |
134 | |
135 | sub _U_stringify { |
8f118dcd |
136 | sprintf '"%s"', join '', |
137 | map sprintf("\\x%02x", $_), unpack 'C*', pack 'U*', @_; |
ac5ea531 |
138 | } |
f027f502 |
139 | # Do we need say <pack 'U*', map utf8::unicode_to_native($_),> |
140 | # instead of <pack 'U*',> for EBCDIC? |
ac5ea531 |
141 | |
142 | foreach my $hash (\%Canon, \%Compat) { |
8f118dcd |
143 | foreach my $key (keys %$hash) { |
144 | $hash->{$key} = _U_stringify( @{ $hash->{$key} } ); |
145 | } |
ac5ea531 |
146 | } |
147 | |
ac5ea531 |
148 | my $prefix = "UNF_"; |
ac5ea531 |
149 | my $structname = "${prefix}complist"; |
150 | |
8f118dcd |
151 | our (%Comp1st, %Comp2nd, %CompList); |
ac5ea531 |
152 | |
f027f502 |
153 | foreach my $c1 (keys %Compos) { |
154 | my $name = sprintf "${structname}_%06x", $c1; |
155 | $Comp1st{$c1} = $name; |
156 | |
157 | foreach my $c2 (keys %{ $Compos{$c1} }) { |
158 | my $composite = $Compos{$c1}{$c2}; |
159 | $Comp2nd{$c2} = 1 if ! $Exclus{$composite} && ! $Combin{$c1}; |
160 | $CompList{$name}{$c2} = $composite; |
161 | } |
8f118dcd |
162 | } |
163 | |
164 | # modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo |
f027f502 |
165 | foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) { |
166 | $Comp2nd{$j} = 1; |
ac5ea531 |
167 | } |
168 | |
169 | my $compinit = |
8f118dcd |
170 | "typedef struct { UV nextchar; UV composite; } $structname;\n\n"; |
ac5ea531 |
171 | |
172 | foreach my $i (sort keys %CompList) { |
8f118dcd |
173 | $compinit .= "$structname $i [] = {\n"; |
174 | $compinit .= join ",\n", |
175 | map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}), |
176 | sort {$a <=> $b } keys %{ $CompList{$i} }; |
177 | $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel |
ac5ea531 |
178 | } |
179 | |
180 | #################################### |
181 | |
8f118dcd |
182 | my @boolfunc = ( |
183 | { |
184 | name => "Exclusion", |
185 | type => "bool", |
186 | hash => \%Exclus, |
187 | }, |
188 | { |
189 | name => "Singleton", |
190 | type => "bool", |
191 | hash => \%Single, |
192 | }, |
193 | { |
194 | name => "NonStDecomp", |
195 | type => "bool", |
196 | hash => \%NonStD, |
197 | }, |
198 | { |
199 | name => "Comp2nd", |
200 | type => "bool", |
201 | hash => \%Comp2nd, |
202 | }, |
203 | ); |
ac5ea531 |
204 | |
205 | my $file = "unfexc.h"; |
206 | open FH, ">$file" or croak "$PACKAGE: $file can't be made"; |
207 | binmode FH; select FH; |
208 | |
8f118dcd |
209 | print << 'EOF'; |
210 | /* |
211 | * This file is auto-generated by mkheader. |
212 | * Any changes here will be lost! |
213 | */ |
214 | EOF |
ac5ea531 |
215 | |
8f118dcd |
216 | foreach my $tbl (@boolfunc) { |
217 | my @temp = sort {$a <=> $b} keys %{$tbl->{hash}}; |
218 | my $type = $tbl->{type}; |
219 | my $name = $tbl->{name}; |
220 | print "$type is$name (UV uv)\n{\nreturn\n\t"; |
221 | |
222 | while (@temp) { |
223 | my $cur = shift @temp; |
224 | if (@temp && $cur + 1 == $temp[0]) { |
225 | print "($cur <= uv && uv <= "; |
226 | while (@temp && $cur + 1 == $temp[0]) { |
227 | $cur = shift @temp; |
228 | } |
229 | print "$cur)"; |
230 | print "\n\t|| " if @temp; |
231 | } else { |
232 | print "uv == $cur"; |
233 | print "\n\t|| " if @temp; |
234 | } |
ac5ea531 |
235 | } |
8f118dcd |
236 | print "\n\t? TRUE : FALSE;\n}\n\n"; |
ac5ea531 |
237 | } |
238 | |
ac5ea531 |
239 | close FH; |
240 | |
8f118dcd |
241 | |
ac5ea531 |
242 | #################################### |
243 | |
244 | my @tripletable = ( |
8f118dcd |
245 | { |
246 | file => "unfcmb", |
247 | name => "combin", |
248 | type => "STDCHAR", |
249 | hash => \%Combin, |
250 | null => 0, |
251 | }, |
252 | { |
253 | file => "unfcan", |
254 | name => "canon", |
255 | type => "char*", |
256 | hash => \%Canon, |
257 | null => "NULL", |
258 | }, |
259 | { |
260 | file => "unfcpt", |
261 | name => "compat", |
262 | type => "char*", |
263 | hash => \%Compat, |
264 | null => "NULL", |
265 | }, |
266 | { |
267 | file => "unfcmp", |
268 | name => "compos", |
269 | type => "$structname *", |
270 | hash => \%Comp1st, |
271 | null => "NULL", |
272 | init => $compinit, |
273 | }, |
ac5ea531 |
274 | ); |
275 | |
276 | foreach my $tbl (@tripletable) { |
8f118dcd |
277 | my $file = "$tbl->{file}.h"; |
278 | my $head = "${prefix}$tbl->{name}"; |
279 | my $type = $tbl->{type}; |
280 | my $hash = $tbl->{hash}; |
281 | my $null = $tbl->{null}; |
282 | my $init = $tbl->{init}; |
283 | |
284 | open FH, ">$file" or croak "$PACKAGE: $file can't be made"; |
285 | binmode FH; select FH; |
286 | my %val; |
287 | |
288 | print FH << 'EOF'; |
ac5ea531 |
289 | /* |
290 | * This file is auto-generated by mkheader. |
291 | * Any changes here will be lost! |
292 | */ |
293 | EOF |
294 | |
8f118dcd |
295 | print $init if defined $init; |
296 | |
297 | foreach my $uv (keys %$hash) { |
f027f502 |
298 | croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv) |
299 | unless $uv <= 0x10FFFF; |
8f118dcd |
300 | my @c = unpack 'CCCC', pack 'N', $uv; |
301 | $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv}; |
302 | } |
303 | |
304 | foreach my $p (sort { $a <=> $b } keys %val) { |
305 | next if ! $val{ $p }; |
306 | for (my $r = 0; $r < 256; $r++) { |
307 | next if ! $val{ $p }{ $r }; |
308 | printf "$type ${head}_%02x_%02x [256] = {\n", $p, $r; |
309 | for (my $c = 0; $c < 256; $c++) { |
310 | print "\t", defined $val{$p}{$r}{$c} |
311 | ? "($type)".$val{$p}{$r}{$c} |
312 | : $null; |
313 | print ',' if $c != 255; |
314 | print "\n" if $c % 8 == 7; |
315 | } |
316 | print "};\n\n"; |
317 | } |
318 | } |
319 | foreach my $p (sort { $a <=> $b } keys %val) { |
320 | next if ! $val{ $p }; |
321 | printf "$type* ${head}_%02x [256] = {\n", $p; |
322 | for (my $r = 0; $r < 256; $r++) { |
323 | print $val{ $p }{ $r } |
324 | ? sprintf("${head}_%02x_%02x", $p, $r) |
325 | : "NULL"; |
326 | print ',' if $r != 255; |
327 | print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0; |
328 | } |
329 | print "};\n\n"; |
ac5ea531 |
330 | } |
8f118dcd |
331 | print "$type** $head [] = {\n"; |
332 | for (my $p = 0; $p <= 0x10; $p++) { |
333 | print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL"; |
334 | print ',' if $p != 0x10; |
335 | print "\n"; |
ac5ea531 |
336 | } |
337 | print "};\n\n"; |
8f118dcd |
338 | close FH; |
ac5ea531 |
339 | } |
340 | |
341 | __END__ |