Commit | Line | Data |
ac5ea531 |
1 | #!perl |
2 | # |
3 | # This script generates "unfcan.h", "unfcpt.h", "unfcmb.h", |
4 | # "unfcmp.h", and "unfexc.h" |
5 | # from CombiningClass.pl, Decomposition.pl, CompExcl.txt |
6 | # in lib/unicore or unicode directory |
7 | # for Unicode::Normalize.xs. (cf. Makefile.PL) |
8 | # |
9 | use 5.006; |
10 | use strict; |
11 | use warnings; |
12 | use Carp; |
13 | |
14 | our $PACKAGE = 'Unicode::Normalize, mkheader'; |
15 | |
16 | our $Combin = do "unicore/CombiningClass.pl" |
17 | || do "unicode/CombiningClass.pl" |
18 | || croak "$PACKAGE: CombiningClass.pl not found"; |
19 | |
20 | our $Decomp = do "unicore/Decomposition.pl" |
21 | || do "unicode/Decomposition.pl" |
22 | || croak "$PACKAGE: Decomposition.pl not found"; |
23 | |
24 | our %Combin; # $codepoint => $number : combination class |
25 | our %Canon; # $codepoint => $hexstring : canonical decomp. |
26 | our %Compat; # $codepoint => $hexstring : compat. decomp. |
27 | our %Compos; # $string => $codepoint : composite |
28 | |
29 | our %Exclus; # $codepoint => 1 : composition exclusions |
30 | |
31 | { |
32 | my($f, $fh); |
33 | foreach my $d (@INC) { |
34 | use File::Spec; |
35 | $f = File::Spec->catfile($d, "unicore", "CompExcl.txt"); |
36 | last if open($fh, $f); |
37 | $f = File::Spec->catfile($d, "unicode", "CompExcl.txt"); |
38 | last if open($fh, $f); |
39 | $f = undef; |
40 | } |
41 | croak "$PACKAGE: CompExcl.txt not found in @INC" unless defined $f; |
42 | while(<$fh>) { |
43 | next if /^#/ or /^$/; |
44 | s/#.*//; |
45 | $Exclus{ hex($1) } =1 if /([0-9A-Fa-f]+)/; |
46 | } |
47 | close $fh; |
48 | } |
49 | |
50 | while($Combin =~ /(.+)/g) { |
51 | my @tab = split /\t/, $1; |
52 | my $ini = hex $tab[0]; |
53 | if($tab[1] eq '') { |
54 | $Combin{ $ini } = $tab[2]; |
55 | } else { |
56 | $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]); |
57 | } |
58 | } |
59 | |
60 | while($Decomp =~ /(.+)/g) { |
61 | my @tab = split /\t/, $1; |
62 | my $compat = $tab[2] =~ s/<[^>]+>//; |
63 | my $dec = [ _getHexArray($tab[2]) ]; # decomposition |
64 | my $com = pack('U*', @$dec); # composable sequence |
65 | my $ini = hex($tab[0]); |
66 | if($tab[1] eq '') { |
67 | $Compat{ $ini } = $dec; |
68 | if(! $compat) { |
69 | $Canon{ $ini } = $dec; |
70 | $Compos{ $com } = $ini if @$dec > 1; |
71 | } |
72 | } else { |
73 | foreach my $u ($ini .. hex($tab[1])){ |
74 | $Compat{ $u } = $dec; |
75 | if(! $compat){ |
76 | $Canon{ $u } = $dec; |
77 | $Compos{ $com } = $ini if @$dec > 1; |
78 | } |
79 | } |
80 | } |
81 | } |
82 | |
83 | # exhaustive decomposition |
84 | foreach my $key (keys %Canon) { |
85 | $Canon{$key} = [ getCanonList($key) ]; |
86 | } |
87 | |
88 | # exhaustive decomposition |
89 | foreach my $key (keys %Compat) { |
90 | $Compat{$key} = [ getCompatList($key) ]; |
91 | } |
92 | |
93 | sub getCanonList { |
94 | my @src = @_; |
95 | my @dec = map $Canon{$_} ? @{ $Canon{$_} } : $_, @src; |
96 | join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec); |
97 | # condition @src == @dec is not ok. |
98 | } |
99 | |
100 | sub getCompatList { |
101 | my @src = @_; |
102 | my @dec = map $Compat{$_} ? @{ $Compat{$_} } : $_, @src; |
103 | join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec); |
104 | # condition @src == @dec is not ok. |
105 | } |
106 | |
107 | sub _getHexArray { |
108 | my $str = shift; |
109 | map hex(), $str =~ /([0-9A-Fa-f]+)/g; |
110 | } |
111 | |
112 | sub _U_stringify { |
113 | sprintf '"%s"', join '', |
114 | map sprintf("\\x%2x", $_), unpack 'C*', pack 'U*', @_; |
115 | } |
116 | |
117 | foreach my $hash (\%Canon, \%Compat) { |
118 | foreach my $key (keys %$hash) { |
119 | $hash->{$key} = _U_stringify( @{ $hash->{$key} } ); |
120 | } |
121 | } |
122 | |
123 | sub utf8len { |
124 | my $uv = shift; |
125 | return $uv < 0x80 ? 1 : |
126 | $uv < 0x800 ? 2 : |
127 | $uv < 0x10000 ? 3 : |
128 | $uv < 0x110000 ? 4 : |
129 | croak "$PACKAGE: illegal char in the composite. utf-8 max is 0x10ffff."; |
130 | } |
131 | |
2a204b45 |
132 | sub utfebcdiclen { |
133 | my $uv = shift; |
134 | return $uv < 0xA0 ? 1 : |
135 | $uv < 0x400 ? 2 : |
136 | $uv < 0x4000 ? 3 : |
137 | $uv < 0x40000 ? 4 : |
138 | $uv < 0x110000 ? 5 : |
139 | croak "$PACKAGE: illegal char in the composite. utf-8 max is 0x10ffff."; |
140 | } |
141 | |
ac5ea531 |
142 | my $prefix = "UNF_"; |
143 | |
144 | my $structname = "${prefix}complist"; |
145 | |
146 | our (%Comp1st, %CompList); |
147 | |
2a204b45 |
148 | my $errExpand = "$PACKAGE: A composable pair in %s " |
149 | . "is longer than the composite in bytes!\n" |
150 | . "%d + %d => %d\nQuit. Please inform the author..."; |
151 | |
ac5ea531 |
152 | foreach(sort keys %Compos) { |
153 | my @a = unpack('U*', $_); |
154 | my $val = $Compos{$_}; |
155 | my $name = sprintf "${structname}_%06x", $a[0]; |
156 | $Comp1st{ $a[0] } = $name; |
157 | $CompList{ $name }{ $a[1] } = $val; |
158 | |
159 | if( utf8len($a[0]) + utf8len($a[1]) < utf8len($val) ) { |
2a204b45 |
160 | croak sprintf($errExpand, "utf-8", $a[0], $a[1], $val); |
161 | } |
162 | if( utfebcdiclen($a[0]) + utfebcdiclen($a[1]) < utfebcdiclen($val)) { |
163 | croak sprintf($errExpand, "utf-ebcdic", $a[0], $a[1], $val); |
ac5ea531 |
164 | } |
165 | } |
166 | |
167 | my $compinit = |
168 | "typedef struct { UV nextchar; UV composite; } $structname;\n\n"; |
169 | |
170 | foreach my $i (sort keys %CompList) { |
171 | $compinit .= "$structname $i [] = {\n"; |
172 | $compinit .= join ",\n", |
173 | map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}), |
174 | sort {$a <=> $b } keys %{ $CompList{$i} }; |
175 | $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel |
176 | } |
177 | |
178 | #################################### |
179 | |
180 | my @Exclus = sort {$a <=> $b} keys %Exclus; |
181 | |
182 | my $file = "unfexc.h"; |
183 | open FH, ">$file" or croak "$PACKAGE: $file can't be made"; |
184 | binmode FH; select FH; |
185 | |
2a204b45 |
186 | print "bool isExclusion (UV uv) \n{\nreturn\n\t"; |
ac5ea531 |
187 | |
188 | while(@Exclus) { |
189 | my $cur = shift @Exclus; |
190 | if(@Exclus && $cur + 1 == $Exclus[0]) { |
191 | print "$cur <= uv && uv <= "; |
192 | while(@Exclus && $cur + 1 == $Exclus[0]) { |
193 | $cur = shift @Exclus; |
194 | } |
195 | print $cur; |
196 | print "\n\t|| " if @Exclus; |
197 | } else { |
198 | print "uv == $cur"; |
199 | print "\n\t|| " if @Exclus; |
200 | } |
201 | } |
202 | |
203 | print "\n\t? TRUE : FALSE;\n}\n\n"; |
204 | close FH; |
205 | |
206 | #################################### |
207 | |
208 | my @tripletable = ( |
209 | { |
210 | file => "unfcmb", |
211 | name => "combin", |
3164a1ca |
212 | type => "STDCHAR", |
ac5ea531 |
213 | hash => \%Combin, |
214 | null => 0, |
215 | }, |
216 | { |
217 | file => "unfcan", |
218 | name => "canon", |
219 | type => "char*", |
220 | hash => \%Canon, |
221 | null => "NULL", |
222 | }, |
223 | { |
224 | file => "unfcpt", |
225 | name => "compat", |
226 | type => "char*", |
227 | hash => \%Compat, |
228 | null => "NULL", |
229 | }, |
230 | { |
231 | file => "unfcmp", |
232 | name => "compos", |
233 | type => "$structname *", |
234 | hash => \%Comp1st, |
235 | null => "NULL", |
236 | init => $compinit, |
237 | }, |
238 | ); |
239 | |
240 | foreach my $tbl (@tripletable) { |
241 | my $file = "$tbl->{file}.h"; |
242 | my $head = "${prefix}$tbl->{name}"; |
243 | my $type = $tbl->{type}; |
244 | my $hash = $tbl->{hash}; |
245 | my $null = $tbl->{null}; |
246 | my $init = $tbl->{init}; |
247 | |
248 | open FH, ">$file" or croak "$PACKAGE: $file can't be made"; |
249 | binmode FH; select FH; |
250 | my %val; |
251 | |
252 | print FH << 'EOF'; |
253 | /* |
254 | * This file is auto-generated by mkheader. |
255 | * Any changes here will be lost! |
256 | */ |
257 | EOF |
258 | |
259 | print $init if defined $init; |
260 | |
261 | foreach my $uv (keys %$hash) { |
262 | my @c = unpack 'CCCC', pack 'N', $uv; |
263 | $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv}; |
264 | } |
265 | |
266 | foreach my $p (sort { $a <=> $b } keys %val) { |
267 | next if ! $val{ $p }; |
268 | for(my $r = 0; $r < 256; $r++){ |
269 | next if ! $val{ $p }{ $r }; |
270 | printf "$type ${head}_%02x_%02x [256] = {\n", $p, $r; |
271 | for(my $c = 0; $c < 256; $c++){ |
272 | print "\t", defined $val{$p}{$r}{$c} ? $val{$p}{$r}{$c} : $null; |
273 | print ',' if $c != 255; |
274 | print "\n" if $c % 8 == 7; |
275 | } |
276 | print "};\n\n"; |
277 | } |
278 | } |
279 | foreach my $p (sort { $a <=> $b } keys %val) { |
280 | next if ! $val{ $p }; |
281 | printf "$type* ${head}_%02x [256] = {\n", $p; |
282 | for(my $r = 0; $r < 256; $r++){ |
283 | print $val{ $p }{ $r } ? sprintf("${head}_%02x_%02x", $p, $r) : "NULL"; |
284 | print ',' if $r != 255; |
285 | print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0; |
286 | } |
287 | print "};\n\n"; |
288 | } |
289 | print "$type** $head [] = {\n"; |
290 | for(my $p = 0; $p <= 0x10; $p++){ |
291 | print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL"; |
292 | print ',' if $p != 0x10; |
293 | print "\n"; |
294 | } |
295 | print "};\n\n"; |
296 | close FH; |
297 | } |
298 | |
299 | __END__ |