Upgrade to Unicode::Normalize 0.76,
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / mkheader
CommitLineData
ac5ea531 1#!perl
2#
3# This script generates "unfcan.h", "unfcpt.h", "unfcmb.h",
f027f502 4# "unfcmp.h", and "unfexc.h"
ac5ea531 5# from CombiningClass.pl, Decomposition.pl, CompExcl.txt
6# in lib/unicore or unicode directory
7# for Unicode::Normalize.xs. (cf. Makefile.PL)
8#
f027f502 9# Usage: <perl mkheader> in command line
10# or <do 'mkheader'> in perl
11#
ac5ea531 12use 5.006;
13use strict;
14use warnings;
15use Carp;
16
17our $PACKAGE = 'Unicode::Normalize, mkheader';
18
19our $Combin = do "unicore/CombiningClass.pl"
8f118dcd 20 || do "unicode/CombiningClass.pl"
21 || croak "$PACKAGE: CombiningClass.pl not found";
ac5ea531 22
23our $Decomp = do "unicore/Decomposition.pl"
8f118dcd 24 || do "unicode/Decomposition.pl"
25 || croak "$PACKAGE: Decomposition.pl not found";
ac5ea531 26
27our %Combin; # $codepoint => $number : combination class
28our %Canon; # $codepoint => $hexstring : canonical decomp.
29our %Compat; # $codepoint => $hexstring : compat. decomp.
f027f502 30our %Compos; # $1st,$2nd => $codepoint : composite
ac5ea531 31our %Exclus; # $codepoint => 1 : composition exclusions
8f118dcd 32our %Single; # $codepoint => 1 : singletons
33our %NonStD; # $codepoint => 1 : non-starter decompositions
ac5ea531 34
35{
8f118dcd 36 my($f, $fh);
37 foreach my $d (@INC) {
38 use File::Spec;
39 $f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
40 last if open($fh, $f);
41 $f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
42 last if open($fh, $f);
43 $f = undef;
44 }
45 croak "$PACKAGE: CompExcl.txt not found in @INC" unless defined $f;
46 while (<$fh>) {
47 next if /^#/ or /^$/;
48 s/#.*//;
49 $Exclus{ hex($1) } =1 if /([0-9A-Fa-f]+)/;
50 }
51 close $fh;
ac5ea531 52}
53
8f118dcd 54while ($Combin =~ /(.+)/g) {
55 my @tab = split /\t/, $1;
56 my $ini = hex $tab[0];
57 if ($tab[1] eq '') {
58 $Combin{ $ini } = $tab[2];
59 } else {
60 $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]);
61 }
ac5ea531 62}
63
8f118dcd 64while ($Decomp =~ /(.+)/g) {
65 my @tab = split /\t/, $1;
66 my $compat = $tab[2] =~ s/<[^>]+>//;
67 my $dec = [ _getHexArray($tab[2]) ]; # decomposition
8f118dcd 68 my $ini = hex($tab[0]); # initial decomposable character
69 if ($tab[1] eq '') {
70 $Compat{ $ini } = $dec;
71
72 if (! $compat) {
73 $Canon{ $ini } = $dec;
74
f027f502 75 if (@$dec == 2) {
8f118dcd 76 if ($Combin{ $dec->[0] }) {
77 $NonStD{ $ini } = 1;
78 } else {
f027f502 79 $Compos{ $dec->[0] }{ $dec->[1] } = $ini;
8f118dcd 80 }
f027f502 81 } elsif (@$dec == 1) {
8f118dcd 82 $Single{ $ini } = 1;
f027f502 83 } else {
84 croak("Weird Canonical Decomposition of U+$tab[0]");
8f118dcd 85 }
86 }
87 } else {
88 foreach my $u ($ini .. hex($tab[1])){
89 $Compat{ $u } = $dec;
90 if (! $compat) {
91 $Canon{ $u } = $dec;
92
f027f502 93 if (@$dec == 2) {
8f118dcd 94 if ($Combin{ $dec->[0] }) {
95 $NonStD{ $u } = 1;
96 } else {
f027f502 97 $Compos{ $dec->[0] }{ $dec->[1] } = $u;
8f118dcd 98 }
f027f502 99 } elsif (@$dec == 1) {
8f118dcd 100 $Single{ $u } = 1;
f027f502 101 } else {
102 croak("Weird Canonical Decomposition of U+$tab[0]");
8f118dcd 103 }
104 }
105 }
ac5ea531 106 }
ac5ea531 107}
108
109# exhaustive decomposition
110foreach my $key (keys %Canon) {
8f118dcd 111 $Canon{$key} = [ getCanonList($key) ];
ac5ea531 112}
113
114# exhaustive decomposition
115foreach my $key (keys %Compat) {
8f118dcd 116 $Compat{$key} = [ getCompatList($key) ];
ac5ea531 117}
118
119sub getCanonList {
8f118dcd 120 my @src = @_;
121 my @dec = map $Canon{$_} ? @{ $Canon{$_} } : $_, @src;
122 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
123 # condition @src == @dec is not ok.
ac5ea531 124}
125
126sub getCompatList {
8f118dcd 127 my @src = @_;
128 my @dec = map $Compat{$_} ? @{ $Compat{$_} } : $_, @src;
129 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
130 # condition @src == @dec is not ok.
ac5ea531 131}
132
8f118dcd 133sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g }
ac5ea531 134
135sub _U_stringify {
8f118dcd 136 sprintf '"%s"', join '',
137 map sprintf("\\x%02x", $_), unpack 'C*', pack 'U*', @_;
ac5ea531 138}
f027f502 139# Do we need say <pack 'U*', map utf8::unicode_to_native($_),>
140# instead of <pack 'U*',> for EBCDIC?
ac5ea531 141
142foreach my $hash (\%Canon, \%Compat) {
8f118dcd 143 foreach my $key (keys %$hash) {
144 $hash->{$key} = _U_stringify( @{ $hash->{$key} } );
145 }
ac5ea531 146}
147
ac5ea531 148my $prefix = "UNF_";
ac5ea531 149my $structname = "${prefix}complist";
150
8f118dcd 151our (%Comp1st, %Comp2nd, %CompList);
ac5ea531 152
f027f502 153foreach my $c1 (keys %Compos) {
154 my $name = sprintf "${structname}_%06x", $c1;
155 $Comp1st{$c1} = $name;
156
157 foreach my $c2 (keys %{ $Compos{$c1} }) {
158 my $composite = $Compos{$c1}{$c2};
159 $Comp2nd{$c2} = 1 if ! $Exclus{$composite} && ! $Combin{$c1};
160 $CompList{$name}{$c2} = $composite;
161 }
8f118dcd 162}
163
164# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
f027f502 165foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
166 $Comp2nd{$j} = 1;
ac5ea531 167}
168
169my $compinit =
8f118dcd 170 "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
ac5ea531 171
172foreach my $i (sort keys %CompList) {
8f118dcd 173 $compinit .= "$structname $i [] = {\n";
174 $compinit .= join ",\n",
175 map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
176 sort {$a <=> $b } keys %{ $CompList{$i} };
177 $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
ac5ea531 178}
179
180####################################
181
8f118dcd 182my @boolfunc = (
183 {
184 name => "Exclusion",
185 type => "bool",
186 hash => \%Exclus,
187 },
188 {
189 name => "Singleton",
190 type => "bool",
191 hash => \%Single,
192 },
193 {
194 name => "NonStDecomp",
195 type => "bool",
196 hash => \%NonStD,
197 },
198 {
199 name => "Comp2nd",
200 type => "bool",
201 hash => \%Comp2nd,
202 },
203);
ac5ea531 204
205my $file = "unfexc.h";
206open FH, ">$file" or croak "$PACKAGE: $file can't be made";
207binmode FH; select FH;
208
8f118dcd 209 print << 'EOF';
210/*
211 * This file is auto-generated by mkheader.
212 * Any changes here will be lost!
213 */
214EOF
ac5ea531 215
8f118dcd 216foreach my $tbl (@boolfunc) {
217 my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
218 my $type = $tbl->{type};
219 my $name = $tbl->{name};
220 print "$type is$name (UV uv)\n{\nreturn\n\t";
221
222 while (@temp) {
223 my $cur = shift @temp;
224 if (@temp && $cur + 1 == $temp[0]) {
225 print "($cur <= uv && uv <= ";
226 while (@temp && $cur + 1 == $temp[0]) {
227 $cur = shift @temp;
228 }
229 print "$cur)";
230 print "\n\t|| " if @temp;
231 } else {
232 print "uv == $cur";
233 print "\n\t|| " if @temp;
234 }
ac5ea531 235 }
8f118dcd 236 print "\n\t? TRUE : FALSE;\n}\n\n";
ac5ea531 237}
238
ac5ea531 239close FH;
240
8f118dcd 241
ac5ea531 242####################################
243
244my @tripletable = (
8f118dcd 245 {
246 file => "unfcmb",
247 name => "combin",
248 type => "STDCHAR",
249 hash => \%Combin,
250 null => 0,
251 },
252 {
253 file => "unfcan",
254 name => "canon",
255 type => "char*",
256 hash => \%Canon,
257 null => "NULL",
258 },
259 {
260 file => "unfcpt",
261 name => "compat",
262 type => "char*",
263 hash => \%Compat,
264 null => "NULL",
265 },
266 {
267 file => "unfcmp",
268 name => "compos",
269 type => "$structname *",
270 hash => \%Comp1st,
271 null => "NULL",
272 init => $compinit,
273 },
ac5ea531 274);
275
276foreach my $tbl (@tripletable) {
8f118dcd 277 my $file = "$tbl->{file}.h";
278 my $head = "${prefix}$tbl->{name}";
279 my $type = $tbl->{type};
280 my $hash = $tbl->{hash};
281 my $null = $tbl->{null};
282 my $init = $tbl->{init};
283
284 open FH, ">$file" or croak "$PACKAGE: $file can't be made";
285 binmode FH; select FH;
286 my %val;
287
288 print FH << 'EOF';
ac5ea531 289/*
290 * This file is auto-generated by mkheader.
291 * Any changes here will be lost!
292 */
293EOF
294
8f118dcd 295 print $init if defined $init;
296
297 foreach my $uv (keys %$hash) {
f027f502 298 croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
299 unless $uv <= 0x10FFFF;
8f118dcd 300 my @c = unpack 'CCCC', pack 'N', $uv;
301 $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
302 }
303
304 foreach my $p (sort { $a <=> $b } keys %val) {
305 next if ! $val{ $p };
306 for (my $r = 0; $r < 256; $r++) {
307 next if ! $val{ $p }{ $r };
308 printf "$type ${head}_%02x_%02x [256] = {\n", $p, $r;
309 for (my $c = 0; $c < 256; $c++) {
310 print "\t", defined $val{$p}{$r}{$c}
311 ? "($type)".$val{$p}{$r}{$c}
312 : $null;
313 print ',' if $c != 255;
314 print "\n" if $c % 8 == 7;
315 }
316 print "};\n\n";
317 }
318 }
319 foreach my $p (sort { $a <=> $b } keys %val) {
320 next if ! $val{ $p };
321 printf "$type* ${head}_%02x [256] = {\n", $p;
322 for (my $r = 0; $r < 256; $r++) {
323 print $val{ $p }{ $r }
324 ? sprintf("${head}_%02x_%02x", $p, $r)
325 : "NULL";
326 print ',' if $r != 255;
327 print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
328 }
329 print "};\n\n";
ac5ea531 330 }
8f118dcd 331 print "$type** $head [] = {\n";
332 for (my $p = 0; $p <= 0x10; $p++) {
333 print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
334 print ',' if $p != 0x10;
335 print "\n";
ac5ea531 336 }
337 print "};\n\n";
8f118dcd 338 close FH;
ac5ea531 339}
340
341__END__