[Unicode::Normalize] mkheader tweak
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / mkheader
CommitLineData
ac5ea531 1#!perl
2#
3# This script generates "unfcan.h", "unfcpt.h", "unfcmb.h",
f027f502 4# "unfcmp.h", and "unfexc.h"
48287974 5# from CombiningClass.pl, Decomposition.pl, CompositionExclusions.txt
ac5ea531 6# in lib/unicore or unicode directory
7# for Unicode::Normalize.xs. (cf. Makefile.PL)
8#
f027f502 9# Usage: <perl mkheader> in command line
10# or <do 'mkheader'> in perl
11#
ac5ea531 12use 5.006;
13use strict;
14use warnings;
15use Carp;
16
17our $PACKAGE = 'Unicode::Normalize, mkheader';
18
19our $Combin = do "unicore/CombiningClass.pl"
8f118dcd 20 || do "unicode/CombiningClass.pl"
21 || croak "$PACKAGE: CombiningClass.pl not found";
ac5ea531 22
23our $Decomp = do "unicore/Decomposition.pl"
8f118dcd 24 || do "unicode/Decomposition.pl"
25 || croak "$PACKAGE: Decomposition.pl not found";
ac5ea531 26
48287974 27our %Combin; # $codepoint => $number : combination class
28our %Canon; # $codepoint => $hexstring : canonical decomp.
29our %Compat; # $codepoint => $hexstring : compat. decomp.
30our %Exclus; # $codepoint => 1 : composition exclusions
31our %Single; # $codepoint => 1 : singletons
32our %NonStD; # $codepoint => 1 : non-starter decompositions
33
34our %Comp1st; # $codepoint => $listname : may be composed with a next char.
35our %Comp2nd; # $codepoint => 1 : may be composed with a prev char.
36our %CompList; # $listname,$2nd => $codepoint : composite
37
38our $prefix = "UNF_";
39our $structname = "${prefix}complist";
ac5ea531 40
41{
8f118dcd 42 my($f, $fh);
43 foreach my $d (@INC) {
44 use File::Spec;
45 $f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
46 last if open($fh, $f);
47 $f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
48 last if open($fh, $f);
49 $f = undef;
50 }
48287974 51 croak "$PACKAGE: neither unicore/CompositionExclusions.txt "
52 . "nor unicode/CompExcl.txt is found in @INC" unless defined $f;
53
54 while (<$fh>) {
55 next if /^#/ or /^$/;
56 s/#.*//;
57 $Exclus{ hex($1) } = 1 if /([0-9A-Fa-f]+)/;
58 }
8f118dcd 59 close $fh;
ac5ea531 60}
61
48287974 62##
63## converts string "hhhh hhhh hhhh" to a numeric list
64##
65sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g }
66
8f118dcd 67while ($Combin =~ /(.+)/g) {
68 my @tab = split /\t/, $1;
69 my $ini = hex $tab[0];
70 if ($tab[1] eq '') {
71 $Combin{ $ini } = $tab[2];
72 } else {
73 $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]);
74 }
ac5ea531 75}
76
8f118dcd 77while ($Decomp =~ /(.+)/g) {
78 my @tab = split /\t/, $1;
79 my $compat = $tab[2] =~ s/<[^>]+>//;
80 my $dec = [ _getHexArray($tab[2]) ]; # decomposition
8f118dcd 81 my $ini = hex($tab[0]); # initial decomposable character
48287974 82
83 my $listname =
84 @$dec == 2 ? sprintf("${structname}_%06x", $dec->[0]) : 'USELESS';
85 # %04x is bad since it'd place _3046 after _1d157.
86
8f118dcd 87 if ($tab[1] eq '') {
88 $Compat{ $ini } = $dec;
89
90 if (! $compat) {
48287974 91 $Canon{ $ini } = $dec;
8f118dcd 92
f027f502 93 if (@$dec == 2) {
8f118dcd 94 if ($Combin{ $dec->[0] }) {
95 $NonStD{ $ini } = 1;
96 } else {
48287974 97 $CompList{ $listname }{ $dec->[1] } = $ini;
98 $Comp1st{ $dec->[0] } = $listname;
99 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$ini};
8f118dcd 100 }
f027f502 101 } elsif (@$dec == 1) {
8f118dcd 102 $Single{ $ini } = 1;
f027f502 103 } else {
104 croak("Weird Canonical Decomposition of U+$tab[0]");
8f118dcd 105 }
106 }
107 } else {
48287974 108 foreach my $u ($ini .. hex($tab[1])) {
8f118dcd 109 $Compat{ $u } = $dec;
48287974 110
8f118dcd 111 if (! $compat) {
48287974 112 $Canon{ $u } = $dec;
8f118dcd 113
f027f502 114 if (@$dec == 2) {
8f118dcd 115 if ($Combin{ $dec->[0] }) {
116 $NonStD{ $u } = 1;
117 } else {
48287974 118 $CompList{ $listname }{ $dec->[1] } = $u;
119 $Comp1st{ $dec->[0] } = $listname;
120 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
8f118dcd 121 }
f027f502 122 } elsif (@$dec == 1) {
8f118dcd 123 $Single{ $u } = 1;
f027f502 124 } else {
125 croak("Weird Canonical Decomposition of U+$tab[0]");
8f118dcd 126 }
127 }
128 }
ac5ea531 129 }
ac5ea531 130}
131
48287974 132# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
133foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
134 $Comp2nd{$j} = 1;
ac5ea531 135}
136
137sub getCanonList {
8f118dcd 138 my @src = @_;
139 my @dec = map $Canon{$_} ? @{ $Canon{$_} } : $_, @src;
140 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
141 # condition @src == @dec is not ok.
ac5ea531 142}
143
144sub getCompatList {
8f118dcd 145 my @src = @_;
146 my @dec = map $Compat{$_} ? @{ $Compat{$_} } : $_, @src;
147 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
148 # condition @src == @dec is not ok.
ac5ea531 149}
150
48287974 151# exhaustive decomposition
152foreach my $key (keys %Canon) {
153 $Canon{$key} = [ getCanonList($key) ];
154}
155
156# exhaustive decomposition
157foreach my $key (keys %Compat) {
158 $Compat{$key} = [ getCompatList($key) ];
159}
ac5ea531 160
161sub _U_stringify {
8f118dcd 162 sprintf '"%s"', join '',
163 map sprintf("\\x%02x", $_), unpack 'C*', pack 'U*', @_;
ac5ea531 164}
f027f502 165# Do we need say <pack 'U*', map utf8::unicode_to_native($_),>
166# instead of <pack 'U*',> for EBCDIC?
ac5ea531 167
168foreach my $hash (\%Canon, \%Compat) {
8f118dcd 169 foreach my $key (keys %$hash) {
170 $hash->{$key} = _U_stringify( @{ $hash->{$key} } );
171 }
ac5ea531 172}
173
ac5ea531 174####################################
175
8f118dcd 176my @boolfunc = (
177 {
178 name => "Exclusion",
179 type => "bool",
180 hash => \%Exclus,
181 },
182 {
183 name => "Singleton",
184 type => "bool",
185 hash => \%Single,
186 },
187 {
188 name => "NonStDecomp",
189 type => "bool",
190 hash => \%NonStD,
191 },
192 {
193 name => "Comp2nd",
194 type => "bool",
195 hash => \%Comp2nd,
196 },
197);
ac5ea531 198
199my $file = "unfexc.h";
200open FH, ">$file" or croak "$PACKAGE: $file can't be made";
201binmode FH; select FH;
202
8f118dcd 203 print << 'EOF';
204/*
205 * This file is auto-generated by mkheader.
206 * Any changes here will be lost!
207 */
208EOF
ac5ea531 209
8f118dcd 210foreach my $tbl (@boolfunc) {
211 my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
212 my $type = $tbl->{type};
213 my $name = $tbl->{name};
214 print "$type is$name (UV uv)\n{\nreturn\n\t";
215
216 while (@temp) {
217 my $cur = shift @temp;
218 if (@temp && $cur + 1 == $temp[0]) {
219 print "($cur <= uv && uv <= ";
220 while (@temp && $cur + 1 == $temp[0]) {
221 $cur = shift @temp;
222 }
223 print "$cur)";
224 print "\n\t|| " if @temp;
225 } else {
226 print "uv == $cur";
227 print "\n\t|| " if @temp;
228 }
ac5ea531 229 }
8f118dcd 230 print "\n\t? TRUE : FALSE;\n}\n\n";
ac5ea531 231}
232
ac5ea531 233close FH;
234
235####################################
236
48287974 237my $compinit =
238 "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
239
240foreach my $i (sort keys %CompList) {
241 $compinit .= "$structname $i [] = {\n";
242 $compinit .= join ",\n",
243 map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
244 sort {$a <=> $b } keys %{ $CompList{$i} };
245 $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
246}
247
ac5ea531 248my @tripletable = (
8f118dcd 249 {
250 file => "unfcmb",
251 name => "combin",
252 type => "STDCHAR",
253 hash => \%Combin,
254 null => 0,
255 },
256 {
257 file => "unfcan",
258 name => "canon",
259 type => "char*",
260 hash => \%Canon,
261 null => "NULL",
262 },
263 {
264 file => "unfcpt",
265 name => "compat",
266 type => "char*",
267 hash => \%Compat,
268 null => "NULL",
269 },
270 {
271 file => "unfcmp",
272 name => "compos",
273 type => "$structname *",
274 hash => \%Comp1st,
275 null => "NULL",
276 init => $compinit,
277 },
ac5ea531 278);
279
280foreach my $tbl (@tripletable) {
8f118dcd 281 my $file = "$tbl->{file}.h";
282 my $head = "${prefix}$tbl->{name}";
283 my $type = $tbl->{type};
284 my $hash = $tbl->{hash};
285 my $null = $tbl->{null};
286 my $init = $tbl->{init};
287
288 open FH, ">$file" or croak "$PACKAGE: $file can't be made";
289 binmode FH; select FH;
290 my %val;
291
292 print FH << 'EOF';
ac5ea531 293/*
294 * This file is auto-generated by mkheader.
295 * Any changes here will be lost!
296 */
297EOF
298
8f118dcd 299 print $init if defined $init;
300
301 foreach my $uv (keys %$hash) {
f027f502 302 croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
303 unless $uv <= 0x10FFFF;
8f118dcd 304 my @c = unpack 'CCCC', pack 'N', $uv;
305 $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
306 }
307
308 foreach my $p (sort { $a <=> $b } keys %val) {
309 next if ! $val{ $p };
310 for (my $r = 0; $r < 256; $r++) {
311 next if ! $val{ $p }{ $r };
312 printf "$type ${head}_%02x_%02x [256] = {\n", $p, $r;
313 for (my $c = 0; $c < 256; $c++) {
314 print "\t", defined $val{$p}{$r}{$c}
315 ? "($type)".$val{$p}{$r}{$c}
316 : $null;
317 print ',' if $c != 255;
318 print "\n" if $c % 8 == 7;
319 }
320 print "};\n\n";
321 }
322 }
323 foreach my $p (sort { $a <=> $b } keys %val) {
324 next if ! $val{ $p };
325 printf "$type* ${head}_%02x [256] = {\n", $p;
326 for (my $r = 0; $r < 256; $r++) {
327 print $val{ $p }{ $r }
328 ? sprintf("${head}_%02x_%02x", $p, $r)
329 : "NULL";
330 print ',' if $r != 255;
331 print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
332 }
333 print "};\n\n";
ac5ea531 334 }
8f118dcd 335 print "$type** $head [] = {\n";
336 for (my $p = 0; $p <= 0x10; $p++) {
337 print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
338 print ',' if $p != 0x10;
339 print "\n";
ac5ea531 340 }
341 print "};\n\n";
8f118dcd 342 close FH;
ac5ea531 343}
344
345__END__