Down with C++ reserved names
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / mkheader
CommitLineData
ac5ea531 1#!perl
2#
3# This script generates "unfcan.h", "unfcpt.h", "unfcmb.h",
f027f502 4# "unfcmp.h", and "unfexc.h"
48287974 5# from CombiningClass.pl, Decomposition.pl, CompositionExclusions.txt
ac5ea531 6# in lib/unicore or unicode directory
7# for Unicode::Normalize.xs. (cf. Makefile.PL)
8#
f027f502 9# Usage: <perl mkheader> in command line
10# or <do 'mkheader'> in perl
11#
ac5ea531 12use 5.006;
13use strict;
14use warnings;
15use Carp;
6c941e0c 16use File::Spec;
17
9f1f04a1 18BEGIN {
1efaba7f 19 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 20 die "Unicode::Normalize cannot stringify a Unicode code point\n";
21 }
22}
ac5ea531 23
24our $PACKAGE = 'Unicode::Normalize, mkheader';
25
26our $Combin = do "unicore/CombiningClass.pl"
8f118dcd 27 || do "unicode/CombiningClass.pl"
28 || croak "$PACKAGE: CombiningClass.pl not found";
ac5ea531 29
30our $Decomp = do "unicore/Decomposition.pl"
8f118dcd 31 || do "unicode/Decomposition.pl"
32 || croak "$PACKAGE: Decomposition.pl not found";
ac5ea531 33
48287974 34our %Combin; # $codepoint => $number : combination class
6c941e0c 35our %Canon; # $codepoint => \@codepoints : canonical decomp.
36our %Compat; # $codepoint => \@codepoints : compat. decomp.
37# after _U_stringify(), ($codepoint => $hexstring) for %Canon and %Compat
48287974 38our %Exclus; # $codepoint => 1 : composition exclusions
39our %Single; # $codepoint => 1 : singletons
40our %NonStD; # $codepoint => 1 : non-starter decompositions
41
42our %Comp1st; # $codepoint => $listname : may be composed with a next char.
43our %Comp2nd; # $codepoint => 1 : may be composed with a prev char.
44our %CompList; # $listname,$2nd => $codepoint : composite
45
46our $prefix = "UNF_";
47our $structname = "${prefix}complist";
ac5ea531 48
6c941e0c 49########## definition of Hangul constants ##########
50use constant SBase => 0xAC00;
51use constant SFinal => 0xD7A3; # SBase -1 + SCount
52use constant SCount => 11172; # LCount * NCount
53use constant NCount => 588; # VCount * TCount
54use constant LBase => 0x1100;
55use constant LFinal => 0x1112;
56use constant LCount => 19;
57use constant VBase => 0x1161;
58use constant VFinal => 0x1175;
59use constant VCount => 21;
60use constant TBase => 0x11A7;
61use constant TFinal => 0x11C2;
62use constant TCount => 28;
63
64sub decomposeHangul {
65 my $SIndex = $_[0] - SBase;
66 my $LIndex = int( $SIndex / NCount);
67 my $VIndex = int(($SIndex % NCount) / TCount);
68 my $TIndex = $SIndex % TCount;
69 my @ret = (
70 LBase + $LIndex,
71 VBase + $VIndex,
72 $TIndex ? (TBase + $TIndex) : (),
73 );
74 wantarray ? @ret : pack('U*', @ret);
75 # any element in @ret greater than 0xFF, so no need of u2n conversion.
76}
77
78########## getting full decomposion ##########
ac5ea531 79{
8f118dcd 80 my($f, $fh);
81 foreach my $d (@INC) {
8f118dcd 82 $f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
83 last if open($fh, $f);
84 $f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
85 last if open($fh, $f);
86 $f = undef;
87 }
48287974 88 croak "$PACKAGE: neither unicore/CompositionExclusions.txt "
89 . "nor unicode/CompExcl.txt is found in @INC" unless defined $f;
90
91 while (<$fh>) {
92 next if /^#/ or /^$/;
93 s/#.*//;
94 $Exclus{ hex($1) } = 1 if /([0-9A-Fa-f]+)/;
95 }
8f118dcd 96 close $fh;
ac5ea531 97}
98
48287974 99##
100## converts string "hhhh hhhh hhhh" to a numeric list
101##
102sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g }
103
8f118dcd 104while ($Combin =~ /(.+)/g) {
105 my @tab = split /\t/, $1;
106 my $ini = hex $tab[0];
107 if ($tab[1] eq '') {
108 $Combin{ $ini } = $tab[2];
109 } else {
110 $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]);
111 }
ac5ea531 112}
113
8f118dcd 114while ($Decomp =~ /(.+)/g) {
115 my @tab = split /\t/, $1;
116 my $compat = $tab[2] =~ s/<[^>]+>//;
117 my $dec = [ _getHexArray($tab[2]) ]; # decomposition
8f118dcd 118 my $ini = hex($tab[0]); # initial decomposable character
48287974 119
120 my $listname =
121 @$dec == 2 ? sprintf("${structname}_%06x", $dec->[0]) : 'USELESS';
122 # %04x is bad since it'd place _3046 after _1d157.
123
8f118dcd 124 if ($tab[1] eq '') {
125 $Compat{ $ini } = $dec;
126
127 if (! $compat) {
48287974 128 $Canon{ $ini } = $dec;
8f118dcd 129
f027f502 130 if (@$dec == 2) {
8f118dcd 131 if ($Combin{ $dec->[0] }) {
132 $NonStD{ $ini } = 1;
133 } else {
48287974 134 $CompList{ $listname }{ $dec->[1] } = $ini;
135 $Comp1st{ $dec->[0] } = $listname;
136 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$ini};
8f118dcd 137 }
f027f502 138 } elsif (@$dec == 1) {
8f118dcd 139 $Single{ $ini } = 1;
f027f502 140 } else {
141 croak("Weird Canonical Decomposition of U+$tab[0]");
8f118dcd 142 }
143 }
144 } else {
48287974 145 foreach my $u ($ini .. hex($tab[1])) {
8f118dcd 146 $Compat{ $u } = $dec;
48287974 147
8f118dcd 148 if (! $compat) {
48287974 149 $Canon{ $u } = $dec;
8f118dcd 150
f027f502 151 if (@$dec == 2) {
8f118dcd 152 if ($Combin{ $dec->[0] }) {
153 $NonStD{ $u } = 1;
154 } else {
48287974 155 $CompList{ $listname }{ $dec->[1] } = $u;
156 $Comp1st{ $dec->[0] } = $listname;
157 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
8f118dcd 158 }
f027f502 159 } elsif (@$dec == 1) {
8f118dcd 160 $Single{ $u } = 1;
f027f502 161 } else {
162 croak("Weird Canonical Decomposition of U+$tab[0]");
8f118dcd 163 }
164 }
165 }
ac5ea531 166 }
ac5ea531 167}
168
48287974 169# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
170foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
171 $Comp2nd{$j} = 1;
ac5ea531 172}
173
174sub getCanonList {
8f118dcd 175 my @src = @_;
6c941e0c 176 my @dec = map {
177 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
178 : $Canon{$_} ? @{ $Canon{$_} } : $_
179 } @src;
8f118dcd 180 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
181 # condition @src == @dec is not ok.
ac5ea531 182}
183
184sub getCompatList {
8f118dcd 185 my @src = @_;
6c941e0c 186 my @dec = map {
187 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
188 : $Compat{$_} ? @{ $Compat{$_} } : $_
189 } @src;
8f118dcd 190 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
191 # condition @src == @dec is not ok.
ac5ea531 192}
193
48287974 194# exhaustive decomposition
195foreach my $key (keys %Canon) {
196 $Canon{$key} = [ getCanonList($key) ];
197}
198
199# exhaustive decomposition
200foreach my $key (keys %Compat) {
201 $Compat{$key} = [ getCompatList($key) ];
202}
ac5ea531 203
9f1f04a1 204sub _pack_U {
1efaba7f 205 return pack('U*', @_);
9f1f04a1 206}
207
ac5ea531 208sub _U_stringify {
8f118dcd 209 sprintf '"%s"', join '',
9f1f04a1 210 map sprintf("\\x%02x", $_), unpack 'C*', _pack_U(@_);
ac5ea531 211}
212
213foreach my $hash (\%Canon, \%Compat) {
8f118dcd 214 foreach my $key (keys %$hash) {
215 $hash->{$key} = _U_stringify( @{ $hash->{$key} } );
216 }
ac5ea531 217}
218
6c941e0c 219########## writing header files ##########
ac5ea531 220
8f118dcd 221my @boolfunc = (
222 {
223 name => "Exclusion",
224 type => "bool",
225 hash => \%Exclus,
226 },
227 {
228 name => "Singleton",
229 type => "bool",
230 hash => \%Single,
231 },
232 {
233 name => "NonStDecomp",
234 type => "bool",
235 hash => \%NonStD,
236 },
237 {
238 name => "Comp2nd",
239 type => "bool",
240 hash => \%Comp2nd,
241 },
242);
ac5ea531 243
244my $file = "unfexc.h";
245open FH, ">$file" or croak "$PACKAGE: $file can't be made";
246binmode FH; select FH;
247
8f118dcd 248 print << 'EOF';
249/*
250 * This file is auto-generated by mkheader.
251 * Any changes here will be lost!
252 */
253EOF
ac5ea531 254
8f118dcd 255foreach my $tbl (@boolfunc) {
256 my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
257 my $type = $tbl->{type};
258 my $name = $tbl->{name};
259 print "$type is$name (UV uv)\n{\nreturn\n\t";
260
261 while (@temp) {
262 my $cur = shift @temp;
263 if (@temp && $cur + 1 == $temp[0]) {
264 print "($cur <= uv && uv <= ";
265 while (@temp && $cur + 1 == $temp[0]) {
266 $cur = shift @temp;
267 }
268 print "$cur)";
269 print "\n\t|| " if @temp;
270 } else {
271 print "uv == $cur";
272 print "\n\t|| " if @temp;
273 }
ac5ea531 274 }
8f118dcd 275 print "\n\t? TRUE : FALSE;\n}\n\n";
ac5ea531 276}
277
ac5ea531 278close FH;
279
280####################################
281
48287974 282my $compinit =
283 "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
284
285foreach my $i (sort keys %CompList) {
286 $compinit .= "$structname $i [] = {\n";
287 $compinit .= join ",\n",
288 map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
289 sort {$a <=> $b } keys %{ $CompList{$i} };
290 $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
291}
292
ac5ea531 293my @tripletable = (
8f118dcd 294 {
295 file => "unfcmb",
296 name => "combin",
297 type => "STDCHAR",
298 hash => \%Combin,
299 null => 0,
300 },
301 {
302 file => "unfcan",
303 name => "canon",
304 type => "char*",
305 hash => \%Canon,
306 null => "NULL",
307 },
308 {
309 file => "unfcpt",
310 name => "compat",
311 type => "char*",
312 hash => \%Compat,
313 null => "NULL",
314 },
315 {
316 file => "unfcmp",
317 name => "compos",
318 type => "$structname *",
319 hash => \%Comp1st,
320 null => "NULL",
321 init => $compinit,
322 },
ac5ea531 323);
324
325foreach my $tbl (@tripletable) {
8f118dcd 326 my $file = "$tbl->{file}.h";
327 my $head = "${prefix}$tbl->{name}";
328 my $type = $tbl->{type};
329 my $hash = $tbl->{hash};
330 my $null = $tbl->{null};
331 my $init = $tbl->{init};
332
333 open FH, ">$file" or croak "$PACKAGE: $file can't be made";
334 binmode FH; select FH;
335 my %val;
336
337 print FH << 'EOF';
ac5ea531 338/*
339 * This file is auto-generated by mkheader.
340 * Any changes here will be lost!
341 */
342EOF
343
8f118dcd 344 print $init if defined $init;
345
346 foreach my $uv (keys %$hash) {
f027f502 347 croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
348 unless $uv <= 0x10FFFF;
8f118dcd 349 my @c = unpack 'CCCC', pack 'N', $uv;
350 $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
351 }
352
353 foreach my $p (sort { $a <=> $b } keys %val) {
354 next if ! $val{ $p };
355 for (my $r = 0; $r < 256; $r++) {
356 next if ! $val{ $p }{ $r };
357 printf "$type ${head}_%02x_%02x [256] = {\n", $p, $r;
358 for (my $c = 0; $c < 256; $c++) {
359 print "\t", defined $val{$p}{$r}{$c}
360 ? "($type)".$val{$p}{$r}{$c}
361 : $null;
362 print ',' if $c != 255;
363 print "\n" if $c % 8 == 7;
364 }
365 print "};\n\n";
366 }
367 }
368 foreach my $p (sort { $a <=> $b } keys %val) {
369 next if ! $val{ $p };
370 printf "$type* ${head}_%02x [256] = {\n", $p;
371 for (my $r = 0; $r < 256; $r++) {
372 print $val{ $p }{ $r }
373 ? sprintf("${head}_%02x_%02x", $p, $r)
374 : "NULL";
375 print ',' if $r != 255;
376 print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
377 }
378 print "};\n\n";
ac5ea531 379 }
8f118dcd 380 print "$type** $head [] = {\n";
381 for (my $p = 0; $p <= 0x10; $p++) {
382 print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
383 print ',' if $p != 0x10;
384 print "\n";
ac5ea531 385 }
386 print "};\n\n";
8f118dcd 387 close FH;
ac5ea531 388}
389
390__END__