Upgrade to Unicode-Normalize-1.00
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / mkheader
CommitLineData
ac5ea531 1#!perl
2#
628bbff0 3# This auxiliary script makes five header files
4# used for building XSUB of Unicode::Normalize.
ac5ea531 5#
628bbff0 6# Usage:
7# <do 'mkheader'> in perl, or <perl mkheader> in command line
8#
9# Input files:
10# unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
11# unicore/Decomposition.pl (or unicode/Decomposition.pl)
12# unicore/CompositionExclusions.txt (or unicode/CompExcl.txt)
13#
14# Output files:
15# unfcan.h
16# unfcpt.h
17# unfcmb.h
18# unfcmp.h
19# unfexc.h
f027f502 20#
ac5ea531 21use 5.006;
22use strict;
23use warnings;
24use Carp;
6c941e0c 25use File::Spec;
26
9f1f04a1 27BEGIN {
1efaba7f 28 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 29 die "Unicode::Normalize cannot stringify a Unicode code point\n";
30 }
31}
ac5ea531 32
33our $PACKAGE = 'Unicode::Normalize, mkheader';
34
35our $Combin = do "unicore/CombiningClass.pl"
8f118dcd 36 || do "unicode/CombiningClass.pl"
37 || croak "$PACKAGE: CombiningClass.pl not found";
ac5ea531 38
39our $Decomp = do "unicore/Decomposition.pl"
8f118dcd 40 || do "unicode/Decomposition.pl"
41 || croak "$PACKAGE: Decomposition.pl not found";
ac5ea531 42
48287974 43our %Combin; # $codepoint => $number : combination class
6c941e0c 44our %Canon; # $codepoint => \@codepoints : canonical decomp.
45our %Compat; # $codepoint => \@codepoints : compat. decomp.
46# after _U_stringify(), ($codepoint => $hexstring) for %Canon and %Compat
48287974 47our %Exclus; # $codepoint => 1 : composition exclusions
48our %Single; # $codepoint => 1 : singletons
49our %NonStD; # $codepoint => 1 : non-starter decompositions
50
51our %Comp1st; # $codepoint => $listname : may be composed with a next char.
52our %Comp2nd; # $codepoint => 1 : may be composed with a prev char.
53our %CompList; # $listname,$2nd => $codepoint : composite
54
55our $prefix = "UNF_";
56our $structname = "${prefix}complist";
ac5ea531 57
6c941e0c 58########## definition of Hangul constants ##########
59use constant SBase => 0xAC00;
60use constant SFinal => 0xD7A3; # SBase -1 + SCount
61use constant SCount => 11172; # LCount * NCount
62use constant NCount => 588; # VCount * TCount
63use constant LBase => 0x1100;
64use constant LFinal => 0x1112;
65use constant LCount => 19;
66use constant VBase => 0x1161;
67use constant VFinal => 0x1175;
68use constant VCount => 21;
69use constant TBase => 0x11A7;
70use constant TFinal => 0x11C2;
71use constant TCount => 28;
72
73sub decomposeHangul {
74 my $SIndex = $_[0] - SBase;
75 my $LIndex = int( $SIndex / NCount);
76 my $VIndex = int(($SIndex % NCount) / TCount);
77 my $TIndex = $SIndex % TCount;
78 my @ret = (
79 LBase + $LIndex,
80 VBase + $VIndex,
81 $TIndex ? (TBase + $TIndex) : (),
82 );
83 wantarray ? @ret : pack('U*', @ret);
84 # any element in @ret greater than 0xFF, so no need of u2n conversion.
85}
86
87########## getting full decomposion ##########
ac5ea531 88{
8f118dcd 89 my($f, $fh);
90 foreach my $d (@INC) {
8f118dcd 91 $f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
92 last if open($fh, $f);
93 $f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
94 last if open($fh, $f);
95 $f = undef;
96 }
48287974 97 croak "$PACKAGE: neither unicore/CompositionExclusions.txt "
98 . "nor unicode/CompExcl.txt is found in @INC" unless defined $f;
99
100 while (<$fh>) {
101 next if /^#/ or /^$/;
102 s/#.*//;
103 $Exclus{ hex($1) } = 1 if /([0-9A-Fa-f]+)/;
104 }
8f118dcd 105 close $fh;
ac5ea531 106}
107
48287974 108##
109## converts string "hhhh hhhh hhhh" to a numeric list
110##
111sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g }
112
8f118dcd 113while ($Combin =~ /(.+)/g) {
114 my @tab = split /\t/, $1;
115 my $ini = hex $tab[0];
116 if ($tab[1] eq '') {
117 $Combin{ $ini } = $tab[2];
118 } else {
119 $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]);
120 }
ac5ea531 121}
122
8f118dcd 123while ($Decomp =~ /(.+)/g) {
124 my @tab = split /\t/, $1;
125 my $compat = $tab[2] =~ s/<[^>]+>//;
126 my $dec = [ _getHexArray($tab[2]) ]; # decomposition
8f118dcd 127 my $ini = hex($tab[0]); # initial decomposable character
48287974 128
129 my $listname =
130 @$dec == 2 ? sprintf("${structname}_%06x", $dec->[0]) : 'USELESS';
131 # %04x is bad since it'd place _3046 after _1d157.
132
8f118dcd 133 if ($tab[1] eq '') {
134 $Compat{ $ini } = $dec;
135
136 if (! $compat) {
48287974 137 $Canon{ $ini } = $dec;
8f118dcd 138
f027f502 139 if (@$dec == 2) {
8f118dcd 140 if ($Combin{ $dec->[0] }) {
141 $NonStD{ $ini } = 1;
142 } else {
48287974 143 $CompList{ $listname }{ $dec->[1] } = $ini;
144 $Comp1st{ $dec->[0] } = $listname;
145 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$ini};
8f118dcd 146 }
f027f502 147 } elsif (@$dec == 1) {
8f118dcd 148 $Single{ $ini } = 1;
f027f502 149 } else {
150 croak("Weird Canonical Decomposition of U+$tab[0]");
8f118dcd 151 }
152 }
153 } else {
48287974 154 foreach my $u ($ini .. hex($tab[1])) {
8f118dcd 155 $Compat{ $u } = $dec;
48287974 156
8f118dcd 157 if (! $compat) {
48287974 158 $Canon{ $u } = $dec;
8f118dcd 159
f027f502 160 if (@$dec == 2) {
8f118dcd 161 if ($Combin{ $dec->[0] }) {
162 $NonStD{ $u } = 1;
163 } else {
48287974 164 $CompList{ $listname }{ $dec->[1] } = $u;
165 $Comp1st{ $dec->[0] } = $listname;
166 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
8f118dcd 167 }
f027f502 168 } elsif (@$dec == 1) {
8f118dcd 169 $Single{ $u } = 1;
f027f502 170 } else {
171 croak("Weird Canonical Decomposition of U+$tab[0]");
8f118dcd 172 }
173 }
174 }
ac5ea531 175 }
ac5ea531 176}
177
48287974 178# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
179foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
180 $Comp2nd{$j} = 1;
ac5ea531 181}
182
183sub getCanonList {
8f118dcd 184 my @src = @_;
6c941e0c 185 my @dec = map {
186 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
187 : $Canon{$_} ? @{ $Canon{$_} } : $_
188 } @src;
8f118dcd 189 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
190 # condition @src == @dec is not ok.
ac5ea531 191}
192
193sub getCompatList {
8f118dcd 194 my @src = @_;
6c941e0c 195 my @dec = map {
196 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
197 : $Compat{$_} ? @{ $Compat{$_} } : $_
198 } @src;
8f118dcd 199 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
200 # condition @src == @dec is not ok.
ac5ea531 201}
202
48287974 203# exhaustive decomposition
204foreach my $key (keys %Canon) {
205 $Canon{$key} = [ getCanonList($key) ];
206}
207
208# exhaustive decomposition
628bbff0 209foreach my $key (keys %Compat) {
48287974 210 $Compat{$key} = [ getCompatList($key) ];
211}
ac5ea531 212
9f1f04a1 213sub _pack_U {
1efaba7f 214 return pack('U*', @_);
9f1f04a1 215}
216
ac5ea531 217sub _U_stringify {
8f118dcd 218 sprintf '"%s"', join '',
9f1f04a1 219 map sprintf("\\x%02x", $_), unpack 'C*', _pack_U(@_);
ac5ea531 220}
221
222foreach my $hash (\%Canon, \%Compat) {
8f118dcd 223 foreach my $key (keys %$hash) {
224 $hash->{$key} = _U_stringify( @{ $hash->{$key} } );
225 }
ac5ea531 226}
227
6c941e0c 228########## writing header files ##########
ac5ea531 229
8f118dcd 230my @boolfunc = (
231 {
232 name => "Exclusion",
233 type => "bool",
234 hash => \%Exclus,
235 },
236 {
237 name => "Singleton",
238 type => "bool",
239 hash => \%Single,
240 },
241 {
242 name => "NonStDecomp",
243 type => "bool",
244 hash => \%NonStD,
245 },
246 {
247 name => "Comp2nd",
248 type => "bool",
249 hash => \%Comp2nd,
250 },
251);
ac5ea531 252
253my $file = "unfexc.h";
254open FH, ">$file" or croak "$PACKAGE: $file can't be made";
255binmode FH; select FH;
256
8f118dcd 257 print << 'EOF';
258/*
259 * This file is auto-generated by mkheader.
260 * Any changes here will be lost!
261 */
262EOF
ac5ea531 263
8f118dcd 264foreach my $tbl (@boolfunc) {
265 my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
266 my $type = $tbl->{type};
267 my $name = $tbl->{name};
268 print "$type is$name (UV uv)\n{\nreturn\n\t";
269
270 while (@temp) {
271 my $cur = shift @temp;
272 if (@temp && $cur + 1 == $temp[0]) {
273 print "($cur <= uv && uv <= ";
274 while (@temp && $cur + 1 == $temp[0]) {
275 $cur = shift @temp;
276 }
277 print "$cur)";
278 print "\n\t|| " if @temp;
279 } else {
280 print "uv == $cur";
281 print "\n\t|| " if @temp;
282 }
ac5ea531 283 }
8f118dcd 284 print "\n\t? TRUE : FALSE;\n}\n\n";
ac5ea531 285}
286
ac5ea531 287close FH;
288
289####################################
290
48287974 291my $compinit =
292 "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
293
294foreach my $i (sort keys %CompList) {
295 $compinit .= "$structname $i [] = {\n";
296 $compinit .= join ",\n",
297 map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
298 sort {$a <=> $b } keys %{ $CompList{$i} };
299 $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
300}
301
ac5ea531 302my @tripletable = (
8f118dcd 303 {
304 file => "unfcmb",
305 name => "combin",
306 type => "STDCHAR",
307 hash => \%Combin,
308 null => 0,
309 },
310 {
311 file => "unfcan",
312 name => "canon",
313 type => "char*",
314 hash => \%Canon,
315 null => "NULL",
316 },
317 {
318 file => "unfcpt",
319 name => "compat",
320 type => "char*",
321 hash => \%Compat,
322 null => "NULL",
323 },
324 {
325 file => "unfcmp",
326 name => "compos",
327 type => "$structname *",
328 hash => \%Comp1st,
329 null => "NULL",
330 init => $compinit,
331 },
ac5ea531 332);
333
334foreach my $tbl (@tripletable) {
8f118dcd 335 my $file = "$tbl->{file}.h";
336 my $head = "${prefix}$tbl->{name}";
337 my $type = $tbl->{type};
338 my $hash = $tbl->{hash};
339 my $null = $tbl->{null};
340 my $init = $tbl->{init};
341
342 open FH, ">$file" or croak "$PACKAGE: $file can't be made";
343 binmode FH; select FH;
344 my %val;
345
346 print FH << 'EOF';
ac5ea531 347/*
348 * This file is auto-generated by mkheader.
349 * Any changes here will be lost!
350 */
351EOF
352
8f118dcd 353 print $init if defined $init;
354
355 foreach my $uv (keys %$hash) {
f027f502 356 croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
357 unless $uv <= 0x10FFFF;
8f118dcd 358 my @c = unpack 'CCCC', pack 'N', $uv;
359 $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
360 }
361
362 foreach my $p (sort { $a <=> $b } keys %val) {
363 next if ! $val{ $p };
364 for (my $r = 0; $r < 256; $r++) {
365 next if ! $val{ $p }{ $r };
fe067ad9 366 printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r;
8f118dcd 367 for (my $c = 0; $c < 256; $c++) {
368 print "\t", defined $val{$p}{$r}{$c}
369 ? "($type)".$val{$p}{$r}{$c}
370 : $null;
371 print ',' if $c != 255;
372 print "\n" if $c % 8 == 7;
373 }
374 print "};\n\n";
375 }
376 }
377 foreach my $p (sort { $a <=> $b } keys %val) {
378 next if ! $val{ $p };
fe067ad9 379 printf "static $type* ${head}_%02x [256] = {\n", $p;
8f118dcd 380 for (my $r = 0; $r < 256; $r++) {
381 print $val{ $p }{ $r }
382 ? sprintf("${head}_%02x_%02x", $p, $r)
383 : "NULL";
384 print ',' if $r != 255;
385 print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
386 }
387 print "};\n\n";
ac5ea531 388 }
fe067ad9 389 print "static $type** $head [] = {\n";
8f118dcd 390 for (my $p = 0; $p <= 0x10; $p++) {
391 print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
392 print ',' if $p != 0x10;
393 print "\n";
ac5ea531 394 }
395 print "};\n\n";
8f118dcd 396 close FH;
ac5ea531 397}
398
628bbff0 3991;
ac5ea531 400__END__