Rename ext/Unicode/Normalize to ext/Unicode-Normalize
[p5sagit/p5-mst-13.2.git] / ext / Unicode-Normalize / mkheader
CommitLineData
ac5ea531 1#!perl
2#
628bbff0 3# This auxiliary script makes five header files
4# used for building XSUB of Unicode::Normalize.
ac5ea531 5#
628bbff0 6# Usage:
7# <do 'mkheader'> in perl, or <perl mkheader> in command line
8#
9# Input files:
10# unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
11# unicore/Decomposition.pl (or unicode/Decomposition.pl)
12# unicore/CompositionExclusions.txt (or unicode/CompExcl.txt)
13#
14# Output files:
15# unfcan.h
16# unfcpt.h
17# unfcmb.h
18# unfcmp.h
19# unfexc.h
f027f502 20#
ac5ea531 21use 5.006;
22use strict;
23use warnings;
24use Carp;
6c941e0c 25use File::Spec;
26
9f1f04a1 27BEGIN {
1efaba7f 28 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 29 die "Unicode::Normalize cannot stringify a Unicode code point\n";
30 }
31}
ac5ea531 32
33our $PACKAGE = 'Unicode::Normalize, mkheader';
34
35our $Combin = do "unicore/CombiningClass.pl"
8f118dcd 36 || do "unicode/CombiningClass.pl"
37 || croak "$PACKAGE: CombiningClass.pl not found";
ac5ea531 38
39our $Decomp = do "unicore/Decomposition.pl"
8f118dcd 40 || do "unicode/Decomposition.pl"
41 || croak "$PACKAGE: Decomposition.pl not found";
ac5ea531 42
48287974 43our %Combin; # $codepoint => $number : combination class
6c941e0c 44our %Canon; # $codepoint => \@codepoints : canonical decomp.
45our %Compat; # $codepoint => \@codepoints : compat. decomp.
46# after _U_stringify(), ($codepoint => $hexstring) for %Canon and %Compat
48287974 47our %Exclus; # $codepoint => 1 : composition exclusions
48our %Single; # $codepoint => 1 : singletons
49our %NonStD; # $codepoint => 1 : non-starter decompositions
50
51our %Comp1st; # $codepoint => $listname : may be composed with a next char.
52our %Comp2nd; # $codepoint => 1 : may be composed with a prev char.
53our %CompList; # $listname,$2nd => $codepoint : composite
54
55our $prefix = "UNF_";
56our $structname = "${prefix}complist";
ac5ea531 57
6c941e0c 58########## definition of Hangul constants ##########
59use constant SBase => 0xAC00;
60use constant SFinal => 0xD7A3; # SBase -1 + SCount
61use constant SCount => 11172; # LCount * NCount
62use constant NCount => 588; # VCount * TCount
63use constant LBase => 0x1100;
64use constant LFinal => 0x1112;
65use constant LCount => 19;
66use constant VBase => 0x1161;
67use constant VFinal => 0x1175;
68use constant VCount => 21;
69use constant TBase => 0x11A7;
70use constant TFinal => 0x11C2;
71use constant TCount => 28;
72
73sub decomposeHangul {
74 my $SIndex = $_[0] - SBase;
75 my $LIndex = int( $SIndex / NCount);
76 my $VIndex = int(($SIndex % NCount) / TCount);
77 my $TIndex = $SIndex % TCount;
78 my @ret = (
79 LBase + $LIndex,
80 VBase + $VIndex,
81 $TIndex ? (TBase + $TIndex) : (),
82 );
2b8d773d 83 return @ret;
6c941e0c 84}
85
86########## getting full decomposion ##########
ac5ea531 87{
8f118dcd 88 my($f, $fh);
89 foreach my $d (@INC) {
8f118dcd 90 $f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
91 last if open($fh, $f);
92 $f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
93 last if open($fh, $f);
94 $f = undef;
95 }
48287974 96 croak "$PACKAGE: neither unicore/CompositionExclusions.txt "
97 . "nor unicode/CompExcl.txt is found in @INC" unless defined $f;
98
99 while (<$fh>) {
100 next if /^#/ or /^$/;
101 s/#.*//;
102 $Exclus{ hex($1) } = 1 if /([0-9A-Fa-f]+)/;
103 }
8f118dcd 104 close $fh;
ac5ea531 105}
106
48287974 107##
108## converts string "hhhh hhhh hhhh" to a numeric list
109##
110sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g }
111
8f118dcd 112while ($Combin =~ /(.+)/g) {
113 my @tab = split /\t/, $1;
114 my $ini = hex $tab[0];
115 if ($tab[1] eq '') {
116 $Combin{ $ini } = $tab[2];
117 } else {
118 $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]);
119 }
ac5ea531 120}
121
8f118dcd 122while ($Decomp =~ /(.+)/g) {
123 my @tab = split /\t/, $1;
124 my $compat = $tab[2] =~ s/<[^>]+>//;
125 my $dec = [ _getHexArray($tab[2]) ]; # decomposition
8f118dcd 126 my $ini = hex($tab[0]); # initial decomposable character
48287974 127
128 my $listname =
129 @$dec == 2 ? sprintf("${structname}_%06x", $dec->[0]) : 'USELESS';
130 # %04x is bad since it'd place _3046 after _1d157.
131
8f118dcd 132 if ($tab[1] eq '') {
133 $Compat{ $ini } = $dec;
134
135 if (! $compat) {
48287974 136 $Canon{ $ini } = $dec;
8f118dcd 137
f027f502 138 if (@$dec == 2) {
8f118dcd 139 if ($Combin{ $dec->[0] }) {
140 $NonStD{ $ini } = 1;
141 } else {
48287974 142 $CompList{ $listname }{ $dec->[1] } = $ini;
143 $Comp1st{ $dec->[0] } = $listname;
144 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$ini};
8f118dcd 145 }
f027f502 146 } elsif (@$dec == 1) {
8f118dcd 147 $Single{ $ini } = 1;
f027f502 148 } else {
149 croak("Weird Canonical Decomposition of U+$tab[0]");
8f118dcd 150 }
151 }
152 } else {
48287974 153 foreach my $u ($ini .. hex($tab[1])) {
8f118dcd 154 $Compat{ $u } = $dec;
48287974 155
8f118dcd 156 if (! $compat) {
48287974 157 $Canon{ $u } = $dec;
8f118dcd 158
f027f502 159 if (@$dec == 2) {
8f118dcd 160 if ($Combin{ $dec->[0] }) {
161 $NonStD{ $u } = 1;
162 } else {
48287974 163 $CompList{ $listname }{ $dec->[1] } = $u;
164 $Comp1st{ $dec->[0] } = $listname;
165 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
8f118dcd 166 }
f027f502 167 } elsif (@$dec == 1) {
8f118dcd 168 $Single{ $u } = 1;
f027f502 169 } else {
170 croak("Weird Canonical Decomposition of U+$tab[0]");
8f118dcd 171 }
172 }
173 }
ac5ea531 174 }
ac5ea531 175}
176
48287974 177# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
178foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
179 $Comp2nd{$j} = 1;
ac5ea531 180}
181
182sub getCanonList {
8f118dcd 183 my @src = @_;
6c941e0c 184 my @dec = map {
185 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
186 : $Canon{$_} ? @{ $Canon{$_} } : $_
187 } @src;
8f118dcd 188 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
189 # condition @src == @dec is not ok.
ac5ea531 190}
191
192sub getCompatList {
8f118dcd 193 my @src = @_;
6c941e0c 194 my @dec = map {
195 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
196 : $Compat{$_} ? @{ $Compat{$_} } : $_
197 } @src;
8f118dcd 198 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
199 # condition @src == @dec is not ok.
ac5ea531 200}
201
48287974 202# exhaustive decomposition
203foreach my $key (keys %Canon) {
204 $Canon{$key} = [ getCanonList($key) ];
205}
206
207# exhaustive decomposition
628bbff0 208foreach my $key (keys %Compat) {
48287974 209 $Compat{$key} = [ getCompatList($key) ];
210}
ac5ea531 211
9f1f04a1 212sub _pack_U {
1efaba7f 213 return pack('U*', @_);
9f1f04a1 214}
215
2b8d773d 216sub split_into_char {
217 use bytes;
218 my $uni = shift;
219 my $len = length($uni);
220 my @ary;
221 for(my $i = 0; $i < $len; ++$i) {
222 push @ary, ord(substr($uni,$i,1));
223 }
224 return @ary;
225}
226
ac5ea531 227sub _U_stringify {
8f118dcd 228 sprintf '"%s"', join '',
2b8d773d 229 map sprintf("\\x%02x", $_), split_into_char(_pack_U(@_));
ac5ea531 230}
231
232foreach my $hash (\%Canon, \%Compat) {
8f118dcd 233 foreach my $key (keys %$hash) {
234 $hash->{$key} = _U_stringify( @{ $hash->{$key} } );
235 }
ac5ea531 236}
237
6c941e0c 238########## writing header files ##########
ac5ea531 239
8f118dcd 240my @boolfunc = (
241 {
242 name => "Exclusion",
243 type => "bool",
244 hash => \%Exclus,
245 },
246 {
247 name => "Singleton",
248 type => "bool",
249 hash => \%Single,
250 },
251 {
252 name => "NonStDecomp",
253 type => "bool",
254 hash => \%NonStD,
255 },
256 {
257 name => "Comp2nd",
258 type => "bool",
259 hash => \%Comp2nd,
260 },
261);
ac5ea531 262
263my $file = "unfexc.h";
264open FH, ">$file" or croak "$PACKAGE: $file can't be made";
265binmode FH; select FH;
266
8f118dcd 267 print << 'EOF';
268/*
269 * This file is auto-generated by mkheader.
270 * Any changes here will be lost!
271 */
272EOF
ac5ea531 273
8f118dcd 274foreach my $tbl (@boolfunc) {
275 my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
276 my $type = $tbl->{type};
277 my $name = $tbl->{name};
278 print "$type is$name (UV uv)\n{\nreturn\n\t";
279
280 while (@temp) {
281 my $cur = shift @temp;
282 if (@temp && $cur + 1 == $temp[0]) {
283 print "($cur <= uv && uv <= ";
284 while (@temp && $cur + 1 == $temp[0]) {
285 $cur = shift @temp;
286 }
287 print "$cur)";
288 print "\n\t|| " if @temp;
289 } else {
290 print "uv == $cur";
291 print "\n\t|| " if @temp;
292 }
ac5ea531 293 }
8f118dcd 294 print "\n\t? TRUE : FALSE;\n}\n\n";
ac5ea531 295}
296
ac5ea531 297close FH;
298
299####################################
300
48287974 301my $compinit =
302 "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
303
304foreach my $i (sort keys %CompList) {
305 $compinit .= "$structname $i [] = {\n";
306 $compinit .= join ",\n",
307 map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
308 sort {$a <=> $b } keys %{ $CompList{$i} };
309 $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
310}
311
ac5ea531 312my @tripletable = (
8f118dcd 313 {
314 file => "unfcmb",
315 name => "combin",
316 type => "STDCHAR",
317 hash => \%Combin,
318 null => 0,
319 },
320 {
321 file => "unfcan",
322 name => "canon",
323 type => "char*",
324 hash => \%Canon,
325 null => "NULL",
326 },
327 {
328 file => "unfcpt",
329 name => "compat",
330 type => "char*",
331 hash => \%Compat,
332 null => "NULL",
333 },
334 {
335 file => "unfcmp",
336 name => "compos",
337 type => "$structname *",
338 hash => \%Comp1st,
339 null => "NULL",
340 init => $compinit,
341 },
ac5ea531 342);
343
344foreach my $tbl (@tripletable) {
8f118dcd 345 my $file = "$tbl->{file}.h";
346 my $head = "${prefix}$tbl->{name}";
347 my $type = $tbl->{type};
348 my $hash = $tbl->{hash};
349 my $null = $tbl->{null};
350 my $init = $tbl->{init};
351
352 open FH, ">$file" or croak "$PACKAGE: $file can't be made";
353 binmode FH; select FH;
354 my %val;
355
356 print FH << 'EOF';
ac5ea531 357/*
358 * This file is auto-generated by mkheader.
359 * Any changes here will be lost!
360 */
361EOF
362
8f118dcd 363 print $init if defined $init;
364
365 foreach my $uv (keys %$hash) {
f027f502 366 croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
367 unless $uv <= 0x10FFFF;
8f118dcd 368 my @c = unpack 'CCCC', pack 'N', $uv;
369 $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
370 }
371
372 foreach my $p (sort { $a <=> $b } keys %val) {
373 next if ! $val{ $p };
374 for (my $r = 0; $r < 256; $r++) {
375 next if ! $val{ $p }{ $r };
fe067ad9 376 printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r;
8f118dcd 377 for (my $c = 0; $c < 256; $c++) {
378 print "\t", defined $val{$p}{$r}{$c}
379 ? "($type)".$val{$p}{$r}{$c}
380 : $null;
381 print ',' if $c != 255;
382 print "\n" if $c % 8 == 7;
383 }
384 print "};\n\n";
385 }
386 }
387 foreach my $p (sort { $a <=> $b } keys %val) {
388 next if ! $val{ $p };
fe067ad9 389 printf "static $type* ${head}_%02x [256] = {\n", $p;
8f118dcd 390 for (my $r = 0; $r < 256; $r++) {
391 print $val{ $p }{ $r }
392 ? sprintf("${head}_%02x_%02x", $p, $r)
393 : "NULL";
394 print ',' if $r != 255;
395 print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
396 }
397 print "};\n\n";
ac5ea531 398 }
fe067ad9 399 print "static $type** $head [] = {\n";
8f118dcd 400 for (my $p = 0; $p <= 0x10; $p++) {
401 print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
402 print ',' if $p != 0x10;
403 print "\n";
ac5ea531 404 }
405 print "};\n\n";
8f118dcd 406 close FH;
ac5ea531 407}
408
628bbff0 4091;
ac5ea531 410__END__