Upgrade to Encode 1.92.
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / mkheader
CommitLineData
ac5ea531 1#!perl
2#
3# This script generates "unfcan.h", "unfcpt.h", "unfcmb.h",
f027f502 4# "unfcmp.h", and "unfexc.h"
48287974 5# from CombiningClass.pl, Decomposition.pl, CompositionExclusions.txt
ac5ea531 6# in lib/unicore or unicode directory
7# for Unicode::Normalize.xs. (cf. Makefile.PL)
8#
f027f502 9# Usage: <perl mkheader> in command line
10# or <do 'mkheader'> in perl
11#
ac5ea531 12use 5.006;
13use strict;
14use warnings;
15use Carp;
6c941e0c 16use File::Spec;
17
18our $IsEBCDIC = ord("A") != 0x41;
ac5ea531 19
20our $PACKAGE = 'Unicode::Normalize, mkheader';
21
22our $Combin = do "unicore/CombiningClass.pl"
8f118dcd 23 || do "unicode/CombiningClass.pl"
24 || croak "$PACKAGE: CombiningClass.pl not found";
ac5ea531 25
26our $Decomp = do "unicore/Decomposition.pl"
8f118dcd 27 || do "unicode/Decomposition.pl"
28 || croak "$PACKAGE: Decomposition.pl not found";
ac5ea531 29
48287974 30our %Combin; # $codepoint => $number : combination class
6c941e0c 31our %Canon; # $codepoint => \@codepoints : canonical decomp.
32our %Compat; # $codepoint => \@codepoints : compat. decomp.
33# after _U_stringify(), ($codepoint => $hexstring) for %Canon and %Compat
48287974 34our %Exclus; # $codepoint => 1 : composition exclusions
35our %Single; # $codepoint => 1 : singletons
36our %NonStD; # $codepoint => 1 : non-starter decompositions
37
38our %Comp1st; # $codepoint => $listname : may be composed with a next char.
39our %Comp2nd; # $codepoint => 1 : may be composed with a prev char.
40our %CompList; # $listname,$2nd => $codepoint : composite
41
42our $prefix = "UNF_";
43our $structname = "${prefix}complist";
ac5ea531 44
6c941e0c 45########## definition of Hangul constants ##########
46use constant SBase => 0xAC00;
47use constant SFinal => 0xD7A3; # SBase -1 + SCount
48use constant SCount => 11172; # LCount * NCount
49use constant NCount => 588; # VCount * TCount
50use constant LBase => 0x1100;
51use constant LFinal => 0x1112;
52use constant LCount => 19;
53use constant VBase => 0x1161;
54use constant VFinal => 0x1175;
55use constant VCount => 21;
56use constant TBase => 0x11A7;
57use constant TFinal => 0x11C2;
58use constant TCount => 28;
59
60sub decomposeHangul {
61 my $SIndex = $_[0] - SBase;
62 my $LIndex = int( $SIndex / NCount);
63 my $VIndex = int(($SIndex % NCount) / TCount);
64 my $TIndex = $SIndex % TCount;
65 my @ret = (
66 LBase + $LIndex,
67 VBase + $VIndex,
68 $TIndex ? (TBase + $TIndex) : (),
69 );
70 wantarray ? @ret : pack('U*', @ret);
71 # any element in @ret greater than 0xFF, so no need of u2n conversion.
72}
73
74########## getting full decomposion ##########
ac5ea531 75{
8f118dcd 76 my($f, $fh);
77 foreach my $d (@INC) {
8f118dcd 78 $f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
79 last if open($fh, $f);
80 $f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
81 last if open($fh, $f);
82 $f = undef;
83 }
48287974 84 croak "$PACKAGE: neither unicore/CompositionExclusions.txt "
85 . "nor unicode/CompExcl.txt is found in @INC" unless defined $f;
86
87 while (<$fh>) {
88 next if /^#/ or /^$/;
89 s/#.*//;
90 $Exclus{ hex($1) } = 1 if /([0-9A-Fa-f]+)/;
91 }
8f118dcd 92 close $fh;
ac5ea531 93}
94
48287974 95##
96## converts string "hhhh hhhh hhhh" to a numeric list
97##
98sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g }
99
8f118dcd 100while ($Combin =~ /(.+)/g) {
101 my @tab = split /\t/, $1;
102 my $ini = hex $tab[0];
103 if ($tab[1] eq '') {
104 $Combin{ $ini } = $tab[2];
105 } else {
106 $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]);
107 }
ac5ea531 108}
109
8f118dcd 110while ($Decomp =~ /(.+)/g) {
111 my @tab = split /\t/, $1;
112 my $compat = $tab[2] =~ s/<[^>]+>//;
113 my $dec = [ _getHexArray($tab[2]) ]; # decomposition
8f118dcd 114 my $ini = hex($tab[0]); # initial decomposable character
48287974 115
116 my $listname =
117 @$dec == 2 ? sprintf("${structname}_%06x", $dec->[0]) : 'USELESS';
118 # %04x is bad since it'd place _3046 after _1d157.
119
8f118dcd 120 if ($tab[1] eq '') {
121 $Compat{ $ini } = $dec;
122
123 if (! $compat) {
48287974 124 $Canon{ $ini } = $dec;
8f118dcd 125
f027f502 126 if (@$dec == 2) {
8f118dcd 127 if ($Combin{ $dec->[0] }) {
128 $NonStD{ $ini } = 1;
129 } else {
48287974 130 $CompList{ $listname }{ $dec->[1] } = $ini;
131 $Comp1st{ $dec->[0] } = $listname;
132 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$ini};
8f118dcd 133 }
f027f502 134 } elsif (@$dec == 1) {
8f118dcd 135 $Single{ $ini } = 1;
f027f502 136 } else {
137 croak("Weird Canonical Decomposition of U+$tab[0]");
8f118dcd 138 }
139 }
140 } else {
48287974 141 foreach my $u ($ini .. hex($tab[1])) {
8f118dcd 142 $Compat{ $u } = $dec;
48287974 143
8f118dcd 144 if (! $compat) {
48287974 145 $Canon{ $u } = $dec;
8f118dcd 146
f027f502 147 if (@$dec == 2) {
8f118dcd 148 if ($Combin{ $dec->[0] }) {
149 $NonStD{ $u } = 1;
150 } else {
48287974 151 $CompList{ $listname }{ $dec->[1] } = $u;
152 $Comp1st{ $dec->[0] } = $listname;
153 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
8f118dcd 154 }
f027f502 155 } elsif (@$dec == 1) {
8f118dcd 156 $Single{ $u } = 1;
f027f502 157 } else {
158 croak("Weird Canonical Decomposition of U+$tab[0]");
8f118dcd 159 }
160 }
161 }
ac5ea531 162 }
ac5ea531 163}
164
48287974 165# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
166foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
167 $Comp2nd{$j} = 1;
ac5ea531 168}
169
170sub getCanonList {
8f118dcd 171 my @src = @_;
6c941e0c 172 my @dec = map {
173 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
174 : $Canon{$_} ? @{ $Canon{$_} } : $_
175 } @src;
8f118dcd 176 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
177 # condition @src == @dec is not ok.
ac5ea531 178}
179
180sub getCompatList {
8f118dcd 181 my @src = @_;
6c941e0c 182 my @dec = map {
183 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
184 : $Compat{$_} ? @{ $Compat{$_} } : $_
185 } @src;
8f118dcd 186 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
187 # condition @src == @dec is not ok.
ac5ea531 188}
189
48287974 190# exhaustive decomposition
191foreach my $key (keys %Canon) {
192 $Canon{$key} = [ getCanonList($key) ];
193}
194
195# exhaustive decomposition
196foreach my $key (keys %Compat) {
197 $Compat{$key} = [ getCompatList($key) ];
198}
ac5ea531 199
200sub _U_stringify {
8f118dcd 201 sprintf '"%s"', join '',
6c941e0c 202 map sprintf("\\x%02x", $_), unpack 'C*',
203 $IsEBCDIC
204 ? pack('U*', map utf8::unicode_to_native($_), @_)
205 : pack('U*', @_);
ac5ea531 206}
207
208foreach my $hash (\%Canon, \%Compat) {
8f118dcd 209 foreach my $key (keys %$hash) {
210 $hash->{$key} = _U_stringify( @{ $hash->{$key} } );
211 }
ac5ea531 212}
213
6c941e0c 214########## writing header files ##########
ac5ea531 215
8f118dcd 216my @boolfunc = (
217 {
218 name => "Exclusion",
219 type => "bool",
220 hash => \%Exclus,
221 },
222 {
223 name => "Singleton",
224 type => "bool",
225 hash => \%Single,
226 },
227 {
228 name => "NonStDecomp",
229 type => "bool",
230 hash => \%NonStD,
231 },
232 {
233 name => "Comp2nd",
234 type => "bool",
235 hash => \%Comp2nd,
236 },
237);
ac5ea531 238
239my $file = "unfexc.h";
240open FH, ">$file" or croak "$PACKAGE: $file can't be made";
241binmode FH; select FH;
242
8f118dcd 243 print << 'EOF';
244/*
245 * This file is auto-generated by mkheader.
246 * Any changes here will be lost!
247 */
248EOF
ac5ea531 249
8f118dcd 250foreach my $tbl (@boolfunc) {
251 my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
252 my $type = $tbl->{type};
253 my $name = $tbl->{name};
254 print "$type is$name (UV uv)\n{\nreturn\n\t";
255
256 while (@temp) {
257 my $cur = shift @temp;
258 if (@temp && $cur + 1 == $temp[0]) {
259 print "($cur <= uv && uv <= ";
260 while (@temp && $cur + 1 == $temp[0]) {
261 $cur = shift @temp;
262 }
263 print "$cur)";
264 print "\n\t|| " if @temp;
265 } else {
266 print "uv == $cur";
267 print "\n\t|| " if @temp;
268 }
ac5ea531 269 }
8f118dcd 270 print "\n\t? TRUE : FALSE;\n}\n\n";
ac5ea531 271}
272
ac5ea531 273close FH;
274
275####################################
276
48287974 277my $compinit =
278 "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
279
280foreach my $i (sort keys %CompList) {
281 $compinit .= "$structname $i [] = {\n";
282 $compinit .= join ",\n",
283 map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
284 sort {$a <=> $b } keys %{ $CompList{$i} };
285 $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
286}
287
ac5ea531 288my @tripletable = (
8f118dcd 289 {
290 file => "unfcmb",
291 name => "combin",
292 type => "STDCHAR",
293 hash => \%Combin,
294 null => 0,
295 },
296 {
297 file => "unfcan",
298 name => "canon",
299 type => "char*",
300 hash => \%Canon,
301 null => "NULL",
302 },
303 {
304 file => "unfcpt",
305 name => "compat",
306 type => "char*",
307 hash => \%Compat,
308 null => "NULL",
309 },
310 {
311 file => "unfcmp",
312 name => "compos",
313 type => "$structname *",
314 hash => \%Comp1st,
315 null => "NULL",
316 init => $compinit,
317 },
ac5ea531 318);
319
320foreach my $tbl (@tripletable) {
8f118dcd 321 my $file = "$tbl->{file}.h";
322 my $head = "${prefix}$tbl->{name}";
323 my $type = $tbl->{type};
324 my $hash = $tbl->{hash};
325 my $null = $tbl->{null};
326 my $init = $tbl->{init};
327
328 open FH, ">$file" or croak "$PACKAGE: $file can't be made";
329 binmode FH; select FH;
330 my %val;
331
332 print FH << 'EOF';
ac5ea531 333/*
334 * This file is auto-generated by mkheader.
335 * Any changes here will be lost!
336 */
337EOF
338
8f118dcd 339 print $init if defined $init;
340
341 foreach my $uv (keys %$hash) {
f027f502 342 croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
343 unless $uv <= 0x10FFFF;
8f118dcd 344 my @c = unpack 'CCCC', pack 'N', $uv;
345 $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
346 }
347
348 foreach my $p (sort { $a <=> $b } keys %val) {
349 next if ! $val{ $p };
350 for (my $r = 0; $r < 256; $r++) {
351 next if ! $val{ $p }{ $r };
352 printf "$type ${head}_%02x_%02x [256] = {\n", $p, $r;
353 for (my $c = 0; $c < 256; $c++) {
354 print "\t", defined $val{$p}{$r}{$c}
355 ? "($type)".$val{$p}{$r}{$c}
356 : $null;
357 print ',' if $c != 255;
358 print "\n" if $c % 8 == 7;
359 }
360 print "};\n\n";
361 }
362 }
363 foreach my $p (sort { $a <=> $b } keys %val) {
364 next if ! $val{ $p };
365 printf "$type* ${head}_%02x [256] = {\n", $p;
366 for (my $r = 0; $r < 256; $r++) {
367 print $val{ $p }{ $r }
368 ? sprintf("${head}_%02x_%02x", $p, $r)
369 : "NULL";
370 print ',' if $r != 255;
371 print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
372 }
373 print "};\n\n";
ac5ea531 374 }
8f118dcd 375 print "$type** $head [] = {\n";
376 for (my $p = 0; $p <= 0x10; $p++) {
377 print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
378 print ',' if $p != 0x10;
379 print "\n";
ac5ea531 380 }
381 print "};\n\n";
8f118dcd 382 close FH;
ac5ea531 383}
384
385__END__