3 # This auxiliary script makes five header files
4 # used for building XSUB of Unicode::Normalize.
7 # <do 'mkheader'> in perl, or <perl mkheader> in command line
10 # unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
11 # unicore/Decomposition.pl (or unicode/Decomposition.pl)
12 # unicore/CompositionExclusions.txt (or unicode/CompExcl.txt)
28 unless ("A" eq pack('U', 0x41)) {
29 die "Unicode::Normalize cannot stringify a Unicode code point\n";
33 our $PACKAGE = 'Unicode::Normalize, mkheader';
35 our $Combin = do "unicore/CombiningClass.pl"
36 || do "unicode/CombiningClass.pl"
37 || croak "$PACKAGE: CombiningClass.pl not found";
39 our $Decomp = do "unicore/Decomposition.pl"
40 || do "unicode/Decomposition.pl"
41 || croak "$PACKAGE: Decomposition.pl not found";
43 our %Combin; # $codepoint => $number : combination class
44 our %Canon; # $codepoint => \@codepoints : canonical decomp.
45 our %Compat; # $codepoint => \@codepoints : compat. decomp.
46 # after _U_stringify(), ($codepoint => $hexstring) for %Canon and %Compat
47 our %Exclus; # $codepoint => 1 : composition exclusions
48 our %Single; # $codepoint => 1 : singletons
49 our %NonStD; # $codepoint => 1 : non-starter decompositions
51 our %Comp1st; # $codepoint => $listname : may be composed with a next char.
52 our %Comp2nd; # $codepoint => 1 : may be composed with a prev char.
53 our %CompList; # $listname,$2nd => $codepoint : composite
56 our $structname = "${prefix}complist";
58 ########## definition of Hangul constants ##########
59 use constant SBase => 0xAC00;
60 use constant SFinal => 0xD7A3; # SBase -1 + SCount
61 use constant SCount => 11172; # LCount * NCount
62 use constant NCount => 588; # VCount * TCount
63 use constant LBase => 0x1100;
64 use constant LFinal => 0x1112;
65 use constant LCount => 19;
66 use constant VBase => 0x1161;
67 use constant VFinal => 0x1175;
68 use constant VCount => 21;
69 use constant TBase => 0x11A7;
70 use constant TFinal => 0x11C2;
71 use constant TCount => 28;
74 my $SIndex = $_[0] - SBase;
75 my $LIndex = int( $SIndex / NCount);
76 my $VIndex = int(($SIndex % NCount) / TCount);
77 my $TIndex = $SIndex % TCount;
81 $TIndex ? (TBase + $TIndex) : (),
86 ########## getting full decomposion ##########
89 foreach my $d (@INC) {
90 $f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
91 last if open($fh, $f);
92 $f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
93 last if open($fh, $f);
96 croak "$PACKAGE: neither unicore/CompositionExclusions.txt "
97 . "nor unicode/CompExcl.txt is found in @INC" unless defined $f;
100 next if /^#/ or /^$/;
102 $Exclus{ hex($1) } = 1 if /([0-9A-Fa-f]+)/;
108 ## converts string "hhhh hhhh hhhh" to a numeric list
110 sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g }
112 while ($Combin =~ /(.+)/g) {
113 my @tab = split /\t/, $1;
114 my $ini = hex $tab[0];
116 $Combin{ $ini } = $tab[2];
118 $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]);
122 while ($Decomp =~ /(.+)/g) {
123 my @tab = split /\t/, $1;
124 my $compat = $tab[2] =~ s/<[^>]+>//;
125 my $dec = [ _getHexArray($tab[2]) ]; # decomposition
126 my $ini = hex($tab[0]); # initial decomposable character
129 @$dec == 2 ? sprintf("${structname}_%06x", $dec->[0]) : 'USELESS';
130 # %04x is bad since it'd place _3046 after _1d157.
133 $Compat{ $ini } = $dec;
136 $Canon{ $ini } = $dec;
139 if ($Combin{ $dec->[0] }) {
142 $CompList{ $listname }{ $dec->[1] } = $ini;
143 $Comp1st{ $dec->[0] } = $listname;
144 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$ini};
146 } elsif (@$dec == 1) {
149 croak("Weird Canonical Decomposition of U+$tab[0]");
153 foreach my $u ($ini .. hex($tab[1])) {
154 $Compat{ $u } = $dec;
160 if ($Combin{ $dec->[0] }) {
163 $CompList{ $listname }{ $dec->[1] } = $u;
164 $Comp1st{ $dec->[0] } = $listname;
165 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
167 } elsif (@$dec == 1) {
170 croak("Weird Canonical Decomposition of U+$tab[0]");
177 # modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
178 foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
185 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
186 : $Canon{$_} ? @{ $Canon{$_} } : $_
188 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
189 # condition @src == @dec is not ok.
195 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
196 : $Compat{$_} ? @{ $Compat{$_} } : $_
198 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
199 # condition @src == @dec is not ok.
202 # exhaustive decomposition
203 foreach my $key (keys %Canon) {
204 $Canon{$key} = [ getCanonList($key) ];
207 # exhaustive decomposition
208 foreach my $key (keys %Compat) {
209 $Compat{$key} = [ getCompatList($key) ];
213 return pack('U*', @_);
216 sub split_into_char {
219 my $len = length($uni);
221 for(my $i = 0; $i < $len; ++$i) {
222 push @ary, ord(substr($uni,$i,1));
228 sprintf '"%s"', join '',
229 map sprintf("\\x%02x", $_), split_into_char(_pack_U(@_));
232 foreach my $hash (\%Canon, \%Compat) {
233 foreach my $key (keys %$hash) {
234 $hash->{$key} = _U_stringify( @{ $hash->{$key} } );
238 ########## writing header files ##########
252 name => "NonStDecomp",
263 my $file = "unfexc.h";
264 open FH, ">$file" or croak "$PACKAGE: $file can't be made";
265 binmode FH; select FH;
269 * This file is auto-generated by mkheader.
270 * Any changes here will be lost!
274 foreach my $tbl (@boolfunc) {
275 my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
276 my $type = $tbl->{type};
277 my $name = $tbl->{name};
278 print "$type is$name (UV uv)\n{\nreturn\n\t";
281 my $cur = shift @temp;
282 if (@temp && $cur + 1 == $temp[0]) {
283 print "($cur <= uv && uv <= ";
284 while (@temp && $cur + 1 == $temp[0]) {
288 print "\n\t|| " if @temp;
291 print "\n\t|| " if @temp;
294 print "\n\t? TRUE : FALSE;\n}\n\n";
299 ####################################
302 "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
304 foreach my $i (sort keys %CompList) {
305 $compinit .= "$structname $i [] = {\n";
306 $compinit .= join ",\n",
307 map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
308 sort {$a <=> $b } keys %{ $CompList{$i} };
309 $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
337 type => "$structname *",
344 foreach my $tbl (@tripletable) {
345 my $file = "$tbl->{file}.h";
346 my $head = "${prefix}$tbl->{name}";
347 my $type = $tbl->{type};
348 my $hash = $tbl->{hash};
349 my $null = $tbl->{null};
350 my $init = $tbl->{init};
352 open FH, ">$file" or croak "$PACKAGE: $file can't be made";
353 binmode FH; select FH;
358 * This file is auto-generated by mkheader.
359 * Any changes here will be lost!
363 print $init if defined $init;
365 foreach my $uv (keys %$hash) {
366 croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
367 unless $uv <= 0x10FFFF;
368 my @c = unpack 'CCCC', pack 'N', $uv;
369 $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
372 foreach my $p (sort { $a <=> $b } keys %val) {
373 next if ! $val{ $p };
374 for (my $r = 0; $r < 256; $r++) {
375 next if ! $val{ $p }{ $r };
376 printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r;
377 for (my $c = 0; $c < 256; $c++) {
378 print "\t", defined $val{$p}{$r}{$c}
379 ? "($type)".$val{$p}{$r}{$c}
381 print ',' if $c != 255;
382 print "\n" if $c % 8 == 7;
387 foreach my $p (sort { $a <=> $b } keys %val) {
388 next if ! $val{ $p };
389 printf "static $type* ${head}_%02x [256] = {\n", $p;
390 for (my $r = 0; $r < 256; $r++) {
391 print $val{ $p }{ $r }
392 ? sprintf("${head}_%02x_%02x", $p, $r)
394 print ',' if $r != 255;
395 print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
399 print "static $type** $head [] = {\n";
400 for (my $p = 0; $p <= 0x10; $p++) {
401 print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
402 print ',' if $p != 0x10;