Upgrade to Unicode::Normalize 0.12.
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / mkheader
CommitLineData
ac5ea531 1#!perl
2#
3# This script generates "unfcan.h", "unfcpt.h", "unfcmb.h",
4# "unfcmp.h", and "unfexc.h"
5# from CombiningClass.pl, Decomposition.pl, CompExcl.txt
6# in lib/unicore or unicode directory
7# for Unicode::Normalize.xs. (cf. Makefile.PL)
8#
9use 5.006;
10use strict;
11use warnings;
12use Carp;
13
14our $PACKAGE = 'Unicode::Normalize, mkheader';
15
16our $Combin = do "unicore/CombiningClass.pl"
17 || do "unicode/CombiningClass.pl"
18 || croak "$PACKAGE: CombiningClass.pl not found";
19
20our $Decomp = do "unicore/Decomposition.pl"
21 || do "unicode/Decomposition.pl"
22 || croak "$PACKAGE: Decomposition.pl not found";
23
24our %Combin; # $codepoint => $number : combination class
25our %Canon; # $codepoint => $hexstring : canonical decomp.
26our %Compat; # $codepoint => $hexstring : compat. decomp.
27our %Compos; # $string => $codepoint : composite
28
29our %Exclus; # $codepoint => 1 : composition exclusions
30
31{
32 my($f, $fh);
33 foreach my $d (@INC) {
34 use File::Spec;
35 $f = File::Spec->catfile($d, "unicore", "CompExcl.txt");
36 last if open($fh, $f);
37 $f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
38 last if open($fh, $f);
39 $f = undef;
40 }
41 croak "$PACKAGE: CompExcl.txt not found in @INC" unless defined $f;
42 while(<$fh>) {
43 next if /^#/ or /^$/;
44 s/#.*//;
45 $Exclus{ hex($1) } =1 if /([0-9A-Fa-f]+)/;
46 }
47 close $fh;
48}
49
50while($Combin =~ /(.+)/g) {
51 my @tab = split /\t/, $1;
52 my $ini = hex $tab[0];
53 if($tab[1] eq '') {
54 $Combin{ $ini } = $tab[2];
55 } else {
56 $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]);
57 }
58}
59
60while($Decomp =~ /(.+)/g) {
61 my @tab = split /\t/, $1;
62 my $compat = $tab[2] =~ s/<[^>]+>//;
63 my $dec = [ _getHexArray($tab[2]) ]; # decomposition
64 my $com = pack('U*', @$dec); # composable sequence
65 my $ini = hex($tab[0]);
66 if($tab[1] eq '') {
67 $Compat{ $ini } = $dec;
68 if(! $compat) {
69 $Canon{ $ini } = $dec;
70 $Compos{ $com } = $ini if @$dec > 1;
71 }
72 } else {
73 foreach my $u ($ini .. hex($tab[1])){
74 $Compat{ $u } = $dec;
75 if(! $compat){
76 $Canon{ $u } = $dec;
77 $Compos{ $com } = $ini if @$dec > 1;
78 }
79 }
80 }
81}
82
83# exhaustive decomposition
84foreach my $key (keys %Canon) {
85 $Canon{$key} = [ getCanonList($key) ];
86}
87
88# exhaustive decomposition
89foreach my $key (keys %Compat) {
90 $Compat{$key} = [ getCompatList($key) ];
91}
92
93sub getCanonList {
94 my @src = @_;
95 my @dec = map $Canon{$_} ? @{ $Canon{$_} } : $_, @src;
96 join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
97 # condition @src == @dec is not ok.
98}
99
100sub getCompatList {
101 my @src = @_;
102 my @dec = map $Compat{$_} ? @{ $Compat{$_} } : $_, @src;
103 join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
104 # condition @src == @dec is not ok.
105}
106
107sub _getHexArray {
108 my $str = shift;
109 map hex(), $str =~ /([0-9A-Fa-f]+)/g;
110}
111
112sub _U_stringify {
113 sprintf '"%s"', join '',
114 map sprintf("\\x%2x", $_), unpack 'C*', pack 'U*', @_;
115}
116
117foreach my $hash (\%Canon, \%Compat) {
118 foreach my $key (keys %$hash) {
119 $hash->{$key} = _U_stringify( @{ $hash->{$key} } );
120 }
121}
122
123sub utf8len {
124 my $uv = shift;
125 return $uv < 0x80 ? 1 :
126 $uv < 0x800 ? 2 :
127 $uv < 0x10000 ? 3 :
128 $uv < 0x110000 ? 4 :
129 croak "$PACKAGE: illegal char in the composite. utf-8 max is 0x10ffff.";
130}
131
2a204b45 132sub utfebcdiclen {
133 my $uv = shift;
134 return $uv < 0xA0 ? 1 :
135 $uv < 0x400 ? 2 :
136 $uv < 0x4000 ? 3 :
137 $uv < 0x40000 ? 4 :
138 $uv < 0x110000 ? 5 :
139 croak "$PACKAGE: illegal char in the composite. utf-8 max is 0x10ffff.";
140}
141
ac5ea531 142my $prefix = "UNF_";
143
144my $structname = "${prefix}complist";
145
146our (%Comp1st, %CompList);
147
2a204b45 148my $errExpand = "$PACKAGE: A composable pair in %s "
149 . "is longer than the composite in bytes!\n"
150 . "%d + %d => %d\nQuit. Please inform the author...";
151
ac5ea531 152foreach(sort keys %Compos) {
153 my @a = unpack('U*', $_);
154 my $val = $Compos{$_};
155 my $name = sprintf "${structname}_%06x", $a[0];
156 $Comp1st{ $a[0] } = $name;
157 $CompList{ $name }{ $a[1] } = $val;
158
159 if( utf8len($a[0]) + utf8len($a[1]) < utf8len($val) ) {
2a204b45 160 croak sprintf($errExpand, "utf-8", $a[0], $a[1], $val);
161 }
162 if( utfebcdiclen($a[0]) + utfebcdiclen($a[1]) < utfebcdiclen($val)) {
163 croak sprintf($errExpand, "utf-ebcdic", $a[0], $a[1], $val);
ac5ea531 164 }
165}
166
167my $compinit =
168 "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
169
170foreach my $i (sort keys %CompList) {
171 $compinit .= "$structname $i [] = {\n";
172 $compinit .= join ",\n",
173 map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
174 sort {$a <=> $b } keys %{ $CompList{$i} };
175 $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
176}
177
178####################################
179
180my @Exclus = sort {$a <=> $b} keys %Exclus;
181
182my $file = "unfexc.h";
183open FH, ">$file" or croak "$PACKAGE: $file can't be made";
184binmode FH; select FH;
185
2a204b45 186print "bool isExclusion (UV uv) \n{\nreturn\n\t";
ac5ea531 187
188while(@Exclus) {
189 my $cur = shift @Exclus;
190 if(@Exclus && $cur + 1 == $Exclus[0]) {
191 print "$cur <= uv && uv <= ";
192 while(@Exclus && $cur + 1 == $Exclus[0]) {
193 $cur = shift @Exclus;
194 }
195 print $cur;
196 print "\n\t|| " if @Exclus;
197 } else {
198 print "uv == $cur";
199 print "\n\t|| " if @Exclus;
200 }
201}
202
203print "\n\t? TRUE : FALSE;\n}\n\n";
204close FH;
205
206####################################
207
208my @tripletable = (
209 {
210 file => "unfcmb",
211 name => "combin",
3164a1ca 212 type => "STDCHAR",
ac5ea531 213 hash => \%Combin,
214 null => 0,
215 },
216 {
217 file => "unfcan",
218 name => "canon",
219 type => "char*",
220 hash => \%Canon,
221 null => "NULL",
222 },
223 {
224 file => "unfcpt",
225 name => "compat",
226 type => "char*",
227 hash => \%Compat,
228 null => "NULL",
229 },
230 {
231 file => "unfcmp",
232 name => "compos",
233 type => "$structname *",
234 hash => \%Comp1st,
235 null => "NULL",
236 init => $compinit,
237 },
238);
239
240foreach my $tbl (@tripletable) {
241 my $file = "$tbl->{file}.h";
242 my $head = "${prefix}$tbl->{name}";
243 my $type = $tbl->{type};
244 my $hash = $tbl->{hash};
245 my $null = $tbl->{null};
246 my $init = $tbl->{init};
247
248 open FH, ">$file" or croak "$PACKAGE: $file can't be made";
249 binmode FH; select FH;
250 my %val;
251
252 print FH << 'EOF';
253/*
254 * This file is auto-generated by mkheader.
255 * Any changes here will be lost!
256 */
257EOF
258
259 print $init if defined $init;
260
261 foreach my $uv (keys %$hash) {
262 my @c = unpack 'CCCC', pack 'N', $uv;
263 $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
264 }
265
266 foreach my $p (sort { $a <=> $b } keys %val) {
267 next if ! $val{ $p };
268 for(my $r = 0; $r < 256; $r++){
269 next if ! $val{ $p }{ $r };
270 printf "$type ${head}_%02x_%02x [256] = {\n", $p, $r;
271 for(my $c = 0; $c < 256; $c++){
272 print "\t", defined $val{$p}{$r}{$c} ? $val{$p}{$r}{$c} : $null;
273 print ',' if $c != 255;
274 print "\n" if $c % 8 == 7;
275 }
276 print "};\n\n";
277 }
278 }
279 foreach my $p (sort { $a <=> $b } keys %val) {
280 next if ! $val{ $p };
281 printf "$type* ${head}_%02x [256] = {\n", $p;
282 for(my $r = 0; $r < 256; $r++){
283 print $val{ $p }{ $r } ? sprintf("${head}_%02x_%02x", $p, $r) : "NULL";
284 print ',' if $r != 255;
285 print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
286 }
287 print "};\n\n";
288 }
289 print "$type** $head [] = {\n";
290 for(my $p = 0; $p <= 0x10; $p++){
291 print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
292 print ',' if $p != 0x10;
293 print "\n";
294 }
295 print "};\n\n";
296 close FH;
297}
298
299__END__