Integrate mainline
[p5sagit/p5-mst-13.2.git] / ext / Encode / compile
CommitLineData
017e2add 1#!../../perl -w
2f2b4ff2 2BEGIN { @INC = '../../lib' };
017e2add 3use strict;
4
5sub encode_U
6{
9b37254d 7 # UTF-8 encode long hand - only covers part of perl's range
017e2add 8 my $uv = shift;
9 if ($uv < 0x80)
10 {
11 return chr($uv)
12 }
13 if ($uv < 0x800)
14 {
15 return chr(($uv >> 6) | 0xC0).
16 chr(($uv & 0x3F) | 0x80);
17 }
18 return chr(($uv >> 12) | 0xE0).
19 chr((($uv >> 6) & 0x3F) | 0x80).
20 chr(($uv & 0x3F) | 0x80);
21}
22
23sub encode_S
24{
14a8264b 25 # encode single byte
017e2add 26 my ($ch,$page) = @_;
27 return chr($ch);
28}
29
30sub encode_D
31{
14a8264b 32 # encode double byte MS byte first
017e2add 33 my ($ch,$page) = @_;
34 return chr($page).chr($ch);
35}
36
37sub encode_M
38{
14a8264b 39 # encode Multi-byte - single for 0..255 otherwise double
017e2add 40 my ($ch,$page) = @_;
41 return &encode_D if $page;
42 return &encode_S;
43}
44
14a8264b 45# Win32 does not expand globs on command line
252a8565 46eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
18b7339f 47
017e2add 48my $cname = shift(@ARGV);
2f2b4ff2 49chmod(0666,$cname) if -f $cname && !-w $cname;
017e2add 50open(C,">$cname") || die "Cannot open $cname:$!";
2f2b4ff2 51my $dname = $cname;
52$dname =~ s/(\.[^\.]*)?$/.def/;
e0c49a6b 53
54my ($doC,$doEnc,$doUcm);
55
56if ($cname =~ /\.(c|xs)$/)
57 {
58 $doC = 1;
59 chmod(0666,$dname) if -f $cname && !-w $dname;
60 open(D,">$dname") || die "Cannot open $dname:$!";
61 my $hname = $cname;
62 $hname =~ s/(\.[^\.]*)?$/.h/;
63 chmod(0666,$hname) if -f $cname && !-w $hname;
64 open(H,">$hname") || die "Cannot open $hname:$!";
65
66 foreach my $fh (\*C,\*D,\*H)
67 {
68 print $fh <<"END";
14a8264b 69/*
70 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
71 This file was autogenerated by:
72 $^X $0 $cname @ARGV
73*/
74END
e0c49a6b 75 }
14a8264b 76
e0c49a6b 77 if ($cname =~ /(\w+)\.xs$/)
78 {
79 print C "#include <EXTERN.h>\n";
80 print C "#include <perl.h>\n";
81 print C "#include <XSUB.h>\n";
82 print C "#define U8 U8\n";
83 }
84 print C "#include \"encode.h\"\n";
85 }
86elsif ($cname =~ /\.enc$/)
87 {
88 $doEnc = 1;
89 }
90elsif ($cname =~ /\.ucm$/)
2f2b4ff2 91 {
e0c49a6b 92 $doUcm = 1;
2f2b4ff2 93 }
017e2add 94
95my %encoding;
96my %strings;
97
2f2b4ff2 98sub cmp_name
99{
100 if ($a =~ /^.*-(\d+)/)
101 {
102 my $an = $1;
103 if ($b =~ /^.*-(\d+)/)
104 {
105 my $r = $an <=> $1;
106 return $r if $r;
107 }
108 }
109 return $a cmp $b;
110}
111
112foreach my $enc (sort cmp_name @ARGV)
017e2add 113 {
9b37254d 114 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
017e2add 115 if (open(E,$enc))
116 {
9b37254d 117 if ($sfx eq 'enc')
118 {
119 compile_enc(\*E,lc($name),\*C);
120 }
121 else
122 {
123 compile_ucm(\*E,lc($name),\*C);
124 }
017e2add 125 }
126 else
127 {
128 warn "Cannot open $enc for $name:$!";
129 }
130 }
131
e0c49a6b 132if ($doC)
2f2b4ff2 133 {
e0c49a6b 134 foreach my $enc (sort cmp_name keys %encoding)
135 {
136 my $sym = "${enc}_encoding";
137 $sym =~ s/\W+/_/g;
138 print C "encode_t $sym = \n";
139 print C " {",join(',',"\"$enc\"",@{$encoding{$enc}}),"};\n\n";
140 }
2f2b4ff2 141
e0c49a6b 142 foreach my $enc (sort cmp_name keys %encoding)
143 {
144 my $sym = "${enc}_encoding";
145 $sym =~ s/\W+/_/g;
146 print H "extern encode_t $sym;\n";
147 print D " Encode_Define(aTHX_ &$sym);\n";
148 }
017e2add 149
e0c49a6b 150 if ($cname =~ /(\w+)\.xs$/)
151 {
152 my $mod = $1;
153 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
154 print C "BOOT:\n{\n";
155 print C "#include \"$dname\"\n";
156 print C "}\n";
157 }
158 close(D);
159 close(H);
2f2b4ff2 160 }
017e2add 161close(C);
162
9b37254d 163
164sub compile_ucm
165{
166 my ($fh,$name,$ch) = @_;
167 my $e2u = {};
168 my $u2e = {};
169 my $cs;
170 my %attr;
171 while (<$fh>)
172 {
173 s/#.*$//;
174 last if /^\s*CHARMAP\s*$/i;
175 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i)
176 {
177 $attr{$1} = $2;
178 }
179 }
180 if (!defined($cs = $attr{'code_set_name'}))
181 {
182 warn "No <code_set_name> in $name\n";
183 }
184 else
185 {
e0c49a6b 186 # $name = lc($cs);
9b37254d 187 }
188 my $erep;
189 my $urep;
190 if (exists $attr{'subchar'})
191 {
192 my @byte = $attr{'subchar'} =~ /^\s*(?:\\x([0-9a-f]+))+\s*$/;
193 $erep = join('',map(hex($_),@byte));
194 }
e0c49a6b 195 warn "Scanning $name ($cs)\n";
9b37254d 196 my $nfb = 0;
197 my $hfb = 0;
198 while (<$fh>)
199 {
200 s/#.*$//;
201 last if /^\s*END\s+CHARMAP\s*$/i;
202 next if /^\s*$/;
203 my ($u,@byte) = /^<U([0-9a-f]+)>\s+(?:\\x([0-9a-f]+))+\s*(\|[0-3]|)\s*$/i;
204 my $fb = pop(@byte);
205 if (defined($u))
206 {
207 my $uch = encode_U(hex($u));
e0c49a6b 208 my $ech = join('',map(chr(hex($_)),@byte));
9b37254d 209 if (length($fb))
210 {
211 $fb = substr($fb,1);
212 $hfb++;
213 }
214 else
215 {
216 $nfb++;
217 $fb = '0';
218 }
219 # $fb is fallback flag
220 # 0 - round trip safe
221 # 1 - fallback for unicode -> enc
222 # 2 - skip sub-char mapping
223 # 3 - fallback enc -> unicode
224 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
225 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
226 }
227 else
228 {
229 warn $_;
230 }
231
232 }
233 if ($nfb && $hfb)
234 {
235 die "$nfb entries without fallback, $hfb entries with\n";
236 }
e0c49a6b 237 if ($doC)
238 {
239 output($ch,$name.'_utf8',$e2u);
240 output($ch,'utf8_'.$name,$u2e);
241 $encoding{$name} = [$e2u->{Cname},$u2e->{Cname},
242 outstring($ch,$e2u->{Cname}.'_def',$erep),length($erep)];
243 }
244 elsif ($doEnc)
245 {
246 output_enc($ch,$name,$e2u);
247 }
248 elsif ($doUcm)
249 {
250 output_ucm($ch,$name,$u2e);
251 }
9b37254d 252}
253
14a8264b 254sub compile_enc
017e2add 255{
256 my ($fh,$name,$ch) = @_;
257 my $e2u = {};
258 my $u2e = {};
259
260 my $type;
261 while ($type = <$fh>)
262 {
263 last if $type !~ /^\s*#/;
264 }
265 chomp($type);
266 return if $type eq 'E';
267 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
14a8264b 268 warn "$type encoded $name\n";
017e2add 269 my $rep = '';
270 {
271 my $v = hex($def);
272 no strict 'refs';
273 $rep = &{"encode_$type"}($v & 0xFF, ($v >> 8) & 0xffe);
274 }
275 while ($pages--)
276 {
277 my $line = <$fh>;
278 chomp($line);
279 my $page = hex($line);
280 my $ch = 0;
281 for (my $i = 0; $i < 16; $i++)
282 {
283 my $line = <$fh>;
284 for (my $j = 0; $j < 16; $j++)
285 {
286 no strict 'refs';
287 my $ech = &{"encode_$type"}($ch,$page);
288 my $val = hex(substr($line,0,4,''));
289 if ($val || (!$ch && !$page))
290 {
291 my $uch = encode_U($val);
9b37254d 292 enter($e2u,$ech,$uch,$e2u,0);
293 enter($u2e,$uch,$ech,$u2e,0);
017e2add 294 }
295 else
296 {
297 # No character at this position
298 # enter($e2u,$ech,undef,$e2u);
299 }
300 $ch++;
301 }
302 }
303 }
e0c49a6b 304 if ($doC)
305 {
306 output($ch,$name.'_utf8',$e2u);
307 output($ch,'utf8_'.$name,$u2e);
308 $encoding{$name} = [$e2u->{Cname},$u2e->{Cname},
309 outstring($ch,$e2u->{Cname}.'_def',$rep),length($rep)];
310 }
311 elsif ($doEnc)
312 {
313 output_enc($ch,$name,$e2u);
314 }
315 elsif ($doUcm)
316 {
317 output_ucm($ch,$name,$u2e);
318 }
017e2add 319}
320
321sub enter
322{
9b37254d 323 my ($a,$s,$d,$t,$fb) = @_;
017e2add 324 $t = $a if @_ < 4;
325 my $b = substr($s,0,1);
326 my $e = $a->{$b};
327 unless ($e)
328 { # 0 1 2 3 4 5
9b37254d 329 $e = [$b,$b,'',{},length($s),0,$fb];
017e2add 330 $a->{$b} = $e;
331 }
332 if (length($s) > 1)
333 {
9b37254d 334 enter($e->[3],substr($s,1),$d,$t,$fb);
017e2add 335 }
336 else
337 {
338 $e->[2] = $d;
339 $e->[3] = $t;
340 $e->[5] = length($d);
341 }
342}
343
017e2add 344sub outstring
345{
346 my ($fh,$name,$s) = @_;
347 my $sym = $strings{$s};
348 unless ($sym)
349 {
2f2b4ff2 350 foreach my $o (keys %strings)
017e2add 351 {
2f2b4ff2 352 my $i = index($o,$s);
353 if ($i >= 0)
017e2add 354 {
2f2b4ff2 355 $sym = $strings{$o};
356 $sym .= sprintf("+0x%02x",$i) if ($i);
357 return $sym;
017e2add 358 }
359 }
360 $strings{$s} = $sym = $name;
14a8264b 361 printf $fh "\nstatic const U8 %s[%d] =\n",$name,length($s);
2f2b4ff2 362 # Do in chunks of 16 chars to constrain line length
363 # Assumes ANSI C adjacent string litteral concatenation
017e2add 364 while (length($s))
365 {
366 my $c = substr($s,0,16,'');
367 print $fh '"',join('',map(sprintf('\x%02x',ord($_)),split(//,$c))),'"';
368 print $fh "\n" if length($s);
369 }
14a8264b 370 printf $fh ";\n";
017e2add 371 }
372 return $sym;
373}
374
14a8264b 375sub process
017e2add 376{
14a8264b 377 my ($name,$a) = @_;
017e2add 378 $name =~ s/\W+/_/g;
379 $a->{Cname} = $name;
380 my @keys = grep(ref($a->{$_}),sort keys %$a);
017e2add 381 my $l;
382 my @ent;
383 foreach my $b (@keys)
384 {
385 my ($s,$f,$out,$t,$end) = @{$a->{$b}};
386 if (defined($l) &&
387 ord($b) == ord($a->{$l}[1])+1 &&
388 $a->{$l}[3] == $a->{$b}[3] &&
389 $a->{$l}[4] == $a->{$b}[4] &&
9b37254d 390 $a->{$l}[5] == $a->{$b}[5] &&
391 $a->{$l}[6] == $a->{$b}[6]
2f2b4ff2 392 # && length($a->{$l}[2]) < 16
393 )
017e2add 394 {
395 my $i = ord($b)-ord($a->{$l}[0]);
396 $a->{$l}[1] = $b;
397 $a->{$l}[2] .= $a->{$b}[2];
398 }
399 else
400 {
401 $l = $b;
402 push(@ent,$b);
403 }
14a8264b 404 if (exists $t->{Cname})
017e2add 405 {
14a8264b 406 $t->{'Forward'} = 1 if $t != $a;
407 }
408 else
409 {
410 process(sprintf("%s_%02x",$name,ord($s)),$t);
017e2add 411 }
412 }
413 if (ord($keys[-1]) < 255)
414 {
415 my $t = chr(ord($keys[-1])+1);
416 $a->{$t} = [$t,chr(255),undef,$a,0,0];
417 push(@ent,$t);
418 }
14a8264b 419 $a->{'Entries'} = \@ent;
420}
421
422sub outtable
423{
424 my ($fh,$a) = @_;
425 my $name = $a->{'Cname'};
017e2add 426 # String tables
14a8264b 427 foreach my $b (@{$a->{'Entries'}})
017e2add 428 {
429 next unless $a->{$b}[5];
430 my $s = ord($a->{$b}[0]);
431 my $e = ord($a->{$b}[1]);
432 outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$a->{$b}[2]);
433 }
14a8264b 434 if ($a->{'Forward'})
435 {
436 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
437 }
438 $a->{'Done'} = 1;
439 foreach my $b (@{$a->{'Entries'}})
440 {
441 my ($s,$e,$out,$t,$end,$l) = @{$a->{$b}};
442 outtable($fh,$t) unless $t->{'Done'};
443 }
444 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
445 foreach my $b (@{$a->{'Entries'}})
017e2add 446 {
9b37254d 447 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
017e2add 448 my $sc = ord($s);
449 my $ec = ord($e);
e0c49a6b 450 $end |= 0x80 if $fb;
017e2add 451 print $fh "{";
452 if ($l)
453 {
454 printf $fh outstring($fh,'',$out);
455 }
456 else
457 {
458 print $fh "0";
459 }
460 print $fh ",",$t->{Cname};
2f2b4ff2 461 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
017e2add 462 }
14a8264b 463 print $fh "};\n";
464}
465
466sub output
467{
468 my ($fh,$name,$a) = @_;
469 process($name,$a);
470 # Sub-tables
471 outtable($fh,$a);
017e2add 472}
473
e0c49a6b 474sub output_enc
475{
476 my ($fh,$name,$a) = @_;
477 foreach my $b (sort keys %$a)
478 {
479 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
480 }
481}
482
483sub decode_U
484{
485 my $s = shift;
486
487}
488
489
490sub output_ucm_page
491{
492 my ($fh,$a,$t,$pre) = @_;
493 # warn sprintf("Page %x\n",$pre);
494 foreach my $b (sort keys %$t)
495 {
496 my ($s,$e,$out,$n,$end,$l,$fb) = @{$t->{$b}};
497 die "oops $s $e" unless $s eq $e;
498 my $u = ord($s);
499 if ($n != $a && $n != $t)
500 {
501 output_ucm_page($fh,$a,$n,(($pre|($u &0x3F)) << 6)&0xFFFF);
502 }
503 elsif (length($out))
504 {
505 if ($pre)
506 {
507 $u = $pre|($u &0x3f);
508 }
509 printf $fh "<U%04X> ",$u;
510 foreach my $c (split(//,$out))
511 {
512 printf $fh "\\x%02X",ord($c);
513 }
514 printf $fh " |%d\n",($fb ? 1 : 0);
515 }
516 else
517 {
518 warn join(',',@{$t->{$b}},$a,$t);
519 }
520 }
521}
522
523sub output_ucm
524{
525 my ($fh,$name,$a) = @_;
526 print $fh "CHARMAP\n";
527 output_ucm_page($fh,$a,$a,0);
528 print $fh "END CHARMAP\n";
529}
017e2add 530