Warn and set errno when open(...,":encoding(xxxxx)",...) cannot find xxxxx.
[p5sagit/p5-mst-13.2.git] / ext / Encode / compile
CommitLineData
017e2add 1#!../../perl -w
2f2b4ff2 2BEGIN { @INC = '../../lib' };
017e2add 3use strict;
4
5sub encode_U
6{
9b37254d 7 # UTF-8 encode long hand - only covers part of perl's range
017e2add 8 my $uv = shift;
9 if ($uv < 0x80)
10 {
11 return chr($uv)
12 }
13 if ($uv < 0x800)
14 {
15 return chr(($uv >> 6) | 0xC0).
16 chr(($uv & 0x3F) | 0x80);
17 }
18 return chr(($uv >> 12) | 0xE0).
19 chr((($uv >> 6) & 0x3F) | 0x80).
20 chr(($uv & 0x3F) | 0x80);
21}
22
23sub encode_S
24{
14a8264b 25 # encode single byte
017e2add 26 my ($ch,$page) = @_;
27 return chr($ch);
28}
29
30sub encode_D
31{
14a8264b 32 # encode double byte MS byte first
017e2add 33 my ($ch,$page) = @_;
34 return chr($page).chr($ch);
35}
36
37sub encode_M
38{
14a8264b 39 # encode Multi-byte - single for 0..255 otherwise double
017e2add 40 my ($ch,$page) = @_;
41 return &encode_D if $page;
42 return &encode_S;
43}
44
14a8264b 45# Win32 does not expand globs on command line
252a8565 46eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
18b7339f 47
017e2add 48my $cname = shift(@ARGV);
2f2b4ff2 49chmod(0666,$cname) if -f $cname && !-w $cname;
017e2add 50open(C,">$cname") || die "Cannot open $cname:$!";
2f2b4ff2 51my $dname = $cname;
52$dname =~ s/(\.[^\.]*)?$/.def/;
e0c49a6b 53
54my ($doC,$doEnc,$doUcm);
55
56if ($cname =~ /\.(c|xs)$/)
57 {
58 $doC = 1;
59 chmod(0666,$dname) if -f $cname && !-w $dname;
60 open(D,">$dname") || die "Cannot open $dname:$!";
61 my $hname = $cname;
62 $hname =~ s/(\.[^\.]*)?$/.h/;
63 chmod(0666,$hname) if -f $cname && !-w $hname;
64 open(H,">$hname") || die "Cannot open $hname:$!";
65
66 foreach my $fh (\*C,\*D,\*H)
67 {
68 print $fh <<"END";
14a8264b 69/*
70 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
71 This file was autogenerated by:
72 $^X $0 $cname @ARGV
73*/
74END
e0c49a6b 75 }
14a8264b 76
e0c49a6b 77 if ($cname =~ /(\w+)\.xs$/)
78 {
79 print C "#include <EXTERN.h>\n";
80 print C "#include <perl.h>\n";
81 print C "#include <XSUB.h>\n";
82 print C "#define U8 U8\n";
83 }
84 print C "#include \"encode.h\"\n";
85 }
86elsif ($cname =~ /\.enc$/)
87 {
88 $doEnc = 1;
89 }
90elsif ($cname =~ /\.ucm$/)
2f2b4ff2 91 {
e0c49a6b 92 $doUcm = 1;
2f2b4ff2 93 }
017e2add 94
c6fdb90a 95# 2nd argument is file containing list of filenames
96my $flist = shift(@ARGV);
97open(FLIST,$flist) || die "Cannot open $flist:$!";
98chomp(my @encfiles = <FLIST>);
99close(FLIST);
100
017e2add 101my %encoding;
102my %strings;
103
2f2b4ff2 104sub cmp_name
105{
106 if ($a =~ /^.*-(\d+)/)
107 {
108 my $an = $1;
109 if ($b =~ /^.*-(\d+)/)
110 {
111 my $r = $an <=> $1;
112 return $r if $r;
113 }
114 }
115 return $a cmp $b;
116}
117
c6fdb90a 118foreach my $enc (sort cmp_name @encfiles)
017e2add 119 {
9b37254d 120 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
017e2add 121 if (open(E,$enc))
122 {
9b37254d 123 if ($sfx eq 'enc')
124 {
125 compile_enc(\*E,lc($name),\*C);
126 }
127 else
128 {
129 compile_ucm(\*E,lc($name),\*C);
130 }
017e2add 131 }
132 else
133 {
134 warn "Cannot open $enc for $name:$!";
135 }
136 }
137
e0c49a6b 138if ($doC)
2f2b4ff2 139 {
e0c49a6b 140 foreach my $enc (sort cmp_name keys %encoding)
141 {
142 my $sym = "${enc}_encoding";
143 $sym =~ s/\W+/_/g;
144 print C "encode_t $sym = \n";
145 print C " {",join(',',"\"$enc\"",@{$encoding{$enc}}),"};\n\n";
146 }
2f2b4ff2 147
e0c49a6b 148 foreach my $enc (sort cmp_name keys %encoding)
149 {
150 my $sym = "${enc}_encoding";
151 $sym =~ s/\W+/_/g;
152 print H "extern encode_t $sym;\n";
153 print D " Encode_Define(aTHX_ &$sym);\n";
154 }
017e2add 155
e0c49a6b 156 if ($cname =~ /(\w+)\.xs$/)
157 {
158 my $mod = $1;
159 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
160 print C "BOOT:\n{\n";
161 print C "#include \"$dname\"\n";
162 print C "}\n";
163 }
164 close(D);
165 close(H);
2f2b4ff2 166 }
017e2add 167close(C);
168
9b37254d 169
170sub compile_ucm
171{
172 my ($fh,$name,$ch) = @_;
173 my $e2u = {};
174 my $u2e = {};
175 my $cs;
176 my %attr;
177 while (<$fh>)
178 {
179 s/#.*$//;
180 last if /^\s*CHARMAP\s*$/i;
181 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i)
182 {
183 $attr{$1} = $2;
184 }
185 }
186 if (!defined($cs = $attr{'code_set_name'}))
187 {
188 warn "No <code_set_name> in $name\n";
189 }
190 else
191 {
e0c49a6b 192 # $name = lc($cs);
9b37254d 193 }
194 my $erep;
195 my $urep;
196 if (exists $attr{'subchar'})
197 {
198 my @byte = $attr{'subchar'} =~ /^\s*(?:\\x([0-9a-f]+))+\s*$/;
199 $erep = join('',map(hex($_),@byte));
200 }
e0c49a6b 201 warn "Scanning $name ($cs)\n";
9b37254d 202 my $nfb = 0;
203 my $hfb = 0;
204 while (<$fh>)
205 {
206 s/#.*$//;
207 last if /^\s*END\s+CHARMAP\s*$/i;
208 next if /^\s*$/;
209 my ($u,@byte) = /^<U([0-9a-f]+)>\s+(?:\\x([0-9a-f]+))+\s*(\|[0-3]|)\s*$/i;
210 my $fb = pop(@byte);
211 if (defined($u))
212 {
213 my $uch = encode_U(hex($u));
e0c49a6b 214 my $ech = join('',map(chr(hex($_)),@byte));
9b37254d 215 if (length($fb))
216 {
217 $fb = substr($fb,1);
218 $hfb++;
219 }
220 else
221 {
222 $nfb++;
223 $fb = '0';
224 }
225 # $fb is fallback flag
226 # 0 - round trip safe
227 # 1 - fallback for unicode -> enc
228 # 2 - skip sub-char mapping
229 # 3 - fallback enc -> unicode
230 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
231 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
232 }
233 else
234 {
235 warn $_;
236 }
237
238 }
239 if ($nfb && $hfb)
240 {
241 die "$nfb entries without fallback, $hfb entries with\n";
242 }
e0c49a6b 243 if ($doC)
244 {
245 output($ch,$name.'_utf8',$e2u);
246 output($ch,'utf8_'.$name,$u2e);
247 $encoding{$name} = [$e2u->{Cname},$u2e->{Cname},
248 outstring($ch,$e2u->{Cname}.'_def',$erep),length($erep)];
249 }
250 elsif ($doEnc)
251 {
252 output_enc($ch,$name,$e2u);
253 }
254 elsif ($doUcm)
255 {
256 output_ucm($ch,$name,$u2e);
257 }
9b37254d 258}
259
14a8264b 260sub compile_enc
017e2add 261{
262 my ($fh,$name,$ch) = @_;
263 my $e2u = {};
264 my $u2e = {};
265
266 my $type;
267 while ($type = <$fh>)
268 {
269 last if $type !~ /^\s*#/;
270 }
271 chomp($type);
272 return if $type eq 'E';
273 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
14a8264b 274 warn "$type encoded $name\n";
017e2add 275 my $rep = '';
276 {
277 my $v = hex($def);
278 no strict 'refs';
279 $rep = &{"encode_$type"}($v & 0xFF, ($v >> 8) & 0xffe);
280 }
281 while ($pages--)
282 {
283 my $line = <$fh>;
284 chomp($line);
285 my $page = hex($line);
286 my $ch = 0;
287 for (my $i = 0; $i < 16; $i++)
288 {
289 my $line = <$fh>;
290 for (my $j = 0; $j < 16; $j++)
291 {
292 no strict 'refs';
293 my $ech = &{"encode_$type"}($ch,$page);
294 my $val = hex(substr($line,0,4,''));
295 if ($val || (!$ch && !$page))
296 {
297 my $uch = encode_U($val);
9b37254d 298 enter($e2u,$ech,$uch,$e2u,0);
299 enter($u2e,$uch,$ech,$u2e,0);
017e2add 300 }
301 else
302 {
303 # No character at this position
304 # enter($e2u,$ech,undef,$e2u);
305 }
306 $ch++;
307 }
308 }
309 }
e0c49a6b 310 if ($doC)
311 {
312 output($ch,$name.'_utf8',$e2u);
313 output($ch,'utf8_'.$name,$u2e);
314 $encoding{$name} = [$e2u->{Cname},$u2e->{Cname},
315 outstring($ch,$e2u->{Cname}.'_def',$rep),length($rep)];
316 }
317 elsif ($doEnc)
318 {
319 output_enc($ch,$name,$e2u);
320 }
321 elsif ($doUcm)
322 {
323 output_ucm($ch,$name,$u2e);
324 }
017e2add 325}
326
327sub enter
328{
9b37254d 329 my ($a,$s,$d,$t,$fb) = @_;
017e2add 330 $t = $a if @_ < 4;
331 my $b = substr($s,0,1);
332 my $e = $a->{$b};
333 unless ($e)
334 { # 0 1 2 3 4 5
9b37254d 335 $e = [$b,$b,'',{},length($s),0,$fb];
017e2add 336 $a->{$b} = $e;
337 }
338 if (length($s) > 1)
339 {
9b37254d 340 enter($e->[3],substr($s,1),$d,$t,$fb);
017e2add 341 }
342 else
343 {
344 $e->[2] = $d;
345 $e->[3] = $t;
346 $e->[5] = length($d);
347 }
348}
349
017e2add 350sub outstring
351{
352 my ($fh,$name,$s) = @_;
353 my $sym = $strings{$s};
354 unless ($sym)
355 {
2f2b4ff2 356 foreach my $o (keys %strings)
017e2add 357 {
2f2b4ff2 358 my $i = index($o,$s);
359 if ($i >= 0)
017e2add 360 {
2f2b4ff2 361 $sym = $strings{$o};
362 $sym .= sprintf("+0x%02x",$i) if ($i);
363 return $sym;
017e2add 364 }
365 }
366 $strings{$s} = $sym = $name;
14a8264b 367 printf $fh "\nstatic const U8 %s[%d] =\n",$name,length($s);
2f2b4ff2 368 # Do in chunks of 16 chars to constrain line length
369 # Assumes ANSI C adjacent string litteral concatenation
017e2add 370 while (length($s))
371 {
372 my $c = substr($s,0,16,'');
373 print $fh '"',join('',map(sprintf('\x%02x',ord($_)),split(//,$c))),'"';
374 print $fh "\n" if length($s);
375 }
14a8264b 376 printf $fh ";\n";
017e2add 377 }
378 return $sym;
379}
380
14a8264b 381sub process
017e2add 382{
14a8264b 383 my ($name,$a) = @_;
017e2add 384 $name =~ s/\W+/_/g;
385 $a->{Cname} = $name;
386 my @keys = grep(ref($a->{$_}),sort keys %$a);
017e2add 387 my $l;
388 my @ent;
389 foreach my $b (@keys)
390 {
391 my ($s,$f,$out,$t,$end) = @{$a->{$b}};
392 if (defined($l) &&
393 ord($b) == ord($a->{$l}[1])+1 &&
394 $a->{$l}[3] == $a->{$b}[3] &&
395 $a->{$l}[4] == $a->{$b}[4] &&
9b37254d 396 $a->{$l}[5] == $a->{$b}[5] &&
397 $a->{$l}[6] == $a->{$b}[6]
2f2b4ff2 398 # && length($a->{$l}[2]) < 16
399 )
017e2add 400 {
401 my $i = ord($b)-ord($a->{$l}[0]);
402 $a->{$l}[1] = $b;
403 $a->{$l}[2] .= $a->{$b}[2];
404 }
405 else
406 {
407 $l = $b;
408 push(@ent,$b);
409 }
14a8264b 410 if (exists $t->{Cname})
017e2add 411 {
14a8264b 412 $t->{'Forward'} = 1 if $t != $a;
413 }
414 else
415 {
416 process(sprintf("%s_%02x",$name,ord($s)),$t);
017e2add 417 }
418 }
419 if (ord($keys[-1]) < 255)
420 {
421 my $t = chr(ord($keys[-1])+1);
422 $a->{$t} = [$t,chr(255),undef,$a,0,0];
423 push(@ent,$t);
424 }
14a8264b 425 $a->{'Entries'} = \@ent;
426}
427
428sub outtable
429{
430 my ($fh,$a) = @_;
431 my $name = $a->{'Cname'};
017e2add 432 # String tables
14a8264b 433 foreach my $b (@{$a->{'Entries'}})
017e2add 434 {
435 next unless $a->{$b}[5];
436 my $s = ord($a->{$b}[0]);
437 my $e = ord($a->{$b}[1]);
438 outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$a->{$b}[2]);
439 }
14a8264b 440 if ($a->{'Forward'})
441 {
442 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
443 }
444 $a->{'Done'} = 1;
445 foreach my $b (@{$a->{'Entries'}})
446 {
447 my ($s,$e,$out,$t,$end,$l) = @{$a->{$b}};
448 outtable($fh,$t) unless $t->{'Done'};
449 }
450 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
451 foreach my $b (@{$a->{'Entries'}})
017e2add 452 {
9b37254d 453 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
017e2add 454 my $sc = ord($s);
455 my $ec = ord($e);
e0c49a6b 456 $end |= 0x80 if $fb;
017e2add 457 print $fh "{";
458 if ($l)
459 {
460 printf $fh outstring($fh,'',$out);
461 }
462 else
463 {
464 print $fh "0";
465 }
466 print $fh ",",$t->{Cname};
2f2b4ff2 467 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
017e2add 468 }
14a8264b 469 print $fh "};\n";
470}
471
472sub output
473{
474 my ($fh,$name,$a) = @_;
475 process($name,$a);
476 # Sub-tables
477 outtable($fh,$a);
017e2add 478}
479
e0c49a6b 480sub output_enc
481{
482 my ($fh,$name,$a) = @_;
483 foreach my $b (sort keys %$a)
484 {
485 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
486 }
487}
488
489sub decode_U
490{
491 my $s = shift;
492
493}
494
495
496sub output_ucm_page
497{
498 my ($fh,$a,$t,$pre) = @_;
499 # warn sprintf("Page %x\n",$pre);
500 foreach my $b (sort keys %$t)
501 {
502 my ($s,$e,$out,$n,$end,$l,$fb) = @{$t->{$b}};
503 die "oops $s $e" unless $s eq $e;
504 my $u = ord($s);
505 if ($n != $a && $n != $t)
506 {
507 output_ucm_page($fh,$a,$n,(($pre|($u &0x3F)) << 6)&0xFFFF);
508 }
509 elsif (length($out))
510 {
511 if ($pre)
512 {
513 $u = $pre|($u &0x3f);
514 }
515 printf $fh "<U%04X> ",$u;
516 foreach my $c (split(//,$out))
517 {
518 printf $fh "\\x%02X",ord($c);
519 }
520 printf $fh " |%d\n",($fb ? 1 : 0);
521 }
522 else
523 {
524 warn join(',',@{$t->{$b}},$a,$t);
525 }
526 }
527}
528
529sub output_ucm
530{
531 my ($fh,$name,$a) = @_;
532 print $fh "CHARMAP\n";
533 output_ucm_page($fh,$a,$a,0);
534 print $fh "END CHARMAP\n";
535}
017e2add 536