NetWare change from Ananth Kesari.
[p5sagit/p5-mst-13.2.git] / ext / Encode / compile
CommitLineData
017e2add 1#!../../perl -w
508a30f1 2BEGIN {
b0bf7e4c 3 unshift @INC, qw(../../lib ../../../lib);
4 $ENV{PATH} .= ';../..;../../..' if $^O eq 'MSWin32';
508a30f1 5}
017e2add 6use strict;
afdae191 7use Getopt::Std;
8my @orig_ARGV = @ARGV;
dcda1f94 9
eb560316 10# These may get re-ordered.
11# RAW is a do_now as inserted by &enter
12# AGG is an aggreagated do_now, as built up by &process
13use constant {
14 RAW_NEXT => 0,
15 RAW_IN_LEN => 1,
16 RAW_OUT_BYTES => 2,
17 RAW_FALLBACK => 3,
18
19 AGG_MIN_IN => 0,
20 AGG_MAX_IN => 1,
21 AGG_OUT_BYTES => 2,
22 AGG_NEXT => 3,
23 AGG_IN_LEN => 4,
24 AGG_OUT_LEN => 5,
25 AGG_FALLBACK => 6,
26};
27# (See the algorithm in encengine.c - we're building structures for it)
28
29# There are two sorts of structures.
30# "do_now" (an array, two variants of what needs storing) is whatever we need
31# to do now we've read an input byte.
32# It's housed in a "do_next" (which is how we got to it), and in turn points
33# to a "do_next" which contains all the "do_now"s for the next input byte.
34
35# There will be a "do_next" which is the start state.
36# For a single byte encoding it's the only "do_next" - each "do_now" points
37# back to it, and each "do_now" will cause bytes. There is no state.
38
39# For a multi-byte encoding where all characters in the input are the same
40# length, then there will be a tree of "do_now"->"do_next"->"do_now"
41# branching out from the start state, one step for each input byte.
42# The leaf "do_now"s will all be at the same distance from the start state,
43# only the leaf "do_now"s cause output bytes, and they in turn point back to
44# the start state.
45
46# For an encoding where there are varaible length input byte sequences, you
47# will encounter a leaf "do_now" sooner for the shorter input sequences, but
48# as before the leaves will point back to the start state.
49
50# The system will cope with escape encodings (imagine them as a mostly
51# self-contained tree for each escape state, and cross links between trees
52# at the state-switching characters) but so far no input format defines these.
53
54# The system will also cope with having output "leaves" in the middle of
55# the bifurcating branches, not just at the extremities, but again no
56# input format does this yet.
57
58# There are two variants of the "do_now" structure. The first, smaller variant
59# is generated by &enter as the input file is read. There is one structure
60# for each input byte. Say we are mapping a single byte encoding to a
61# single byte encoding, with "ABCD" going "abcd". There will be
62# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
63
64# &process then walks the tree, building aggregate "do_now" structres for
65# adjacent bytes where possible. The aggregate is for a contiguous range of
66# bytes which each produce the same length of output, each move to the
67# same next state, and each have the same fallback flag.
68# So our 4 RAW "do_now"s above become replaced by a single structure
69# containing:
70# ["A", "D", "abcd", 1, ...]
71# ie, for an input byte $_ in "A".."D", output 1 byte, found as
72# substr ("abcd", (ord $_ - ord "A") * 1, 1)
73# which maps very nicely into pointer arithmetic in C for encengine.c
74
017e2add 75sub encode_U
76{
9b37254d 77 # UTF-8 encode long hand - only covers part of perl's range
7ba7f87b 78 ## my $uv = shift;
1b026014 79 # chr() works in native space so convert value from table
80 # into that space before using chr().
7ba7f87b 81 my $ch = chr(utf8::unicode_to_native($_[0]));
1b026014 82 # Now get core perl to encode that the way it likes.
83 utf8::encode($ch);
84 return $ch;
017e2add 85}
86
87sub encode_S
88{
14a8264b 89 # encode single byte
7ba7f87b 90 ## my ($ch,$page) = @_; return chr($ch);
91 return chr $_[0];
017e2add 92}
93
94sub encode_D
95{
14a8264b 96 # encode double byte MS byte first
7ba7f87b 97 ## my ($ch,$page) = @_; return chr($page).chr($ch);
98 return chr ($_[1]) . chr $_[0];
017e2add 99}
100
101sub encode_M
102{
14a8264b 103 # encode Multi-byte - single for 0..255 otherwise double
7ba7f87b 104 ## my ($ch,$page) = @_;
105 ## return &encode_D if $page;
106 ## return &encode_S;
107 return chr ($_[1]) . chr $_[0] if $_[1];
108 return chr $_[0];
017e2add 109}
110
7ba7f87b 111my %encode_types = (U => \&encode_U,
112 S => \&encode_S,
113 D => \&encode_D,
114 M => \&encode_M,
115 );
116
14a8264b 117# Win32 does not expand globs on command line
252a8565 118eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
18b7339f 119
afdae191 120my %opt;
4cfc977c 121getopts('qOo:f:n:',\%opt);
afdae191 122my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
2f2b4ff2 123chmod(0666,$cname) if -f $cname && !-w $cname;
017e2add 124open(C,">$cname") || die "Cannot open $cname:$!";
afdae191 125
126
2f2b4ff2 127my $dname = $cname;
023d8852 128$dname =~ s/(\.[^\.]*)?$/_def.h/;
e0c49a6b 129
dcda1f94 130my ($doC,$doEnc,$doUcm,$doPet);
e0c49a6b 131
132if ($cname =~ /\.(c|xs)$/)
133 {
134 $doC = 1;
135 chmod(0666,$dname) if -f $cname && !-w $dname;
136 open(D,">$dname") || die "Cannot open $dname:$!";
137 my $hname = $cname;
138 $hname =~ s/(\.[^\.]*)?$/.h/;
139 chmod(0666,$hname) if -f $cname && !-w $hname;
140 open(H,">$hname") || die "Cannot open $hname:$!";
141
142 foreach my $fh (\*C,\*D,\*H)
143 {
afdae191 144 print $fh <<"END" unless $opt{'q'};
14a8264b 145/*
146 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
147 This file was autogenerated by:
023d8852 148 $^X $0 @orig_ARGV
14a8264b 149*/
150END
e0c49a6b 151 }
14a8264b 152
e0c49a6b 153 if ($cname =~ /(\w+)\.xs$/)
154 {
155 print C "#include <EXTERN.h>\n";
156 print C "#include <perl.h>\n";
157 print C "#include <XSUB.h>\n";
158 print C "#define U8 U8\n";
159 }
160 print C "#include \"encode.h\"\n";
d811239c 161
e0c49a6b 162 }
163elsif ($cname =~ /\.enc$/)
164 {
165 $doEnc = 1;
166 }
167elsif ($cname =~ /\.ucm$/)
2f2b4ff2 168 {
e0c49a6b 169 $doUcm = 1;
2f2b4ff2 170 }
dcda1f94 171elsif ($cname =~ /\.pet$/)
172 {
173 $doPet = 1;
174 }
017e2add 175
afdae191 176my @encfiles;
3865e8e1 177if (exists $opt{'f'})
afdae191 178 {
179 # -F is followed by name of file containing list of filenames
3865e8e1 180 my $flist = $opt{'f'};
afdae191 181 open(FLIST,$flist) || die "Cannot open $flist:$!";
182 chomp(@encfiles = <FLIST>);
183 close(FLIST);
184 }
185else
186 {
187 @encfiles = @ARGV;
188 }
c6fdb90a 189
017e2add 190my %encoding;
191my %strings;
b1e7e56f 192my $saved = 0;
193my $subsave = 0;
194my $strings = 0;
017e2add 195
2f2b4ff2 196sub cmp_name
197{
198 if ($a =~ /^.*-(\d+)/)
199 {
200 my $an = $1;
201 if ($b =~ /^.*-(\d+)/)
202 {
203 my $r = $an <=> $1;
204 return $r if $r;
205 }
206 }
207 return $a cmp $b;
208}
209
dcda1f94 210
c6fdb90a 211foreach my $enc (sort cmp_name @encfiles)
017e2add 212 {
9b37254d 213 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
51ef4e11 214 $name = $opt{'n'} if exists $opt{'n'};
017e2add 215 if (open(E,$enc))
216 {
9b37254d 217 if ($sfx eq 'enc')
218 {
dcda1f94 219 compile_enc(\*E,lc($name));
9b37254d 220 }
221 else
222 {
dcda1f94 223 compile_ucm(\*E,lc($name));
9b37254d 224 }
017e2add 225 }
226 else
227 {
228 warn "Cannot open $enc for $name:$!";
229 }
230 }
231
e0c49a6b 232if ($doC)
2f2b4ff2 233 {
b1e7e56f 234 print STDERR "Writing compiled form\n";
dcda1f94 235 foreach my $name (sort cmp_name keys %encoding)
236 {
237 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
238 output(\*C,$name.'_utf8',$e2u);
239 output(\*C,'utf8_'.$name,$u2e);
240 push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
241 }
e0c49a6b 242 foreach my $enc (sort cmp_name keys %encoding)
243 {
dcda1f94 244 my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
245 my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
e0c49a6b 246 my $sym = "${enc}_encoding";
247 $sym =~ s/\W+/_/g;
248 print C "encode_t $sym = \n";
dcda1f94 249 print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
e0c49a6b 250 }
2f2b4ff2 251
e0c49a6b 252 foreach my $enc (sort cmp_name keys %encoding)
253 {
254 my $sym = "${enc}_encoding";
255 $sym =~ s/\W+/_/g;
256 print H "extern encode_t $sym;\n";
d811239c 257 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
e0c49a6b 258 }
017e2add 259
e0c49a6b 260 if ($cname =~ /(\w+)\.xs$/)
261 {
262 my $mod = $1;
d811239c 263 print C <<'END';
264
023d8852 265static void
d811239c 266Encode_XSEncoding(pTHX_ encode_t *enc)
267{
268 dSP;
269 HV *stash = gv_stashpv("Encode::XS", TRUE);
270 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
271 int i = 0;
272 PUSHMARK(sp);
273 XPUSHs(sv);
274 while (enc->name[i])
275 {
276 const char *name = enc->name[i++];
277 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
278 }
279 PUTBACK;
280 call_pv("Encode::define_encoding",G_DISCARD);
281 SvREFCNT_dec(sv);
282}
283
284END
285
e0c49a6b 286 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
287 print C "BOOT:\n{\n";
288 print C "#include \"$dname\"\n";
289 print C "}\n";
290 }
291 close(D);
292 close(H);
b1e7e56f 293 printf STDERR "%d bytes in string tables\n",$strings;
294 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",$saved,100*$saved/$strings if $saved;
295 printf STDERR "%d bytes (%.3g%%) saved using substrings\n",$subsave,100*$subsave/$strings if $subsave;
2f2b4ff2 296 }
dcda1f94 297elsif ($doEnc)
298 {
299 foreach my $name (sort cmp_name keys %encoding)
300 {
301 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
302 output_enc(\*C,$name,$e2u);
303 }
304 }
305elsif ($doUcm)
306 {
307 foreach my $name (sort cmp_name keys %encoding)
308 {
309 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
310 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
311 }
312 }
313
017e2add 314close(C);
315
9b37254d 316
b1e7e56f 317
9b37254d 318sub compile_ucm
319{
dcda1f94 320 my ($fh,$name) = @_;
9b37254d 321 my $e2u = {};
322 my $u2e = {};
323 my $cs;
324 my %attr;
325 while (<$fh>)
326 {
327 s/#.*$//;
328 last if /^\s*CHARMAP\s*$/i;
7ba7f87b 329 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
9b37254d 330 {
331 $attr{$1} = $2;
332 }
333 }
334 if (!defined($cs = $attr{'code_set_name'}))
335 {
336 warn "No <code_set_name> in $name\n";
337 }
338 else
339 {
51ef4e11 340 $name = $cs unless exists $opt{'n'};
9b37254d 341 }
342 my $erep;
343 my $urep;
afdae191 344 my $max_el;
345 my $min_el;
9b37254d 346 if (exists $attr{'subchar'})
347 {
afdae191 348 my @byte;
349 $attr{'subchar'} =~ /^\s*/cg;
350 push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
351 $erep = join('',map(chr(hex($_)),@byte));
9b37254d 352 }
b1e7e56f 353 print "Reading $name ($cs)\n";
9b37254d 354 my $nfb = 0;
355 my $hfb = 0;
356 while (<$fh>)
357 {
358 s/#.*$//;
359 last if /^\s*END\s+CHARMAP\s*$/i;
360 next if /^\s*$/;
afdae191 361 my ($u,@byte);
362 my $fb = '';
363 $u = $1 if (/^<U([0-9a-f]+)>\s+/igc);
364 push(@byte,$1) while /\G\\x([0-9a-f]+)/igc;
365 $fb = $1 if /\G\s*(\|[0-3])/gc;
366 # warn "$_: $u @byte | $fb\n";
367 die "Bad line:$_" unless /\G\s*(#.*)?$/gc;
9b37254d 368 if (defined($u))
369 {
370 my $uch = encode_U(hex($u));
e0c49a6b 371 my $ech = join('',map(chr(hex($_)),@byte));
afdae191 372 my $el = length($ech);
373 $max_el = $el if (!defined($max_el) || $el > $max_el);
374 $min_el = $el if (!defined($min_el) || $el < $min_el);
9b37254d 375 if (length($fb))
376 {
377 $fb = substr($fb,1);
378 $hfb++;
379 }
380 else
381 {
382 $nfb++;
383 $fb = '0';
384 }
385 # $fb is fallback flag
386 # 0 - round trip safe
387 # 1 - fallback for unicode -> enc
388 # 2 - skip sub-char mapping
389 # 3 - fallback enc -> unicode
390 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
391 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
392 }
393 else
394 {
395 warn $_;
396 }
9b37254d 397 }
398 if ($nfb && $hfb)
399 {
400 die "$nfb entries without fallback, $hfb entries with\n";
401 }
dcda1f94 402 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
9b37254d 403}
404
14a8264b 405sub compile_enc
017e2add 406{
dcda1f94 407 my ($fh,$name) = @_;
017e2add 408 my $e2u = {};
409 my $u2e = {};
410
411 my $type;
412 while ($type = <$fh>)
413 {
414 last if $type !~ /^\s*#/;
415 }
416 chomp($type);
417 return if $type eq 'E';
445e6c97 418 # Do the hash lookup once, rather than once per function call. 4% speedup.
419 my $type_func = $encode_types{$type};
017e2add 420 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
14a8264b 421 warn "$type encoded $name\n";
017e2add 422 my $rep = '';
7ba7f87b 423 # Save a defined test by setting these to defined values.
424 my $min_el = ~0; # A very big integer
425 my $max_el = 0; # Anything must be longer than 0
017e2add 426 {
427 my $v = hex($def);
445e6c97 428 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
017e2add 429 }
e03ac092 430 my %seen;
445e6c97 431 do
017e2add 432 {
433 my $line = <$fh>;
434 chomp($line);
435 my $page = hex($line);
436 my $ch = 0;
445e6c97 437 my $i = 16;
438 do
017e2add 439 {
445e6c97 440 # So why is it 1% faster to leave the my here?
017e2add 441 my $line = <$fh>;
7ba7f87b 442 die "Line should be exactly 65 characters long including newline"
443 unless length ($line) == 65;
444 # Split line into groups of 4 hex digits, convert groups to ints
445e6c97 445 # This takes 65.35
446 # map {hex $_} $line =~ /(....)/g
447 # This takes 63.75 (2.5% less time)
448 # unpack "n*", pack "H*", $line
449 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
450 # Doing it as while ($line =~ /(....)/g) took 74.63
451 foreach my $val (unpack "n*", pack "H*", $line)
017e2add 452 {
e03ac092 453 next if $val == 0xFFFD;
445e6c97 454 my $ech = &$type_func($ch,$page);
017e2add 455 if ($val || (!$ch && !$page))
456 {
afdae191 457 my $el = length($ech);
7ba7f87b 458 $max_el = $el if $el > $max_el;
459 $min_el = $el if $el < $min_el;
017e2add 460 my $uch = encode_U($val);
445e6c97 461 # We don't need to read this quickly, so storing it as a scalar,
462 # rather than 3 (anon array, plus the 2 scalars it holds) saves
463 # RAM and may make us faster on low RAM systems. [see __END__]
e03ac092 464 if (exists $seen{$uch})
465 {
445e6c97 466 warn sprintf("U%04X is %02X%02X and %04X\n",
467 $val,$page,$ch,$seen{$uch});
e03ac092 468 }
469 else
470 {
445e6c97 471 $seen{$uch} = $page << 16 | $ch;
e03ac092 472 }
eb560316 473 # Passing 2 extra args each time is 3.6% slower!
445e6c97 474 # Even with having to add $fallback ||= 0 later
475 enter_fb0($e2u,$ech,$uch);
476 enter_fb0($u2e,$uch,$ech);
017e2add 477 }
478 else
479 {
480 # No character at this position
481 # enter($e2u,$ech,undef,$e2u);
482 }
483 $ch++;
484 }
445e6c97 485 } while --$i;
486 } while --$pages;
7ba7f87b 487 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
488 if $min_el > $max_el;
dcda1f94 489 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
017e2add 490}
491
eb560316 492# my ($a,$s,$d,$t,$fb) = @_;
493sub enter {
494 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
495 # state we shift to after this (multibyte) input character defaults to same
496 # as current state.
497 $next ||= $current;
498 # Making sure it is defined seems to be faster than {no warnings;} in
499 # &process, or passing it in as 0 explicity.
445e6c97 500 # XXX $fallback ||= 0;
eb560316 501
502 # Start at the beginning and work forwards through the string to zero.
503 # effectively we are removing 1 character from the front each time
504 # but we don't actually edit the string. [this alone seems to be 14% speedup]
505 # Hence -$pos is the length of the remaining string.
506 my $pos = -length $inbytes;
507 while (1) {
508 my $byte = substr $inbytes, $pos, 1;
509 # RAW_NEXT => 0,
510 # RAW_IN_LEN => 1,
511 # RAW_OUT_BYTES => 2,
512 # RAW_FALLBACK => 3,
513 # to unicode an array would seem to be better, because the pages are dense.
514 # from unicode can be very sparse, favouring a hash.
515 # hash using the bytes (all length 1) as keys rather than ord value,
516 # as it's easier to sort these in &process.
517
518 # It's faster to always add $fallback even if it's undef, rather than
519 # choosing between 3 and 4 element array. (hence why we set it defined
520 # above)
521 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
522 # When $pos was -1 we were at the last input character.
523 unless (++$pos) {
524 $do_now->[RAW_OUT_BYTES] = $outbytes;
525 $do_now->[RAW_NEXT] = $next;
526 return;
527 }
528 # Tail recursion. The intermdiate state may not have a name yet.
529 $current = $do_now->[RAW_NEXT];
017e2add 530 }
531}
532
445e6c97 533# This is purely for optimistation. It's just &enter hard coded for $fallback
534# of 0, using only a 3 entry array ref to save memory for every entry.
535sub enter_fb0 {
536 my ($current,$inbytes,$outbytes,$next) = @_;
537 $next ||= $current;
538
539 my $pos = -length $inbytes;
540 while (1) {
541 my $byte = substr $inbytes, $pos, 1;
542 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
543 unless (++$pos) {
544 $do_now->[RAW_OUT_BYTES] = $outbytes;
545 $do_now->[RAW_NEXT] = $next;
546 return;
547 }
548 $current = $do_now->[RAW_NEXT];
549 }
550}
b1e7e56f 551
552
017e2add 553sub outstring
554{
555 my ($fh,$name,$s) = @_;
556 my $sym = $strings{$s};
b1e7e56f 557 if ($sym)
558 {
559 $saved += length($s);
560 }
561 else
017e2add 562 {
4cfc977c 563 if ($opt{'O'}) {
564 foreach my $o (keys %strings)
565 {
445e6c97 566 next unless (my $i = index($o,$s)) >= 0;
567 $sym = $strings{$o};
568 $sym .= sprintf("+0x%02x",$i) if ($i);
569 $subsave += length($s);
570 return $strings{$s} = $sym;
571 }
4cfc977c 572 }
017e2add 573 $strings{$s} = $sym = $name;
b1e7e56f 574 $strings += length($s);
de745a2e 575 my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
576 # Maybe we should assert that these are all <256.
577 $definition .= join(',',unpack "C*",$s);
578 # We have a single long line. Split it at convenient commas.
579 $definition =~ s/(.{74,77},)/$1\n/g;
580 print $fh "$definition };\n\n";
017e2add 581 }
582 return $sym;
583}
584
14a8264b 585sub process
017e2add 586{
eb560316 587 my ($name,$a) = @_;
588 $name =~ s/\W+/_/g;
589 $a->{Cname} = $name;
445e6c97 590 my $raw = $a->{Raw};
eb560316 591 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
592 my @ent;
445e6c97 593 $agg_max_in = 0;
594 foreach my $key (sort keys %$raw) {
eb560316 595 # RAW_NEXT => 0,
596 # RAW_IN_LEN => 1,
597 # RAW_OUT_BYTES => 2,
598 # RAW_FALLBACK => 3,
445e6c97 599 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
eb560316 600 # Now we are converting from raw to aggregate, switch from 1 byte strings
601 # to numbers
602 my $b = ord $key;
445e6c97 603 $fallback ||= 0;
eb560316 604 if ($l &&
605 # If this == fails, we're going to reset $agg_max_in below anyway.
606 $b == ++$agg_max_in &&
607 # References in numeric context give the pointer as an int.
608 $agg_next == $next &&
609 $agg_in_len == $in_len &&
610 $agg_out_len == length $out_bytes &&
611 $agg_fallback == $fallback
612 # && length($l->[AGG_OUT_BYTES]) < 16
613 ) {
614 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
615 # we can aggregate this byte onto the end.
616 $l->[AGG_MAX_IN] = $b;
617 $l->[AGG_OUT_BYTES] .= $out_bytes;
618 } else {
619 # AGG_MIN_IN => 0,
620 # AGG_MAX_IN => 1,
621 # AGG_OUT_BYTES => 2,
622 # AGG_NEXT => 3,
623 # AGG_IN_LEN => 4,
624 # AGG_OUT_LEN => 5,
625 # AGG_FALLBACK => 6,
626 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
627 # (only gains .6% on euc-jp -- is it worth it?)
628 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
629 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
630 $agg_fallback = $fallback];
017e2add 631 }
eb560316 632 if (exists $next->{Cname}) {
633 $next->{'Forward'} = 1 if $next != $a;
634 } else {
635 process(sprintf("%s_%02x",$name,$b),$next);
017e2add 636 }
637 }
eb560316 638 # encengine.c rules say that last entry must be for 255
445e6c97 639 if ($agg_max_in < 255) {
640 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
017e2add 641 }
eb560316 642 $a->{'Entries'} = \@ent;
14a8264b 643}
644
645sub outtable
646{
647 my ($fh,$a) = @_;
648 my $name = $a->{'Cname'};
017e2add 649 # String tables
14a8264b 650 foreach my $b (@{$a->{'Entries'}})
017e2add 651 {
eb560316 652 next unless $b->[AGG_OUT_LEN];
653 my $s = $b->[AGG_MIN_IN];
654 my $e = $b->[AGG_MAX_IN];
655 outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
017e2add 656 }
14a8264b 657 if ($a->{'Forward'})
658 {
659 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
660 }
661 $a->{'Done'} = 1;
662 foreach my $b (@{$a->{'Entries'}})
663 {
eb560316 664 my ($s,$e,$out,$t,$end,$l) = @$b;
14a8264b 665 outtable($fh,$t) unless $t->{'Done'};
666 }
667 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
668 foreach my $b (@{$a->{'Entries'}})
017e2add 669 {
eb560316 670 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
e0c49a6b 671 $end |= 0x80 if $fb;
017e2add 672 print $fh "{";
673 if ($l)
674 {
675 printf $fh outstring($fh,'',$out);
676 }
677 else
678 {
679 print $fh "0";
680 }
681 print $fh ",",$t->{Cname};
2f2b4ff2 682 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
017e2add 683 }
14a8264b 684 print $fh "};\n";
685}
686
687sub output
688{
689 my ($fh,$name,$a) = @_;
690 process($name,$a);
691 # Sub-tables
692 outtable($fh,$a);
017e2add 693}
694
e0c49a6b 695sub output_enc
696{
697 my ($fh,$name,$a) = @_;
eb560316 698 die "Changed - fix me for new structure";
e0c49a6b 699 foreach my $b (sort keys %$a)
700 {
701 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
702 }
703}
704
705sub decode_U
706{
707 my $s = shift;
e0c49a6b 708}
709
f39fb8cc 710my @uname;
711sub char_names
712{
9fb51cbc 713 my $s = do "unicore/Name.pl";
714 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
f39fb8cc 715 pos($s) = 0;
716 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
717 {
718 my $name = $3;
719 my $s = hex($1);
720 last if $s >= 0x10000;
721 my $e = length($2) ? hex($2) : $s;
722 for (my $i = $s; $i <= $e; $i++)
723 {
724 $uname[$i] = $name;
725# print sprintf("U%04X $name\n",$i);
726 }
727 }
728}
729
e0c49a6b 730sub output_ucm_page
731{
f39fb8cc 732 my ($cmap,$a,$t,$pre) = @_;
e0c49a6b 733 # warn sprintf("Page %x\n",$pre);
734 foreach my $b (sort keys %$t)
735 {
eb560316 736 die "Changed - fix me for new structure";
e0c49a6b 737 my ($s,$e,$out,$n,$end,$l,$fb) = @{$t->{$b}};
738 die "oops $s $e" unless $s eq $e;
739 my $u = ord($s);
740 if ($n != $a && $n != $t)
741 {
f39fb8cc 742 output_ucm_page($cmap,$a,$n,(($pre|($u &0x3F)) << 6)&0xFFFF);
e0c49a6b 743 }
744 elsif (length($out))
745 {
746 if ($pre)
747 {
748 $u = $pre|($u &0x3f);
749 }
f39fb8cc 750 my $s = sprintf "<U%04X> ",$u;
e0c49a6b 751 foreach my $c (split(//,$out))
752 {
f39fb8cc 753 $s .= sprintf "\\x%02X",ord($c);
e0c49a6b 754 }
f39fb8cc 755 $s .= sprintf " |%d # %s\n",($fb ? 1 : 0),$uname[$u];
756 push(@$cmap,$s);
e0c49a6b 757 }
758 else
759 {
760 warn join(',',@{$t->{$b}},$a,$t);
761 }
762 }
763}
764
765sub output_ucm
766{
f39fb8cc 767 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
df1df145 768 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
afdae191 769 print $fh "<code_set_name> \"$name\"\n";
f39fb8cc 770 char_names();
afdae191 771 if (defined $min_el)
772 {
773 print $fh "<mb_cur_min> $min_el\n";
774 }
775 if (defined $max_el)
776 {
777 print $fh "<mb_cur_max> $max_el\n";
778 }
779 if (defined $rep)
780 {
781 print $fh "<subchar> ";
782 foreach my $c (split(//,$rep))
783 {
784 printf $fh "\\x%02X",ord($c);
785 }
786 print $fh "\n";
787 }
f39fb8cc 788 my @cmap;
789 output_ucm_page(\@cmap,$h,$h,0);
afdae191 790 print $fh "#\nCHARMAP\n";
f39fb8cc 791 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
792 {
793 print $fh $line;
794 }
e0c49a6b 795 print $fh "END CHARMAP\n";
796}
017e2add 797
445e6c97 798
799__END__
800With %seen holding array refs:
801
802 865.66 real 28.80 user 8.79 sys
803 7904 maximum resident set size
804 1356 average shared memory size
805 18566 average unshared data size
806 229 average unshared stack size
807 46080 page reclaims
808 33373 page faults
809
810With %seen holding simple scalars:
811
812 342.16 real 27.11 user 3.54 sys
813 8388 maximum resident set size
814 1394 average shared memory size
815 14969 average unshared data size
816 236 average unshared stack size
817 28159 page reclaims
818 9839 page faults
819
820Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
821how %seen is storing things its seen. So it is pathalogically bad on a 16M
822RAM machine, but it's going to help even on modern machines.
823Swapping is bad, m'kay :-)