EBCDIC: document using Encode for character conversions.
[p5sagit/p5-mst-13.2.git] / ext / Encode / compile
CommitLineData
017e2add 1#!../../perl -w
508a30f1 2BEGIN {
b0bf7e4c 3 unshift @INC, qw(../../lib ../../../lib);
4 $ENV{PATH} .= ';../..;../../..' if $^O eq 'MSWin32';
508a30f1 5}
017e2add 6use strict;
afdae191 7use Getopt::Std;
8my @orig_ARGV = @ARGV;
dcda1f94 9
eb560316 10# These may get re-ordered.
11# RAW is a do_now as inserted by &enter
12# AGG is an aggreagated do_now, as built up by &process
13use constant {
14 RAW_NEXT => 0,
15 RAW_IN_LEN => 1,
16 RAW_OUT_BYTES => 2,
17 RAW_FALLBACK => 3,
18
19 AGG_MIN_IN => 0,
20 AGG_MAX_IN => 1,
21 AGG_OUT_BYTES => 2,
22 AGG_NEXT => 3,
23 AGG_IN_LEN => 4,
24 AGG_OUT_LEN => 5,
25 AGG_FALLBACK => 6,
26};
27# (See the algorithm in encengine.c - we're building structures for it)
28
29# There are two sorts of structures.
30# "do_now" (an array, two variants of what needs storing) is whatever we need
31# to do now we've read an input byte.
32# It's housed in a "do_next" (which is how we got to it), and in turn points
33# to a "do_next" which contains all the "do_now"s for the next input byte.
34
35# There will be a "do_next" which is the start state.
36# For a single byte encoding it's the only "do_next" - each "do_now" points
37# back to it, and each "do_now" will cause bytes. There is no state.
38
39# For a multi-byte encoding where all characters in the input are the same
40# length, then there will be a tree of "do_now"->"do_next"->"do_now"
41# branching out from the start state, one step for each input byte.
42# The leaf "do_now"s will all be at the same distance from the start state,
43# only the leaf "do_now"s cause output bytes, and they in turn point back to
44# the start state.
45
46# For an encoding where there are varaible length input byte sequences, you
47# will encounter a leaf "do_now" sooner for the shorter input sequences, but
48# as before the leaves will point back to the start state.
49
50# The system will cope with escape encodings (imagine them as a mostly
51# self-contained tree for each escape state, and cross links between trees
52# at the state-switching characters) but so far no input format defines these.
53
54# The system will also cope with having output "leaves" in the middle of
55# the bifurcating branches, not just at the extremities, but again no
56# input format does this yet.
57
58# There are two variants of the "do_now" structure. The first, smaller variant
59# is generated by &enter as the input file is read. There is one structure
60# for each input byte. Say we are mapping a single byte encoding to a
61# single byte encoding, with "ABCD" going "abcd". There will be
62# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
63
64# &process then walks the tree, building aggregate "do_now" structres for
65# adjacent bytes where possible. The aggregate is for a contiguous range of
66# bytes which each produce the same length of output, each move to the
67# same next state, and each have the same fallback flag.
68# So our 4 RAW "do_now"s above become replaced by a single structure
69# containing:
70# ["A", "D", "abcd", 1, ...]
71# ie, for an input byte $_ in "A".."D", output 1 byte, found as
72# substr ("abcd", (ord $_ - ord "A") * 1, 1)
73# which maps very nicely into pointer arithmetic in C for encengine.c
74
017e2add 75sub encode_U
76{
9b37254d 77 # UTF-8 encode long hand - only covers part of perl's range
7ba7f87b 78 ## my $uv = shift;
1b026014 79 # chr() works in native space so convert value from table
80 # into that space before using chr().
7ba7f87b 81 my $ch = chr(utf8::unicode_to_native($_[0]));
1b026014 82 # Now get core perl to encode that the way it likes.
83 utf8::encode($ch);
84 return $ch;
017e2add 85}
86
87sub encode_S
88{
14a8264b 89 # encode single byte
7ba7f87b 90 ## my ($ch,$page) = @_; return chr($ch);
91 return chr $_[0];
017e2add 92}
93
94sub encode_D
95{
14a8264b 96 # encode double byte MS byte first
7ba7f87b 97 ## my ($ch,$page) = @_; return chr($page).chr($ch);
98 return chr ($_[1]) . chr $_[0];
017e2add 99}
100
101sub encode_M
102{
14a8264b 103 # encode Multi-byte - single for 0..255 otherwise double
7ba7f87b 104 ## my ($ch,$page) = @_;
105 ## return &encode_D if $page;
106 ## return &encode_S;
107 return chr ($_[1]) . chr $_[0] if $_[1];
108 return chr $_[0];
017e2add 109}
110
7ba7f87b 111my %encode_types = (U => \&encode_U,
112 S => \&encode_S,
113 D => \&encode_D,
114 M => \&encode_M,
115 );
116
14a8264b 117# Win32 does not expand globs on command line
252a8565 118eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
18b7339f 119
afdae191 120my %opt;
4cfc977c 121getopts('qOo:f:n:',\%opt);
afdae191 122my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
2f2b4ff2 123chmod(0666,$cname) if -f $cname && !-w $cname;
017e2add 124open(C,">$cname") || die "Cannot open $cname:$!";
afdae191 125
126
2f2b4ff2 127my $dname = $cname;
023d8852 128$dname =~ s/(\.[^\.]*)?$/_def.h/;
e0c49a6b 129
dcda1f94 130my ($doC,$doEnc,$doUcm,$doPet);
e0c49a6b 131
132if ($cname =~ /\.(c|xs)$/)
133 {
134 $doC = 1;
135 chmod(0666,$dname) if -f $cname && !-w $dname;
136 open(D,">$dname") || die "Cannot open $dname:$!";
137 my $hname = $cname;
138 $hname =~ s/(\.[^\.]*)?$/.h/;
139 chmod(0666,$hname) if -f $cname && !-w $hname;
140 open(H,">$hname") || die "Cannot open $hname:$!";
141
142 foreach my $fh (\*C,\*D,\*H)
143 {
afdae191 144 print $fh <<"END" unless $opt{'q'};
14a8264b 145/*
146 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
147 This file was autogenerated by:
023d8852 148 $^X $0 @orig_ARGV
14a8264b 149*/
150END
e0c49a6b 151 }
14a8264b 152
e0c49a6b 153 if ($cname =~ /(\w+)\.xs$/)
154 {
155 print C "#include <EXTERN.h>\n";
156 print C "#include <perl.h>\n";
157 print C "#include <XSUB.h>\n";
158 print C "#define U8 U8\n";
159 }
160 print C "#include \"encode.h\"\n";
d811239c 161
e0c49a6b 162 }
163elsif ($cname =~ /\.enc$/)
164 {
165 $doEnc = 1;
166 }
167elsif ($cname =~ /\.ucm$/)
2f2b4ff2 168 {
e0c49a6b 169 $doUcm = 1;
2f2b4ff2 170 }
dcda1f94 171elsif ($cname =~ /\.pet$/)
172 {
173 $doPet = 1;
174 }
017e2add 175
afdae191 176my @encfiles;
3865e8e1 177if (exists $opt{'f'})
afdae191 178 {
179 # -F is followed by name of file containing list of filenames
3865e8e1 180 my $flist = $opt{'f'};
afdae191 181 open(FLIST,$flist) || die "Cannot open $flist:$!";
182 chomp(@encfiles = <FLIST>);
183 close(FLIST);
184 }
185else
186 {
187 @encfiles = @ARGV;
188 }
c6fdb90a 189
017e2add 190my %encoding;
191my %strings;
b1e7e56f 192my $saved = 0;
193my $subsave = 0;
194my $strings = 0;
017e2add 195
2f2b4ff2 196sub cmp_name
197{
198 if ($a =~ /^.*-(\d+)/)
199 {
200 my $an = $1;
201 if ($b =~ /^.*-(\d+)/)
202 {
203 my $r = $an <=> $1;
204 return $r if $r;
205 }
206 }
207 return $a cmp $b;
208}
209
dcda1f94 210
c6fdb90a 211foreach my $enc (sort cmp_name @encfiles)
017e2add 212 {
9b37254d 213 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
51ef4e11 214 $name = $opt{'n'} if exists $opt{'n'};
017e2add 215 if (open(E,$enc))
216 {
9b37254d 217 if ($sfx eq 'enc')
218 {
dcda1f94 219 compile_enc(\*E,lc($name));
9b37254d 220 }
221 else
222 {
dcda1f94 223 compile_ucm(\*E,lc($name));
9b37254d 224 }
017e2add 225 }
226 else
227 {
228 warn "Cannot open $enc for $name:$!";
229 }
230 }
231
e0c49a6b 232if ($doC)
2f2b4ff2 233 {
b1e7e56f 234 print STDERR "Writing compiled form\n";
dcda1f94 235 foreach my $name (sort cmp_name keys %encoding)
236 {
237 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
238 output(\*C,$name.'_utf8',$e2u);
239 output(\*C,'utf8_'.$name,$u2e);
240 push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
241 }
e0c49a6b 242 foreach my $enc (sort cmp_name keys %encoding)
243 {
dcda1f94 244 my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
245 my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
e0c49a6b 246 my $sym = "${enc}_encoding";
247 $sym =~ s/\W+/_/g;
248 print C "encode_t $sym = \n";
dcda1f94 249 print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
e0c49a6b 250 }
2f2b4ff2 251
e0c49a6b 252 foreach my $enc (sort cmp_name keys %encoding)
253 {
254 my $sym = "${enc}_encoding";
255 $sym =~ s/\W+/_/g;
256 print H "extern encode_t $sym;\n";
d811239c 257 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
e0c49a6b 258 }
017e2add 259
e0c49a6b 260 if ($cname =~ /(\w+)\.xs$/)
261 {
262 my $mod = $1;
d811239c 263 print C <<'END';
264
023d8852 265static void
d811239c 266Encode_XSEncoding(pTHX_ encode_t *enc)
267{
268 dSP;
269 HV *stash = gv_stashpv("Encode::XS", TRUE);
270 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
271 int i = 0;
272 PUSHMARK(sp);
273 XPUSHs(sv);
274 while (enc->name[i])
275 {
276 const char *name = enc->name[i++];
277 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
278 }
279 PUTBACK;
280 call_pv("Encode::define_encoding",G_DISCARD);
281 SvREFCNT_dec(sv);
282}
283
284END
285
e0c49a6b 286 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
287 print C "BOOT:\n{\n";
288 print C "#include \"$dname\"\n";
289 print C "}\n";
290 }
291 close(D);
292 close(H);
f20503bb 293
294 my $perc_saved = $strings/($strings + $saved) * 100;
295 my $perc_subsaved = $strings/($strings + $subsave) * 100;
b1e7e56f 296 printf STDERR "%d bytes in string tables\n",$strings;
f20503bb 297 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
298 $saved, $perc_saved if $saved;
299 printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
300 $subsave, $perc_subsaved if $subsave;
2f2b4ff2 301 }
dcda1f94 302elsif ($doEnc)
303 {
304 foreach my $name (sort cmp_name keys %encoding)
305 {
306 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
307 output_enc(\*C,$name,$e2u);
308 }
309 }
310elsif ($doUcm)
311 {
312 foreach my $name (sort cmp_name keys %encoding)
313 {
314 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
315 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
316 }
317 }
318
017e2add 319close(C);
320
9b37254d 321
b1e7e56f 322
9b37254d 323sub compile_ucm
324{
dcda1f94 325 my ($fh,$name) = @_;
9b37254d 326 my $e2u = {};
327 my $u2e = {};
328 my $cs;
329 my %attr;
330 while (<$fh>)
331 {
332 s/#.*$//;
333 last if /^\s*CHARMAP\s*$/i;
7ba7f87b 334 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
9b37254d 335 {
336 $attr{$1} = $2;
337 }
338 }
339 if (!defined($cs = $attr{'code_set_name'}))
340 {
341 warn "No <code_set_name> in $name\n";
342 }
343 else
344 {
51ef4e11 345 $name = $cs unless exists $opt{'n'};
9b37254d 346 }
347 my $erep;
348 my $urep;
afdae191 349 my $max_el;
350 my $min_el;
9b37254d 351 if (exists $attr{'subchar'})
352 {
afdae191 353 my @byte;
354 $attr{'subchar'} =~ /^\s*/cg;
355 push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
356 $erep = join('',map(chr(hex($_)),@byte));
9b37254d 357 }
b1e7e56f 358 print "Reading $name ($cs)\n";
9b37254d 359 my $nfb = 0;
360 my $hfb = 0;
361 while (<$fh>)
362 {
363 s/#.*$//;
364 last if /^\s*END\s+CHARMAP\s*$/i;
365 next if /^\s*$/;
afdae191 366 my ($u,@byte);
367 my $fb = '';
368 $u = $1 if (/^<U([0-9a-f]+)>\s+/igc);
369 push(@byte,$1) while /\G\\x([0-9a-f]+)/igc;
370 $fb = $1 if /\G\s*(\|[0-3])/gc;
371 # warn "$_: $u @byte | $fb\n";
372 die "Bad line:$_" unless /\G\s*(#.*)?$/gc;
9b37254d 373 if (defined($u))
374 {
375 my $uch = encode_U(hex($u));
e0c49a6b 376 my $ech = join('',map(chr(hex($_)),@byte));
afdae191 377 my $el = length($ech);
378 $max_el = $el if (!defined($max_el) || $el > $max_el);
379 $min_el = $el if (!defined($min_el) || $el < $min_el);
9b37254d 380 if (length($fb))
381 {
382 $fb = substr($fb,1);
383 $hfb++;
384 }
385 else
386 {
387 $nfb++;
388 $fb = '0';
389 }
390 # $fb is fallback flag
391 # 0 - round trip safe
392 # 1 - fallback for unicode -> enc
393 # 2 - skip sub-char mapping
394 # 3 - fallback enc -> unicode
395 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
396 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
397 }
398 else
399 {
400 warn $_;
401 }
9b37254d 402 }
403 if ($nfb && $hfb)
404 {
405 die "$nfb entries without fallback, $hfb entries with\n";
406 }
dcda1f94 407 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
9b37254d 408}
409
14a8264b 410sub compile_enc
017e2add 411{
dcda1f94 412 my ($fh,$name) = @_;
017e2add 413 my $e2u = {};
414 my $u2e = {};
415
416 my $type;
417 while ($type = <$fh>)
418 {
419 last if $type !~ /^\s*#/;
420 }
421 chomp($type);
422 return if $type eq 'E';
445e6c97 423 # Do the hash lookup once, rather than once per function call. 4% speedup.
424 my $type_func = $encode_types{$type};
017e2add 425 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
14a8264b 426 warn "$type encoded $name\n";
017e2add 427 my $rep = '';
7ba7f87b 428 # Save a defined test by setting these to defined values.
429 my $min_el = ~0; # A very big integer
430 my $max_el = 0; # Anything must be longer than 0
017e2add 431 {
432 my $v = hex($def);
445e6c97 433 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
017e2add 434 }
e03ac092 435 my %seen;
445e6c97 436 do
017e2add 437 {
438 my $line = <$fh>;
439 chomp($line);
440 my $page = hex($line);
441 my $ch = 0;
445e6c97 442 my $i = 16;
443 do
017e2add 444 {
445e6c97 445 # So why is it 1% faster to leave the my here?
017e2add 446 my $line = <$fh>;
7ba7f87b 447 die "Line should be exactly 65 characters long including newline"
448 unless length ($line) == 65;
449 # Split line into groups of 4 hex digits, convert groups to ints
445e6c97 450 # This takes 65.35
451 # map {hex $_} $line =~ /(....)/g
452 # This takes 63.75 (2.5% less time)
453 # unpack "n*", pack "H*", $line
454 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
455 # Doing it as while ($line =~ /(....)/g) took 74.63
456 foreach my $val (unpack "n*", pack "H*", $line)
017e2add 457 {
e03ac092 458 next if $val == 0xFFFD;
445e6c97 459 my $ech = &$type_func($ch,$page);
017e2add 460 if ($val || (!$ch && !$page))
461 {
afdae191 462 my $el = length($ech);
7ba7f87b 463 $max_el = $el if $el > $max_el;
464 $min_el = $el if $el < $min_el;
017e2add 465 my $uch = encode_U($val);
445e6c97 466 # We don't need to read this quickly, so storing it as a scalar,
467 # rather than 3 (anon array, plus the 2 scalars it holds) saves
468 # RAM and may make us faster on low RAM systems. [see __END__]
e03ac092 469 if (exists $seen{$uch})
470 {
445e6c97 471 warn sprintf("U%04X is %02X%02X and %04X\n",
472 $val,$page,$ch,$seen{$uch});
e03ac092 473 }
474 else
475 {
a8373f85 476 $seen{$uch} = $page << 8 | $ch;
e03ac092 477 }
eb560316 478 # Passing 2 extra args each time is 3.6% slower!
445e6c97 479 # Even with having to add $fallback ||= 0 later
480 enter_fb0($e2u,$ech,$uch);
481 enter_fb0($u2e,$uch,$ech);
017e2add 482 }
483 else
484 {
485 # No character at this position
486 # enter($e2u,$ech,undef,$e2u);
487 }
488 $ch++;
489 }
445e6c97 490 } while --$i;
491 } while --$pages;
7ba7f87b 492 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
493 if $min_el > $max_el;
dcda1f94 494 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
017e2add 495}
496
eb560316 497# my ($a,$s,$d,$t,$fb) = @_;
498sub enter {
499 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
500 # state we shift to after this (multibyte) input character defaults to same
501 # as current state.
502 $next ||= $current;
503 # Making sure it is defined seems to be faster than {no warnings;} in
504 # &process, or passing it in as 0 explicity.
445e6c97 505 # XXX $fallback ||= 0;
eb560316 506
507 # Start at the beginning and work forwards through the string to zero.
508 # effectively we are removing 1 character from the front each time
509 # but we don't actually edit the string. [this alone seems to be 14% speedup]
510 # Hence -$pos is the length of the remaining string.
511 my $pos = -length $inbytes;
512 while (1) {
513 my $byte = substr $inbytes, $pos, 1;
514 # RAW_NEXT => 0,
515 # RAW_IN_LEN => 1,
516 # RAW_OUT_BYTES => 2,
517 # RAW_FALLBACK => 3,
518 # to unicode an array would seem to be better, because the pages are dense.
519 # from unicode can be very sparse, favouring a hash.
520 # hash using the bytes (all length 1) as keys rather than ord value,
521 # as it's easier to sort these in &process.
522
523 # It's faster to always add $fallback even if it's undef, rather than
524 # choosing between 3 and 4 element array. (hence why we set it defined
525 # above)
526 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
527 # When $pos was -1 we were at the last input character.
528 unless (++$pos) {
529 $do_now->[RAW_OUT_BYTES] = $outbytes;
530 $do_now->[RAW_NEXT] = $next;
531 return;
532 }
533 # Tail recursion. The intermdiate state may not have a name yet.
534 $current = $do_now->[RAW_NEXT];
017e2add 535 }
536}
537
445e6c97 538# This is purely for optimistation. It's just &enter hard coded for $fallback
539# of 0, using only a 3 entry array ref to save memory for every entry.
540sub enter_fb0 {
541 my ($current,$inbytes,$outbytes,$next) = @_;
542 $next ||= $current;
543
544 my $pos = -length $inbytes;
545 while (1) {
546 my $byte = substr $inbytes, $pos, 1;
547 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
548 unless (++$pos) {
549 $do_now->[RAW_OUT_BYTES] = $outbytes;
550 $do_now->[RAW_NEXT] = $next;
551 return;
552 }
553 $current = $do_now->[RAW_NEXT];
554 }
555}
b1e7e56f 556
557
017e2add 558sub outstring
559{
560 my ($fh,$name,$s) = @_;
561 my $sym = $strings{$s};
b1e7e56f 562 if ($sym)
563 {
564 $saved += length($s);
565 }
566 else
017e2add 567 {
4cfc977c 568 if ($opt{'O'}) {
569 foreach my $o (keys %strings)
570 {
445e6c97 571 next unless (my $i = index($o,$s)) >= 0;
572 $sym = $strings{$o};
573 $sym .= sprintf("+0x%02x",$i) if ($i);
574 $subsave += length($s);
575 return $strings{$s} = $sym;
576 }
4cfc977c 577 }
017e2add 578 $strings{$s} = $sym = $name;
b1e7e56f 579 $strings += length($s);
de745a2e 580 my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
581 # Maybe we should assert that these are all <256.
582 $definition .= join(',',unpack "C*",$s);
583 # We have a single long line. Split it at convenient commas.
584 $definition =~ s/(.{74,77},)/$1\n/g;
585 print $fh "$definition };\n\n";
017e2add 586 }
587 return $sym;
588}
589
14a8264b 590sub process
017e2add 591{
eb560316 592 my ($name,$a) = @_;
593 $name =~ s/\W+/_/g;
594 $a->{Cname} = $name;
445e6c97 595 my $raw = $a->{Raw};
eb560316 596 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
597 my @ent;
445e6c97 598 $agg_max_in = 0;
599 foreach my $key (sort keys %$raw) {
eb560316 600 # RAW_NEXT => 0,
601 # RAW_IN_LEN => 1,
602 # RAW_OUT_BYTES => 2,
603 # RAW_FALLBACK => 3,
445e6c97 604 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
eb560316 605 # Now we are converting from raw to aggregate, switch from 1 byte strings
606 # to numbers
607 my $b = ord $key;
445e6c97 608 $fallback ||= 0;
eb560316 609 if ($l &&
610 # If this == fails, we're going to reset $agg_max_in below anyway.
611 $b == ++$agg_max_in &&
612 # References in numeric context give the pointer as an int.
613 $agg_next == $next &&
614 $agg_in_len == $in_len &&
615 $agg_out_len == length $out_bytes &&
616 $agg_fallback == $fallback
617 # && length($l->[AGG_OUT_BYTES]) < 16
618 ) {
619 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
620 # we can aggregate this byte onto the end.
621 $l->[AGG_MAX_IN] = $b;
622 $l->[AGG_OUT_BYTES] .= $out_bytes;
623 } else {
624 # AGG_MIN_IN => 0,
625 # AGG_MAX_IN => 1,
626 # AGG_OUT_BYTES => 2,
627 # AGG_NEXT => 3,
628 # AGG_IN_LEN => 4,
629 # AGG_OUT_LEN => 5,
630 # AGG_FALLBACK => 6,
631 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
632 # (only gains .6% on euc-jp -- is it worth it?)
633 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
634 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
635 $agg_fallback = $fallback];
017e2add 636 }
eb560316 637 if (exists $next->{Cname}) {
638 $next->{'Forward'} = 1 if $next != $a;
639 } else {
640 process(sprintf("%s_%02x",$name,$b),$next);
017e2add 641 }
642 }
eb560316 643 # encengine.c rules say that last entry must be for 255
445e6c97 644 if ($agg_max_in < 255) {
645 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
017e2add 646 }
eb560316 647 $a->{'Entries'} = \@ent;
14a8264b 648}
649
650sub outtable
651{
652 my ($fh,$a) = @_;
653 my $name = $a->{'Cname'};
017e2add 654 # String tables
14a8264b 655 foreach my $b (@{$a->{'Entries'}})
017e2add 656 {
eb560316 657 next unless $b->[AGG_OUT_LEN];
658 my $s = $b->[AGG_MIN_IN];
659 my $e = $b->[AGG_MAX_IN];
660 outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
017e2add 661 }
14a8264b 662 if ($a->{'Forward'})
663 {
664 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
665 }
666 $a->{'Done'} = 1;
667 foreach my $b (@{$a->{'Entries'}})
668 {
eb560316 669 my ($s,$e,$out,$t,$end,$l) = @$b;
14a8264b 670 outtable($fh,$t) unless $t->{'Done'};
671 }
672 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
673 foreach my $b (@{$a->{'Entries'}})
017e2add 674 {
eb560316 675 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
e0c49a6b 676 $end |= 0x80 if $fb;
017e2add 677 print $fh "{";
678 if ($l)
679 {
680 printf $fh outstring($fh,'',$out);
681 }
682 else
683 {
684 print $fh "0";
685 }
686 print $fh ",",$t->{Cname};
2f2b4ff2 687 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
017e2add 688 }
14a8264b 689 print $fh "};\n";
690}
691
692sub output
693{
694 my ($fh,$name,$a) = @_;
695 process($name,$a);
696 # Sub-tables
697 outtable($fh,$a);
017e2add 698}
699
e0c49a6b 700sub output_enc
701{
702 my ($fh,$name,$a) = @_;
eb560316 703 die "Changed - fix me for new structure";
e0c49a6b 704 foreach my $b (sort keys %$a)
705 {
706 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
707 }
708}
709
710sub decode_U
711{
712 my $s = shift;
e0c49a6b 713}
714
f39fb8cc 715my @uname;
716sub char_names
717{
9fb51cbc 718 my $s = do "unicore/Name.pl";
719 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
f39fb8cc 720 pos($s) = 0;
721 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
722 {
723 my $name = $3;
724 my $s = hex($1);
725 last if $s >= 0x10000;
726 my $e = length($2) ? hex($2) : $s;
727 for (my $i = $s; $i <= $e; $i++)
728 {
729 $uname[$i] = $name;
730# print sprintf("U%04X $name\n",$i);
731 }
732 }
733}
734
e0c49a6b 735sub output_ucm_page
736{
a8373f85 737 my ($cmap,$a,$t,$pre) = @_;
738 # warn sprintf("Page %x\n",$pre);
739 my $raw = $t->{Raw};
740 foreach my $key (sort keys %$raw) {
741 # RAW_NEXT => 0,
742 # RAW_IN_LEN => 1,
743 # RAW_OUT_BYTES => 2,
744 # RAW_FALLBACK => 3,
745 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
746 my $u = ord $key;
747 $fallback ||= 0;
748
749 if ($next != $a && $next != $t) {
750 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
751 } elsif (length $out_bytes) {
752 if ($pre) {
753 $u = $pre|($u &0x3f);
e0c49a6b 754 }
a8373f85 755 my $s = sprintf "<U%04X> ",$u;
756 #foreach my $c (split(//,$out_bytes)) {
757 # $s .= sprintf "\\x%02X",ord($c);
758 #}
759 # 9.5% faster changing that lloop to this:
760 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
761 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
762 push(@$cmap,$s);
763 } else {
764 warn join(',',$u, @{$raw->{$key}},$a,$t);
e0c49a6b 765 }
766 }
767}
768
769sub output_ucm
770{
f39fb8cc 771 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
df1df145 772 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
afdae191 773 print $fh "<code_set_name> \"$name\"\n";
f39fb8cc 774 char_names();
afdae191 775 if (defined $min_el)
776 {
777 print $fh "<mb_cur_min> $min_el\n";
778 }
779 if (defined $max_el)
780 {
781 print $fh "<mb_cur_max> $max_el\n";
782 }
783 if (defined $rep)
784 {
785 print $fh "<subchar> ";
786 foreach my $c (split(//,$rep))
787 {
788 printf $fh "\\x%02X",ord($c);
789 }
790 print $fh "\n";
791 }
f39fb8cc 792 my @cmap;
793 output_ucm_page(\@cmap,$h,$h,0);
afdae191 794 print $fh "#\nCHARMAP\n";
f39fb8cc 795 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
796 {
797 print $fh $line;
798 }
e0c49a6b 799 print $fh "END CHARMAP\n";
800}
017e2add 801
445e6c97 802
803__END__
804With %seen holding array refs:
805
806 865.66 real 28.80 user 8.79 sys
807 7904 maximum resident set size
808 1356 average shared memory size
809 18566 average unshared data size
810 229 average unshared stack size
811 46080 page reclaims
812 33373 page faults
813
814With %seen holding simple scalars:
815
816 342.16 real 27.11 user 3.54 sys
817 8388 maximum resident set size
818 1394 average shared memory size
819 14969 average unshared data size
820 236 average unshared stack size
821 28159 page reclaims
822 9839 page faults
823
824Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
825how %seen is storing things its seen. So it is pathalogically bad on a 16M
826RAM machine, but it's going to help even on modern machines.
827Swapping is bad, m'kay :-)