Better fix from Autrijus.
[p5sagit/p5-mst-13.2.git] / ext / Encode / compile
CommitLineData
017e2add 1#!../../perl -w
508a30f1 2BEGIN {
e472a8fe 3 unshift @INC, qw(../../lib ../../../lib);
4 $ENV{PATH} .= ';../..;../../..' if $^O eq 'MSWin32';
508a30f1 5}
017e2add 6use strict;
afdae191 7use Getopt::Std;
8my @orig_ARGV = @ARGV;
b6504073 9our $VERSION = '0.30';
dcda1f94 10
eb560316 11# These may get re-ordered.
12# RAW is a do_now as inserted by &enter
13# AGG is an aggreagated do_now, as built up by &process
14use constant {
15 RAW_NEXT => 0,
16 RAW_IN_LEN => 1,
17 RAW_OUT_BYTES => 2,
18 RAW_FALLBACK => 3,
19
20 AGG_MIN_IN => 0,
21 AGG_MAX_IN => 1,
22 AGG_OUT_BYTES => 2,
23 AGG_NEXT => 3,
24 AGG_IN_LEN => 4,
25 AGG_OUT_LEN => 5,
26 AGG_FALLBACK => 6,
27};
28# (See the algorithm in encengine.c - we're building structures for it)
29
30# There are two sorts of structures.
31# "do_now" (an array, two variants of what needs storing) is whatever we need
32# to do now we've read an input byte.
33# It's housed in a "do_next" (which is how we got to it), and in turn points
34# to a "do_next" which contains all the "do_now"s for the next input byte.
35
36# There will be a "do_next" which is the start state.
37# For a single byte encoding it's the only "do_next" - each "do_now" points
38# back to it, and each "do_now" will cause bytes. There is no state.
39
40# For a multi-byte encoding where all characters in the input are the same
41# length, then there will be a tree of "do_now"->"do_next"->"do_now"
42# branching out from the start state, one step for each input byte.
43# The leaf "do_now"s will all be at the same distance from the start state,
44# only the leaf "do_now"s cause output bytes, and they in turn point back to
45# the start state.
46
47# For an encoding where there are varaible length input byte sequences, you
48# will encounter a leaf "do_now" sooner for the shorter input sequences, but
49# as before the leaves will point back to the start state.
50
51# The system will cope with escape encodings (imagine them as a mostly
52# self-contained tree for each escape state, and cross links between trees
53# at the state-switching characters) but so far no input format defines these.
54
55# The system will also cope with having output "leaves" in the middle of
56# the bifurcating branches, not just at the extremities, but again no
57# input format does this yet.
58
59# There are two variants of the "do_now" structure. The first, smaller variant
60# is generated by &enter as the input file is read. There is one structure
61# for each input byte. Say we are mapping a single byte encoding to a
62# single byte encoding, with "ABCD" going "abcd". There will be
63# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
64
65# &process then walks the tree, building aggregate "do_now" structres for
66# adjacent bytes where possible. The aggregate is for a contiguous range of
67# bytes which each produce the same length of output, each move to the
68# same next state, and each have the same fallback flag.
69# So our 4 RAW "do_now"s above become replaced by a single structure
70# containing:
71# ["A", "D", "abcd", 1, ...]
72# ie, for an input byte $_ in "A".."D", output 1 byte, found as
73# substr ("abcd", (ord $_ - ord "A") * 1, 1)
74# which maps very nicely into pointer arithmetic in C for encengine.c
75
017e2add 76sub encode_U
77{
9b37254d 78 # UTF-8 encode long hand - only covers part of perl's range
7ba7f87b 79 ## my $uv = shift;
1b026014 80 # chr() works in native space so convert value from table
81 # into that space before using chr().
7ba7f87b 82 my $ch = chr(utf8::unicode_to_native($_[0]));
1b026014 83 # Now get core perl to encode that the way it likes.
84 utf8::encode($ch);
85 return $ch;
017e2add 86}
87
88sub encode_S
89{
14a8264b 90 # encode single byte
7ba7f87b 91 ## my ($ch,$page) = @_; return chr($ch);
92 return chr $_[0];
017e2add 93}
94
95sub encode_D
96{
14a8264b 97 # encode double byte MS byte first
7ba7f87b 98 ## my ($ch,$page) = @_; return chr($page).chr($ch);
99 return chr ($_[1]) . chr $_[0];
017e2add 100}
101
102sub encode_M
103{
14a8264b 104 # encode Multi-byte - single for 0..255 otherwise double
7ba7f87b 105 ## my ($ch,$page) = @_;
106 ## return &encode_D if $page;
107 ## return &encode_S;
108 return chr ($_[1]) . chr $_[0] if $_[1];
109 return chr $_[0];
017e2add 110}
111
7ba7f87b 112my %encode_types = (U => \&encode_U,
113 S => \&encode_S,
114 D => \&encode_D,
115 M => \&encode_M,
116 );
117
14a8264b 118# Win32 does not expand globs on command line
252a8565 119eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
18b7339f 120
afdae191 121my %opt;
3964a085 122# I think these are:
123# -Q to disable the duplicate codepoint test
16efc830 124# -S make mapping errors fatal
3964a085 125# -q to remove comments written to output files
126# -O to enable the (brute force) substring optimiser
127# -o <output> to specify the output file name (else it's the first arg)
128# -f <inlist> to give a file with a list of input files (else use the args)
129# -n <name> to name the encoding (else use the basename of the input file.
16efc830 130getopts('SQqOo:f:n:',\%opt);
3964a085 131
132# This really should go first, else the die here causes empty (non-erroneous)
133# output files to be written.
134my @encfiles;
135if (exists $opt{'f'}) {
136 # -F is followed by name of file containing list of filenames
137 my $flist = $opt{'f'};
138 open(FLIST,$flist) || die "Cannot open $flist:$!";
139 chomp(@encfiles = <FLIST>);
140 close(FLIST);
141} else {
142 @encfiles = @ARGV;
143}
144
afdae191 145my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
2f2b4ff2 146chmod(0666,$cname) if -f $cname && !-w $cname;
017e2add 147open(C,">$cname") || die "Cannot open $cname:$!";
afdae191 148
2f2b4ff2 149my $dname = $cname;
3964a085 150my $hname = $cname;
e0c49a6b 151
dcda1f94 152my ($doC,$doEnc,$doUcm,$doPet);
e0c49a6b 153
154if ($cname =~ /\.(c|xs)$/)
155 {
156 $doC = 1;
3964a085 157 $dname =~ s/(\.[^\.]*)?$/_def.h/;
e0c49a6b 158 chmod(0666,$dname) if -f $cname && !-w $dname;
159 open(D,">$dname") || die "Cannot open $dname:$!";
e0c49a6b 160 $hname =~ s/(\.[^\.]*)?$/.h/;
161 chmod(0666,$hname) if -f $cname && !-w $hname;
162 open(H,">$hname") || die "Cannot open $hname:$!";
163
164 foreach my $fh (\*C,\*D,\*H)
165 {
afdae191 166 print $fh <<"END" unless $opt{'q'};
14a8264b 167/*
168 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
169 This file was autogenerated by:
023d8852 170 $^X $0 @orig_ARGV
14a8264b 171*/
172END
e0c49a6b 173 }
14a8264b 174
e0c49a6b 175 if ($cname =~ /(\w+)\.xs$/)
176 {
177 print C "#include <EXTERN.h>\n";
178 print C "#include <perl.h>\n";
179 print C "#include <XSUB.h>\n";
180 print C "#define U8 U8\n";
181 }
182 print C "#include \"encode.h\"\n";
d811239c 183
e0c49a6b 184 }
185elsif ($cname =~ /\.enc$/)
186 {
187 $doEnc = 1;
188 }
189elsif ($cname =~ /\.ucm$/)
2f2b4ff2 190 {
e0c49a6b 191 $doUcm = 1;
2f2b4ff2 192 }
dcda1f94 193elsif ($cname =~ /\.pet$/)
194 {
195 $doPet = 1;
196 }
017e2add 197
198my %encoding;
199my %strings;
b1e7e56f 200my $saved = 0;
201my $subsave = 0;
202my $strings = 0;
017e2add 203
2f2b4ff2 204sub cmp_name
205{
206 if ($a =~ /^.*-(\d+)/)
207 {
208 my $an = $1;
209 if ($b =~ /^.*-(\d+)/)
210 {
211 my $r = $an <=> $1;
212 return $r if $r;
213 }
214 }
215 return $a cmp $b;
216}
217
dcda1f94 218
c6fdb90a 219foreach my $enc (sort cmp_name @encfiles)
017e2add 220 {
9b37254d 221 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
51ef4e11 222 $name = $opt{'n'} if exists $opt{'n'};
017e2add 223 if (open(E,$enc))
224 {
9b37254d 225 if ($sfx eq 'enc')
226 {
dcda1f94 227 compile_enc(\*E,lc($name));
9b37254d 228 }
229 else
230 {
dcda1f94 231 compile_ucm(\*E,lc($name));
9b37254d 232 }
017e2add 233 }
234 else
235 {
236 warn "Cannot open $enc for $name:$!";
237 }
238 }
239
e0c49a6b 240if ($doC)
2f2b4ff2 241 {
b1e7e56f 242 print STDERR "Writing compiled form\n";
dcda1f94 243 foreach my $name (sort cmp_name keys %encoding)
244 {
245 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
246 output(\*C,$name.'_utf8',$e2u);
247 output(\*C,'utf8_'.$name,$u2e);
248 push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
249 }
e0c49a6b 250 foreach my $enc (sort cmp_name keys %encoding)
251 {
dcda1f94 252 my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
253 my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
e0c49a6b 254 my $sym = "${enc}_encoding";
255 $sym =~ s/\W+/_/g;
256 print C "encode_t $sym = \n";
dcda1f94 257 print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
e0c49a6b 258 }
2f2b4ff2 259
e0c49a6b 260 foreach my $enc (sort cmp_name keys %encoding)
261 {
262 my $sym = "${enc}_encoding";
263 $sym =~ s/\W+/_/g;
264 print H "extern encode_t $sym;\n";
d811239c 265 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
e0c49a6b 266 }
017e2add 267
e0c49a6b 268 if ($cname =~ /(\w+)\.xs$/)
269 {
270 my $mod = $1;
d811239c 271 print C <<'END';
272
023d8852 273static void
d811239c 274Encode_XSEncoding(pTHX_ encode_t *enc)
275{
276 dSP;
277 HV *stash = gv_stashpv("Encode::XS", TRUE);
278 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
279 int i = 0;
280 PUSHMARK(sp);
281 XPUSHs(sv);
282 while (enc->name[i])
283 {
284 const char *name = enc->name[i++];
285 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
286 }
287 PUTBACK;
288 call_pv("Encode::define_encoding",G_DISCARD);
289 SvREFCNT_dec(sv);
290}
291
292END
293
e0c49a6b 294 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
295 print C "BOOT:\n{\n";
296 print C "#include \"$dname\"\n";
297 print C "}\n";
298 }
3964a085 299 # Close in void context is bad, m'kay
300 close(D) or warn "Error closing '$dname': $!";
301 close(H) or warn "Error closing '$hname': $!";
f20503bb 302
303 my $perc_saved = $strings/($strings + $saved) * 100;
304 my $perc_subsaved = $strings/($strings + $subsave) * 100;
b1e7e56f 305 printf STDERR "%d bytes in string tables\n",$strings;
f20503bb 306 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
307 $saved, $perc_saved if $saved;
308 printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
309 $subsave, $perc_subsaved if $subsave;
2f2b4ff2 310 }
dcda1f94 311elsif ($doEnc)
312 {
313 foreach my $name (sort cmp_name keys %encoding)
314 {
315 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
316 output_enc(\*C,$name,$e2u);
317 }
318 }
319elsif ($doUcm)
320 {
321 foreach my $name (sort cmp_name keys %encoding)
322 {
323 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
324 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
325 }
326 }
327
3964a085 328# writing half meg files and then not checking to see if you just filled the
329# disk is bad, m'kay
330close(C) or die "Error closing '$cname': $!";
9b37254d 331
3964a085 332# End of the main program.
b1e7e56f 333
9b37254d 334sub compile_ucm
335{
dcda1f94 336 my ($fh,$name) = @_;
9b37254d 337 my $e2u = {};
338 my $u2e = {};
339 my $cs;
340 my %attr;
341 while (<$fh>)
342 {
343 s/#.*$//;
344 last if /^\s*CHARMAP\s*$/i;
7ba7f87b 345 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
9b37254d 346 {
347 $attr{$1} = $2;
348 }
349 }
350 if (!defined($cs = $attr{'code_set_name'}))
351 {
352 warn "No <code_set_name> in $name\n";
353 }
354 else
355 {
51ef4e11 356 $name = $cs unless exists $opt{'n'};
9b37254d 357 }
358 my $erep;
359 my $urep;
afdae191 360 my $max_el;
361 my $min_el;
9b37254d 362 if (exists $attr{'subchar'})
363 {
afdae191 364 my @byte;
365 $attr{'subchar'} =~ /^\s*/cg;
366 push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
367 $erep = join('',map(chr(hex($_)),@byte));
9b37254d 368 }
b1e7e56f 369 print "Reading $name ($cs)\n";
9b37254d 370 my $nfb = 0;
371 my $hfb = 0;
372 while (<$fh>)
373 {
374 s/#.*$//;
375 last if /^\s*END\s+CHARMAP\s*$/i;
376 next if /^\s*$/;
afdae191 377 my ($u,@byte);
378 my $fb = '';
379 $u = $1 if (/^<U([0-9a-f]+)>\s+/igc);
380 push(@byte,$1) while /\G\\x([0-9a-f]+)/igc;
381 $fb = $1 if /\G\s*(\|[0-3])/gc;
382 # warn "$_: $u @byte | $fb\n";
383 die "Bad line:$_" unless /\G\s*(#.*)?$/gc;
9b37254d 384 if (defined($u))
385 {
386 my $uch = encode_U(hex($u));
e0c49a6b 387 my $ech = join('',map(chr(hex($_)),@byte));
afdae191 388 my $el = length($ech);
389 $max_el = $el if (!defined($max_el) || $el > $max_el);
390 $min_el = $el if (!defined($min_el) || $el < $min_el);
9b37254d 391 if (length($fb))
392 {
393 $fb = substr($fb,1);
394 $hfb++;
395 }
396 else
397 {
398 $nfb++;
399 $fb = '0';
400 }
401 # $fb is fallback flag
402 # 0 - round trip safe
403 # 1 - fallback for unicode -> enc
404 # 2 - skip sub-char mapping
405 # 3 - fallback enc -> unicode
406 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
407 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
408 }
409 else
410 {
411 warn $_;
412 }
9b37254d 413 }
414 if ($nfb && $hfb)
415 {
416 die "$nfb entries without fallback, $hfb entries with\n";
417 }
dcda1f94 418 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
9b37254d 419}
420
aa095549 421
422
14a8264b 423sub compile_enc
017e2add 424{
dcda1f94 425 my ($fh,$name) = @_;
017e2add 426 my $e2u = {};
427 my $u2e = {};
428
429 my $type;
430 while ($type = <$fh>)
431 {
432 last if $type !~ /^\s*#/;
433 }
434 chomp($type);
435 return if $type eq 'E';
445e6c97 436 # Do the hash lookup once, rather than once per function call. 4% speedup.
437 my $type_func = $encode_types{$type};
017e2add 438 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
14a8264b 439 warn "$type encoded $name\n";
017e2add 440 my $rep = '';
7ba7f87b 441 # Save a defined test by setting these to defined values.
442 my $min_el = ~0; # A very big integer
443 my $max_el = 0; # Anything must be longer than 0
017e2add 444 {
445 my $v = hex($def);
445e6c97 446 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
017e2add 447 }
16efc830 448 my $errors;
3964a085 449 my $seen;
450 # use -Q to silence the seen test. Makefile.PL uses this by default.
451 $seen = {} unless $opt{Q};
445e6c97 452 do
017e2add 453 {
454 my $line = <$fh>;
455 chomp($line);
456 my $page = hex($line);
457 my $ch = 0;
445e6c97 458 my $i = 16;
459 do
017e2add 460 {
445e6c97 461 # So why is it 1% faster to leave the my here?
017e2add 462 my $line = <$fh>;
aa095549 463 $line =~ s/\r\n$/\n/;
464 die "$.:${line}Line should be exactly 65 characters long including
465 newline (".length($line).")" unless length ($line) == 65;
7ba7f87b 466 # Split line into groups of 4 hex digits, convert groups to ints
445e6c97 467 # This takes 65.35
468 # map {hex $_} $line =~ /(....)/g
469 # This takes 63.75 (2.5% less time)
470 # unpack "n*", pack "H*", $line
471 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
472 # Doing it as while ($line =~ /(....)/g) took 74.63
473 foreach my $val (unpack "n*", pack "H*", $line)
017e2add 474 {
e03ac092 475 next if $val == 0xFFFD;
445e6c97 476 my $ech = &$type_func($ch,$page);
017e2add 477 if ($val || (!$ch && !$page))
478 {
afdae191 479 my $el = length($ech);
7ba7f87b 480 $max_el = $el if $el > $max_el;
481 $min_el = $el if $el < $min_el;
017e2add 482 my $uch = encode_U($val);
3964a085 483 if ($seen) {
484 # We're doing the test.
485 # We don't need to read this quickly, so storing it as a scalar,
486 # rather than 3 (anon array, plus the 2 scalars it holds) saves
487 # RAM and may make us faster on low RAM systems. [see __END__]
488 if (exists $seen->{$uch})
489 {
490 warn sprintf("U%04X is %02X%02X and %04X\n",
491 $val,$page,$ch,$seen->{$uch});
16efc830 492 $errors++;
3964a085 493 }
494 else
495 {
496 $seen->{$uch} = $page << 8 | $ch;
497 }
498 }
eb560316 499 # Passing 2 extra args each time is 3.6% slower!
445e6c97 500 # Even with having to add $fallback ||= 0 later
501 enter_fb0($e2u,$ech,$uch);
502 enter_fb0($u2e,$uch,$ech);
017e2add 503 }
504 else
505 {
506 # No character at this position
507 # enter($e2u,$ech,undef,$e2u);
508 }
509 $ch++;
510 }
445e6c97 511 } while --$i;
512 } while --$pages;
7ba7f87b 513 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
514 if $min_el > $max_el;
16efc830 515 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
dcda1f94 516 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
017e2add 517}
518
eb560316 519# my ($a,$s,$d,$t,$fb) = @_;
520sub enter {
521 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
522 # state we shift to after this (multibyte) input character defaults to same
523 # as current state.
524 $next ||= $current;
525 # Making sure it is defined seems to be faster than {no warnings;} in
526 # &process, or passing it in as 0 explicity.
445e6c97 527 # XXX $fallback ||= 0;
eb560316 528
529 # Start at the beginning and work forwards through the string to zero.
530 # effectively we are removing 1 character from the front each time
531 # but we don't actually edit the string. [this alone seems to be 14% speedup]
532 # Hence -$pos is the length of the remaining string.
533 my $pos = -length $inbytes;
534 while (1) {
535 my $byte = substr $inbytes, $pos, 1;
536 # RAW_NEXT => 0,
537 # RAW_IN_LEN => 1,
538 # RAW_OUT_BYTES => 2,
539 # RAW_FALLBACK => 3,
540 # to unicode an array would seem to be better, because the pages are dense.
541 # from unicode can be very sparse, favouring a hash.
542 # hash using the bytes (all length 1) as keys rather than ord value,
543 # as it's easier to sort these in &process.
544
545 # It's faster to always add $fallback even if it's undef, rather than
546 # choosing between 3 and 4 element array. (hence why we set it defined
547 # above)
548 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
549 # When $pos was -1 we were at the last input character.
550 unless (++$pos) {
551 $do_now->[RAW_OUT_BYTES] = $outbytes;
552 $do_now->[RAW_NEXT] = $next;
553 return;
554 }
555 # Tail recursion. The intermdiate state may not have a name yet.
556 $current = $do_now->[RAW_NEXT];
017e2add 557 }
558}
559
445e6c97 560# This is purely for optimistation. It's just &enter hard coded for $fallback
561# of 0, using only a 3 entry array ref to save memory for every entry.
562sub enter_fb0 {
563 my ($current,$inbytes,$outbytes,$next) = @_;
564 $next ||= $current;
565
566 my $pos = -length $inbytes;
567 while (1) {
568 my $byte = substr $inbytes, $pos, 1;
569 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
570 unless (++$pos) {
571 $do_now->[RAW_OUT_BYTES] = $outbytes;
572 $do_now->[RAW_NEXT] = $next;
573 return;
574 }
575 $current = $do_now->[RAW_NEXT];
576 }
577}
b1e7e56f 578
579
017e2add 580sub outstring
581{
582 my ($fh,$name,$s) = @_;
583 my $sym = $strings{$s};
b1e7e56f 584 if ($sym)
585 {
586 $saved += length($s);
587 }
588 else
017e2add 589 {
4cfc977c 590 if ($opt{'O'}) {
591 foreach my $o (keys %strings)
592 {
445e6c97 593 next unless (my $i = index($o,$s)) >= 0;
594 $sym = $strings{$o};
595 $sym .= sprintf("+0x%02x",$i) if ($i);
596 $subsave += length($s);
597 return $strings{$s} = $sym;
598 }
4cfc977c 599 }
017e2add 600 $strings{$s} = $sym = $name;
b1e7e56f 601 $strings += length($s);
de745a2e 602 my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
603 # Maybe we should assert that these are all <256.
604 $definition .= join(',',unpack "C*",$s);
605 # We have a single long line. Split it at convenient commas.
606 $definition =~ s/(.{74,77},)/$1\n/g;
607 print $fh "$definition };\n\n";
017e2add 608 }
609 return $sym;
610}
611
14a8264b 612sub process
017e2add 613{
eb560316 614 my ($name,$a) = @_;
615 $name =~ s/\W+/_/g;
616 $a->{Cname} = $name;
445e6c97 617 my $raw = $a->{Raw};
eb560316 618 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
619 my @ent;
445e6c97 620 $agg_max_in = 0;
621 foreach my $key (sort keys %$raw) {
eb560316 622 # RAW_NEXT => 0,
623 # RAW_IN_LEN => 1,
624 # RAW_OUT_BYTES => 2,
625 # RAW_FALLBACK => 3,
445e6c97 626 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
eb560316 627 # Now we are converting from raw to aggregate, switch from 1 byte strings
628 # to numbers
629 my $b = ord $key;
445e6c97 630 $fallback ||= 0;
eb560316 631 if ($l &&
632 # If this == fails, we're going to reset $agg_max_in below anyway.
633 $b == ++$agg_max_in &&
634 # References in numeric context give the pointer as an int.
635 $agg_next == $next &&
636 $agg_in_len == $in_len &&
637 $agg_out_len == length $out_bytes &&
638 $agg_fallback == $fallback
639 # && length($l->[AGG_OUT_BYTES]) < 16
640 ) {
641 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
642 # we can aggregate this byte onto the end.
643 $l->[AGG_MAX_IN] = $b;
644 $l->[AGG_OUT_BYTES] .= $out_bytes;
645 } else {
646 # AGG_MIN_IN => 0,
647 # AGG_MAX_IN => 1,
648 # AGG_OUT_BYTES => 2,
649 # AGG_NEXT => 3,
650 # AGG_IN_LEN => 4,
651 # AGG_OUT_LEN => 5,
652 # AGG_FALLBACK => 6,
653 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
654 # (only gains .6% on euc-jp -- is it worth it?)
655 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
656 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
657 $agg_fallback = $fallback];
017e2add 658 }
eb560316 659 if (exists $next->{Cname}) {
660 $next->{'Forward'} = 1 if $next != $a;
661 } else {
662 process(sprintf("%s_%02x",$name,$b),$next);
017e2add 663 }
664 }
eb560316 665 # encengine.c rules say that last entry must be for 255
445e6c97 666 if ($agg_max_in < 255) {
667 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
017e2add 668 }
eb560316 669 $a->{'Entries'} = \@ent;
14a8264b 670}
671
672sub outtable
673{
674 my ($fh,$a) = @_;
675 my $name = $a->{'Cname'};
017e2add 676 # String tables
14a8264b 677 foreach my $b (@{$a->{'Entries'}})
017e2add 678 {
eb560316 679 next unless $b->[AGG_OUT_LEN];
680 my $s = $b->[AGG_MIN_IN];
681 my $e = $b->[AGG_MAX_IN];
682 outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
017e2add 683 }
14a8264b 684 if ($a->{'Forward'})
685 {
686 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
687 }
688 $a->{'Done'} = 1;
689 foreach my $b (@{$a->{'Entries'}})
690 {
eb560316 691 my ($s,$e,$out,$t,$end,$l) = @$b;
14a8264b 692 outtable($fh,$t) unless $t->{'Done'};
693 }
694 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
695 foreach my $b (@{$a->{'Entries'}})
017e2add 696 {
eb560316 697 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
e0c49a6b 698 $end |= 0x80 if $fb;
017e2add 699 print $fh "{";
700 if ($l)
701 {
702 printf $fh outstring($fh,'',$out);
703 }
704 else
705 {
706 print $fh "0";
707 }
708 print $fh ",",$t->{Cname};
2f2b4ff2 709 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
017e2add 710 }
14a8264b 711 print $fh "};\n";
712}
713
714sub output
715{
716 my ($fh,$name,$a) = @_;
717 process($name,$a);
718 # Sub-tables
719 outtable($fh,$a);
017e2add 720}
721
e0c49a6b 722sub output_enc
723{
724 my ($fh,$name,$a) = @_;
eb560316 725 die "Changed - fix me for new structure";
e0c49a6b 726 foreach my $b (sort keys %$a)
727 {
728 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
729 }
730}
731
732sub decode_U
733{
734 my $s = shift;
e0c49a6b 735}
736
f39fb8cc 737my @uname;
738sub char_names
739{
9fb51cbc 740 my $s = do "unicore/Name.pl";
741 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
f39fb8cc 742 pos($s) = 0;
743 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
744 {
745 my $name = $3;
746 my $s = hex($1);
747 last if $s >= 0x10000;
748 my $e = length($2) ? hex($2) : $s;
749 for (my $i = $s; $i <= $e; $i++)
750 {
751 $uname[$i] = $name;
752# print sprintf("U%04X $name\n",$i);
753 }
754 }
755}
756
e0c49a6b 757sub output_ucm_page
758{
a8373f85 759 my ($cmap,$a,$t,$pre) = @_;
760 # warn sprintf("Page %x\n",$pre);
761 my $raw = $t->{Raw};
762 foreach my $key (sort keys %$raw) {
763 # RAW_NEXT => 0,
764 # RAW_IN_LEN => 1,
765 # RAW_OUT_BYTES => 2,
766 # RAW_FALLBACK => 3,
767 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
768 my $u = ord $key;
769 $fallback ||= 0;
770
771 if ($next != $a && $next != $t) {
772 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
773 } elsif (length $out_bytes) {
774 if ($pre) {
775 $u = $pre|($u &0x3f);
e0c49a6b 776 }
a8373f85 777 my $s = sprintf "<U%04X> ",$u;
778 #foreach my $c (split(//,$out_bytes)) {
779 # $s .= sprintf "\\x%02X",ord($c);
780 #}
3964a085 781 # 9.5% faster changing that loop to this:
a8373f85 782 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
783 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
784 push(@$cmap,$s);
785 } else {
786 warn join(',',$u, @{$raw->{$key}},$a,$t);
e0c49a6b 787 }
788 }
789}
790
791sub output_ucm
792{
f39fb8cc 793 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
18586f54 794 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
afdae191 795 print $fh "<code_set_name> \"$name\"\n";
f39fb8cc 796 char_names();
afdae191 797 if (defined $min_el)
798 {
799 print $fh "<mb_cur_min> $min_el\n";
800 }
801 if (defined $max_el)
802 {
803 print $fh "<mb_cur_max> $max_el\n";
804 }
805 if (defined $rep)
806 {
807 print $fh "<subchar> ";
808 foreach my $c (split(//,$rep))
809 {
810 printf $fh "\\x%02X",ord($c);
811 }
812 print $fh "\n";
813 }
f39fb8cc 814 my @cmap;
815 output_ucm_page(\@cmap,$h,$h,0);
afdae191 816 print $fh "#\nCHARMAP\n";
f39fb8cc 817 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
818 {
819 print $fh $line;
820 }
e0c49a6b 821 print $fh "END CHARMAP\n";
822}
017e2add 823
445e6c97 824
825__END__
826With %seen holding array refs:
827
828 865.66 real 28.80 user 8.79 sys
829 7904 maximum resident set size
830 1356 average shared memory size
831 18566 average unshared data size
832 229 average unshared stack size
833 46080 page reclaims
834 33373 page faults
835
836With %seen holding simple scalars:
837
838 342.16 real 27.11 user 3.54 sys
839 8388 maximum resident set size
840 1394 average shared memory size
841 14969 average unshared data size
842 236 average unshared stack size
843 28159 page reclaims
844 9839 page faults
845
846Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
847how %seen is storing things its seen. So it is pathalogically bad on a 16M
848RAM machine, but it's going to help even on modern machines.
849Swapping is bad, m'kay :-)