Unicode::Normalize 0.14 synch, from SADAHIRO Tomoyuki.
[p5sagit/p5-mst-13.2.git] / ext / Encode / compile
CommitLineData
017e2add 1#!../../perl -w
508a30f1 2BEGIN {
b0bf7e4c 3 unshift @INC, qw(../../lib ../../../lib);
4 $ENV{PATH} .= ';../..;../../..' if $^O eq 'MSWin32';
508a30f1 5}
017e2add 6use strict;
afdae191 7use Getopt::Std;
8my @orig_ARGV = @ARGV;
dcda1f94 9
eb560316 10# These may get re-ordered.
11# RAW is a do_now as inserted by &enter
12# AGG is an aggreagated do_now, as built up by &process
13use constant {
14 RAW_NEXT => 0,
15 RAW_IN_LEN => 1,
16 RAW_OUT_BYTES => 2,
17 RAW_FALLBACK => 3,
18
19 AGG_MIN_IN => 0,
20 AGG_MAX_IN => 1,
21 AGG_OUT_BYTES => 2,
22 AGG_NEXT => 3,
23 AGG_IN_LEN => 4,
24 AGG_OUT_LEN => 5,
25 AGG_FALLBACK => 6,
26};
27# (See the algorithm in encengine.c - we're building structures for it)
28
29# There are two sorts of structures.
30# "do_now" (an array, two variants of what needs storing) is whatever we need
31# to do now we've read an input byte.
32# It's housed in a "do_next" (which is how we got to it), and in turn points
33# to a "do_next" which contains all the "do_now"s for the next input byte.
34
35# There will be a "do_next" which is the start state.
36# For a single byte encoding it's the only "do_next" - each "do_now" points
37# back to it, and each "do_now" will cause bytes. There is no state.
38
39# For a multi-byte encoding where all characters in the input are the same
40# length, then there will be a tree of "do_now"->"do_next"->"do_now"
41# branching out from the start state, one step for each input byte.
42# The leaf "do_now"s will all be at the same distance from the start state,
43# only the leaf "do_now"s cause output bytes, and they in turn point back to
44# the start state.
45
46# For an encoding where there are varaible length input byte sequences, you
47# will encounter a leaf "do_now" sooner for the shorter input sequences, but
48# as before the leaves will point back to the start state.
49
50# The system will cope with escape encodings (imagine them as a mostly
51# self-contained tree for each escape state, and cross links between trees
52# at the state-switching characters) but so far no input format defines these.
53
54# The system will also cope with having output "leaves" in the middle of
55# the bifurcating branches, not just at the extremities, but again no
56# input format does this yet.
57
58# There are two variants of the "do_now" structure. The first, smaller variant
59# is generated by &enter as the input file is read. There is one structure
60# for each input byte. Say we are mapping a single byte encoding to a
61# single byte encoding, with "ABCD" going "abcd". There will be
62# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
63
64# &process then walks the tree, building aggregate "do_now" structres for
65# adjacent bytes where possible. The aggregate is for a contiguous range of
66# bytes which each produce the same length of output, each move to the
67# same next state, and each have the same fallback flag.
68# So our 4 RAW "do_now"s above become replaced by a single structure
69# containing:
70# ["A", "D", "abcd", 1, ...]
71# ie, for an input byte $_ in "A".."D", output 1 byte, found as
72# substr ("abcd", (ord $_ - ord "A") * 1, 1)
73# which maps very nicely into pointer arithmetic in C for encengine.c
74
017e2add 75sub encode_U
76{
9b37254d 77 # UTF-8 encode long hand - only covers part of perl's range
7ba7f87b 78 ## my $uv = shift;
1b026014 79 # chr() works in native space so convert value from table
80 # into that space before using chr().
7ba7f87b 81 my $ch = chr(utf8::unicode_to_native($_[0]));
1b026014 82 # Now get core perl to encode that the way it likes.
83 utf8::encode($ch);
84 return $ch;
017e2add 85}
86
87sub encode_S
88{
14a8264b 89 # encode single byte
7ba7f87b 90 ## my ($ch,$page) = @_; return chr($ch);
91 return chr $_[0];
017e2add 92}
93
94sub encode_D
95{
14a8264b 96 # encode double byte MS byte first
7ba7f87b 97 ## my ($ch,$page) = @_; return chr($page).chr($ch);
98 return chr ($_[1]) . chr $_[0];
017e2add 99}
100
101sub encode_M
102{
14a8264b 103 # encode Multi-byte - single for 0..255 otherwise double
7ba7f87b 104 ## my ($ch,$page) = @_;
105 ## return &encode_D if $page;
106 ## return &encode_S;
107 return chr ($_[1]) . chr $_[0] if $_[1];
108 return chr $_[0];
017e2add 109}
110
7ba7f87b 111my %encode_types = (U => \&encode_U,
112 S => \&encode_S,
113 D => \&encode_D,
114 M => \&encode_M,
115 );
116
14a8264b 117# Win32 does not expand globs on command line
252a8565 118eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
18b7339f 119
afdae191 120my %opt;
3964a085 121# I think these are:
122# -Q to disable the duplicate codepoint test
123# -q to remove comments written to output files
124# -O to enable the (brute force) substring optimiser
125# -o <output> to specify the output file name (else it's the first arg)
126# -f <inlist> to give a file with a list of input files (else use the args)
127# -n <name> to name the encoding (else use the basename of the input file.
128getopts('QqOo:f:n:',\%opt);
129
130# This really should go first, else the die here causes empty (non-erroneous)
131# output files to be written.
132my @encfiles;
133if (exists $opt{'f'}) {
134 # -F is followed by name of file containing list of filenames
135 my $flist = $opt{'f'};
136 open(FLIST,$flist) || die "Cannot open $flist:$!";
137 chomp(@encfiles = <FLIST>);
138 close(FLIST);
139} else {
140 @encfiles = @ARGV;
141}
142
afdae191 143my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
2f2b4ff2 144chmod(0666,$cname) if -f $cname && !-w $cname;
017e2add 145open(C,">$cname") || die "Cannot open $cname:$!";
afdae191 146
2f2b4ff2 147my $dname = $cname;
3964a085 148my $hname = $cname;
e0c49a6b 149
dcda1f94 150my ($doC,$doEnc,$doUcm,$doPet);
e0c49a6b 151
152if ($cname =~ /\.(c|xs)$/)
153 {
154 $doC = 1;
3964a085 155 $dname =~ s/(\.[^\.]*)?$/_def.h/;
e0c49a6b 156 chmod(0666,$dname) if -f $cname && !-w $dname;
157 open(D,">$dname") || die "Cannot open $dname:$!";
e0c49a6b 158 $hname =~ s/(\.[^\.]*)?$/.h/;
159 chmod(0666,$hname) if -f $cname && !-w $hname;
160 open(H,">$hname") || die "Cannot open $hname:$!";
161
162 foreach my $fh (\*C,\*D,\*H)
163 {
afdae191 164 print $fh <<"END" unless $opt{'q'};
14a8264b 165/*
166 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
167 This file was autogenerated by:
023d8852 168 $^X $0 @orig_ARGV
14a8264b 169*/
170END
e0c49a6b 171 }
14a8264b 172
e0c49a6b 173 if ($cname =~ /(\w+)\.xs$/)
174 {
175 print C "#include <EXTERN.h>\n";
176 print C "#include <perl.h>\n";
177 print C "#include <XSUB.h>\n";
178 print C "#define U8 U8\n";
179 }
180 print C "#include \"encode.h\"\n";
d811239c 181
e0c49a6b 182 }
183elsif ($cname =~ /\.enc$/)
184 {
185 $doEnc = 1;
186 }
187elsif ($cname =~ /\.ucm$/)
2f2b4ff2 188 {
e0c49a6b 189 $doUcm = 1;
2f2b4ff2 190 }
dcda1f94 191elsif ($cname =~ /\.pet$/)
192 {
193 $doPet = 1;
194 }
017e2add 195
196my %encoding;
197my %strings;
b1e7e56f 198my $saved = 0;
199my $subsave = 0;
200my $strings = 0;
017e2add 201
2f2b4ff2 202sub cmp_name
203{
204 if ($a =~ /^.*-(\d+)/)
205 {
206 my $an = $1;
207 if ($b =~ /^.*-(\d+)/)
208 {
209 my $r = $an <=> $1;
210 return $r if $r;
211 }
212 }
213 return $a cmp $b;
214}
215
dcda1f94 216
c6fdb90a 217foreach my $enc (sort cmp_name @encfiles)
017e2add 218 {
9b37254d 219 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
51ef4e11 220 $name = $opt{'n'} if exists $opt{'n'};
017e2add 221 if (open(E,$enc))
222 {
9b37254d 223 if ($sfx eq 'enc')
224 {
dcda1f94 225 compile_enc(\*E,lc($name));
9b37254d 226 }
227 else
228 {
dcda1f94 229 compile_ucm(\*E,lc($name));
9b37254d 230 }
017e2add 231 }
232 else
233 {
234 warn "Cannot open $enc for $name:$!";
235 }
236 }
237
e0c49a6b 238if ($doC)
2f2b4ff2 239 {
b1e7e56f 240 print STDERR "Writing compiled form\n";
dcda1f94 241 foreach my $name (sort cmp_name keys %encoding)
242 {
243 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
244 output(\*C,$name.'_utf8',$e2u);
245 output(\*C,'utf8_'.$name,$u2e);
246 push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
247 }
e0c49a6b 248 foreach my $enc (sort cmp_name keys %encoding)
249 {
dcda1f94 250 my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
251 my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
e0c49a6b 252 my $sym = "${enc}_encoding";
253 $sym =~ s/\W+/_/g;
254 print C "encode_t $sym = \n";
dcda1f94 255 print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
e0c49a6b 256 }
2f2b4ff2 257
e0c49a6b 258 foreach my $enc (sort cmp_name keys %encoding)
259 {
260 my $sym = "${enc}_encoding";
261 $sym =~ s/\W+/_/g;
262 print H "extern encode_t $sym;\n";
d811239c 263 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
e0c49a6b 264 }
017e2add 265
e0c49a6b 266 if ($cname =~ /(\w+)\.xs$/)
267 {
268 my $mod = $1;
d811239c 269 print C <<'END';
270
023d8852 271static void
d811239c 272Encode_XSEncoding(pTHX_ encode_t *enc)
273{
274 dSP;
275 HV *stash = gv_stashpv("Encode::XS", TRUE);
276 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
277 int i = 0;
278 PUSHMARK(sp);
279 XPUSHs(sv);
280 while (enc->name[i])
281 {
282 const char *name = enc->name[i++];
283 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
284 }
285 PUTBACK;
286 call_pv("Encode::define_encoding",G_DISCARD);
287 SvREFCNT_dec(sv);
288}
289
290END
291
e0c49a6b 292 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
293 print C "BOOT:\n{\n";
294 print C "#include \"$dname\"\n";
295 print C "}\n";
296 }
3964a085 297 # Close in void context is bad, m'kay
298 close(D) or warn "Error closing '$dname': $!";
299 close(H) or warn "Error closing '$hname': $!";
f20503bb 300
301 my $perc_saved = $strings/($strings + $saved) * 100;
302 my $perc_subsaved = $strings/($strings + $subsave) * 100;
b1e7e56f 303 printf STDERR "%d bytes in string tables\n",$strings;
f20503bb 304 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
305 $saved, $perc_saved if $saved;
306 printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
307 $subsave, $perc_subsaved if $subsave;
2f2b4ff2 308 }
dcda1f94 309elsif ($doEnc)
310 {
311 foreach my $name (sort cmp_name keys %encoding)
312 {
313 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
314 output_enc(\*C,$name,$e2u);
315 }
316 }
317elsif ($doUcm)
318 {
319 foreach my $name (sort cmp_name keys %encoding)
320 {
321 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
322 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
323 }
324 }
325
3964a085 326# writing half meg files and then not checking to see if you just filled the
327# disk is bad, m'kay
328close(C) or die "Error closing '$cname': $!";
9b37254d 329
3964a085 330# End of the main program.
b1e7e56f 331
9b37254d 332sub compile_ucm
333{
dcda1f94 334 my ($fh,$name) = @_;
9b37254d 335 my $e2u = {};
336 my $u2e = {};
337 my $cs;
338 my %attr;
339 while (<$fh>)
340 {
341 s/#.*$//;
342 last if /^\s*CHARMAP\s*$/i;
7ba7f87b 343 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
9b37254d 344 {
345 $attr{$1} = $2;
346 }
347 }
348 if (!defined($cs = $attr{'code_set_name'}))
349 {
350 warn "No <code_set_name> in $name\n";
351 }
352 else
353 {
51ef4e11 354 $name = $cs unless exists $opt{'n'};
9b37254d 355 }
356 my $erep;
357 my $urep;
afdae191 358 my $max_el;
359 my $min_el;
9b37254d 360 if (exists $attr{'subchar'})
361 {
afdae191 362 my @byte;
363 $attr{'subchar'} =~ /^\s*/cg;
364 push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
365 $erep = join('',map(chr(hex($_)),@byte));
9b37254d 366 }
b1e7e56f 367 print "Reading $name ($cs)\n";
9b37254d 368 my $nfb = 0;
369 my $hfb = 0;
370 while (<$fh>)
371 {
372 s/#.*$//;
373 last if /^\s*END\s+CHARMAP\s*$/i;
374 next if /^\s*$/;
afdae191 375 my ($u,@byte);
376 my $fb = '';
377 $u = $1 if (/^<U([0-9a-f]+)>\s+/igc);
378 push(@byte,$1) while /\G\\x([0-9a-f]+)/igc;
379 $fb = $1 if /\G\s*(\|[0-3])/gc;
380 # warn "$_: $u @byte | $fb\n";
381 die "Bad line:$_" unless /\G\s*(#.*)?$/gc;
9b37254d 382 if (defined($u))
383 {
384 my $uch = encode_U(hex($u));
e0c49a6b 385 my $ech = join('',map(chr(hex($_)),@byte));
afdae191 386 my $el = length($ech);
387 $max_el = $el if (!defined($max_el) || $el > $max_el);
388 $min_el = $el if (!defined($min_el) || $el < $min_el);
9b37254d 389 if (length($fb))
390 {
391 $fb = substr($fb,1);
392 $hfb++;
393 }
394 else
395 {
396 $nfb++;
397 $fb = '0';
398 }
399 # $fb is fallback flag
400 # 0 - round trip safe
401 # 1 - fallback for unicode -> enc
402 # 2 - skip sub-char mapping
403 # 3 - fallback enc -> unicode
404 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
405 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
406 }
407 else
408 {
409 warn $_;
410 }
9b37254d 411 }
412 if ($nfb && $hfb)
413 {
414 die "$nfb entries without fallback, $hfb entries with\n";
415 }
dcda1f94 416 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
9b37254d 417}
418
14a8264b 419sub compile_enc
017e2add 420{
dcda1f94 421 my ($fh,$name) = @_;
017e2add 422 my $e2u = {};
423 my $u2e = {};
424
425 my $type;
426 while ($type = <$fh>)
427 {
428 last if $type !~ /^\s*#/;
429 }
430 chomp($type);
431 return if $type eq 'E';
445e6c97 432 # Do the hash lookup once, rather than once per function call. 4% speedup.
433 my $type_func = $encode_types{$type};
017e2add 434 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
14a8264b 435 warn "$type encoded $name\n";
017e2add 436 my $rep = '';
7ba7f87b 437 # Save a defined test by setting these to defined values.
438 my $min_el = ~0; # A very big integer
439 my $max_el = 0; # Anything must be longer than 0
017e2add 440 {
441 my $v = hex($def);
445e6c97 442 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
017e2add 443 }
3964a085 444 my $seen;
445 # use -Q to silence the seen test. Makefile.PL uses this by default.
446 $seen = {} unless $opt{Q};
445e6c97 447 do
017e2add 448 {
449 my $line = <$fh>;
450 chomp($line);
451 my $page = hex($line);
452 my $ch = 0;
445e6c97 453 my $i = 16;
454 do
017e2add 455 {
445e6c97 456 # So why is it 1% faster to leave the my here?
017e2add 457 my $line = <$fh>;
7ba7f87b 458 die "Line should be exactly 65 characters long including newline"
459 unless length ($line) == 65;
460 # Split line into groups of 4 hex digits, convert groups to ints
445e6c97 461 # This takes 65.35
462 # map {hex $_} $line =~ /(....)/g
463 # This takes 63.75 (2.5% less time)
464 # unpack "n*", pack "H*", $line
465 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
466 # Doing it as while ($line =~ /(....)/g) took 74.63
467 foreach my $val (unpack "n*", pack "H*", $line)
017e2add 468 {
e03ac092 469 next if $val == 0xFFFD;
445e6c97 470 my $ech = &$type_func($ch,$page);
017e2add 471 if ($val || (!$ch && !$page))
472 {
afdae191 473 my $el = length($ech);
7ba7f87b 474 $max_el = $el if $el > $max_el;
475 $min_el = $el if $el < $min_el;
017e2add 476 my $uch = encode_U($val);
3964a085 477 if ($seen) {
478 # We're doing the test.
479 # We don't need to read this quickly, so storing it as a scalar,
480 # rather than 3 (anon array, plus the 2 scalars it holds) saves
481 # RAM and may make us faster on low RAM systems. [see __END__]
482 if (exists $seen->{$uch})
483 {
484 warn sprintf("U%04X is %02X%02X and %04X\n",
485 $val,$page,$ch,$seen->{$uch});
486 }
487 else
488 {
489 $seen->{$uch} = $page << 8 | $ch;
490 }
491 }
eb560316 492 # Passing 2 extra args each time is 3.6% slower!
445e6c97 493 # Even with having to add $fallback ||= 0 later
494 enter_fb0($e2u,$ech,$uch);
495 enter_fb0($u2e,$uch,$ech);
017e2add 496 }
497 else
498 {
499 # No character at this position
500 # enter($e2u,$ech,undef,$e2u);
501 }
502 $ch++;
503 }
445e6c97 504 } while --$i;
505 } while --$pages;
7ba7f87b 506 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
507 if $min_el > $max_el;
dcda1f94 508 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
017e2add 509}
510
eb560316 511# my ($a,$s,$d,$t,$fb) = @_;
512sub enter {
513 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
514 # state we shift to after this (multibyte) input character defaults to same
515 # as current state.
516 $next ||= $current;
517 # Making sure it is defined seems to be faster than {no warnings;} in
518 # &process, or passing it in as 0 explicity.
445e6c97 519 # XXX $fallback ||= 0;
eb560316 520
521 # Start at the beginning and work forwards through the string to zero.
522 # effectively we are removing 1 character from the front each time
523 # but we don't actually edit the string. [this alone seems to be 14% speedup]
524 # Hence -$pos is the length of the remaining string.
525 my $pos = -length $inbytes;
526 while (1) {
527 my $byte = substr $inbytes, $pos, 1;
528 # RAW_NEXT => 0,
529 # RAW_IN_LEN => 1,
530 # RAW_OUT_BYTES => 2,
531 # RAW_FALLBACK => 3,
532 # to unicode an array would seem to be better, because the pages are dense.
533 # from unicode can be very sparse, favouring a hash.
534 # hash using the bytes (all length 1) as keys rather than ord value,
535 # as it's easier to sort these in &process.
536
537 # It's faster to always add $fallback even if it's undef, rather than
538 # choosing between 3 and 4 element array. (hence why we set it defined
539 # above)
540 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
541 # When $pos was -1 we were at the last input character.
542 unless (++$pos) {
543 $do_now->[RAW_OUT_BYTES] = $outbytes;
544 $do_now->[RAW_NEXT] = $next;
545 return;
546 }
547 # Tail recursion. The intermdiate state may not have a name yet.
548 $current = $do_now->[RAW_NEXT];
017e2add 549 }
550}
551
445e6c97 552# This is purely for optimistation. It's just &enter hard coded for $fallback
553# of 0, using only a 3 entry array ref to save memory for every entry.
554sub enter_fb0 {
555 my ($current,$inbytes,$outbytes,$next) = @_;
556 $next ||= $current;
557
558 my $pos = -length $inbytes;
559 while (1) {
560 my $byte = substr $inbytes, $pos, 1;
561 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
562 unless (++$pos) {
563 $do_now->[RAW_OUT_BYTES] = $outbytes;
564 $do_now->[RAW_NEXT] = $next;
565 return;
566 }
567 $current = $do_now->[RAW_NEXT];
568 }
569}
b1e7e56f 570
571
017e2add 572sub outstring
573{
574 my ($fh,$name,$s) = @_;
575 my $sym = $strings{$s};
b1e7e56f 576 if ($sym)
577 {
578 $saved += length($s);
579 }
580 else
017e2add 581 {
4cfc977c 582 if ($opt{'O'}) {
583 foreach my $o (keys %strings)
584 {
445e6c97 585 next unless (my $i = index($o,$s)) >= 0;
586 $sym = $strings{$o};
587 $sym .= sprintf("+0x%02x",$i) if ($i);
588 $subsave += length($s);
589 return $strings{$s} = $sym;
590 }
4cfc977c 591 }
017e2add 592 $strings{$s} = $sym = $name;
b1e7e56f 593 $strings += length($s);
de745a2e 594 my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
595 # Maybe we should assert that these are all <256.
596 $definition .= join(',',unpack "C*",$s);
597 # We have a single long line. Split it at convenient commas.
598 $definition =~ s/(.{74,77},)/$1\n/g;
599 print $fh "$definition };\n\n";
017e2add 600 }
601 return $sym;
602}
603
14a8264b 604sub process
017e2add 605{
eb560316 606 my ($name,$a) = @_;
607 $name =~ s/\W+/_/g;
608 $a->{Cname} = $name;
445e6c97 609 my $raw = $a->{Raw};
eb560316 610 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
611 my @ent;
445e6c97 612 $agg_max_in = 0;
613 foreach my $key (sort keys %$raw) {
eb560316 614 # RAW_NEXT => 0,
615 # RAW_IN_LEN => 1,
616 # RAW_OUT_BYTES => 2,
617 # RAW_FALLBACK => 3,
445e6c97 618 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
eb560316 619 # Now we are converting from raw to aggregate, switch from 1 byte strings
620 # to numbers
621 my $b = ord $key;
445e6c97 622 $fallback ||= 0;
eb560316 623 if ($l &&
624 # If this == fails, we're going to reset $agg_max_in below anyway.
625 $b == ++$agg_max_in &&
626 # References in numeric context give the pointer as an int.
627 $agg_next == $next &&
628 $agg_in_len == $in_len &&
629 $agg_out_len == length $out_bytes &&
630 $agg_fallback == $fallback
631 # && length($l->[AGG_OUT_BYTES]) < 16
632 ) {
633 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
634 # we can aggregate this byte onto the end.
635 $l->[AGG_MAX_IN] = $b;
636 $l->[AGG_OUT_BYTES] .= $out_bytes;
637 } else {
638 # AGG_MIN_IN => 0,
639 # AGG_MAX_IN => 1,
640 # AGG_OUT_BYTES => 2,
641 # AGG_NEXT => 3,
642 # AGG_IN_LEN => 4,
643 # AGG_OUT_LEN => 5,
644 # AGG_FALLBACK => 6,
645 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
646 # (only gains .6% on euc-jp -- is it worth it?)
647 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
648 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
649 $agg_fallback = $fallback];
017e2add 650 }
eb560316 651 if (exists $next->{Cname}) {
652 $next->{'Forward'} = 1 if $next != $a;
653 } else {
654 process(sprintf("%s_%02x",$name,$b),$next);
017e2add 655 }
656 }
eb560316 657 # encengine.c rules say that last entry must be for 255
445e6c97 658 if ($agg_max_in < 255) {
659 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
017e2add 660 }
eb560316 661 $a->{'Entries'} = \@ent;
14a8264b 662}
663
664sub outtable
665{
666 my ($fh,$a) = @_;
667 my $name = $a->{'Cname'};
017e2add 668 # String tables
14a8264b 669 foreach my $b (@{$a->{'Entries'}})
017e2add 670 {
eb560316 671 next unless $b->[AGG_OUT_LEN];
672 my $s = $b->[AGG_MIN_IN];
673 my $e = $b->[AGG_MAX_IN];
674 outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
017e2add 675 }
14a8264b 676 if ($a->{'Forward'})
677 {
678 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
679 }
680 $a->{'Done'} = 1;
681 foreach my $b (@{$a->{'Entries'}})
682 {
eb560316 683 my ($s,$e,$out,$t,$end,$l) = @$b;
14a8264b 684 outtable($fh,$t) unless $t->{'Done'};
685 }
686 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
687 foreach my $b (@{$a->{'Entries'}})
017e2add 688 {
eb560316 689 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
e0c49a6b 690 $end |= 0x80 if $fb;
017e2add 691 print $fh "{";
692 if ($l)
693 {
694 printf $fh outstring($fh,'',$out);
695 }
696 else
697 {
698 print $fh "0";
699 }
700 print $fh ",",$t->{Cname};
2f2b4ff2 701 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
017e2add 702 }
14a8264b 703 print $fh "};\n";
704}
705
706sub output
707{
708 my ($fh,$name,$a) = @_;
709 process($name,$a);
710 # Sub-tables
711 outtable($fh,$a);
017e2add 712}
713
e0c49a6b 714sub output_enc
715{
716 my ($fh,$name,$a) = @_;
eb560316 717 die "Changed - fix me for new structure";
e0c49a6b 718 foreach my $b (sort keys %$a)
719 {
720 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
721 }
722}
723
724sub decode_U
725{
726 my $s = shift;
e0c49a6b 727}
728
f39fb8cc 729my @uname;
730sub char_names
731{
9fb51cbc 732 my $s = do "unicore/Name.pl";
733 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
f39fb8cc 734 pos($s) = 0;
735 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
736 {
737 my $name = $3;
738 my $s = hex($1);
739 last if $s >= 0x10000;
740 my $e = length($2) ? hex($2) : $s;
741 for (my $i = $s; $i <= $e; $i++)
742 {
743 $uname[$i] = $name;
744# print sprintf("U%04X $name\n",$i);
745 }
746 }
747}
748
e0c49a6b 749sub output_ucm_page
750{
a8373f85 751 my ($cmap,$a,$t,$pre) = @_;
752 # warn sprintf("Page %x\n",$pre);
753 my $raw = $t->{Raw};
754 foreach my $key (sort keys %$raw) {
755 # RAW_NEXT => 0,
756 # RAW_IN_LEN => 1,
757 # RAW_OUT_BYTES => 2,
758 # RAW_FALLBACK => 3,
759 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
760 my $u = ord $key;
761 $fallback ||= 0;
762
763 if ($next != $a && $next != $t) {
764 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
765 } elsif (length $out_bytes) {
766 if ($pre) {
767 $u = $pre|($u &0x3f);
e0c49a6b 768 }
a8373f85 769 my $s = sprintf "<U%04X> ",$u;
770 #foreach my $c (split(//,$out_bytes)) {
771 # $s .= sprintf "\\x%02X",ord($c);
772 #}
3964a085 773 # 9.5% faster changing that loop to this:
a8373f85 774 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
775 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
776 push(@$cmap,$s);
777 } else {
778 warn join(',',$u, @{$raw->{$key}},$a,$t);
e0c49a6b 779 }
780 }
781}
782
783sub output_ucm
784{
f39fb8cc 785 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
df1df145 786 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
afdae191 787 print $fh "<code_set_name> \"$name\"\n";
f39fb8cc 788 char_names();
afdae191 789 if (defined $min_el)
790 {
791 print $fh "<mb_cur_min> $min_el\n";
792 }
793 if (defined $max_el)
794 {
795 print $fh "<mb_cur_max> $max_el\n";
796 }
797 if (defined $rep)
798 {
799 print $fh "<subchar> ";
800 foreach my $c (split(//,$rep))
801 {
802 printf $fh "\\x%02X",ord($c);
803 }
804 print $fh "\n";
805 }
f39fb8cc 806 my @cmap;
807 output_ucm_page(\@cmap,$h,$h,0);
afdae191 808 print $fh "#\nCHARMAP\n";
f39fb8cc 809 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
810 {
811 print $fh $line;
812 }
e0c49a6b 813 print $fh "END CHARMAP\n";
814}
017e2add 815
445e6c97 816
817__END__
818With %seen holding array refs:
819
820 865.66 real 28.80 user 8.79 sys
821 7904 maximum resident set size
822 1356 average shared memory size
823 18566 average unshared data size
824 229 average unshared stack size
825 46080 page reclaims
826 33373 page faults
827
828With %seen holding simple scalars:
829
830 342.16 real 27.11 user 3.54 sys
831 8388 maximum resident set size
832 1394 average shared memory size
833 14969 average unshared data size
834 236 average unshared stack size
835 28159 page reclaims
836 9839 page faults
837
838Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
839how %seen is storing things its seen. So it is pathalogically bad on a 16M
840RAM machine, but it's going to help even on modern machines.
841Swapping is bad, m'kay :-)