3 unshift @INC, qw(../../lib ../../../lib);
4 $ENV{PATH} .= ';../..;../../..' if $^O eq 'MSWin32';
11 # These may get re-ordered.
12 # RAW is a do_now as inserted by &enter
13 # AGG is an aggreagated do_now, as built up by &process
28 # (See the algorithm in encengine.c - we're building structures for it)
30 # There are two sorts of structures.
31 # "do_now" (an array, two variants of what needs storing) is whatever we need
32 # to do now we've read an input byte.
33 # It's housed in a "do_next" (which is how we got to it), and in turn points
34 # to a "do_next" which contains all the "do_now"s for the next input byte.
36 # There will be a "do_next" which is the start state.
37 # For a single byte encoding it's the only "do_next" - each "do_now" points
38 # back to it, and each "do_now" will cause bytes. There is no state.
40 # For a multi-byte encoding where all characters in the input are the same
41 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
42 # branching out from the start state, one step for each input byte.
43 # The leaf "do_now"s will all be at the same distance from the start state,
44 # only the leaf "do_now"s cause output bytes, and they in turn point back to
47 # For an encoding where there are varaible length input byte sequences, you
48 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
49 # as before the leaves will point back to the start state.
51 # The system will cope with escape encodings (imagine them as a mostly
52 # self-contained tree for each escape state, and cross links between trees
53 # at the state-switching characters) but so far no input format defines these.
55 # The system will also cope with having output "leaves" in the middle of
56 # the bifurcating branches, not just at the extremities, but again no
57 # input format does this yet.
59 # There are two variants of the "do_now" structure. The first, smaller variant
60 # is generated by &enter as the input file is read. There is one structure
61 # for each input byte. Say we are mapping a single byte encoding to a
62 # single byte encoding, with "ABCD" going "abcd". There will be
63 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
65 # &process then walks the tree, building aggregate "do_now" structres for
66 # adjacent bytes where possible. The aggregate is for a contiguous range of
67 # bytes which each produce the same length of output, each move to the
68 # same next state, and each have the same fallback flag.
69 # So our 4 RAW "do_now"s above become replaced by a single structure
71 # ["A", "D", "abcd", 1, ...]
72 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
73 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
74 # which maps very nicely into pointer arithmetic in C for encengine.c
78 # UTF-8 encode long hand - only covers part of perl's range
80 # chr() works in native space so convert value from table
81 # into that space before using chr().
82 my $ch = chr(utf8::unicode_to_native($_[0]));
83 # Now get core perl to encode that the way it likes.
91 ## my ($ch,$page) = @_; return chr($ch);
97 # encode double byte MS byte first
98 ## my ($ch,$page) = @_; return chr($page).chr($ch);
99 return chr ($_[1]) . chr $_[0];
104 # encode Multi-byte - single for 0..255 otherwise double
105 ## my ($ch,$page) = @_;
106 ## return &encode_D if $page;
108 return chr ($_[1]) . chr $_[0] if $_[1];
112 my %encode_types = (U => \&encode_U,
118 # Win32 does not expand globs on command line
119 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
122 getopts('qOo:f:n:',\%opt);
123 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
124 chmod(0666,$cname) if -f $cname && !-w $cname;
125 open(C,">$cname") || die "Cannot open $cname:$!";
129 $dname =~ s/(\.[^\.]*)?$/_def.h/;
131 my ($doC,$doEnc,$doUcm,$doPet);
133 if ($cname =~ /\.(c|xs)$/)
136 chmod(0666,$dname) if -f $cname && !-w $dname;
137 open(D,">$dname") || die "Cannot open $dname:$!";
139 $hname =~ s/(\.[^\.]*)?$/.h/;
140 chmod(0666,$hname) if -f $cname && !-w $hname;
141 open(H,">$hname") || die "Cannot open $hname:$!";
143 foreach my $fh (\*C,\*D,\*H)
145 print $fh <<"END" unless $opt{'q'};
147 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
148 This file was autogenerated by:
154 if ($cname =~ /(\w+)\.xs$/)
156 print C "#include <EXTERN.h>\n";
157 print C "#include <perl.h>\n";
158 print C "#include <XSUB.h>\n";
159 print C "#define U8 U8\n";
161 print C "#include \"encode.h\"\n";
164 elsif ($cname =~ /\.enc$/)
168 elsif ($cname =~ /\.ucm$/)
172 elsif ($cname =~ /\.pet$/)
178 if (exists $opt{'f'})
180 # -F is followed by name of file containing list of filenames
181 my $flist = $opt{'f'};
182 open(FLIST,$flist) || die "Cannot open $flist:$!";
183 chomp(@encfiles = <FLIST>);
199 if ($a =~ /^.*-(\d+)/)
202 if ($b =~ /^.*-(\d+)/)
212 foreach my $enc (sort cmp_name @encfiles)
214 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
215 $name = $opt{'n'} if exists $opt{'n'};
220 compile_enc(\*E,lc($name));
224 compile_ucm(\*E,lc($name));
229 warn "Cannot open $enc for $name:$!";
235 print STDERR "Writing compiled form\n";
236 foreach my $name (sort cmp_name keys %encoding)
238 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
239 output(\*C,$name.'_utf8',$e2u);
240 output(\*C,'utf8_'.$name,$u2e);
241 push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
243 foreach my $enc (sort cmp_name keys %encoding)
245 my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
246 my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
247 my $sym = "${enc}_encoding";
249 print C "encode_t $sym = \n";
250 print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
253 foreach my $enc (sort cmp_name keys %encoding)
255 my $sym = "${enc}_encoding";
257 print H "extern encode_t $sym;\n";
258 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
261 if ($cname =~ /(\w+)\.xs$/)
267 Encode_XSEncoding(pTHX_ encode_t *enc)
270 HV *stash = gv_stashpv("Encode::XS", TRUE);
271 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
277 const char *name = enc->name[i++];
278 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
281 call_pv("Encode::define_encoding",G_DISCARD);
287 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
288 print C "BOOT:\n{\n";
289 print C "#include \"$dname\"\n";
294 printf STDERR "%d bytes in string tables\n",$strings;
295 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",$saved,100*$saved/$strings if $saved;
296 printf STDERR "%d bytes (%.3g%%) saved using substrings\n",$subsave,100*$subsave/$strings if $subsave;
300 foreach my $name (sort cmp_name keys %encoding)
302 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
303 output_enc(\*C,$name,$e2u);
308 foreach my $name (sort cmp_name keys %encoding)
310 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
311 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
329 last if /^\s*CHARMAP\s*$/i;
330 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
335 if (!defined($cs = $attr{'code_set_name'}))
337 warn "No <code_set_name> in $name\n";
341 $name = $cs unless exists $opt{'n'};
347 if (exists $attr{'subchar'})
350 $attr{'subchar'} =~ /^\s*/cg;
351 push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
352 $erep = join('',map(chr(hex($_)),@byte));
354 print "Reading $name ($cs)\n";
360 last if /^\s*END\s+CHARMAP\s*$/i;
364 $u = $1 if (/^<U([0-9a-f]+)>\s+/igc);
365 push(@byte,$1) while /\G\\x([0-9a-f]+)/igc;
366 $fb = $1 if /\G\s*(\|[0-3])/gc;
367 # warn "$_: $u @byte | $fb\n";
368 die "Bad line:$_" unless /\G\s*(#.*)?$/gc;
371 my $uch = encode_U(hex($u));
372 my $ech = join('',map(chr(hex($_)),@byte));
373 my $el = length($ech);
374 $max_el = $el if (!defined($max_el) || $el > $max_el);
375 $min_el = $el if (!defined($min_el) || $el < $min_el);
386 # $fb is fallback flag
387 # 0 - round trip safe
388 # 1 - fallback for unicode -> enc
389 # 2 - skip sub-char mapping
390 # 3 - fallback enc -> unicode
391 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
392 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
401 die "$nfb entries without fallback, $hfb entries with\n";
403 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
413 while ($type = <$fh>)
415 last if $type !~ /^\s*#/;
418 return if $type eq 'E';
419 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
420 warn "$type encoded $name\n";
422 # Save a defined test by setting these to defined values.
423 my $min_el = ~0; # A very big integer
424 my $max_el = 0; # Anything must be longer than 0
427 $rep = &{$encode_types{$type}}($v & 0xFF, ($v >> 8) & 0xffe);
434 my $page = hex($line);
439 die "Line should be exactly 65 characters long including newline"
440 unless length ($line) == 65;
441 # Split line into groups of 4 hex digits, convert groups to ints
442 for my $val (map {hex $_} $line =~ /(....)/g)
444 next if $val == 0xFFFD;
445 my $ech = &{$encode_types{$type}}($ch,$page);
446 if ($val || (!$ch && !$page))
448 my $el = length($ech);
449 $max_el = $el if $el > $max_el;
450 $min_el = $el if $el < $min_el;
451 my $uch = encode_U($val);
452 if (exists $seen{$uch})
454 warn sprintf("U%04X is %02X%02X and %02X%02X\n",
455 $val,$page,$ch,@{$seen{$uch}});
459 $seen{$uch} = [$page,$ch];
461 # Passing 2 extra args each time is 3.6% slower!
462 # Even with having to add $fallback ||= 0 in &process
463 enter($e2u,$ech,$uch);
464 enter($u2e,$uch,$ech);
468 # No character at this position
469 # enter($e2u,$ech,undef,$e2u);
475 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
476 if $min_el > $max_el;
477 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
480 # my ($a,$s,$d,$t,$fb) = @_;
482 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
483 # state we shift to after this (multibyte) input character defaults to same
486 # Making sure it is defined seems to be faster than {no warnings;} in
487 # &process, or passing it in as 0 explicity.
490 # Start at the beginning and work forwards through the string to zero.
491 # effectively we are removing 1 character from the front each time
492 # but we don't actually edit the string. [this alone seems to be 14% speedup]
493 # Hence -$pos is the length of the remaining string.
494 my $pos = -length $inbytes;
496 my $byte = substr $inbytes, $pos, 1;
499 # RAW_OUT_BYTES => 2,
501 # to unicode an array would seem to be better, because the pages are dense.
502 # from unicode can be very sparse, favouring a hash.
503 # hash using the bytes (all length 1) as keys rather than ord value,
504 # as it's easier to sort these in &process.
506 # It's faster to always add $fallback even if it's undef, rather than
507 # choosing between 3 and 4 element array. (hence why we set it defined
509 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
510 # When $pos was -1 we were at the last input character.
512 $do_now->[RAW_OUT_BYTES] = $outbytes;
513 $do_now->[RAW_NEXT] = $next;
516 # Tail recursion. The intermdiate state may not have a name yet.
517 $current = $do_now->[RAW_NEXT];
525 my ($fh,$name,$s) = @_;
526 my $sym = $strings{$s};
529 $saved += length($s);
534 foreach my $o (keys %strings)
536 my $i = index($o,$s);
540 $sym .= sprintf("+0x%02x",$i) if ($i);
541 $subsave += length($s);
547 $strings{$s} = $sym = $name;
548 $strings += length($s);
549 my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
550 # Maybe we should assert that these are all <256.
551 $definition .= join(',',unpack "C*",$s);
552 # We have a single long line. Split it at convenient commas.
553 $definition =~ s/(.{74,77},)/$1\n/g;
554 print $fh "$definition };\n\n";
564 my @raw = sort keys %{$a->{Raw}};
565 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
567 foreach my $key (@raw) {
570 # RAW_OUT_BYTES => 2,
572 my ($next, $in_len, $out_bytes, $fallback) = @{$a->{Raw}{$key}};
573 # Now we are converting from raw to aggregate, switch from 1 byte strings
577 # If this == fails, we're going to reset $agg_max_in below anyway.
578 $b == ++$agg_max_in &&
579 # References in numeric context give the pointer as an int.
580 $agg_next == $next &&
581 $agg_in_len == $in_len &&
582 $agg_out_len == length $out_bytes &&
583 $agg_fallback == $fallback
584 # && length($l->[AGG_OUT_BYTES]) < 16
586 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
587 # we can aggregate this byte onto the end.
588 $l->[AGG_MAX_IN] = $b;
589 $l->[AGG_OUT_BYTES] .= $out_bytes;
593 # AGG_OUT_BYTES => 2,
598 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
599 # (only gains .6% on euc-jp -- is it worth it?)
600 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
601 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
602 $agg_fallback = $fallback];
604 if (exists $next->{Cname}) {
605 $next->{'Forward'} = 1 if $next != $a;
607 process(sprintf("%s_%02x",$name,$b),$next);
610 # encengine.c rules say that last entry must be for 255
611 if (ord $raw[-1] < 255) {
612 push @ent, [1+ord $raw[-1], 255,undef,$a,0,0];
614 $a->{'Entries'} = \@ent;
620 my $name = $a->{'Cname'};
622 foreach my $b (@{$a->{'Entries'}})
624 next unless $b->[AGG_OUT_LEN];
625 my $s = $b->[AGG_MIN_IN];
626 my $e = $b->[AGG_MAX_IN];
627 outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
631 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
634 foreach my $b (@{$a->{'Entries'}})
636 my ($s,$e,$out,$t,$end,$l) = @$b;
637 outtable($fh,$t) unless $t->{'Done'};
639 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
640 foreach my $b (@{$a->{'Entries'}})
642 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
647 printf $fh outstring($fh,'',$out);
653 print $fh ",",$t->{Cname};
654 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
661 my ($fh,$name,$a) = @_;
669 my ($fh,$name,$a) = @_;
670 die "Changed - fix me for new structure";
671 foreach my $b (sort keys %$a)
673 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
685 my $s = do "unicore/Name.pl";
686 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
688 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
692 last if $s >= 0x10000;
693 my $e = length($2) ? hex($2) : $s;
694 for (my $i = $s; $i <= $e; $i++)
697 # print sprintf("U%04X $name\n",$i);
704 my ($cmap,$a,$t,$pre) = @_;
705 # warn sprintf("Page %x\n",$pre);
706 foreach my $b (sort keys %$t)
708 die "Changed - fix me for new structure";
709 my ($s,$e,$out,$n,$end,$l,$fb) = @{$t->{$b}};
710 die "oops $s $e" unless $s eq $e;
712 if ($n != $a && $n != $t)
714 output_ucm_page($cmap,$a,$n,(($pre|($u &0x3F)) << 6)&0xFFFF);
720 $u = $pre|($u &0x3f);
722 my $s = sprintf "<U%04X> ",$u;
723 foreach my $c (split(//,$out))
725 $s .= sprintf "\\x%02X",ord($c);
727 $s .= sprintf " |%d # %s\n",($fb ? 1 : 0),$uname[$u];
732 warn join(',',@{$t->{$b}},$a,$t);
739 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
740 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
741 print $fh "<code_set_name> \"$name\"\n";
745 print $fh "<mb_cur_min> $min_el\n";
749 print $fh "<mb_cur_max> $max_el\n";
753 print $fh "<subchar> ";
754 foreach my $c (split(//,$rep))
756 printf $fh "\\x%02X",ord($c);
761 output_ucm_page(\@cmap,$h,$h,0);
762 print $fh "#\nCHARMAP\n";
763 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
767 print $fh "END CHARMAP\n";