X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=ext%2FEncode%2Fcompile;h=d0611f719f2639529084ba4c83d24073771bbd69;hb=0a1f2d144e4463451f8627bd1c6ca420a59b01b0;hp=03cea1918c5859bff7992d179df44e4ad294f140;hpb=c6fdb90a0a27837f63bec24b004f5841b85501e6;p=p5sagit%2Fp5-mst-13.2.git diff --git a/ext/Encode/compile b/ext/Encode/compile index 03cea19..d0611f7 100755 --- a/ext/Encode/compile +++ b/ext/Encode/compile @@ -1,23 +1,23 @@ #!../../perl -w -BEGIN { @INC = '../../lib' }; +BEGIN { + @INC = '../../lib'; + $ENV{PATH} .= ';../..' if $^O eq 'MSWin32'; +} use strict; +use Getopt::Std; +my @orig_ARGV = @ARGV; +my $perforce = '$Id$'; sub encode_U { # UTF-8 encode long hand - only covers part of perl's range my $uv = shift; - if ($uv < 0x80) - { - return chr($uv) - } - if ($uv < 0x800) - { - return chr(($uv >> 6) | 0xC0). - chr(($uv & 0x3F) | 0x80); - } - return chr(($uv >> 12) | 0xE0). - chr((($uv >> 6) & 0x3F) | 0x80). - chr(($uv & 0x3F) | 0x80); + # chr() works in native space so convert value from table + # into that space before using chr(). + my $ch = chr(utf8::unicode_to_native($uv)); + # Now get core perl to encode that the way it likes. + utf8::encode($ch); + return $ch; } sub encode_S @@ -45,13 +45,17 @@ sub encode_M # Win32 does not expand globs on command line eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32'); -my $cname = shift(@ARGV); +my %opt; +getopts('qo:f:n:',\%opt); +my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV); chmod(0666,$cname) if -f $cname && !-w $cname; open(C,">$cname") || die "Cannot open $cname:$!"; + + my $dname = $cname; $dname =~ s/(\.[^\.]*)?$/.def/; -my ($doC,$doEnc,$doUcm); +my ($doC,$doEnc,$doUcm,$doPet); if ($cname =~ /\.(c|xs)$/) { @@ -65,11 +69,12 @@ if ($cname =~ /\.(c|xs)$/) foreach my $fh (\*C,\*D,\*H) { - print $fh <<"END"; + print $fh <<"END" unless $opt{'q'}; /* !!!!!!! DO NOT EDIT THIS FILE !!!!!!! This file was autogenerated by: - $^X $0 $cname @ARGV + $^X $0 $cname @orig_ARGV + (Repository $perforce) */ END } @@ -91,12 +96,24 @@ elsif ($cname =~ /\.ucm$/) { $doUcm = 1; } +elsif ($cname =~ /\.pet$/) + { + $doPet = 1; + } -# 2nd argument is file containing list of filenames -my $flist = shift(@ARGV); -open(FLIST,$flist) || die "Cannot open $flist:$!"; -chomp(my @encfiles = ); -close(FLIST); +my @encfiles; +if (exists $opt{'f'}) + { + # -F is followed by name of file containing list of filenames + my $flist = $opt{'f'}; + open(FLIST,$flist) || die "Cannot open $flist:$!"; + chomp(@encfiles = ); + close(FLIST); + } +else + { + @encfiles = @ARGV; + } my %encoding; my %strings; @@ -115,18 +132,20 @@ sub cmp_name return $a cmp $b; } + foreach my $enc (sort cmp_name @encfiles) { my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/; + $name = $opt{'n'} if exists $opt{'n'}; if (open(E,$enc)) { if ($sfx eq 'enc') { - compile_enc(\*E,lc($name),\*C); + compile_enc(\*E,lc($name)); } else { - compile_ucm(\*E,lc($name),\*C); + compile_ucm(\*E,lc($name)); } } else @@ -137,12 +156,21 @@ foreach my $enc (sort cmp_name @encfiles) if ($doC) { + foreach my $name (sort cmp_name keys %encoding) + { + my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; + output(\*C,$name.'_utf8',$e2u); + output(\*C,'utf8_'.$name,$u2e); + push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep)); + } foreach my $enc (sort cmp_name keys %encoding) { + my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}}; + my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el); my $sym = "${enc}_encoding"; $sym =~ s/\W+/_/g; print C "encode_t $sym = \n"; - print C " {",join(',',"\"$enc\"",@{$encoding{$enc}}),"};\n\n"; + print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n"; } foreach my $enc (sort cmp_name keys %encoding) @@ -164,12 +192,29 @@ if ($doC) close(D); close(H); } +elsif ($doEnc) + { + foreach my $name (sort cmp_name keys %encoding) + { + my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; + output_enc(\*C,$name,$e2u); + } + } +elsif ($doUcm) + { + foreach my $name (sort cmp_name keys %encoding) + { + my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; + output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el); + } + } + close(C); sub compile_ucm { - my ($fh,$name,$ch) = @_; + my ($fh,$name) = @_; my $e2u = {}; my $u2e = {}; my $cs; @@ -189,16 +234,20 @@ sub compile_ucm } else { - # $name = lc($cs); + $name = $cs unless exists $opt{'n'}; } my $erep; my $urep; + my $max_el; + my $min_el; if (exists $attr{'subchar'}) { - my @byte = $attr{'subchar'} =~ /^\s*(?:\\x([0-9a-f]+))+\s*$/; - $erep = join('',map(hex($_),@byte)); + my @byte; + $attr{'subchar'} =~ /^\s*/cg; + push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg; + $erep = join('',map(chr(hex($_)),@byte)); } - warn "Scanning $name ($cs)\n"; + print "Scanning $name ($cs)\n"; my $nfb = 0; my $hfb = 0; while (<$fh>) @@ -206,12 +255,20 @@ sub compile_ucm s/#.*$//; last if /^\s*END\s+CHARMAP\s*$/i; next if /^\s*$/; - my ($u,@byte) = /^\s+(?:\\x([0-9a-f]+))+\s*(\|[0-3]|)\s*$/i; - my $fb = pop(@byte); + my ($u,@byte); + my $fb = ''; + $u = $1 if (/^\s+/igc); + push(@byte,$1) while /\G\\x([0-9a-f]+)/igc; + $fb = $1 if /\G\s*(\|[0-3])/gc; + # warn "$_: $u @byte | $fb\n"; + die "Bad line:$_" unless /\G\s*(#.*)?$/gc; if (defined($u)) { my $uch = encode_U(hex($u)); my $ech = join('',map(chr(hex($_)),@byte)); + my $el = length($ech); + $max_el = $el if (!defined($max_el) || $el > $max_el); + $min_el = $el if (!defined($min_el) || $el < $min_el); if (length($fb)) { $fb = substr($fb,1); @@ -234,32 +291,17 @@ sub compile_ucm { warn $_; } - } if ($nfb && $hfb) { die "$nfb entries without fallback, $hfb entries with\n"; } - if ($doC) - { - output($ch,$name.'_utf8',$e2u); - output($ch,'utf8_'.$name,$u2e); - $encoding{$name} = [$e2u->{Cname},$u2e->{Cname}, - outstring($ch,$e2u->{Cname}.'_def',$erep),length($erep)]; - } - elsif ($doEnc) - { - output_enc($ch,$name,$e2u); - } - elsif ($doUcm) - { - output_ucm($ch,$name,$u2e); - } + $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el]; } sub compile_enc { - my ($fh,$name,$ch) = @_; + my ($fh,$name) = @_; my $e2u = {}; my $u2e = {}; @@ -273,11 +315,14 @@ sub compile_enc my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>)); warn "$type encoded $name\n"; my $rep = ''; + my $min_el; + my $max_el; { my $v = hex($def); no strict 'refs'; $rep = &{"encode_$type"}($v & 0xFF, ($v >> 8) & 0xffe); } + my %seen; while ($pages--) { my $line = <$fh>; @@ -292,9 +337,22 @@ sub compile_enc no strict 'refs'; my $ech = &{"encode_$type"}($ch,$page); my $val = hex(substr($line,0,4,'')); + next if $val == 0xFFFD; if ($val || (!$ch && !$page)) { + my $el = length($ech); + $max_el = $el if (!defined($max_el) || $el > $max_el); + $min_el = $el if (!defined($min_el) || $el < $min_el); my $uch = encode_U($val); + if (exists $seen{$uch}) + { + warn sprintf("U%04X is %02X%02X and %02X%02X\n", + $val,$page,$ch,@{$seen{$uch}}); + } + else + { + $seen{$uch} = [$page,$ch]; + } enter($e2u,$ech,$uch,$e2u,0); enter($u2e,$uch,$ech,$u2e,0); } @@ -307,21 +365,7 @@ sub compile_enc } } } - if ($doC) - { - output($ch,$name.'_utf8',$e2u); - output($ch,'utf8_'.$name,$u2e); - $encoding{$name} = [$e2u->{Cname},$u2e->{Cname}, - outstring($ch,$e2u->{Cname}.'_def',$rep),length($rep)]; - } - elsif ($doEnc) - { - output_enc($ch,$name,$e2u); - } - elsif ($doUcm) - { - output_ucm($ch,$name,$u2e); - } + $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el]; } sub enter @@ -489,13 +533,30 @@ sub output_enc sub decode_U { my $s = shift; - } +my @uname; +sub char_names +{ + my $s = do "unicode/Name.pl"; + pos($s) = 0; + while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc) + { + my $name = $3; + my $s = hex($1); + last if $s >= 0x10000; + my $e = length($2) ? hex($2) : $s; + for (my $i = $s; $i <= $e; $i++) + { + $uname[$i] = $name; +# print sprintf("U%04X $name\n",$i); + } + } +} sub output_ucm_page { - my ($fh,$a,$t,$pre) = @_; + my ($cmap,$a,$t,$pre) = @_; # warn sprintf("Page %x\n",$pre); foreach my $b (sort keys %$t) { @@ -504,7 +565,7 @@ sub output_ucm_page my $u = ord($s); if ($n != $a && $n != $t) { - output_ucm_page($fh,$a,$n,(($pre|($u &0x3F)) << 6)&0xFFFF); + output_ucm_page($cmap,$a,$n,(($pre|($u &0x3F)) << 6)&0xFFFF); } elsif (length($out)) { @@ -512,12 +573,13 @@ sub output_ucm_page { $u = $pre|($u &0x3f); } - printf $fh " ",$u; + my $s = sprintf " ",$u; foreach my $c (split(//,$out)) { - printf $fh "\\x%02X",ord($c); + $s .= sprintf "\\x%02X",ord($c); } - printf $fh " |%d\n",($fb ? 1 : 0); + $s .= sprintf " |%d # %s\n",($fb ? 1 : 0),$uname[$u]; + push(@$cmap,$s); } else { @@ -528,9 +590,34 @@ sub output_ucm_page sub output_ucm { - my ($fh,$name,$a) = @_; - print $fh "CHARMAP\n"; - output_ucm_page($fh,$a,$a,0); + my ($fh,$name,$h,$rep,$min_el,$max_el) = @_; + print $fh "# Written $perforce\n# $0 @orig_ARGV\n" unless $opt{'q'}; + print $fh " \"$name\"\n"; + char_names(); + if (defined $min_el) + { + print $fh " $min_el\n"; + } + if (defined $max_el) + { + print $fh " $max_el\n"; + } + if (defined $rep) + { + print $fh " "; + foreach my $c (split(//,$rep)) + { + printf $fh "\\x%02X",ord($c); + } + print $fh "\n"; + } + my @cmap; + output_ucm_page(\@cmap,$h,$h,0); + print $fh "#\nCHARMAP\n"; + foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap) + { + print $fh $line; + } print $fh "END CHARMAP\n"; }