From: Nick Ing-Simmons Date: Wed, 13 Dec 2000 23:16:13 +0000 (+0000) Subject: Beginings of compiled encodings - checked in as a snapshot of thoughts X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=017e2addf6da99b3f648d9518de5a848be394ab8;p=p5sagit%2Fp5-mst-13.2.git Beginings of compiled encodings - checked in as a snapshot of thoughts so far and so it does not get lost. p4raw-id: //depot/perlio@8103 --- diff --git a/ext/Encode/compile b/ext/Encode/compile new file mode 100755 index 0000000..e2db8ab --- /dev/null +++ b/ext/Encode/compile @@ -0,0 +1,257 @@ +#!../../perl -w +@INC = '../../lib'; +use strict; + +sub encode_U +{ + my $uv = shift; + if ($uv < 0x80) + { + return chr($uv) + } + if ($uv < 0x800) + { + return chr(($uv >> 6) | 0xC0). + chr(($uv & 0x3F) | 0x80); + } + return chr(($uv >> 12) | 0xE0). + chr((($uv >> 6) & 0x3F) | 0x80). + chr(($uv & 0x3F) | 0x80); +} + +sub encode_S +{ + my ($ch,$page) = @_; + return chr($ch); +} + +sub encode_D +{ + my ($ch,$page) = @_; + return chr($page).chr($ch); +} + +sub encode_M +{ + my ($ch,$page) = @_; + return &encode_D if $page; + return &encode_S; +} + +my $cname = shift(@ARGV); +open(C,">$cname") || die "Cannot open $cname:$!"; +print C "#include \"encode.h\"\n"; + +my %encoding; +my %strings; + +foreach my $enc (@ARGV) + { + my ($name) = $enc =~ /^.*?([\w-]+)(\.enc)$/; + if (open(E,$enc)) + { + compile(\*E,$name,\*C); + } + else + { + warn "Cannot open $enc for $name:$!"; + } + } + +print C "encode_t encodings[] = {\n"; +foreach my $enc (sort keys %encoding) + { + print C " {",join(',',"\"$enc\"",@{$encoding{$enc}}),"},\n"; + } +print C " {0,0,0,0,0}\n};\n"; + +close(C); + + + +sub compile +{ + my ($fh,$name,$ch) = @_; + my $e2u = {}; + my $u2e = {}; + + my $type; + while ($type = <$fh>) + { + last if $type !~ /^\s*#/; + } + chomp($type); + return if $type eq 'E'; + my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>)); + warn "t=$type s=$sym d=$def p=$pages $name\n"; + my $rep = ''; + { + my $v = hex($def); + no strict 'refs'; + $rep = &{"encode_$type"}($v & 0xFF, ($v >> 8) & 0xffe); + } + while ($pages--) + { + my $line = <$fh>; + chomp($line); + my $page = hex($line); + my $ch = 0; + for (my $i = 0; $i < 16; $i++) + { + my $line = <$fh>; + for (my $j = 0; $j < 16; $j++) + { + no strict 'refs'; + my $ech = &{"encode_$type"}($ch,$page); + my $val = hex(substr($line,0,4,'')); + if ($val || (!$ch && !$page)) + { + my $uch = encode_U($val); + enter($e2u,$ech,$uch,$e2u); + enter($u2e,$uch,$ech,$u2e); + } + else + { + # No character at this position + # enter($e2u,$ech,undef,$e2u); + } + $ch++; + } + } + } + output($ch,$name.'_utf8',$e2u); + output($ch,'utf8_'.$name,$u2e); + $encoding{$name} = [$e2u->{Cname},$u2e->{Cname}, + outstring($ch,$e2u->{Cname}.'_def',$rep),length($rep)]; +} + +sub enter +{ + my ($a,$s,$d,$t) = @_; + $t = $a if @_ < 4; + my $b = substr($s,0,1); + my $e = $a->{$b}; + unless ($e) + { # 0 1 2 3 4 5 + $e = [$b,$b,'',{},length($s),0]; + $a->{$b} = $e; + } + if (length($s) > 1) + { + enter($e->[3],substr($s,1),$d,$t); + } + else + { + $e->[2] = $d; + $e->[3] = $t; + $e->[5] = length($d); + } +} + + +sub outstring +{ + my ($fh,$name,$s) = @_; + my $sym = $strings{$s}; + unless ($sym) + { + if (1) + { + foreach my $o (keys %strings) + { + my $i = index($o,$s); + if ($i >= 0) + { + $sym = $strings{$o}; + $sym .= sprintf("+0x%02x",$i) if ($i); + return $sym; + } + } + } + $strings{$s} = $sym = $name; + printf $fh "static const U8 %s[%d] =\n",$name,length($s); + while (length($s)) + { + my $c = substr($s,0,16,''); + print $fh '"',join('',map(sprintf('\x%02x',ord($_)),split(//,$c))),'"'; + print $fh "\n" if length($s); + } + printf $fh ";\n\n"; + } + return $sym; +} + + + +sub output +{ + my ($fh,$name,$a) = @_; + $name =~ s/\W+/_/g; + $a->{Cname} = $name; + my @keys = grep(ref($a->{$_}),sort keys %$a); + print $fh "\nstatic encpage_t $name\[\];\n"; + # Sub-tables + my %str; + my $l; + my @ent; + foreach my $b (@keys) + { + my ($s,$f,$out,$t,$end) = @{$a->{$b}}; + if (defined($l) && + ord($b) == ord($a->{$l}[1])+1 && + $a->{$l}[3] == $a->{$b}[3] && + $a->{$l}[4] == $a->{$b}[4] && + $a->{$l}[5] == $a->{$b}[5] ) + { + my $i = ord($b)-ord($a->{$l}[0]); + $a->{$l}[1] = $b; + $a->{$l}[2] .= $a->{$b}[2]; + } + else + { + $l = $b; + push(@ent,$b); + } + unless (exists $t->{Cname}) + { + output($fh,sprintf("%s_%02x",$name,ord($s)),$t); + } + } + if (ord($keys[-1]) < 255) + { + my $t = chr(ord($keys[-1])+1); + $a->{$t} = [$t,chr(255),undef,$a,0,0]; + push(@ent,$t); + } + # String tables + foreach my $b (@ent) + { + next unless $a->{$b}[5]; + my $s = ord($a->{$b}[0]); + my $e = ord($a->{$b}[1]); + outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$a->{$b}[2]); + } + + print $fh "\n"; + print $fh "static encpage_t $name\[",scalar(@ent),"] = {\n"; + foreach my $b (@ent) + { + my ($s,$e,$out,$t,$end,$l) = @{$a->{$b}}; + my $sc = ord($s); + my $ec = ord($e); + print $fh "{"; + if ($l) + { + printf $fh outstring($fh,'',$out); + } + else + { + print $fh "0"; + } + print $fh ",",$t->{Cname}; + printf $fh ",0x%02x,0x%02x,$end},\n",$sc,$ec; + } + print $fh "};\n\n"; +} + + diff --git a/ext/Encode/encengine.c b/ext/Encode/encengine.c new file mode 100644 index 0000000..a73be73 --- /dev/null +++ b/ext/Encode/encengine.c @@ -0,0 +1,134 @@ +/* +Data structures for encoding transformations. + +Perl works internally in either a native 'byte' encoding or +in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t" +representation. When we do we can use utf8_to_uv(). + +Most character encodings are either simple byte mappings or +variable length multi-byte encodings. UTF-8 can be viewed as a +rather extreme case of the latter. + +So to solve an important part of perl's encode needs we need to solve the +"multi-byte -> multi-byte" case. The simple byte forms are then just degenerate +case. (Where one of multi-bytes will usually be UTF-8.) + +The other type of encoding is a shift encoding where a prefix sequence +determines what subsequent bytes mean. Such encodings have state. + +We also need to handle case where a character in one encoding has to be +represented as multiple characters in the other. e.g. letter+diacritic. + +The process can be considered as pseudo perl: + +my $dst = ''; +while (length($src)) + { + my $size = $count($src); + my $in_seq = substr($src,0,$size,''); + my $out_seq = $s2d_hash{$in_seq}; + if (defined $out_seq) + { + $dst .= $out_seq; + } + else + { + # an error condition + } + } +return $dst; + +That has the following components: + &src_count - a "rule" for how many bytes make up the next character in the + source. + %s2d_hash - a mapping from input sequences to output sequences + +The problem with that scheme is that it does not allow the output +character repertoire to affect the characters considered from the +input. + +So we use a "trie" representation which can also be considered +a state machine: + +my $dst = ''; +my $seq = \@s2d_seq; +my $next = \@s2d_next; +while (length($src)) + { + my $byte = $substr($src,0,1,''); + my $out_seq = $seq->[$byte]; + if (defined $out_seq) + { + $dst .= $out_seq; + } + else + { + # an error condition + } + ($next,$seq) = @$next->[$byte] if $next; + } +return $dst; + +There is now a pair of data structures to represent everything. +It is valid for output sequence at a particular point to +be defined but zero length, that just means "don't know yet". +For the single byte case there is no 'next' so new tables will be the same as +the original tables. For a multi-byte case a prefix byte will flip to the tables +for the next page (adding nothing to the output), then the tables for the page +will provide the actual output and set tables back to original base page. + +This scheme can also handle shift encodings. + +A slight enhancement to the scheme also allows for look-ahead - if +we add a flag to re-add the removed byte to the source we could handle + a" -> รค + ab -> a (and take b back please) + +*/ + +#include +#include +#define U8 U8 +#include "encode.h" + +STRLEN +translate(encpage_t *enc, const U8 *src, STRLEN slen, U8 *dst, STRLEN dlen) +{ + const U8 *send = src+slen; + U8 *dend = dst+dlen; + U8 *dptr = dst; + while (src < send) + { + encpage_t *e = enc; + U8 byte = *src++; + while (byte > e->max) + e++; + if (byte >= e->min) + { + STRLEN n = e->dlen; + if (n) + { + const U8 *out = e->seq+n*(byte - e->min); + STRLEN n = *out++; + if (dptr+n <= dend) + { + if (dst) + Copy(out,dptr,n,U8); + dptr += n; + } + else + { + /* No room */ + } + } + enc = e->next; + } + else + { + /* Cannot represent */ + } + } + return dptr-dst; +} + + diff --git a/ext/Encode/encode.h b/ext/Encode/encode.h new file mode 100644 index 0000000..10f8386 --- /dev/null +++ b/ext/Encode/encode.h @@ -0,0 +1,26 @@ +#ifndef U8 +typedef unsigned char U8; +#endif + +typedef struct encpage_s encpage_t; + +struct encpage_s +{ + const U8 *seq; + encpage_t *next; + U8 min; + U8 max; + U8 dlen; + U8 slen; +}; + +typedef struct encode_s encode_t; +struct encode_s +{ + const char *name; + encpage_t *t_utf8; + encpage_t *f_utf8; + const U8 *rep; + int replen; +}; +