From: Nick Ing-Simmons Date: Sun, 31 Dec 2000 18:11:54 +0000 (+0000) Subject: Start of support of ICU-style .ucm files: X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=9b37254de3a0e643859aebea34267588f789f15f;p=p5sagit%2Fp5-mst-13.2.git Start of support of ICU-style .ucm files: - teach compile how to read a .ucm file - first guess at how to represent fallbacks in "tries". - use fallbacks if check == 0 - new return code to indicate we used one. p4raw-id: //depot/perlio@8285 --- diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index db471cb..dedb8e9 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -342,7 +342,7 @@ sub from_to # The global hash is declared in XS code $encoding{Unicode} = bless({},'Encode::Unicode'); -$encoding{iso10646-1} = bless({},'Encode::iso10646_1'); +$encoding{'iso10646-1'} = bless({},'Encode::iso10646_1'); sub encodings { @@ -408,7 +408,8 @@ sub getEncoding package Encode::Unicode; -# Dummy package that provides the encode interface +# Dummy package that provides the encode interface but leaves data +# as UTF-8 encoded. It is here so that from_to() sub name { 'Unicode' } @@ -533,7 +534,9 @@ sub fromUnicode return $str; } -package Encode::iso10646_1;# +package Encode::iso10646_1; +# Encoding is 16-bit network order Unicode +# Used for X font encodings sub name { 'iso10646-1' } diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index cca1ddc..b61d89b 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -340,10 +340,14 @@ encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check) U8 *d = (U8 *) SvGROW(dst, 2*slen+1); STRLEN dlen = SvLEN(dst); int code; - while ((code = do_encode(dir,s,&slen,d,dlen,&dlen))) + while ((code = do_encode(dir,s,&slen,d,dlen,&dlen,!check))) { SvCUR_set(dst,dlen); SvPOK_on(dst); + + if (code == ENCODE_FALLBACK) + break; + switch(code) { case ENCODE_NOSPACE: diff --git a/ext/Encode/compile b/ext/Encode/compile index fbb08cd..21478f8 100755 --- a/ext/Encode/compile +++ b/ext/Encode/compile @@ -4,7 +4,7 @@ use strict; sub encode_U { - # UTF-8 encocde long hand - only covers part of perl's range + # UTF-8 encode long hand - only covers part of perl's range my $uv = shift; if ($uv < 0x80) { @@ -96,10 +96,17 @@ sub cmp_name foreach my $enc (sort cmp_name @ARGV) { - my ($name) = $enc =~ /^.*?([\w-]+)(\.enc)$/; + my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/; if (open(E,$enc)) { - compile_enc(\*E,lc($name),\*C); + if ($sfx eq 'enc') + { + compile_enc(\*E,lc($name),\*C); + } + else + { + compile_ucm(\*E,lc($name),\*C); + } } else { @@ -135,6 +142,86 @@ close(C); close(D); close(H); + +sub compile_ucm +{ + my ($fh,$name,$ch) = @_; + my $e2u = {}; + my $u2e = {}; + my $cs; + my %attr; + while (<$fh>) + { + s/#.*$//; + last if /^\s*CHARMAP\s*$/i; + if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) + { + $attr{$1} = $2; + } + } + if (!defined($cs = $attr{'code_set_name'})) + { + warn "No in $name\n"; + } + else + { + $name = lc($cs); + } + my $erep; + my $urep; + if (exists $attr{'subchar'}) + { + my @byte = $attr{'subchar'} =~ /^\s*(?:\\x([0-9a-f]+))+\s*$/; + $erep = join('',map(hex($_),@byte)); + } + warn "Scanning $cs\n"; + my $nfb = 0; + my $hfb = 0; + while (<$fh>) + { + s/#.*$//; + last if /^\s*END\s+CHARMAP\s*$/i; + next if /^\s*$/; + my ($u,@byte) = /^\s+(?:\\x([0-9a-f]+))+\s*(\|[0-3]|)\s*$/i; + my $fb = pop(@byte); + if (defined($u)) + { + my $uch = encode_U(hex($u)); + my $ech = join('',map(hex($_),@byte)); + if (length($fb)) + { + $fb = substr($fb,1); + $hfb++; + } + else + { + $nfb++; + $fb = '0'; + } + # $fb is fallback flag + # 0 - round trip safe + # 1 - fallback for unicode -> enc + # 2 - skip sub-char mapping + # 3 - fallback enc -> unicode + enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/); + enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/); + } + else + { + warn $_; + } + + } + if ($nfb && $hfb) + { + die "$nfb entries without fallback, $hfb entries with\n"; + } + output($ch,$name.'_utf8',$e2u); + output($ch,'utf8_'.$name,$u2e); + $encoding{$name} = [$e2u->{Cname},$u2e->{Cname}, + outstring($ch,$e2u->{Cname}.'_def',$erep),length($erep)]; +} + sub compile_enc { my ($fh,$name,$ch) = @_; @@ -173,8 +260,8 @@ sub compile_enc if ($val || (!$ch && !$page)) { my $uch = encode_U($val); - enter($e2u,$ech,$uch,$e2u); - enter($u2e,$uch,$ech,$u2e); + enter($e2u,$ech,$uch,$e2u,0); + enter($u2e,$uch,$ech,$u2e,0); } else { @@ -193,18 +280,18 @@ sub compile_enc sub enter { - my ($a,$s,$d,$t) = @_; + my ($a,$s,$d,$t,$fb) = @_; $t = $a if @_ < 4; my $b = substr($s,0,1); my $e = $a->{$b}; unless ($e) { # 0 1 2 3 4 5 - $e = [$b,$b,'',{},length($s),0]; + $e = [$b,$b,'',{},length($s),0,$fb]; $a->{$b} = $e; } if (length($s) > 1) { - enter($e->[3],substr($s,1),$d,$t); + enter($e->[3],substr($s,1),$d,$t,$fb); } else { @@ -260,7 +347,8 @@ sub process ord($b) == ord($a->{$l}[1])+1 && $a->{$l}[3] == $a->{$b}[3] && $a->{$l}[4] == $a->{$b}[4] && - $a->{$l}[5] == $a->{$b}[5] + $a->{$l}[5] == $a->{$b}[5] && + $a->{$l}[6] == $a->{$b}[6] # && length($a->{$l}[2]) < 16 ) { @@ -316,7 +404,7 @@ sub outtable print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n"; foreach my $b (@{$a->{'Entries'}}) { - my ($s,$e,$out,$t,$end,$l) = @{$a->{$b}}; + my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}}; my $sc = ord($s); my $ec = ord($e); print $fh "{"; diff --git a/ext/Encode/encengine.c b/ext/Encode/encengine.c index f317250..4c68dd9 100644 --- a/ext/Encode/encengine.c +++ b/ext/Encode/encengine.c @@ -92,7 +92,7 @@ we add a flag to re-add the removed byte to the source we could handle #include "encode.h" int -do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STRLEN *dout) +do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STRLEN *dout, int approx) { const U8 *s = src; const U8 *send = s+*slen; @@ -106,9 +106,9 @@ do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STR U8 byte = *s; while (byte > e->max) e++; - if (byte >= e->min && e->slen) + if (byte >= e->min && e->slen && (approx || !e->slen & 0x80)) { - const U8 *cend = s + e->slen; + const U8 *cend = s + (e->slen & 0x7f); if (cend <= send) { STRLEN n; @@ -136,7 +136,11 @@ do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STR enc = e->next; s++; if (s == cend) - last = s; + { + if (approx && (e->slen & 0x80)) + code = ENCODE_FALLBACK; + last = s; + } } else { diff --git a/ext/Encode/encode.h b/ext/Encode/encode.h index 604b97f..853ad04 100644 --- a/ext/Encode/encode.h +++ b/ext/Encode/encode.h @@ -28,12 +28,13 @@ struct encode_s #ifdef U8 extern int do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, - U8 *dst, STRLEN dlen, STRLEN *dout); + U8 *dst, STRLEN dlen, STRLEN *dout, int approx); extern void Encode_DefineEncoding(encode_t *enc); #endif -#define ENCODE_NOSPACE 1 -#define ENCODE_PARTIAL 2 -#define ENCODE_NOREP 3 +#define ENCODE_NOSPACE 1 +#define ENCODE_PARTIAL 2 +#define ENCODE_NOREP 3 +#define ENCODE_FALLBACK 4 #endif