From: Dan Kogai Date: Tue, 19 Nov 2002 03:18:44 +0000 (+0900) Subject: [Encode] 1.83 + bleedperl patch released X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=b536bf570905070470ba64c88c9fb4f28bfa54f3;p=p5sagit%2Fp5-mst-13.2.git [Encode] 1.83 + bleedperl patch released Message-Id: <2C132F6D-FB22-11D6-87FC-0003939A104C@dan.co.jp> p4raw-id: //depot/perl@18175 --- diff --git a/ext/Encode/AUTHORS b/ext/Encode/AUTHORS index 01d3530..c559c84 100644 --- a/ext/Encode/AUTHORS +++ b/ext/Encode/AUTHORS @@ -9,7 +9,7 @@ # # This list is in alphabetical order. -- -Andreas J. Koenig +Andreas J. Koenig Anton Tagunov Autrijus Tang Benjamin Goldberg @@ -21,6 +21,7 @@ Gerrit P. Haase Graham Barr Gurusamy Sarathy H.Merijn Brand +Hugo van der Sanden Jarkko Hietaniemi Jungshik Shin Laszlo Molnar diff --git a/ext/Encode/Changes b/ext/Encode/Changes index 52cbda3..60452d8 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -1,9 +1,35 @@ # Revision history for Perl extension Encode. # -# $Id: Changes,v 1.80 2002/10/21 20:39:09 dankogai Exp dankogai $ +# $Id: Changes,v 1.83 2002/11/18 17:28:49 dankogai Exp dankogai $ # -$Revision: 1.80 $ $Date: 2002/10/21 20:39:09 $ +$Revision: 1.83 $ $Date: 2002/11/18 17:28:49 $ +! Encode.xs lib/Encode/JIS7.pm + Even more patches from Inaba-san has been applied. With this + patch t/uni/tr_7jis.t and t/uni/t_utf8.t of bleedperl will work. + Message-Id: <20021115105514D.inaba.hiroto@toshiba-it.co.jp> + +1.82 2002/11/14 23:06:12 +! Encode.xs + Encode::utf8 (XS Version) assertion botch first found in Cygwin, + later found in perls w/ -Dusemymalloc was fixed by NC. + Message-Id: <20021114210349.GA288@Bagpuss.unfortu.net> + +1.81 2002/11/08 18:29:27 +! Encode.pm Encode.xs + Non-XS version of Encode::utf8 is back (with XS being default). + Encode::predefine_encodings(0) to turn off XS. + This is primarily to cope w/ Cygwin smoke but Sadahiro-san has + found that it was Test::More causing the problem, not Encode. + But I have already made it configurable so it may be useful in + some rare cases.... + Message-Id: <20021107210110.2EE4.BQW10602@nifty.com>, et al. +! bin/enc2xs + The ingenious patch by Nicholas Clark that reduces shlib sizes by + 50% with no penalty and backward compatibility preserved, is in. + Message-Id: <20021103231324.GE288@Bagpuss.unfortu.net> + +1.80 2002/10/21 20:39:09 ! Encode.xs t/mime-header.t Even more patches from NI-XS regarding Encode::utf8->decode(). And one more test to t/mime-header.t to prove it @@ -774,7 +800,7 @@ $Revision: 1.80 $ $Date: 2002/10/21 20:39:09 $ Typo fixes and improvements by jhi Message-Id: <200204010201.FAA03564@alpha.hut.fi>, et al. -1.11 $Date: 2002/10/21 20:39:09 $ +1.11 2002/03/31 22:12:13 + t/encoding.t + t/jperl.t ! MANIFEST diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 62e2ae6..01dc8ff 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -1,9 +1,9 @@ # -# $Id: Encode.pm,v 1.80 2002/10/21 20:38:45 dankogai Exp $ +# $Id: Encode.pm,v 1.83 2002/11/18 17:28:29 dankogai Exp $ # package Encode; use strict; -our $VERSION = do { my @r = (q$Revision: 1.80 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.83 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; our $DEBUG = 0; use XSLoader (); XSLoader::load(__PACKAGE__, $VERSION); @@ -191,7 +191,7 @@ sub decode_utf8($) return $str; } -predefine_encodings(); +predefine_encodings(1); # # This is to restore %Encoding if really needed; @@ -199,6 +199,8 @@ predefine_encodings(); sub predefine_encodings{ use Encode::Encoding; + no warnings 'redefine'; + my $use_xs = shift; if ($ON_EBCDIC) { # was in Encode::UTF_EBCDIC package Encode::UTF_EBCDIC; @@ -243,7 +245,29 @@ sub predefine_encodings{ # was in Encode::utf8 package Encode::utf8; push @Encode::utf8::ISA, 'Encode::Encoding'; - # encode and decode methods now in Encode.xs + # + if ($use_xs){ + $DEBUG and warn __PACKAGE__, " XS on"; + *decode = \&decode_xs; + *encode = \&encode_xs; + }else{ + $DEBUG and warn __PACKAGE__, " XS off"; + *decode = sub{ + my ($obj,$octets,$chk) = @_; + my $str = Encode::decode_utf8($octets); + if (defined $str) { + $_[1] = '' if $chk; + return $str; + } + return undef; + }; + *encode = sub { + my ($obj,$string,$chk) = @_; + my $octets = Encode::encode_utf8($string); + $_[1] = '' if $chk; + return $octets; + }; + } $Encode::Encoding{utf8} = bless {Name => "utf8"} => "Encode::utf8"; } diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index df77b7a..4d30914 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -1,5 +1,5 @@ /* - $Id: Encode.xs,v 1.49 2002/10/21 19:47:47 dankogai Exp $ + $Id: Encode.xs,v 1.52 2002/11/18 17:28:49 dankogai Exp dankogai $ */ #define PERL_NO_GET_CONTEXT @@ -241,7 +241,7 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_ void -Method_decode(obj,src,check = 0) +Method_decode_xs(obj,src,check = 0) SV * obj SV * src int check @@ -250,7 +250,7 @@ CODE: STRLEN slen; U8 *s = (U8 *) SvPV(src, slen); U8 *e = (U8 *) SvEND(src); - SV *dst = newSV(slen); + SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */ SvPOK_only(dst); SvCUR_set(dst,0); if (SvUTF8(src)) { @@ -321,7 +321,7 @@ CODE: } void -Method_encode(obj,src,check = 0) +Method_encode_xs(obj,src,check = 0) SV * obj SV * src int check @@ -330,7 +330,7 @@ CODE: STRLEN slen; U8 *s = (U8 *) SvPV(src, slen); U8 *e = (U8 *) SvEND(src); - SV *dst = newSV(slen); + SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */ if (SvUTF8(src)) { /* Already encoded - trust it and just copy the octets */ sv_setpvn(dst,(char *)s,(e-s)); @@ -338,7 +338,7 @@ CODE: } else { /* Native bytes - can always encode */ - U8 *d = (U8 *) SvGROW(dst,2*slen); + U8 *d = (U8 *) SvGROW(dst, 2*slen+1); /* +1 or assertion will botch */ while (s < e) { UV uv = NATIVE_TO_UNI((UV) *s++); if (UNI_IS_INVARIANT(uv)) diff --git a/ext/Encode/MANIFEST b/ext/Encode/MANIFEST index 45fd869..77c189e 100644 --- a/ext/Encode/MANIFEST +++ b/ext/Encode/MANIFEST @@ -33,6 +33,7 @@ bin/enc2xs Encode module generator bin/piconv iconv by perl bin/ucm2table Table Generator for testing bin/ucmlint A UCM Lint utility +bin/ucmsort Sorts UCM lines bin/unidump Unicode Dump like hexdump(1) encengine.c Encode extension encoding.pm Perl Pragmactic Module @@ -77,6 +78,7 @@ t/ksc5601.enc test data t/ksc5601.utf test data t/mime-header.t test script t/perlio.t test script +t/rt.pl even more test script t/unibench.pl benchmark script ucm/8859-1.ucm Unicode Character Map ucm/8859-10.ucm Unicode Character Map diff --git a/ext/Encode/bin/enc2xs b/ext/Encode/bin/enc2xs index 7100bab..ae44c79 100644 --- a/ext/Encode/bin/enc2xs +++ b/ext/Encode/bin/enc2xs @@ -6,9 +6,10 @@ BEGIN { require Config; import Config; } use strict; +use warnings; use Getopt::Std; my @orig_ARGV = @ARGV; -our $VERSION = do { my @r = (q$Revision: 1.30 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.31 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; # These may get re-ordered. # RAW is a do_now as inserted by &enter @@ -186,7 +187,7 @@ END print C "#include \n"; print C "#define U8 U8\n"; } - print C "#include \"encode.h\"\n"; + print C "#include \"encode.h\"\n\n"; } elsif ($cname =~ /\.enc$/) @@ -204,6 +205,9 @@ elsif ($cname =~ /\.pet$/) my %encoding; my %strings; +my $string_acc; +my %strings_in_acc; + my $saved = 0; my $subsave = 0; my $strings = 0; @@ -250,8 +254,19 @@ if ($doC) foreach my $name (sort cmp_name keys %encoding) { my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; - output(\*C,$name.'_utf8',$e2u); - output(\*C,'utf8_'.$name,$u2e); + process($name.'_utf8',$e2u); + addstrings(\*C,$e2u); + + process('utf8_'.$name,$u2e); + addstrings(\*C,$u2e); + } + outbigstring(\*C,"enctable"); + foreach my $name (sort cmp_name keys %encoding) + { + my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; + outtable(\*C,$e2u, "enctable"); + outtable(\*C,$u2e, "enctable"); + # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep)); } foreach my $enc (sort cmp_name keys %encoding) @@ -596,43 +611,6 @@ sub enter_fb0 { } } - -sub outstring -{ - my ($fh,$name,$s) = @_; - my $sym = $strings{$s}; - if ($sym) - { - $saved += length($s); - } - else - { - if ($opt{'O'}) { - foreach my $o (keys %strings) - { - next unless (my $i = index($o,$s)) >= 0; - $sym = $strings{$o}; - # gcc things that 0x0e+0x10 (anything with e+) starts to look like - # a hexadecimal floating point constant. Silly gcc. Only p - # introduces a floating point constant. Put the space in to stop it - # getting confused. - $sym .= sprintf(" +0x%02x",$i) if ($i); - $subsave += length($s); - return $strings{$s} = $sym; - } - } - $strings{$s} = $sym = $name; - $strings += length($s); - my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s); - # Maybe we should assert that these are all <256. - $definition .= join(',',unpack "C*",$s); - # We have a single long line. Split it at convenient commas. - $definition =~ s/(.{74,77},)/$1\n/g; - print $fh "$definition };\n\n"; - } - return $sym; -} - sub process { my ($name,$a) = @_; @@ -693,7 +671,8 @@ sub process $a->{'Entries'} = \@ent; } -sub outtable + +sub addstrings { my ($fh,$a) = @_; my $name = $a->{'Cname'}; @@ -701,20 +680,98 @@ sub outtable foreach my $b (@{$a->{'Entries'}}) { next unless $b->[AGG_OUT_LEN]; - my $s = $b->[AGG_MIN_IN]; - my $e = $b->[AGG_MAX_IN]; - outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]); + $strings{$b->[AGG_OUT_BYTES]} = undef; } if ($a->{'Forward'}) { my $var = $^O eq 'MacOS' ? 'extern' : 'static'; - print $fh "\n$var encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n"; + print $fh "$var encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n"; + } + $a->{'DoneStrings'} = 1; + foreach my $b (@{$a->{'Entries'}}) + { + my ($s,$e,$out,$t,$end,$l) = @$b; + addstrings($fh,$t) unless $t->{'DoneStrings'}; } +} + +sub outbigstring +{ + my ($fh,$name) = @_; + + $string_acc = ''; + + # Make the big string in the string accumulator. Longest first, on the hope + # that this makes it more likely that we find the short strings later on. + # Not sure if it helps sorting strings of the same length lexcically. + foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) { + my $index = index $string_acc, $s; + if ($index >= 0) { + $saved += length($s); + $strings_in_acc{$s} = $index; + } else { + OPTIMISER: { + if ($opt{'O'}) { + my $sublength = length $s; + while (--$sublength > 0) { + # progressively lop characters off the end, to see if the start of + # the new string overlaps the end of the accumulator. + if (substr ($string_acc, -$sublength) + eq substr ($s, 0, $sublength)) { + $subsave += $sublength; + $strings_in_acc{$s} = length ($string_acc) - $sublength; + # append the last bit on the end. + $string_acc .= substr ($s, $sublength); + last OPTIMISER; + } + # or if the end of the new string overlaps the start of the + # accumulator + next unless substr ($string_acc, 0, $sublength) + eq substr ($s, -$sublength); + # well, the last $sublength characters of the accumulator match. + # so as we're prepending to the accumulator, need to shift all our + # existing offsets forwards + $_ += $sublength foreach values %strings_in_acc; + $subsave += $sublength; + $strings_in_acc{$s} = 0; + # append the first bit on the start. + $string_acc = substr ($s, 0, -$sublength) . $string_acc; + last OPTIMISER; + } + } + # Optimiser (if it ran) found nothing, so just going have to tack the + # whole thing on the end. + $strings_in_acc{$s} = length $string_acc; + $string_acc .= $s; + }; + } + } + + $strings = length $string_acc; + my $definition = "\nstatic const U8 $name\[$strings] = { " . + join(',',unpack "C*",$string_acc); + # We have a single long line. Split it at convenient commas. + print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs; + print $fh substr ($definition, pos $definition), " };\n"; +} + +sub findstring { + my ($name,$s) = @_; + my $offset = $strings_in_acc{$s}; + die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator" + unless defined $offset; + "$name + $offset"; +} + +sub outtable +{ + my ($fh,$a,$bigname) = @_; + my $name = $a->{'Cname'}; $a->{'Done'} = 1; foreach my $b (@{$a->{'Entries'}}) { my ($s,$e,$out,$t,$end,$l) = @$b; - outtable($fh,$t) unless $t->{'Done'}; + outtable($fh,$t,$bigname) unless $t->{'Done'}; } print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n"; foreach my $b (@{$a->{'Entries'}}) @@ -724,7 +781,7 @@ sub outtable print $fh "{"; if ($l) { - printf $fh outstring($fh,'',$out); + printf $fh findstring($bigname,$out); } else { @@ -736,14 +793,6 @@ sub outtable print $fh "};\n"; } -sub output -{ - my ($fh,$name,$a) = @_; - process($name,$a); - # Sub-tables - outtable($fh,$a); -} - sub output_enc { my ($fh,$name,$a) = @_; @@ -857,7 +906,7 @@ use vars qw( ); sub find_e2x{ - eval { require File::Find }; + eval { require File::Find; }; my (@inc, %e2x_dir); for my $inc (@INC){ push @inc, $inc unless $inc eq '.'; #skip current dir @@ -869,6 +918,7 @@ sub find_e2x{ = lstat($_) or return; -f _ or return; if (/^.*\.e2x$/o){ + no warnings 'once'; $e2x_dir{$File::Find::dir} ||= $mtime; } return; @@ -927,6 +977,7 @@ sub make_configlocal_pm eval { require "Encode/$f"; }; $@ and die "Can't require Encode/$f: $@\n"; for my $enc (Encode->encodings()){ + no warnings 'once'; $in_core{$enc} and next; $Encode::Config::ExtModule{$enc} and next; my $mod = "Encode/$f"; diff --git a/ext/Encode/lib/Encode/JP/JIS7.pm b/ext/Encode/lib/Encode/JP/JIS7.pm index d1e69e6..d49ec6c 100644 --- a/ext/Encode/lib/Encode/JP/JIS7.pm +++ b/ext/Encode/lib/Encode/JP/JIS7.pm @@ -1,7 +1,7 @@ package Encode::JP::JIS7; use strict; -our $VERSION = do { my @r = (q$Revision: 1.8 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.9 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode qw(:fallbacks); @@ -62,21 +62,23 @@ sub encode($$;$) # JIS<->EUC +our $re_scan_jis = qr{ + (?:($RE{JIS_0212})|$RE{JIS_0208}|($RE{ISO_ASC})|($RE{JIS_KANA}))([^\e]*) +}x; sub jis_euc { + local ${^ENCODING}; my $r_str = shift; - $$r_str =~ s( - ($RE{JIS_0212}|$RE{JIS_0208}|$RE{ISO_ASC}|$RE{JIS_KANA}) - ([^\e]*) - ) + $$r_str =~ s($re_scan_jis) { - my ($esc, $chunk) = ($1, $2); - if ($esc !~ /$RE{ISO_ASC}/o) { + my ($esc_0212, $esc_asc, $esc_kana, $chunk) = + ($1, $2, $3, $4); + if (!$esc_asc) { $chunk =~ tr/\x21-\x7e/\xa1-\xfe/; - if ($esc =~ /$RE{JIS_KANA}/o) { + if ($esc_kana) { $chunk =~ s/([\xa1-\xdf])/\x8e$1/og; } - elsif ($esc =~ /$RE{JIS_0212}/o) { + elsif ($esc_0212) { $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og; } } diff --git a/ext/Encode/t/rt.pl b/ext/Encode/t/rt.pl index cff5a3f..28924b2 100644 --- a/ext/Encode/t/rt.pl +++ b/ext/Encode/t/rt.pl @@ -1,12 +1,14 @@ #!/usr/local/bin/perl # -# $Id: rt.pl,v 1.1 2002/10/20 15:44:00 dankogai Exp $ +# $Id: rt.pl,v 1.2 2002/11/08 18:29:27 dankogai Exp $ # BEGIN { + my $ucmdir = "ucm"; if ($ENV{'PERL_CORE'}){ chdir 't'; unshift @INC, '../lib'; + $ucmdir = "../ext/Encode/ucm"; } require Config; import Config; if ($Config{'extensions'} !~ /\bEncode\b/) { @@ -19,7 +21,6 @@ BEGIN { } use strict; require Test::More; - my $ucmdir = "ucm"; our $DEBUG; our @ucm; unless(@ARGV){ diff --git a/t/uni/tr_7jis.t b/t/uni/tr_7jis.t index 292a01a..360ce1c 100644 --- a/t/uni/tr_7jis.t +++ b/t/uni/tr_7jis.t @@ -1,7 +1,8 @@ # # $Id$ # -# This script is written intentionally in EUC-JP +# This script is written intentionally in ISO-2022-JP +# requires Encode 1.83 or better to work # -- dankogai BEGIN { @@ -23,8 +24,6 @@ BEGIN { exit 0; } $| = 1; - print "1..0 # does not work with iso-2022-jp yet\n"; - exit 0; } use strict; @@ -49,10 +48,10 @@ is($str, $katakana, "tr// # hiragana -> katakana"); $str = $katakana; $str =~ tr/$B%!(B-$B%s(B/$B$!(B-$B$s(B/; is($str, $hiragana, "tr// # hiragana -> katakana"); -$str = $hiragana; eval qq(\$str =~ tr/$B$!(B-$B$s(B/$B%!(B-$B%s(B/); -is($str, $katakana, "eval qq(tr//) # hiragana -> katakana"); -$str = $katakana; eval qq(\$str =~ tr/$B%!(B-$B%s(B/$B$!(B-$B$s(B/); -is($str, $hiragana, "eval qq(tr//) # hiragana -> katakana"); +$str = $hiragana; eval qq{\$str =~ tr/$B$!(B-$B$s(B/$B%!(B-$B%s(B/}; +is($str, $katakana, "eval qq{tr//} # hiragana -> katakana"); +$str = $katakana; eval qq{\$str =~ tr/$B%!(B-$B%s(B/$B$!(B-$B$s(B/}; +is($str, $hiragana, "eval qq{tr//} # hiragana -> katakana"); $str = $hiragana; $str =~ s/([$B$!(B-$B$s(B])/$h2k{$1}/go; is($str, $katakana, "s/// # hiragana -> katakana"); diff --git a/t/uni/tr_sjis.t b/t/uni/tr_sjis.t index f5ad045..0f4a72b 100644 --- a/t/uni/tr_sjis.t +++ b/t/uni/tr_sjis.t @@ -1,7 +1,7 @@ # # $Id$ # -# This script is written intentionally in EUC-JP +# This script is written intentionally in Shift JIS # -- dankogai BEGIN { diff --git a/t/uni/tr_utf8.t b/t/uni/tr_utf8.t index 54b9b4f..b67883a 100644 --- a/t/uni/tr_utf8.t +++ b/t/uni/tr_utf8.t @@ -1,7 +1,8 @@ # # $Id$ # -# This script is written intentionally in EUC-JP +# This script is written intentionally in UTF-8 +# Requires Encode 1.83 or better # -- dankogai BEGIN { @@ -29,10 +30,7 @@ use strict; #use Test::More qw(no_plan); use Test::More tests => 6; -# use encoding 'utf8'; # you can't uncomment this! -# if you uncomment above, you'll get the following (as of Encode 1.80) -# Assertion ((dst)->sv_flags & 0xff) >= SVt_PV failed: -# file "Encode.xs", line 255 at t/uni/tr_utf8.t line 35. +use encoding 'utf8'; my @hiragana = map {chr} ord("ぁ")..ord("ん"); my @katakana = map {chr} ord("ァ")..ord("ン");