From: Dan Kogai Date: Sat, 28 Jun 2003 01:20:59 +0000 (+0900) Subject: [Encode] pre-1.97 patches X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=23f3589e21445e9141901c2894bc97b457493332;p=p5sagit%2Fp5-mst-13.2.git [Encode] pre-1.97 patches Message-Id: <56D5BFEE-A8BB-11D7-9092-000393AE4244@dan.co.jp> p4raw-id: //depot/perl@19871 --- diff --git a/ext/Encode/Changes b/ext/Encode/Changes index 18f5788..7251f5d 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -3,6 +3,16 @@ # $Id: Changes,v 1.96 2003/06/18 09:29:02 dankogai Exp $ # $Revision: 1.96 $ $Date: 2003/06/18 09:29:02 $ +! lib/Encode/Guess.pm + $Encode::Guess::NoUTFAutoGuess is added so you can turn off + automatic utf(8|16|32) guessing -- originally by Autrijus + Message-Id: <20030626162731.GA2077@not.autrijus.org> +! Encode.pm + Addressed the following; + Subject: [perl #22835] FB_QUIET doesn't work with Encode::encode + Message-Id: + +1.96 2003/06/18 09:29:02 ! lib/Encode/JP/JP.pm t/guess.t m/(...)/ in void context then $1 is considered a Bad Thing Message-Id: diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 57bcc2b..db74b6a 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -147,7 +147,7 @@ sub encode($$;$) Carp::croak("Unknown encoding '$name'"); } my $octets = $enc->encode($string,$check); - return undef if ($check && length($string)); + $_[1] = $string if $check; return $octets; } diff --git a/ext/Encode/lib/Encode/Guess.pm b/ext/Encode/lib/Encode/Guess.pm index fc8d267..5858f81 100644 --- a/ext/Encode/lib/Encode/Guess.pm +++ b/ext/Encode/lib/Encode/Guess.pm @@ -18,6 +18,7 @@ sub needs_lines { 1 } sub perlio_ok { 0 } our @EXPORT = qw(guess_encoding); +our $NoUTFAutoGuess = 0; sub import { # Exporter not used so we do it on our own my $callpkg = caller; @@ -70,75 +71,80 @@ sub guess { return unless defined $octet and length $octet; # cheat 0: utf8 flag; - Encode::is_utf8($octet) and return find_encoding('utf8'); + if ( Encode::is_utf8($octet) ) { + return find_encoding('utf8') unless $NoUTFAutoGuess; + Encode::_utf8_off($octet); + } # cheat 1: BOM use Encode::Unicode; - my $BOM = unpack('n', $octet); - return find_encoding('UTF-16') - if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe)); - $BOM = unpack('N', $octet); - return find_encoding('UTF-32') - if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000)); + unless ($NoUTFAutoGuess) { + my $BOM = unpack('n', $octet); + return find_encoding('UTF-16') + if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe)); + $BOM = unpack('N', $octet); + return find_encoding('UTF-32') + if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000)); + if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE) + my $utf; + my ($be, $le) = (0, 0); + if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed + $utf = "UTF-32"; + for my $char (unpack('N*', $octet)){ + $char & 0x0000ffff and $be++; + $char & 0xffff0000 and $le++; + } + }else{ # UTF-16(BE|LE) assumed + $utf = "UTF-16"; + for my $char (unpack('n*', $octet)){ + $char & 0x00ff and $be++; + $char & 0xff00 and $le++; + } + } + $DEBUG and warn "$utf, be == $be, le == $le"; + $be == $le + and return + "Encodings ambiguous between $utf BE and LE ($be, $le)"; + $utf .= ($be > $le) ? 'BE' : 'LE'; + return find_encoding($utf); + } + } my %try = %{$obj->{Suspects}}; for my $c (@_){ my $e = find_encoding($c) or die "Unknown encoding: $c"; $try{$e->name} = $e; $DEBUG and warn "Added: ", $e->name; } - if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE) - my $utf; - my ($be, $le) = (0, 0); - if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed - $utf = "UTF-32"; - for my $char (unpack('N*', $octet)){ - $char & 0x0000ffff and $be++; - $char & 0xffff0000 and $le++; - } - }else{ # UTF-16(BE|LE) assumed - $utf = "UTF-16"; - for my $char (unpack('n*', $octet)){ - $char & 0x00ff and $be++; - $char & 0xff00 and $le++; + my $nline = 1; + for my $line (split /\r\n?|\n/, $octet){ + # cheat 2 -- \e in the string + if ($line =~ /\e/o){ + my @keys = keys %try; + delete @try{qw/utf8 ascii/}; + for my $k (@keys){ + ref($try{$k}) eq 'Encode::XS' and delete $try{$k}; } } - $DEBUG and warn "$utf, be == $be, le == $le"; - $be == $le - and return "Encodings ambiguous between $utf BE and LE ($be, $le)"; - $utf .= ($be > $le) ? 'BE' : 'LE'; - return find_encoding($utf); - }else{ - my $nline = 1; - for my $line (split /\r\n?|\n/, $octet){ - # cheat 2 -- \e in the string - if ($line =~ /\e/o){ - my @keys = keys %try; - delete @try{qw/utf8 ascii/}; - for my $k (@keys){ - ref($try{$k}) eq 'Encode::XS' and delete $try{$k}; - } - } - my %ok = %try; - # warn join(",", keys %try); - for my $k (keys %try){ - my $scratch = $line; - $try{$k}->decode($scratch, FB_QUIET); - if ($scratch eq ''){ - $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k); - }else{ - use bytes (); - $DEBUG and - warn sprintf("%4d:%-24s not ok; %d bytes left\n", - $nline, $k, bytes::length($scratch)); - delete $ok{$k}; - } + my %ok = %try; + # warn join(",", keys %try); + for my $k (keys %try){ + my $scratch = $line; + $try{$k}->decode($scratch, FB_QUIET); + if ($scratch eq ''){ + $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k); + }else{ + use bytes (); + $DEBUG and + warn sprintf("%4d:%-24s not ok; %d bytes left\n", + $nline, $k, bytes::length($scratch)); + delete $ok{$k}; } - %ok or return "No appropriate encodings found!"; - if (scalar(keys(%ok)) == 1){ - my ($retval) = values(%ok); - return $retval; - } - %try = %ok; $nline++; } + %ok or return "No appropriate encodings found!"; + if (scalar(keys(%ok)) == 1){ + my ($retval) = values(%ok); + return $retval; + } + %try = %ok; $nline++; } $try{ascii} or return "Encodings too ambiguous: ", join(" or ", keys %try); @@ -189,6 +195,10 @@ canonical names or aliases. # tries all major Japanese Encodings as well use Encode::Guess qw/euc-jp shiftjis 7bit-jis/; +If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true +value, no heuristics will be applied to UTF8/16/32, and the result +will be limited to the suspects and C. + =over 4 =item Encode::Guess->set_suspects