From: Dan Kogai <dankogai@dan.co.jp>
Date: Sat, 28 Jun 2003 01:20:59 +0000 (+0900)
Subject: [Encode] pre-1.97 patches
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=23f3589e21445e9141901c2894bc97b457493332;p=p5sagit%2Fp5-mst-13.2.git

[Encode] pre-1.97 patches
Message-Id: <56D5BFEE-A8BB-11D7-9092-000393AE4244@dan.co.jp>

p4raw-id: //depot/perl@19871
---

diff --git a/ext/Encode/Changes b/ext/Encode/Changes
index 18f5788..7251f5d 100644
--- a/ext/Encode/Changes
+++ b/ext/Encode/Changes
@@ -3,6 +3,16 @@
 # $Id: Changes,v 1.96 2003/06/18 09:29:02 dankogai Exp $
 #
 $Revision: 1.96 $ $Date: 2003/06/18 09:29:02 $
+! lib/Encode/Guess.pm
+  $Encode::Guess::NoUTFAutoGuess is added so you can turn off
+  automatic  utf(8|16|32) guessing -- originally by Autrijus
+  Message-Id: <20030626162731.GA2077@not.autrijus.org>
+! Encode.pm
+  Addressed the following;
+  Subject: [perl #22835] FB_QUIET doesn't work with Encode::encode 
+  Message-Id: <rt-22835-59975.6.8650775354304@rt.perl.org>
+
+1.96 2003/06/18 09:29:02
 ! lib/Encode/JP/JP.pm t/guess.t
   m/(...)/ in void context then $1 is considered a Bad Thing
   Message-Id: <B5AB34D0-A019-11D7-AF03-000393AE4244@dan.co.jp>
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm
index 57bcc2b..db74b6a 100644
--- a/ext/Encode/Encode.pm
+++ b/ext/Encode/Encode.pm
@@ -147,7 +147,7 @@ sub encode($$;$)
 	Carp::croak("Unknown encoding '$name'");
     }
     my $octets = $enc->encode($string,$check);
-    return undef if ($check && length($string));
+    $_[1] = $string if $check;
     return $octets;
 }
 
diff --git a/ext/Encode/lib/Encode/Guess.pm b/ext/Encode/lib/Encode/Guess.pm
index fc8d267..5858f81 100644
--- a/ext/Encode/lib/Encode/Guess.pm
+++ b/ext/Encode/lib/Encode/Guess.pm
@@ -18,6 +18,7 @@ sub needs_lines { 1 }
 sub perlio_ok { 0 }
 
 our @EXPORT = qw(guess_encoding);
+our $NoUTFAutoGuess = 0;
 
 sub import { # Exporter not used so we do it on our own
     my $callpkg = caller;
@@ -70,75 +71,80 @@ sub guess {
     return unless defined $octet and length $octet;
 
     # cheat 0: utf8 flag;
-    Encode::is_utf8($octet) and return find_encoding('utf8');
+    if ( Encode::is_utf8($octet) ) {
+	return find_encoding('utf8') unless $NoUTFAutoGuess;
+	Encode::_utf8_off($octet);
+    }
     # cheat 1: BOM
     use Encode::Unicode;
-    my $BOM = unpack('n', $octet);
-    return find_encoding('UTF-16') 
-	if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe));
-    $BOM = unpack('N', $octet);
-    return find_encoding('UTF-32') 
-	if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000));
+    unless ($NoUTFAutoGuess) {
+	my $BOM = unpack('n', $octet);
+	return find_encoding('UTF-16')
+	    if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe));
+	$BOM = unpack('N', $octet);
+	return find_encoding('UTF-32')
+	    if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000));
+	if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE)
+	    my $utf;
+	    my ($be, $le) = (0, 0);
+	    if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed
+		$utf = "UTF-32";
+		for my $char (unpack('N*', $octet)){
+		    $char & 0x0000ffff and $be++;
+		    $char & 0xffff0000 and $le++;
+		}
+	    }else{ # UTF-16(BE|LE) assumed
+		$utf = "UTF-16";
+		for my $char (unpack('n*', $octet)){
+		    $char & 0x00ff and $be++;
+		    $char & 0xff00 and $le++;
+		}
+	    }
+	    $DEBUG and warn "$utf, be == $be, le == $le";
+	    $be == $le 
+		and return
+		    "Encodings ambiguous between $utf BE and LE ($be, $le)";
+	    $utf .= ($be > $le) ? 'BE' : 'LE';
+	    return find_encoding($utf);
+	}
+    }
     my %try =  %{$obj->{Suspects}};
     for my $c (@_){
 	my $e = find_encoding($c) or die "Unknown encoding: $c";
 	$try{$e->name} = $e;
 	$DEBUG and warn "Added: ", $e->name;
     }
-    if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE)
-	my $utf;
-	my ($be, $le) = (0, 0);
-	if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed
-	    $utf = "UTF-32";
-	    for my $char (unpack('N*', $octet)){
-		$char & 0x0000ffff and $be++;
-		$char & 0xffff0000 and $le++;
-	    }
-	}else{ # UTF-16(BE|LE) assumed
-	    $utf = "UTF-16";
-	    for my $char (unpack('n*', $octet)){
-		$char & 0x00ff and $be++;
-		$char & 0xff00 and $le++;
+    my $nline = 1;
+    for my $line (split /\r\n?|\n/, $octet){
+	# cheat 2 -- \e in the string
+	if ($line =~ /\e/o){
+	    my @keys = keys %try;
+	    delete @try{qw/utf8 ascii/};
+	    for my $k (@keys){
+		ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
 	    }
 	}
-	$DEBUG and warn "$utf, be == $be, le == $le";
-	$be == $le 
-	    and return "Encodings ambiguous between $utf BE and LE ($be, $le)";
-	$utf .= ($be > $le) ? 'BE' : 'LE';
-	return find_encoding($utf);
-    }else{
-	my $nline = 1;
-	for my $line (split /\r\n?|\n/, $octet){
-	    # cheat 2 -- \e in the string
-	    if ($line =~ /\e/o){
-		my @keys = keys %try;
-		delete @try{qw/utf8 ascii/};
-		for my $k (@keys){
-		    ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
-		}
-	    }
-	    my %ok = %try;
-	    # warn join(",", keys %try);
-	    for my $k (keys %try){
-		my $scratch = $line;
-		$try{$k}->decode($scratch, FB_QUIET);
-		if ($scratch eq ''){
-		    $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
-		}else{
-		    use bytes ();
-		    $DEBUG and 
-			warn sprintf("%4d:%-24s not ok; %d bytes left\n", 
-				     $nline, $k, bytes::length($scratch));
-		    delete $ok{$k};
-		}
+	my %ok = %try;
+	# warn join(",", keys %try);
+	for my $k (keys %try){
+	    my $scratch = $line;
+	    $try{$k}->decode($scratch, FB_QUIET);
+	    if ($scratch eq ''){
+		$DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
+	    }else{
+		use bytes ();
+		$DEBUG and 
+		    warn sprintf("%4d:%-24s not ok; %d bytes left\n", 
+				 $nline, $k, bytes::length($scratch));
+		delete $ok{$k};
 	    }
-	    %ok or return "No appropriate encodings found!";
-	    if (scalar(keys(%ok)) == 1){
-		my ($retval) = values(%ok);
-		return $retval;
-	    }
-	    %try = %ok; $nline++;
 	}
+	%ok or return "No appropriate encodings found!";
+	if (scalar(keys(%ok)) == 1){
+	    my ($retval) = values(%ok);
+	    return $retval;
+	}
+	%try = %ok; $nline++;
     }
     $try{ascii} or 
 	return  "Encodings too ambiguous: ", join(" or ", keys %try);
@@ -189,6 +195,10 @@ canonical names or aliases.
  # tries all major Japanese Encodings as well
   use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
 
+If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true
+value, no heuristics will be applied to UTF8/16/32, and the result
+will be limited to the suspects and C<ascii>.
+
 =over 4
 
 =item Encode::Guess->set_suspects