use strict;
use Encode qw(:fallbacks find_encoding);
-our $VERSION = do { my @r = (q$Revision: 1.3 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
my $Canon = 'Guess';
-our $DEBUG = 0;
+sub DEBUG () { 0 }
our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8);
$Encode::Encoding{$Canon} =
bless {
sub perlio_ok { 0 }
our @EXPORT = qw(guess_encoding);
+our $NoUTFAutoGuess = 0;
+our $UTF8_BOM = pack("C3", 0xef, 0xbb, 0xbf);
sub import { # Exporter not used so we do it on our own
my $callpkg = caller;
for my $c (@_){
my $e = find_encoding($c) or die "Unknown encoding: $c";
$self->{Suspects}{$e->name} = $e;
- $DEBUG and warn "Added: ", $e->name;
+ DEBUG and warn "Added: ", $e->name;
}
}
my $class = shift;
my $obj = ref($class) ? $class : $Encode::Encoding{$Canon};
my $octet = shift;
+
+ # sanity check
+ return unless defined $octet and length $octet;
+
# cheat 0: utf8 flag;
- Encode::is_utf8($octet) and return find_encoding('utf8');
+ if ( Encode::is_utf8($octet) ) {
+ return find_encoding('utf8') unless $NoUTFAutoGuess;
+ Encode::_utf8_off($octet);
+ }
# cheat 1: BOM
use Encode::Unicode;
- my $BOM = unpack('n', $octet);
- return find_encoding('UTF-16')
- if ($BOM == 0xFeFF or $BOM == 0xFFFe);
- $BOM = unpack('N', $octet);
- return find_encoding('UTF-32')
- if ($BOM == 0xFeFF or $BOM == 0xFFFe0000);
-
+ unless ($NoUTFAutoGuess) {
+ my $BOM = pack('C3', unpack("C3", $octet));
+ return find_encoding('utf8')
+ if (defined $BOM and $BOM eq $UTF8_BOM);
+ $BOM = unpack('N', $octet);
+ return find_encoding('UTF-32')
+ if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000));
+ $BOM = unpack('n', $octet);
+ return find_encoding('UTF-16')
+ if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe));
+ if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE)
+ my $utf;
+ my ($be, $le) = (0, 0);
+ if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed
+ $utf = "UTF-32";
+ for my $char (unpack('N*', $octet)){
+ $char & 0x0000ffff and $be++;
+ $char & 0xffff0000 and $le++;
+ }
+ }else{ # UTF-16(BE|LE) assumed
+ $utf = "UTF-16";
+ for my $char (unpack('n*', $octet)){
+ $char & 0x00ff and $be++;
+ $char & 0xff00 and $le++;
+ }
+ }
+ DEBUG and warn "$utf, be == $be, le == $le";
+ $be == $le
+ and return
+ "Encodings ambiguous between $utf BE and LE ($be, $le)";
+ $utf .= ($be > $le) ? 'BE' : 'LE';
+ return find_encoding($utf);
+ }
+ }
my %try = %{$obj->{Suspects}};
for my $c (@_){
my $e = find_encoding($c) or die "Unknown encoding: $c";
$try{$e->name} = $e;
- $DEBUG and warn "Added: ", $e->name;
+ DEBUG and warn "Added: ", $e->name;
}
my $nline = 1;
- for my $line (split /\r|\n|\r\n/, $octet){
+ for my $line (split /\r\n?|\n/, $octet){
# cheat 2 -- \e in the string
if ($line =~ /\e/o){
my @keys = keys %try;
my $scratch = $line;
$try{$k}->decode($scratch, FB_QUIET);
if ($scratch eq ''){
- $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
+ DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
}else{
use bytes ();
- $DEBUG and
+ DEBUG and
warn sprintf("%4d:%-24s not ok; %d bytes left\n",
$nline, $k, bytes::length($scratch));
delete $ok{$k};
-
}
}
%ok or return "No appropriate encodings found!";
# if you are sure $data won't contain anything bogus
+ use Encode;
use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
my $utf8 = decode("Guess", $data);
my $data = encode("Guess", $utf8); # this doesn't work!
# more elaborate way
- use Encode::Guess,
+ use Encode::Guess;
my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/);
ref($enc) or die "Can't guess: $enc"; # trap error this way
$utf8 = $enc->decode($data);
check (I<suspects> as follows). The name of suspects can either be
canonical names or aliases.
+CAVEAT: Unlike UTF-(16|32), BOM in utf8 is NOT AUTOMATICALLY STRIPPED.
+
# tries all major Japanese Encodings as well
use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
+If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true
+value, no heuristics will be applied to UTF8/16/32, and the result
+will be limited to the suspects and C<ascii>.
+
=over 4
=item Encode::Guess->set_suspects
=item Encode::Guess->guess($data)
-But it will croak if Encode::Guess fails to eliminate all other
-suspects but the right one or no suspect was good. So you should
-instead try this;
+But it will croak if:
+
+=over
+
+=item *
+
+Two or more suspects remain
+
+=item *
+
+No suspects left
+
+=back
+
+So you should instead try this;
my $decoder = Encode::Guess->guess($data);
The reason is that Encode::Guess guesses encoding by trial and error.
It first splits $data into lines and tries to decode the line for each
-suspect. It keeps it going until all but one encoding was eliminated
+suspect. It keeps it going until all but one encoding is eliminated
out of suspects list. ISO-8859 series is just too successful for most
cases (because it fills almost all code points in \x00-\xff).
comes to encodings. But there are some, especially Japanese,
environment that guess-coding is a must. Use this module with care.
+=head1 TO DO
+
+Encode::Guess does not work on EBCDIC platforms.
+
=head1 SEE ALSO
L<Encode>, L<Encode::Encoding>