use strict;
use Encode qw(:fallbacks find_encoding);
-our $VERSION = do { my @r = (q$Revision: 1.8 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
my $Canon = 'Guess';
-our $DEBUG = 0;
+sub DEBUG () { 0 }
our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8);
$Encode::Encoding{$Canon} =
bless {
our @EXPORT = qw(guess_encoding);
our $NoUTFAutoGuess = 0;
+our $UTF8_BOM = pack("C3", 0xef, 0xbb, 0xbf);
sub import { # Exporter not used so we do it on our own
my $callpkg = caller;
for my $c (@_){
my $e = find_encoding($c) or die "Unknown encoding: $c";
$self->{Suspects}{$e->name} = $e;
- $DEBUG and warn "Added: ", $e->name;
+ DEBUG and warn "Added: ", $e->name;
}
}
# cheat 1: BOM
use Encode::Unicode;
unless ($NoUTFAutoGuess) {
- my $BOM = unpack('n', $octet);
- return find_encoding('UTF-16')
- if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe));
+ my $BOM = pack('C3', unpack("C3", $octet));
+ return find_encoding('utf8')
+ if (defined $BOM and $BOM eq $UTF8_BOM);
$BOM = unpack('N', $octet);
return find_encoding('UTF-32')
if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000));
+ $BOM = unpack('n', $octet);
+ return find_encoding('UTF-16')
+ if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe));
if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE)
my $utf;
my ($be, $le) = (0, 0);
$char & 0xff00 and $le++;
}
}
- $DEBUG and warn "$utf, be == $be, le == $le";
+ DEBUG and warn "$utf, be == $be, le == $le";
$be == $le
and return
"Encodings ambiguous between $utf BE and LE ($be, $le)";
for my $c (@_){
my $e = find_encoding($c) or die "Unknown encoding: $c";
$try{$e->name} = $e;
- $DEBUG and warn "Added: ", $e->name;
+ DEBUG and warn "Added: ", $e->name;
}
my $nline = 1;
for my $line (split /\r\n?|\n/, $octet){
my $scratch = $line;
$try{$k}->decode($scratch, FB_QUIET);
if ($scratch eq ''){
- $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
+ DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
}else{
use bytes ();
- $DEBUG and
+ DEBUG and
warn sprintf("%4d:%-24s not ok; %d bytes left\n",
$nline, $k, bytes::length($scratch));
delete $ok{$k};
check (I<suspects> as follows). The name of suspects can either be
canonical names or aliases.
+CAVEAT: Unlike UTF-(16|32), BOM in utf8 is NOT AUTOMATICALLY STRIPPED.
+
# tries all major Japanese Encodings as well
use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;