X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=ext%2FEncode%2Flib%2FEncode%2FGuess.pm;h=5692cee9a4ab2c66067720f8531948daea15e658;hb=7237418a8f516d6002de239a7ae175380a62b3c8;hp=8a4f04c7ed5e37720811c5c6d9903d1cc8f49497;hpb=bc2a472ee2496e18323688445a703595a998e3a4;p=p5sagit%2Fp5-mst-13.2.git diff --git a/ext/Encode/lib/Encode/Guess.pm b/ext/Encode/lib/Encode/Guess.pm index 8a4f04c..5692cee 100644 --- a/ext/Encode/lib/Encode/Guess.pm +++ b/ext/Encode/lib/Encode/Guess.pm @@ -2,7 +2,7 @@ package Encode::Guess; use strict; use Encode qw(:fallbacks find_encoding); -our $VERSION = do { my @r = (q$Revision: 1.9 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; my $Canon = 'Guess'; sub DEBUG () { 0 } @@ -19,6 +19,7 @@ sub perlio_ok { 0 } our @EXPORT = qw(guess_encoding); our $NoUTFAutoGuess = 0; +our $UTF8_BOM = pack("C3", 0xef, 0xbb, 0xbf); sub import { # Exporter not used so we do it on our own my $callpkg = caller; @@ -78,12 +79,15 @@ sub guess { # cheat 1: BOM use Encode::Unicode; unless ($NoUTFAutoGuess) { - my $BOM = unpack('n', $octet); - return find_encoding('UTF-16') - if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe)); + my $BOM = pack('C3', unpack("C3", $octet)); + return find_encoding('utf8') + if (defined $BOM and $BOM eq $UTF8_BOM); $BOM = unpack('N', $octet); return find_encoding('UTF-32') if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000)); + $BOM = unpack('n', $octet); + return find_encoding('UTF-16') + if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe)); if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE) my $utf; my ($be, $le) = (0, 0); @@ -192,6 +196,8 @@ To use it more practically, you have to give the names of encodings to check (I as follows). The name of suspects can either be canonical names or aliases. +CAVEAT: Unlike UTF-(16|32), BOM in utf8 is NOT AUTOMATICALLY STRIPPED. + # tries all major Japanese Encodings as well use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;