ext/Encode/lib/Encode/Guess.pm

   1 package Encode::Guess;
   2 use strict;
   3 use warnings;
   4 use Encode qw(:fallbacks find_encoding);
   5 our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
   6
   7 my $Canon = 'Guess';
   8 sub DEBUG () { 0 }
   9 our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8);
  10 $Encode::Encoding{$Canon} = bless {
  11     Name     => $Canon,
  12     Suspects => {%DEF_SUSPECTS},
  13 } => __PACKAGE__;
  14
  15 use base qw(Encode::Encoding);
  16 sub needs_lines { 1 }
  17 sub perlio_ok   { 0 }
  18
  19 our @EXPORT         = qw(guess_encoding);
  20 our $NoUTFAutoGuess = 0;
  21 our $UTF8_BOM       = pack( "C3", 0xef, 0xbb, 0xbf );
  22
  23 sub import {    # Exporter not used so we do it on our own
  24     my $callpkg = caller;
  25     for my $item (@EXPORT) {
  26         no strict 'refs';
  27         *{"$callpkg\::$item"} = \&{"$item"};
  28     }
  29     set_suspects(@_);
  30 }
  31
  32 sub set_suspects {
  33     my $class = shift;
  34     my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
  35     $self->{Suspects} = {%DEF_SUSPECTS};
  36     $self->add_suspects(@_);
  37 }
  38
  39 sub add_suspects {
  40     my $class = shift;
  41     my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
  42     for my $c (@_) {
  43         my $e = find_encoding($c) or die "Unknown encoding: $c";
  44         $self->{Suspects}{ $e->name } = $e;
  45         DEBUG and warn "Added: ", $e->name;
  46     }
  47 }
  48
  49 sub decode($$;$) {
  50     my ( $obj, $octet, $chk ) = @_;
  51     my $guessed = guess( $obj, $octet );
  52     unless ( ref($guessed) ) {
  53         require Carp;
  54         Carp::croak($guessed);
  55     }
  56     my $utf8 = $guessed->decode( $octet, $chk );
  57     $_[1] = $octet if $chk;
  58     return $utf8;
  59 }
  60
  61 sub guess_encoding {
  62     guess( $Encode::Encoding{$Canon}, @_ );
  63 }
  64
  65 sub guess {
  66     my $class = shift;
  67     my $obj   = ref($class) ? $class : $Encode::Encoding{$Canon};
  68     my $octet = shift;
  69
  70     # sanity check
  71     return unless defined $octet and length $octet;
  72
  73     # cheat 0: utf8 flag;
  74     if ( Encode::is_utf8($octet) ) {
  75         return find_encoding('utf8') unless $NoUTFAutoGuess;
  76         Encode::_utf8_off($octet);
  77     }
  78
  79     # cheat 1: BOM
  80     use Encode::Unicode;
  81     unless ($NoUTFAutoGuess) {
  82         my $BOM = pack( 'C3', unpack( "C3", $octet ) );
  83         return find_encoding('utf8')
  84           if ( defined $BOM and $BOM eq $UTF8_BOM );
  85         $BOM = unpack( 'N', $octet );
  86         return find_encoding('UTF-32')
  87           if ( defined $BOM and ( $BOM == 0xFeFF or $BOM == 0xFFFe0000 ) );
  88         $BOM = unpack( 'n', $octet );
  89         return find_encoding('UTF-16')
  90           if ( defined $BOM and ( $BOM == 0xFeFF or $BOM == 0xFFFe ) );
  91         if ( $octet =~ /\x00/o )
  92         {    # if \x00 found, we assume UTF-(16|32)(BE|LE)
  93             my $utf;
  94             my ( $be, $le ) = ( 0, 0 );
  95             if ( $octet =~ /\x00\x00/o ) {    # UTF-32(BE|LE) assumed
  96                 $utf = "UTF-32";
  97                 for my $char ( unpack( 'N*', $octet ) ) {
  98                     $char & 0x0000ffff and $be++;
  99                     $char & 0xffff0000 and $le++;
 100                 }
 101             }
 102             else {                            # UTF-16(BE|LE) assumed
 103                 $utf = "UTF-16";
 104                 for my $char ( unpack( 'n*', $octet ) ) {
 105                     $char & 0x00ff and $be++;
 106                     $char & 0xff00 and $le++;
 107                 }
 108             }
 109             DEBUG and warn "$utf, be == $be, le == $le";
 110             $be == $le
 111               and return
 112               "Encodings ambiguous between $utf BE and LE ($be, $le)";
 113             $utf .= ( $be > $le ) ? 'BE' : 'LE';
 114             return find_encoding($utf);
 115         }
 116     }
 117     my %try = %{ $obj->{Suspects} };
 118     for my $c (@_) {
 119         my $e = find_encoding($c) or die "Unknown encoding: $c";
 120         $try{ $e->name } = $e;
 121         DEBUG and warn "Added: ", $e->name;
 122     }
 123     my $nline = 1;
 124     for my $line ( split /\r\n?|\n/, $octet ) {
 125
 126         # cheat 2 -- \e in the string
 127         if ( $line =~ /\e/o ) {
 128             my @keys = keys %try;
 129             delete @try{qw/utf8 ascii/};
 130             for my $k (@keys) {
 131                 ref( $try{$k} ) eq 'Encode::XS' and delete $try{$k};
 132             }
 133         }
 134         my %ok = %try;
 135
 136         # warn join(",", keys %try);
 137         for my $k ( keys %try ) {
 138             my $scratch = $line;
 139             $try{$k}->decode( $scratch, FB_QUIET );
 140             if ( $scratch eq '' ) {
 141                 DEBUG and warn sprintf( "%4d:%-24s ok\n", $nline, $k );
 142             }
 143             else {
 144                 use bytes ();
 145                 DEBUG
 146                   and warn sprintf( "%4d:%-24s not ok; %d bytes left\n",
 147                     $nline, $k, bytes::length($scratch) );
 148                 delete $ok{$k};
 149             }
 150         }
 151         %ok or return "No appropriate encodings found!";
 152         if ( scalar( keys(%ok) ) == 1 ) {
 153             my ($retval) = values(%ok);
 154             return $retval;
 155         }
 156         %try = %ok;
 157         $nline++;
 158     }
 159     $try{ascii}
 160       or return "Encodings too ambiguous: ", join( " or ", keys %try );
 161     return $try{ascii};
 162 }
 163
 164 1;
 165 __END__
 166
 167 =head1 NAME
 168
 169 Encode::Guess -- Guesses encoding from data
 170
 171 =head1 SYNOPSIS
 172
 173   # if you are sure $data won't contain anything bogus
 174
 175   use Encode;
 176   use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
 177   my $utf8 = decode("Guess", $data);
 178   my $data = encode("Guess", $utf8);   # this doesn't work!
 179
 180   # more elaborate way
 181   use Encode::Guess;
 182   my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/);
 183   ref($enc) or die "Can't guess: $enc"; # trap error this way
 184   $utf8 = $enc->decode($data);
 185   # or
 186   $utf8 = decode($enc->name, $data)
 187
 188 =head1 ABSTRACT
 189
 190 Encode::Guess enables you to guess in what encoding a given data is
 191 encoded, or at least tries to.
 192
 193 =head1 DESCRIPTION
 194
 195 By default, it checks only ascii, utf8 and UTF-16/32 with BOM.
 196
 197   use Encode::Guess; # ascii/utf8/BOMed UTF
 198
 199 To use it more practically, you have to give the names of encodings to
 200 check (I<suspects> as follows).  The name of suspects can either be
 201 canonical names or aliases.
 202
 203 CAVEAT: Unlike UTF-(16|32), BOM in utf8 is NOT AUTOMATICALLY STRIPPED.
 204
 205  # tries all major Japanese Encodings as well
 206   use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
 207
 208 If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true
 209 value, no heuristics will be applied to UTF8/16/32, and the result
 210 will be limited to the suspects and C<ascii>.
 211
 212 =over 4
 213
 214 =item Encode::Guess->set_suspects
 215
 216 You can also change the internal suspects list via C<set_suspects>
 217 method.
 218
 219   use Encode::Guess;
 220   Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/);
 221
 222 =item Encode::Guess->add_suspects
 223
 224 Or you can use C<add_suspects> method.  The difference is that
 225 C<set_suspects> flushes the current suspects list while
 226 C<add_suspects> adds.
 227
 228   use Encode::Guess;
 229   Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/);
 230   # now the suspects are euc-jp,shiftjis,7bit-jis, AND
 231   # euc-kr,euc-cn, and big5-eten
 232   Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/);
 233
 234 =item Encode::decode("Guess" ...)
 235
 236 When you are content with suspects list, you can now
 237
 238   my $utf8 = Encode::decode("Guess", $data);
 239
 240 =item Encode::Guess->guess($data)
 241
 242 But it will croak if:
 243
 244 =over
 245
 246 =item *
 247
 248 Two or more suspects remain
 249
 250 =item *
 251
 252 No suspects left
 253
 254 =back
 255
 256 So you should instead try this;
 257
 258   my $decoder = Encode::Guess->guess($data);
 259
 260 On success, $decoder is an object that is documented in
 261 L<Encode::Encoding>.  So you can now do this;
 262
 263   my $utf8 = $decoder->decode($data);
 264
 265 On failure, $decoder now contains an error message so the whole thing
 266 would be as follows;
 267
 268   my $decoder = Encode::Guess->guess($data);
 269   die $decoder unless ref($decoder);
 270   my $utf8 = $decoder->decode($data);
 271
 272 =item guess_encoding($data, [, I<list of suspects>])
 273
 274 You can also try C<guess_encoding> function which is exported by
 275 default.  It takes $data to check and it also takes the list of
 276 suspects by option.  The optional suspect list is I<not reflected> to
 277 the internal suspects list.
 278
 279   my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/);
 280   die $decoder unless ref($decoder);
 281   my $utf8 = $decoder->decode($data);
 282   # check only ascii and utf8
 283   my $decoder = guess_encoding($data);
 284
 285 =back
 286
 287 =head1 CAVEATS
 288
 289 =over 4
 290
 291 =item *
 292
 293 Because of the algorithm used, ISO-8859 series and other single-byte
 294 encodings do not work well unless either one of ISO-8859 is the only
 295 one suspect (besides ascii and utf8).
 296
 297   use Encode::Guess;
 298   # perhaps ok
 299   my $decoder = guess_encoding($data, 'latin1');
 300   # definitely NOT ok
 301   my $decoder = guess_encoding($data, qw/latin1 greek/);
 302
 303 The reason is that Encode::Guess guesses encoding by trial and error.
 304 It first splits $data into lines and tries to decode the line for each
 305 suspect.  It keeps it going until all but one encoding is eliminated
 306 out of suspects list.  ISO-8859 series is just too successful for most
 307 cases (because it fills almost all code points in \x00-\xff).
 308
 309 =item *
 310
 311 Do not mix national standard encodings and the corresponding vendor
 312 encodings.
 313
 314   # a very bad idea
 315   my $decoder
 316      = guess_encoding($data, qw/shiftjis MacJapanese cp932/);
 317
 318 The reason is that vendor encoding is usually a superset of national
 319 standard so it becomes too ambiguous for most cases.
 320
 321 =item *
 322
 323 On the other hand, mixing various national standard encodings
 324 automagically works unless $data is too short to allow for guessing.
 325
 326  # This is ok if $data is long enough
 327  my $decoder =
 328   guess_encoding($data, qw/euc-cn
 329                            euc-jp shiftjis 7bit-jis
 330                            euc-kr
 331                            big5-eten/);
 332
 333 =item *
 334
 335 DO NOT PUT TOO MANY SUSPECTS!  Don't you try something like this!
 336
 337   my $decoder = guess_encoding($data,
 338                                Encode->encodings(":all"));
 339
 340 =back
 341
 342 It is, after all, just a guess.  You should alway be explicit when it
 343 comes to encodings.  But there are some, especially Japanese,
 344 environment that guess-coding is a must.  Use this module with care.
 345
 346 =head1 TO DO
 347
 348 Encode::Guess does not work on EBCDIC platforms.
 349
 350 =head1 SEE ALSO
 351
 352 L<Encode>, L<Encode::Encoding>
 353
 354 =cut
 355