[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Guess.pm

package Encode::Guess;
use strict;

use Encode qw(:fallbacks find_encoding);
our $VERSION = do { my @r = (q$Revision: 1.9 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

my $Canon = 'Guess';
sub DEBUG () { 0 }
our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8);
$Encode::Encoding{$Canon} = 
    bless { 
	   Name       => $Canon,
	   Suspects => { %DEF_SUSPECTS },
	  } => __PACKAGE__;

use base qw(Encode::Encoding);
sub needs_lines { 1 }
sub perlio_ok { 0 }

our @EXPORT = qw(guess_encoding);
our $NoUTFAutoGuess = 0;

sub import { # Exporter not used so we do it on our own
    my $callpkg = caller;
    for my $item (@EXPORT){
	no strict 'refs';
	*{"$callpkg\::$item"} = \&{"$item"};
    }
    set_suspects(@_);
}

sub set_suspects{
    my $class = shift;
    my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
    $self->{Suspects} = { %DEF_SUSPECTS };
    $self->add_suspects(@_);
}

sub add_suspects{
    my $class = shift;
    my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
    for my $c (@_){
	my $e = find_encoding($c) or die "Unknown encoding: $c";
	$self->{Suspects}{$e->name} = $e;
	DEBUG and warn "Added: ", $e->name;
    }
}

sub decode($$;$){
    my ($obj, $octet, $chk) = @_;
    my $guessed = guess($obj, $octet);
    unless (ref($guessed)){
	require Carp;
	Carp::croak($guessed);
    }
    my $utf8 = $guessed->decode($octet, $chk);
    $_[1] = $octet if $chk;
    return $utf8;
}

sub guess_encoding{
    guess($Encode::Encoding{$Canon}, @_);
}

sub guess {
    my $class = shift;
    my $obj   = ref($class) ? $class : $Encode::Encoding{$Canon};
    my $octet = shift;

    # sanity check
    return unless defined $octet and length $octet;

    # cheat 0: utf8 flag;
    if ( Encode::is_utf8($octet) ) {
	return find_encoding('utf8') unless $NoUTFAutoGuess;
	Encode::_utf8_off($octet);
    }
    # cheat 1: BOM
    use Encode::Unicode;
    unless ($NoUTFAutoGuess) {
	my $BOM = unpack('n', $octet);
	return find_encoding('UTF-16')
	    if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe));
	$BOM = unpack('N', $octet);
	return find_encoding('UTF-32')
	    if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000));
	if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE)
	    my $utf;
	    my ($be, $le) = (0, 0);
	    if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed
		$utf = "UTF-32";
		for my $char (unpack('N*', $octet)){
		    $char & 0x0000ffff and $be++;
		    $char & 0xffff0000 and $le++;
		}
	    }else{ # UTF-16(BE|LE) assumed
		$utf = "UTF-16";
		for my $char (unpack('n*', $octet)){
		    $char & 0x00ff and $be++;
		    $char & 0xff00 and $le++;
		}
	    }
	    DEBUG and warn "$utf, be == $be, le == $le";
	    $be == $le 
		and return
		    "Encodings ambiguous between $utf BE and LE ($be, $le)";
	    $utf .= ($be > $le) ? 'BE' : 'LE';
	    return find_encoding($utf);
	}
    }
    my %try =  %{$obj->{Suspects}};
    for my $c (@_){
	my $e = find_encoding($c) or die "Unknown encoding: $c";
	$try{$e->name} = $e;
	DEBUG and warn "Added: ", $e->name;
    }
    my $nline = 1;
    for my $line (split /\r\n?|\n/, $octet){
	# cheat 2 -- \e in the string
	if ($line =~ /\e/o){
	    my @keys = keys %try;
	    delete @try{qw/utf8 ascii/};
	    for my $k (@keys){
		ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
	    }
	}
	my %ok = %try;
	# warn join(",", keys %try);
	for my $k (keys %try){
	    my $scratch = $line;
	    $try{$k}->decode($scratch, FB_QUIET);
	    if ($scratch eq ''){
		DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
	    }else{
		use bytes ();
		DEBUG and 
		    warn sprintf("%4d:%-24s not ok; %d bytes left\n", 
				 $nline, $k, bytes::length($scratch));
		delete $ok{$k};
	    }
	}
	%ok or return "No appropriate encodings found!";
	if (scalar(keys(%ok)) == 1){
	    my ($retval) = values(%ok);
	    return $retval;
	}
	%try = %ok; $nline++;
    }
    $try{ascii} or 
	return  "Encodings too ambiguous: ", join(" or ", keys %try);
    return $try{ascii};
}


1;
__END__

=head1 NAME

Encode::Guess -- Guesses encoding from data

=head1 SYNOPSIS

  # if you are sure $data won't contain anything bogus

  use Encode;
  use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
  my $utf8 = decode("Guess", $data);
  my $data = encode("Guess", $utf8);   # this doesn't work!

  # more elaborate way
  use Encode::Guess;
  my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/);
  ref($enc) or die "Can't guess: $enc"; # trap error this way
  $utf8 = $enc->decode($data);
  # or
  $utf8 = decode($enc->name, $data)

=head1 ABSTRACT

Encode::Guess enables you to guess in what encoding a given data is
encoded, or at least tries to.  

=head1 DESCRIPTION

By default, it checks only ascii, utf8 and UTF-16/32 with BOM.

  use Encode::Guess; # ascii/utf8/BOMed UTF

To use it more practically, you have to give the names of encodings to
check (I<suspects> as follows).  The name of suspects can either be
canonical names or aliases.

 # tries all major Japanese Encodings as well
  use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;

If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true
value, no heuristics will be applied to UTF8/16/32, and the result
will be limited to the suspects and C<ascii>.

=over 4

=item Encode::Guess->set_suspects

You can also change the internal suspects list via C<set_suspects>
method. 

  use Encode::Guess;
  Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/);

=item Encode::Guess->add_suspects

Or you can use C<add_suspects> method.  The difference is that
C<set_suspects> flushes the current suspects list while
C<add_suspects> adds.

  use Encode::Guess;
  Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/);
  # now the suspects are euc-jp,shiftjis,7bit-jis, AND
  # euc-kr,euc-cn, and big5-eten
  Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/);

=item Encode::decode("Guess" ...)

When you are content with suspects list, you can now

  my $utf8 = Encode::decode("Guess", $data);

=item Encode::Guess->guess($data)

But it will croak if:

=over

=item *

Two or more suspects remain

=item *

No suspects left

=back

So you should instead try this;

  my $decoder = Encode::Guess->guess($data);

On success, $decoder is an object that is documented in
L<Encode::Encoding>.  So you can now do this;

  my $utf8 = $decoder->decode($data);

On failure, $decoder now contains an error message so the whole thing
would be as follows;

  my $decoder = Encode::Guess->guess($data);
  die $decoder unless ref($decoder);
  my $utf8 = $decoder->decode($data);

=item guess_encoding($data, [, I<list of suspects>])

You can also try C<guess_encoding> function which is exported by
default.  It takes $data to check and it also takes the list of
suspects by option.  The optional suspect list is I<not reflected> to
the internal suspects list.

  my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/);
  die $decoder unless ref($decoder);
  my $utf8 = $decoder->decode($data);
  # check only ascii and utf8
  my $decoder = guess_encoding($data);

=back

=head1 CAVEATS

=over 4

=item *

Because of the algorithm used, ISO-8859 series and other single-byte
encodings do not work well unless either one of ISO-8859 is the only
one suspect (besides ascii and utf8).

  use Encode::Guess;
  # perhaps ok
  my $decoder = guess_encoding($data, 'latin1');
  # definitely NOT ok
  my $decoder = guess_encoding($data, qw/latin1 greek/);

The reason is that Encode::Guess guesses encoding by trial and error.
It first splits $data into lines and tries to decode the line for each
suspect.  It keeps it going until all but one encoding is eliminated
out of suspects list.  ISO-8859 series is just too successful for most
cases (because it fills almost all code points in \x00-\xff).

=item *

Do not mix national standard encodings and the corresponding vendor
encodings.

  # a very bad idea
  my $decoder
     = guess_encoding($data, qw/shiftjis MacJapanese cp932/);

The reason is that vendor encoding is usually a superset of national
standard so it becomes too ambiguous for most cases.

=item *

On the other hand, mixing various national standard encodings
automagically works unless $data is too short to allow for guessing.

 # This is ok if $data is long enough
 my $decoder =  
  guess_encoding($data, qw/euc-cn
                           euc-jp shiftjis 7bit-jis
                           euc-kr
                           big5-eten/);

=item *

DO NOT PUT TOO MANY SUSPECTS!  Don't you try something like this!

  my $decoder = guess_encoding($data, 
                               Encode->encodings(":all"));

=back

It is, after all, just a guess.  You should alway be explicit when it
comes to encodings.  But there are some, especially Japanese,
environment that guess-coding is a must.  Use this module with care. 

=head1 TO DO

Encode::Guess does not work on EBCDIC platforms.

=head1 SEE ALSO

L<Encode>, L<Encode::Encoding>

=cut
Commit	Line	Data
af1f55d9	1	package Encode::Guess;
af1f55d9	2	use strict;
7e19fb92	3
af1f55d9	4	use Encode qw(:fallbacks find_encoding);
8f139f4c	5	our $VERSION = do { my @r = (q$Revision: 1.9 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
af1f55d9	6
af1f55d9	7	my $Canon = 'Guess';
8f139f4c	8	sub DEBUG () { 0 }
7e19fb92	9	our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8);
	10	$Encode::Encoding{$Canon} =
	11	bless {
	12	Name => $Canon,
	13	Suspects => { %DEF_SUSPECTS },
	14	} => __PACKAGE__;
	15
10c5ecbb	16	use base qw(Encode::Encoding);
7e19fb92	17	sub needs_lines { 1 }
7e19fb92	18	sub perlio_ok { 0 }
7e19fb92	19
7e19fb92	20	our @EXPORT = qw(guess_encoding);
23f3589e	21	our $NoUTFAutoGuess = 0;
7e19fb92	22
	23	sub import { # Exporter not used so we do it on our own
	24	my $callpkg = caller;
	25	for my $item (@EXPORT){
	26	no strict 'refs';
	27	*{"$callpkg\::$item"} = \&{"$item"};
	28	}
	29	set_suspects(@_);
	30	}
af1f55d9	31
7e19fb92	32	sub set_suspects{
	33	my $class = shift;
	34	my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
	35	$self->{Suspects} = { %DEF_SUSPECTS };
	36	$self->add_suspects(@_);
	37	}
af1f55d9	38
7e19fb92	39	sub add_suspects{
af1f55d9	40	my $class = shift;
7e19fb92	41	my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
af1f55d9	42	for my $c (@_){
af1f55d9	43	my $e = find_encoding($c) or die "Unknown encoding: $c";
7e19fb92	44	$self->{Suspects}{$e->name} = $e;
8f139f4c	45	DEBUG and warn "Added: ", $e->name;
af1f55d9	46	}
	47	}
	48
af1f55d9	49	sub decode($$;$){
af1f55d9	50	my ($obj, $octet, $chk) = @_;
7e19fb92	51	my $guessed = guess($obj, $octet);
10c5ecbb	52	unless (ref($guessed)){
	53	require Carp;
	54	Carp::croak($guessed);
	55	}
7e19fb92	56	my $utf8 = $guessed->decode($octet, $chk);
af1f55d9	57	$_[1] = $octet if $chk;
	58	return $utf8;
	59	}
	60
7e19fb92	61	sub guess_encoding{
7e19fb92	62	guess($Encode::Encoding{$Canon}, @_);
af1f55d9	63	}
	64
	65	sub guess {
7e19fb92	66	my $class = shift;
	67	my $obj = ref($class) ? $class : $Encode::Encoding{$Canon};
	68	my $octet = shift;
2fc614e0	69
	70	# sanity check
	71	return unless defined $octet and length $octet;
	72
7e19fb92	73	# cheat 0: utf8 flag;
23f3589e	74	if ( Encode::is_utf8($octet) ) {
	75	return find_encoding('utf8') unless $NoUTFAutoGuess;
	76	Encode::_utf8_off($octet);
	77	}
7e19fb92	78	# cheat 1: BOM
7e19fb92	79	use Encode::Unicode;
23f3589e	80	unless ($NoUTFAutoGuess) {
	81	my $BOM = unpack('n', $octet);
	82	return find_encoding('UTF-16')
	83	if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe));
	84	$BOM = unpack('N', $octet);
	85	return find_encoding('UTF-32')
	86	if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000));
	87	if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16\|32)(BE\|LE)
	88	my $utf;
	89	my ($be, $le) = (0, 0);
	90	if ($octet =~ /\x00\x00/o){ # UTF-32(BE\|LE) assumed
	91	$utf = "UTF-32";
	92	for my $char (unpack('N*', $octet)){
	93	$char & 0x0000ffff and $be++;
	94	$char & 0xffff0000 and $le++;
	95	}
	96	}else{ # UTF-16(BE\|LE) assumed
	97	$utf = "UTF-16";
	98	for my $char (unpack('n*', $octet)){
	99	$char & 0x00ff and $be++;
	100	$char & 0xff00 and $le++;
	101	}
	102	}
8f139f4c	103	DEBUG and warn "$utf, be == $be, le == $le";
23f3589e	104	$be == $le
	105	and return
	106	"Encodings ambiguous between $utf BE and LE ($be, $le)";
	107	$utf .= ($be > $le) ? 'BE' : 'LE';
	108	return find_encoding($utf);
	109	}
	110	}
7e19fb92	111	my %try = %{$obj->{Suspects}};
	112	for my $c (@_){
	113	my $e = find_encoding($c) or die "Unknown encoding: $c";
	114	$try{$e->name} = $e;
8f139f4c	115	DEBUG and warn "Added: ", $e->name;
7e19fb92	116	}
23f3589e	117	my $nline = 1;
	118	for my $line (split /\r\n?\|\n/, $octet){
	119	# cheat 2 -- \e in the string
	120	if ($line =~ /\e/o){
	121	my @keys = keys %try;
	122	delete @try{qw/utf8 ascii/};
	123	for my $k (@keys){
	124	ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
af1f55d9	125	}
af1f55d9	126	}
23f3589e	127	my %ok = %try;
	128	# warn join(",", keys %try);
	129	for my $k (keys %try){
	130	my $scratch = $line;
	131	$try{$k}->decode($scratch, FB_QUIET);
	132	if ($scratch eq ''){
8f139f4c	133	DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
23f3589e	134	}else{
23f3589e	135	use bytes ();
8f139f4c	136	DEBUG and
23f3589e	137	warn sprintf("%4d:%-24s not ok; %d bytes left\n",
	138	$nline, $k, bytes::length($scratch));
	139	delete $ok{$k};
8676e7d3	140	}
af1f55d9	141	}
23f3589e	142	%ok or return "No appropriate encodings found!";
	143	if (scalar(keys(%ok)) == 1){
	144	my ($retval) = values(%ok);
	145	return $retval;
	146	}
	147	%try = %ok; $nline++;
af1f55d9	148	}
7e19fb92	149	$try{ascii} or
7e19fb92	150	return "Encodings too ambiguous: ", join(" or ", keys %try);
af1f55d9	151	return $try{ascii};
	152	}
	153
	154
7e19fb92	155
af1f55d9	156	1;
	157	__END__
	158
	159	=head1 NAME
	160
7e19fb92	161	Encode::Guess -- Guesses encoding from data
	162
	163	=head1 SYNOPSIS
	164
	165	# if you are sure $data won't contain anything bogus
	166
e8c86ba6	167	use Encode;
7e19fb92	168	use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
	169	my $utf8 = decode("Guess", $data);
	170	my $data = encode("Guess", $utf8); # this doesn't work!
	171
	172	# more elaborate way
9735c3fc	173	use Encode::Guess;
7e19fb92	174	my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/);
	175	ref($enc) or die "Can't guess: $enc"; # trap error this way
	176	$utf8 = $enc->decode($data);
	177	# or
	178	$utf8 = decode($enc->name, $data)
	179
	180	=head1 ABSTRACT
	181
	182	Encode::Guess enables you to guess in what encoding a given data is
	183	encoded, or at least tries to.
	184
	185	=head1 DESCRIPTION
	186
	187	By default, it checks only ascii, utf8 and UTF-16/32 with BOM.
	188
	189	use Encode::Guess; # ascii/utf8/BOMed UTF
	190
	191	To use it more practically, you have to give the names of encodings to
	192	check (I<suspects> as follows). The name of suspects can either be
	193	canonical names or aliases.
	194
	195	# tries all major Japanese Encodings as well
	196	use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
	197
23f3589e	198	If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true
	199	value, no heuristics will be applied to UTF8/16/32, and the result
	200	will be limited to the suspects and C<ascii>.
	201
7e19fb92	202	=over 4
	203
	204	=item Encode::Guess->set_suspects
	205
	206	You can also change the internal suspects list via C<set_suspects>
	207	method.
	208
	209	use Encode::Guess;
	210	Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/);
	211
	212	=item Encode::Guess->add_suspects
	213
	214	Or you can use C<add_suspects> method. The difference is that
	215	C<set_suspects> flushes the current suspects list while
	216	C<add_suspects> adds.
	217
	218	use Encode::Guess;
	219	Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/);
	220	# now the suspects are euc-jp,shiftjis,7bit-jis, AND
	221	# euc-kr,euc-cn, and big5-eten
	222	Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/);
	223
	224	=item Encode::decode("Guess" ...)
	225
	226	When you are content with suspects list, you can now
	227
	228	my $utf8 = Encode::decode("Guess", $data);
	229
	230	=item Encode::Guess->guess($data)
	231
9735c3fc	232	But it will croak if:
	233
	234	=over
	235
	236	=item *
	237
	238	Two or more suspects remain
	239
	240	=item *
	241
	242	No suspects left
	243
	244	=back
	245
	246	So you should instead try this;
7e19fb92	247
	248	my $decoder = Encode::Guess->guess($data);
	249
	250	On success, $decoder is an object that is documented in
	251	L<Encode::Encoding>. So you can now do this;
	252
	253	my $utf8 = $decoder->decode($data);
	254
	255	On failure, $decoder now contains an error message so the whole thing
	256	would be as follows;
	257
	258	my $decoder = Encode::Guess->guess($data);
	259	die $decoder unless ref($decoder);
	260	my $utf8 = $decoder->decode($data);
	261
	262	=item guess_encoding($data, [, I<list of suspects>])
	263
	264	You can also try C<guess_encoding> function which is exported by
	265	default. It takes $data to check and it also takes the list of
	266	suspects by option. The optional suspect list is I<not reflected> to
	267	the internal suspects list.
	268
	269	my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/);
	270	die $decoder unless ref($decoder);
	271	my $utf8 = $decoder->decode($data);
	272	# check only ascii and utf8
	273	my $decoder = guess_encoding($data);
	274
	275	=back
	276
	277	=head1 CAVEATS
	278
	279	=over 4
	280
	281	=item *
	282
	283	Because of the algorithm used, ISO-8859 series and other single-byte
	284	encodings do not work well unless either one of ISO-8859 is the only
	285	one suspect (besides ascii and utf8).
	286
	287	use Encode::Guess;
	288	# perhaps ok
	289	my $decoder = guess_encoding($data, 'latin1');
	290	# definitely NOT ok
	291	my $decoder = guess_encoding($data, qw/latin1 greek/);
	292
	293	The reason is that Encode::Guess guesses encoding by trial and error.
	294	It first splits $data into lines and tries to decode the line for each
9735c3fc	295	suspect. It keeps it going until all but one encoding is eliminated
7e19fb92	296	out of suspects list. ISO-8859 series is just too successful for most
	297	cases (because it fills almost all code points in \x00-\xff).
	298
	299	=item *
	300
	301	Do not mix national standard encodings and the corresponding vendor
	302	encodings.
	303
	304	# a very bad idea
	305	my $decoder
	306	= guess_encoding($data, qw/shiftjis MacJapanese cp932/);
	307
	308	The reason is that vendor encoding is usually a superset of national
	309	standard so it becomes too ambiguous for most cases.
	310
	311	=item *
	312
	313	On the other hand, mixing various national standard encodings
	314	automagically works unless $data is too short to allow for guessing.
	315
	316	# This is ok if $data is long enough
	317	my $decoder =
	318	guess_encoding($data, qw/euc-cn
	319	euc-jp shiftjis 7bit-jis
	320	euc-kr
	321	big5-eten/);
	322
	323	=item *
	324
	325	DO NOT PUT TOO MANY SUSPECTS! Don't you try something like this!
	326
	327	my $decoder = guess_encoding($data,
	328	Encode->encodings(":all"));
	329
	330	=back
	331
	332	It is, after all, just a guess. You should alway be explicit when it
	333	comes to encodings. But there are some, especially Japanese,
	334	environment that guess-coding is a must. Use this module with care.
	335
982a4085	336	=head1 TO DO
	337
	338	Encode::Guess does not work on EBCDIC platforms.
	339
7e19fb92	340	=head1 SEE ALSO
	341
	342	L<Encode>, L<Encode::Encoding>
af1f55d9	343
	344	=cut
	345