[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Guess.pm

package Encode::Guess;
use strict;

use Encode qw(:fallbacks find_encoding);
our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

my $Canon = 'Guess';
sub DEBUG () { 0 }
our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8);
$Encode::Encoding{$Canon} = 
    bless { 
	   Name       => $Canon,
	   Suspects => { %DEF_SUSPECTS },
	  } => __PACKAGE__;

use base qw(Encode::Encoding);
sub needs_lines { 1 }
sub perlio_ok { 0 }

our @EXPORT = qw(guess_encoding);
our $NoUTFAutoGuess = 0;
our $UTF8_BOM = pack("C3", 0xef, 0xbb, 0xbf);

sub import { # Exporter not used so we do it on our own
    my $callpkg = caller;
    for my $item (@EXPORT){
	no strict 'refs';
	*{"$callpkg\::$item"} = \&{"$item"};
    }
    set_suspects(@_);
}

sub set_suspects{
    my $class = shift;
    my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
    $self->{Suspects} = { %DEF_SUSPECTS };
    $self->add_suspects(@_);
}

sub add_suspects{
    my $class = shift;
    my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
    for my $c (@_){
	my $e = find_encoding($c) or die "Unknown encoding: $c";
	$self->{Suspects}{$e->name} = $e;
	DEBUG and warn "Added: ", $e->name;
    }
}

sub decode($$;$){
    my ($obj, $octet, $chk) = @_;
    my $guessed = guess($obj, $octet);
    unless (ref($guessed)){
	require Carp;
	Carp::croak($guessed);
    }
    my $utf8 = $guessed->decode($octet, $chk);
    $_[1] = $octet if $chk;
    return $utf8;
}

sub guess_encoding{
    guess($Encode::Encoding{$Canon}, @_);
}

sub guess {
    my $class = shift;
    my $obj   = ref($class) ? $class : $Encode::Encoding{$Canon};
    my $octet = shift;

    # sanity check
    return unless defined $octet and length $octet;

    # cheat 0: utf8 flag;
    if ( Encode::is_utf8($octet) ) {
	return find_encoding('utf8') unless $NoUTFAutoGuess;
	Encode::_utf8_off($octet);
    }
    # cheat 1: BOM
    use Encode::Unicode;
    unless ($NoUTFAutoGuess) {
	my $BOM = pack('C3', unpack("C3", $octet));
	return find_encoding('utf8')
	    if (defined $BOM and $BOM eq $UTF8_BOM);
	$BOM = unpack('N', $octet);
	return find_encoding('UTF-32')
	    if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000));
	$BOM = unpack('n', $octet);
	return find_encoding('UTF-16')
	    if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe));
	if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE)
	    my $utf;
	    my ($be, $le) = (0, 0);
	    if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed
		$utf = "UTF-32";
		for my $char (unpack('N*', $octet)){
		    $char & 0x0000ffff and $be++;
		    $char & 0xffff0000 and $le++;
		}
	    }else{ # UTF-16(BE|LE) assumed
		$utf = "UTF-16";
		for my $char (unpack('n*', $octet)){
		    $char & 0x00ff and $be++;
		    $char & 0xff00 and $le++;
		}
	    }
	    DEBUG and warn "$utf, be == $be, le == $le";
	    $be == $le 
		and return
		    "Encodings ambiguous between $utf BE and LE ($be, $le)";
	    $utf .= ($be > $le) ? 'BE' : 'LE';
	    return find_encoding($utf);
	}
    }
    my %try =  %{$obj->{Suspects}};
    for my $c (@_){
	my $e = find_encoding($c) or die "Unknown encoding: $c";
	$try{$e->name} = $e;
	DEBUG and warn "Added: ", $e->name;
    }
    my $nline = 1;
    for my $line (split /\r\n?|\n/, $octet){
	# cheat 2 -- \e in the string
	if ($line =~ /\e/o){
	    my @keys = keys %try;
	    delete @try{qw/utf8 ascii/};
	    for my $k (@keys){
		ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
	    }
	}
	my %ok = %try;
	# warn join(",", keys %try);
	for my $k (keys %try){
	    my $scratch = $line;
	    $try{$k}->decode($scratch, FB_QUIET);
	    if ($scratch eq ''){
		DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
	    }else{
		use bytes ();
		DEBUG and 
		    warn sprintf("%4d:%-24s not ok; %d bytes left\n", 
				 $nline, $k, bytes::length($scratch));
		delete $ok{$k};
	    }
	}
	%ok or return "No appropriate encodings found!";
	if (scalar(keys(%ok)) == 1){
	    my ($retval) = values(%ok);
	    return $retval;
	}
	%try = %ok; $nline++;
    }
    $try{ascii} or 
	return  "Encodings too ambiguous: ", join(" or ", keys %try);
    return $try{ascii};
}


1;
__END__

=head1 NAME

Encode::Guess -- Guesses encoding from data

=head1 SYNOPSIS

  # if you are sure $data won't contain anything bogus

  use Encode;
  use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
  my $utf8 = decode("Guess", $data);
  my $data = encode("Guess", $utf8);   # this doesn't work!

  # more elaborate way
  use Encode::Guess;
  my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/);
  ref($enc) or die "Can't guess: $enc"; # trap error this way
  $utf8 = $enc->decode($data);
  # or
  $utf8 = decode($enc->name, $data)

=head1 ABSTRACT

Encode::Guess enables you to guess in what encoding a given data is
encoded, or at least tries to.  

=head1 DESCRIPTION

By default, it checks only ascii, utf8 and UTF-16/32 with BOM.

  use Encode::Guess; # ascii/utf8/BOMed UTF

To use it more practically, you have to give the names of encodings to
check (I<suspects> as follows).  The name of suspects can either be
canonical names or aliases.

CAVEAT: Unlike UTF-(16|32), BOM in utf8 is NOT AUTOMATICALLY STRIPPED.

 # tries all major Japanese Encodings as well
  use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;

If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true
value, no heuristics will be applied to UTF8/16/32, and the result
will be limited to the suspects and C<ascii>.

=over 4

=item Encode::Guess->set_suspects

You can also change the internal suspects list via C<set_suspects>
method. 

  use Encode::Guess;
  Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/);

=item Encode::Guess->add_suspects

Or you can use C<add_suspects> method.  The difference is that
C<set_suspects> flushes the current suspects list while
C<add_suspects> adds.

  use Encode::Guess;
  Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/);
  # now the suspects are euc-jp,shiftjis,7bit-jis, AND
  # euc-kr,euc-cn, and big5-eten
  Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/);

=item Encode::decode("Guess" ...)

When you are content with suspects list, you can now

  my $utf8 = Encode::decode("Guess", $data);

=item Encode::Guess->guess($data)

But it will croak if:

=over

=item *

Two or more suspects remain

=item *

No suspects left

=back

So you should instead try this;

  my $decoder = Encode::Guess->guess($data);

On success, $decoder is an object that is documented in
L<Encode::Encoding>.  So you can now do this;

  my $utf8 = $decoder->decode($data);

On failure, $decoder now contains an error message so the whole thing
would be as follows;

  my $decoder = Encode::Guess->guess($data);
  die $decoder unless ref($decoder);
  my $utf8 = $decoder->decode($data);

=item guess_encoding($data, [, I<list of suspects>])

You can also try C<guess_encoding> function which is exported by
default.  It takes $data to check and it also takes the list of
suspects by option.  The optional suspect list is I<not reflected> to
the internal suspects list.

  my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/);
  die $decoder unless ref($decoder);
  my $utf8 = $decoder->decode($data);
  # check only ascii and utf8
  my $decoder = guess_encoding($data);

=back

=head1 CAVEATS

=over 4

=item *

Because of the algorithm used, ISO-8859 series and other single-byte
encodings do not work well unless either one of ISO-8859 is the only
one suspect (besides ascii and utf8).

  use Encode::Guess;
  # perhaps ok
  my $decoder = guess_encoding($data, 'latin1');
  # definitely NOT ok
  my $decoder = guess_encoding($data, qw/latin1 greek/);

The reason is that Encode::Guess guesses encoding by trial and error.
It first splits $data into lines and tries to decode the line for each
suspect.  It keeps it going until all but one encoding is eliminated
out of suspects list.  ISO-8859 series is just too successful for most
cases (because it fills almost all code points in \x00-\xff).

=item *

Do not mix national standard encodings and the corresponding vendor
encodings.

  # a very bad idea
  my $decoder
     = guess_encoding($data, qw/shiftjis MacJapanese cp932/);

The reason is that vendor encoding is usually a superset of national
standard so it becomes too ambiguous for most cases.

=item *

On the other hand, mixing various national standard encodings
automagically works unless $data is too short to allow for guessing.

 # This is ok if $data is long enough
 my $decoder =  
  guess_encoding($data, qw/euc-cn
                           euc-jp shiftjis 7bit-jis
                           euc-kr
                           big5-eten/);

=item *

DO NOT PUT TOO MANY SUSPECTS!  Don't you try something like this!

  my $decoder = guess_encoding($data, 
                               Encode->encodings(":all"));

=back

It is, after all, just a guess.  You should alway be explicit when it
comes to encodings.  But there are some, especially Japanese,
environment that guess-coding is a must.  Use this module with care. 

=head1 TO DO

Encode::Guess does not work on EBCDIC platforms.

=head1 SEE ALSO

L<Encode>, L<Encode::Encoding>

=cut
Commit	Line	Data
af1f55d9	1	package Encode::Guess;
af1f55d9	2	use strict;
7e19fb92	3
af1f55d9	4	use Encode qw(:fallbacks find_encoding);
7237418a	5	our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
af1f55d9	6
af1f55d9	7	my $Canon = 'Guess';
8f139f4c	8	sub DEBUG () { 0 }
7e19fb92	9	our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8);
	10	$Encode::Encoding{$Canon} =
	11	bless {
	12	Name => $Canon,
	13	Suspects => { %DEF_SUSPECTS },
	14	} => __PACKAGE__;
	15
10c5ecbb	16	use base qw(Encode::Encoding);
7e19fb92	17	sub needs_lines { 1 }
7e19fb92	18	sub perlio_ok { 0 }
7e19fb92	19
7e19fb92	20	our @EXPORT = qw(guess_encoding);
23f3589e	21	our $NoUTFAutoGuess = 0;
7237418a	22	our $UTF8_BOM = pack("C3", 0xef, 0xbb, 0xbf);
7e19fb92	23
	24	sub import { # Exporter not used so we do it on our own
	25	my $callpkg = caller;
	26	for my $item (@EXPORT){
	27	no strict 'refs';
	28	*{"$callpkg\::$item"} = \&{"$item"};
	29	}
	30	set_suspects(@_);
	31	}
af1f55d9	32
7e19fb92	33	sub set_suspects{
	34	my $class = shift;
	35	my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
	36	$self->{Suspects} = { %DEF_SUSPECTS };
	37	$self->add_suspects(@_);
	38	}
af1f55d9	39
7e19fb92	40	sub add_suspects{
af1f55d9	41	my $class = shift;
7e19fb92	42	my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
af1f55d9	43	for my $c (@_){
af1f55d9	44	my $e = find_encoding($c) or die "Unknown encoding: $c";
7e19fb92	45	$self->{Suspects}{$e->name} = $e;
8f139f4c	46	DEBUG and warn "Added: ", $e->name;
af1f55d9	47	}
	48	}
	49
af1f55d9	50	sub decode($$;$){
af1f55d9	51	my ($obj, $octet, $chk) = @_;
7e19fb92	52	my $guessed = guess($obj, $octet);
10c5ecbb	53	unless (ref($guessed)){
	54	require Carp;
	55	Carp::croak($guessed);
	56	}
7e19fb92	57	my $utf8 = $guessed->decode($octet, $chk);
af1f55d9	58	$_[1] = $octet if $chk;
	59	return $utf8;
	60	}
	61
7e19fb92	62	sub guess_encoding{
7e19fb92	63	guess($Encode::Encoding{$Canon}, @_);
af1f55d9	64	}
	65
	66	sub guess {
7e19fb92	67	my $class = shift;
	68	my $obj = ref($class) ? $class : $Encode::Encoding{$Canon};
	69	my $octet = shift;
2fc614e0	70
	71	# sanity check
	72	return unless defined $octet and length $octet;
	73
7e19fb92	74	# cheat 0: utf8 flag;
23f3589e	75	if ( Encode::is_utf8($octet) ) {
	76	return find_encoding('utf8') unless $NoUTFAutoGuess;
	77	Encode::_utf8_off($octet);
	78	}
7e19fb92	79	# cheat 1: BOM
7e19fb92	80	use Encode::Unicode;
23f3589e	81	unless ($NoUTFAutoGuess) {
7237418a	82	my $BOM = pack('C3', unpack("C3", $octet));
	83	return find_encoding('utf8')
	84	if (defined $BOM and $BOM eq $UTF8_BOM);
23f3589e	85	$BOM = unpack('N', $octet);
	86	return find_encoding('UTF-32')
	87	if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000));
7237418a	88	$BOM = unpack('n', $octet);
	89	return find_encoding('UTF-16')
	90	if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe));
23f3589e	91	if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16\|32)(BE\|LE)
	92	my $utf;
	93	my ($be, $le) = (0, 0);
	94	if ($octet =~ /\x00\x00/o){ # UTF-32(BE\|LE) assumed
	95	$utf = "UTF-32";
	96	for my $char (unpack('N*', $octet)){
	97	$char & 0x0000ffff and $be++;
	98	$char & 0xffff0000 and $le++;
	99	}
	100	}else{ # UTF-16(BE\|LE) assumed
	101	$utf = "UTF-16";
	102	for my $char (unpack('n*', $octet)){
	103	$char & 0x00ff and $be++;
	104	$char & 0xff00 and $le++;
	105	}
	106	}
8f139f4c	107	DEBUG and warn "$utf, be == $be, le == $le";
23f3589e	108	$be == $le
	109	and return
	110	"Encodings ambiguous between $utf BE and LE ($be, $le)";
	111	$utf .= ($be > $le) ? 'BE' : 'LE';
	112	return find_encoding($utf);
	113	}
	114	}
7e19fb92	115	my %try = %{$obj->{Suspects}};
	116	for my $c (@_){
	117	my $e = find_encoding($c) or die "Unknown encoding: $c";
	118	$try{$e->name} = $e;
8f139f4c	119	DEBUG and warn "Added: ", $e->name;
7e19fb92	120	}
23f3589e	121	my $nline = 1;
	122	for my $line (split /\r\n?\|\n/, $octet){
	123	# cheat 2 -- \e in the string
	124	if ($line =~ /\e/o){
	125	my @keys = keys %try;
	126	delete @try{qw/utf8 ascii/};
	127	for my $k (@keys){
	128	ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
af1f55d9	129	}
af1f55d9	130	}
23f3589e	131	my %ok = %try;
	132	# warn join(",", keys %try);
	133	for my $k (keys %try){
	134	my $scratch = $line;
	135	$try{$k}->decode($scratch, FB_QUIET);
	136	if ($scratch eq ''){
8f139f4c	137	DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
23f3589e	138	}else{
23f3589e	139	use bytes ();
8f139f4c	140	DEBUG and
23f3589e	141	warn sprintf("%4d:%-24s not ok; %d bytes left\n",
	142	$nline, $k, bytes::length($scratch));
	143	delete $ok{$k};
8676e7d3	144	}
af1f55d9	145	}
23f3589e	146	%ok or return "No appropriate encodings found!";
	147	if (scalar(keys(%ok)) == 1){
	148	my ($retval) = values(%ok);
	149	return $retval;
	150	}
	151	%try = %ok; $nline++;
af1f55d9	152	}
7e19fb92	153	$try{ascii} or
7e19fb92	154	return "Encodings too ambiguous: ", join(" or ", keys %try);
af1f55d9	155	return $try{ascii};
	156	}
	157
	158
7e19fb92	159
af1f55d9	160	1;
	161	__END__
	162
	163	=head1 NAME
	164
7e19fb92	165	Encode::Guess -- Guesses encoding from data
	166
	167	=head1 SYNOPSIS
	168
	169	# if you are sure $data won't contain anything bogus
	170
e8c86ba6	171	use Encode;
7e19fb92	172	use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
	173	my $utf8 = decode("Guess", $data);
	174	my $data = encode("Guess", $utf8); # this doesn't work!
	175
	176	# more elaborate way
9735c3fc	177	use Encode::Guess;
7e19fb92	178	my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/);
	179	ref($enc) or die "Can't guess: $enc"; # trap error this way
	180	$utf8 = $enc->decode($data);
	181	# or
	182	$utf8 = decode($enc->name, $data)
	183
	184	=head1 ABSTRACT
	185
	186	Encode::Guess enables you to guess in what encoding a given data is
	187	encoded, or at least tries to.
	188
	189	=head1 DESCRIPTION
	190
	191	By default, it checks only ascii, utf8 and UTF-16/32 with BOM.
	192
	193	use Encode::Guess; # ascii/utf8/BOMed UTF
	194
	195	To use it more practically, you have to give the names of encodings to
	196	check (I<suspects> as follows). The name of suspects can either be
	197	canonical names or aliases.
	198
7237418a	199	CAVEAT: Unlike UTF-(16\|32), BOM in utf8 is NOT AUTOMATICALLY STRIPPED.
7237418a	200
7e19fb92	201	# tries all major Japanese Encodings as well
	202	use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
	203
23f3589e	204	If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true
	205	value, no heuristics will be applied to UTF8/16/32, and the result
	206	will be limited to the suspects and C<ascii>.
	207
7e19fb92	208	=over 4
	209
	210	=item Encode::Guess->set_suspects
	211
	212	You can also change the internal suspects list via C<set_suspects>
	213	method.
	214
	215	use Encode::Guess;
	216	Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/);
	217
	218	=item Encode::Guess->add_suspects
	219
	220	Or you can use C<add_suspects> method. The difference is that
	221	C<set_suspects> flushes the current suspects list while
	222	C<add_suspects> adds.
	223
	224	use Encode::Guess;
	225	Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/);
	226	# now the suspects are euc-jp,shiftjis,7bit-jis, AND
	227	# euc-kr,euc-cn, and big5-eten
	228	Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/);
	229
	230	=item Encode::decode("Guess" ...)
	231
	232	When you are content with suspects list, you can now
	233
	234	my $utf8 = Encode::decode("Guess", $data);
	235
	236	=item Encode::Guess->guess($data)
	237
9735c3fc	238	But it will croak if:
	239
	240	=over
	241
	242	=item *
	243
	244	Two or more suspects remain
	245
	246	=item *
	247
	248	No suspects left
	249
	250	=back
	251
	252	So you should instead try this;
7e19fb92	253
	254	my $decoder = Encode::Guess->guess($data);
	255
	256	On success, $decoder is an object that is documented in
	257	L<Encode::Encoding>. So you can now do this;
	258
	259	my $utf8 = $decoder->decode($data);
	260
	261	On failure, $decoder now contains an error message so the whole thing
	262	would be as follows;
	263
	264	my $decoder = Encode::Guess->guess($data);
	265	die $decoder unless ref($decoder);
	266	my $utf8 = $decoder->decode($data);
	267
	268	=item guess_encoding($data, [, I<list of suspects>])
	269
	270	You can also try C<guess_encoding> function which is exported by
	271	default. It takes $data to check and it also takes the list of
	272	suspects by option. The optional suspect list is I<not reflected> to
	273	the internal suspects list.
	274
	275	my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/);
	276	die $decoder unless ref($decoder);
	277	my $utf8 = $decoder->decode($data);
	278	# check only ascii and utf8
	279	my $decoder = guess_encoding($data);
	280
	281	=back
	282
	283	=head1 CAVEATS
	284
	285	=over 4
	286
	287	=item *
	288
	289	Because of the algorithm used, ISO-8859 series and other single-byte
	290	encodings do not work well unless either one of ISO-8859 is the only
	291	one suspect (besides ascii and utf8).
	292
	293	use Encode::Guess;
	294	# perhaps ok
	295	my $decoder = guess_encoding($data, 'latin1');
	296	# definitely NOT ok
	297	my $decoder = guess_encoding($data, qw/latin1 greek/);
	298
	299	The reason is that Encode::Guess guesses encoding by trial and error.
	300	It first splits $data into lines and tries to decode the line for each
9735c3fc	301	suspect. It keeps it going until all but one encoding is eliminated
7e19fb92	302	out of suspects list. ISO-8859 series is just too successful for most
	303	cases (because it fills almost all code points in \x00-\xff).
	304
	305	=item *
	306
	307	Do not mix national standard encodings and the corresponding vendor
	308	encodings.
	309
	310	# a very bad idea
	311	my $decoder
	312	= guess_encoding($data, qw/shiftjis MacJapanese cp932/);
	313
	314	The reason is that vendor encoding is usually a superset of national
	315	standard so it becomes too ambiguous for most cases.
	316
	317	=item *
	318
	319	On the other hand, mixing various national standard encodings
	320	automagically works unless $data is too short to allow for guessing.
	321
	322	# This is ok if $data is long enough
	323	my $decoder =
	324	guess_encoding($data, qw/euc-cn
	325	euc-jp shiftjis 7bit-jis
	326	euc-kr
	327	big5-eten/);
	328
	329	=item *
	330
	331	DO NOT PUT TOO MANY SUSPECTS! Don't you try something like this!
	332
	333	my $decoder = guess_encoding($data,
	334	Encode->encodings(":all"));
	335
	336	=back
	337
	338	It is, after all, just a guess. You should alway be explicit when it
	339	comes to encodings. But there are some, especially Japanese,
	340	environment that guess-coding is a must. Use this module with care.
	341
982a4085	342	=head1 TO DO
	343
	344	Encode::Guess does not work on EBCDIC platforms.
	345
7e19fb92	346	=head1 SEE ALSO
	347
	348	L<Encode>, L<Encode::Encoding>
af1f55d9	349
	350	=cut
	351