[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Guess.pm

package Encode::Guess;
use strict;

use Encode qw(:fallbacks find_encoding);
our $VERSION = do { my @r = (q$Revision: 1.8 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

my $Canon = 'Guess';
our $DEBUG = 0;
our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8);
$Encode::Encoding{$Canon} = 
    bless { 
	   Name       => $Canon,
	   Suspects => { %DEF_SUSPECTS },
	  } => __PACKAGE__;

use base qw(Encode::Encoding);
sub needs_lines { 1 }
sub perlio_ok { 0 }

our @EXPORT = qw(guess_encoding);

sub import { # Exporter not used so we do it on our own
    my $callpkg = caller;
    for my $item (@EXPORT){
	no strict 'refs';
	*{"$callpkg\::$item"} = \&{"$item"};
    }
    set_suspects(@_);
}

sub set_suspects{
    my $class = shift;
    my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
    $self->{Suspects} = { %DEF_SUSPECTS };
    $self->add_suspects(@_);
}

sub add_suspects{
    my $class = shift;
    my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
    for my $c (@_){
	my $e = find_encoding($c) or die "Unknown encoding: $c";
	$self->{Suspects}{$e->name} = $e;
	$DEBUG and warn "Added: ", $e->name;
    }
}

sub decode($$;$){
    my ($obj, $octet, $chk) = @_;
    my $guessed = guess($obj, $octet);
    unless (ref($guessed)){
	require Carp;
	Carp::croak($guessed);
    }
    my $utf8 = $guessed->decode($octet, $chk);
    $_[1] = $octet if $chk;
    return $utf8;
}

sub guess_encoding{
    guess($Encode::Encoding{$Canon}, @_);
}

sub guess {
    my $class = shift;
    my $obj   = ref($class) ? $class : $Encode::Encoding{$Canon};
    my $octet = shift;

    # sanity check
    return unless defined $octet and length $octet;

    # cheat 0: utf8 flag;
    Encode::is_utf8($octet) and return find_encoding('utf8');
    # cheat 1: BOM
    use Encode::Unicode;
    my $BOM = unpack('n', $octet);
    return find_encoding('UTF-16') 
	if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe));
    $BOM = unpack('N', $octet);
    return find_encoding('UTF-32') 
	if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000));
    my %try =  %{$obj->{Suspects}};
    for my $c (@_){
	my $e = find_encoding($c) or die "Unknown encoding: $c";
	$try{$e->name} = $e;
	$DEBUG and warn "Added: ", $e->name;
    }
    if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE)
	my $utf;
	my ($be, $le) = (0, 0);
	if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed
	    $utf = "UTF-32";
	    for my $char (unpack('N*', $octet)){
		$char & 0x0000ffff and $be++;
		$char & 0xffff0000 and $le++;
	    }
	}else{ # UTF-16(BE|LE) assumed
	    $utf = "UTF-16";
	    for my $char (unpack('n*', $octet)){
		$char & 0x00ff and $be++;
		$char & 0xff00 and $le++;
	    }
	}
	$DEBUG and warn "$utf, be == $be, le == $le";
	$be == $le 
	    and return "Encodings ambiguous between $utf BE and LE ($be, $le)";
	$utf .= ($be > $le) ? 'BE' : 'LE';
	return find_encoding($utf);
    }else{
	my $nline = 1;
	for my $line (split /\r\n?|\n/, $octet){
	    # cheat 2 -- \e in the string
	    if ($line =~ /\e/o){
		my @keys = keys %try;
		delete @try{qw/utf8 ascii/};
		for my $k (@keys){
		    ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
		}
	    }
	    my %ok = %try;
	    # warn join(",", keys %try);
	    for my $k (keys %try){
		my $scratch = $line;
		$try{$k}->decode($scratch, FB_QUIET);
		if ($scratch eq ''){
		    $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
		}else{
		    use bytes ();
		    $DEBUG and 
			warn sprintf("%4d:%-24s not ok; %d bytes left\n", 
				     $nline, $k, bytes::length($scratch));
		    delete $ok{$k};
		}
	    }
	    %ok or return "No appropriate encodings found!";
	    if (scalar(keys(%ok)) == 1){
		my ($retval) = values(%ok);
		return $retval;
	    }
	    %try = %ok; $nline++;
	}
    }
    $try{ascii} or 
	return  "Encodings too ambiguous: ", join(" or ", keys %try);
    return $try{ascii};
}


1;
__END__

=head1 NAME

Encode::Guess -- Guesses encoding from data

=head1 SYNOPSIS

  # if you are sure $data won't contain anything bogus

  use Encode;
  use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
  my $utf8 = decode("Guess", $data);
  my $data = encode("Guess", $utf8);   # this doesn't work!

  # more elaborate way
  use Encode::Guess;
  my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/);
  ref($enc) or die "Can't guess: $enc"; # trap error this way
  $utf8 = $enc->decode($data);
  # or
  $utf8 = decode($enc->name, $data)

=head1 ABSTRACT

Encode::Guess enables you to guess in what encoding a given data is
encoded, or at least tries to.  

=head1 DESCRIPTION

By default, it checks only ascii, utf8 and UTF-16/32 with BOM.

  use Encode::Guess; # ascii/utf8/BOMed UTF

To use it more practically, you have to give the names of encodings to
check (I<suspects> as follows).  The name of suspects can either be
canonical names or aliases.

 # tries all major Japanese Encodings as well
  use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;

=over 4

=item Encode::Guess->set_suspects

You can also change the internal suspects list via C<set_suspects>
method. 

  use Encode::Guess;
  Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/);

=item Encode::Guess->add_suspects

Or you can use C<add_suspects> method.  The difference is that
C<set_suspects> flushes the current suspects list while
C<add_suspects> adds.

  use Encode::Guess;
  Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/);
  # now the suspects are euc-jp,shiftjis,7bit-jis, AND
  # euc-kr,euc-cn, and big5-eten
  Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/);

=item Encode::decode("Guess" ...)

When you are content with suspects list, you can now

  my $utf8 = Encode::decode("Guess", $data);

=item Encode::Guess->guess($data)

But it will croak if:

=over

=item *

Two or more suspects remain

=item *

No suspects left

=back

So you should instead try this;

  my $decoder = Encode::Guess->guess($data);

On success, $decoder is an object that is documented in
L<Encode::Encoding>.  So you can now do this;

  my $utf8 = $decoder->decode($data);

On failure, $decoder now contains an error message so the whole thing
would be as follows;

  my $decoder = Encode::Guess->guess($data);
  die $decoder unless ref($decoder);
  my $utf8 = $decoder->decode($data);

=item guess_encoding($data, [, I<list of suspects>])

You can also try C<guess_encoding> function which is exported by
default.  It takes $data to check and it also takes the list of
suspects by option.  The optional suspect list is I<not reflected> to
the internal suspects list.

  my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/);
  die $decoder unless ref($decoder);
  my $utf8 = $decoder->decode($data);
  # check only ascii and utf8
  my $decoder = guess_encoding($data);

=back

=head1 CAVEATS

=over 4

=item *

Because of the algorithm used, ISO-8859 series and other single-byte
encodings do not work well unless either one of ISO-8859 is the only
one suspect (besides ascii and utf8).

  use Encode::Guess;
  # perhaps ok
  my $decoder = guess_encoding($data, 'latin1');
  # definitely NOT ok
  my $decoder = guess_encoding($data, qw/latin1 greek/);

The reason is that Encode::Guess guesses encoding by trial and error.
It first splits $data into lines and tries to decode the line for each
suspect.  It keeps it going until all but one encoding is eliminated
out of suspects list.  ISO-8859 series is just too successful for most
cases (because it fills almost all code points in \x00-\xff).

=item *

Do not mix national standard encodings and the corresponding vendor
encodings.

  # a very bad idea
  my $decoder
     = guess_encoding($data, qw/shiftjis MacJapanese cp932/);

The reason is that vendor encoding is usually a superset of national
standard so it becomes too ambiguous for most cases.

=item *

On the other hand, mixing various national standard encodings
automagically works unless $data is too short to allow for guessing.

 # This is ok if $data is long enough
 my $decoder =  
  guess_encoding($data, qw/euc-cn
                           euc-jp shiftjis 7bit-jis
                           euc-kr
                           big5-eten/);

=item *

DO NOT PUT TOO MANY SUSPECTS!  Don't you try something like this!

  my $decoder = guess_encoding($data, 
                               Encode->encodings(":all"));

=back

It is, after all, just a guess.  You should alway be explicit when it
comes to encodings.  But there are some, especially Japanese,
environment that guess-coding is a must.  Use this module with care. 

=head1 TO DO

Encode::Guess does not work on EBCDIC platforms.

=head1 SEE ALSO

L<Encode>, L<Encode::Encoding>

=cut
Commit	Line	Data
af1f55d9	1	package Encode::Guess;
af1f55d9	2	use strict;
7e19fb92	3
af1f55d9	4	use Encode qw(:fallbacks find_encoding);
8676e7d3	5	our $VERSION = do { my @r = (q$Revision: 1.8 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
af1f55d9	6
af1f55d9	7	my $Canon = 'Guess';
af1f55d9	8	our $DEBUG = 0;
7e19fb92	9	our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8);
	10	$Encode::Encoding{$Canon} =
	11	bless {
	12	Name => $Canon,
	13	Suspects => { %DEF_SUSPECTS },
	14	} => __PACKAGE__;
	15
10c5ecbb	16	use base qw(Encode::Encoding);
7e19fb92	17	sub needs_lines { 1 }
7e19fb92	18	sub perlio_ok { 0 }
7e19fb92	19
	20	our @EXPORT = qw(guess_encoding);
	21
	22	sub import { # Exporter not used so we do it on our own
	23	my $callpkg = caller;
	24	for my $item (@EXPORT){
	25	no strict 'refs';
	26	*{"$callpkg\::$item"} = \&{"$item"};
	27	}
	28	set_suspects(@_);
	29	}
af1f55d9	30
7e19fb92	31	sub set_suspects{
	32	my $class = shift;
	33	my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
	34	$self->{Suspects} = { %DEF_SUSPECTS };
	35	$self->add_suspects(@_);
	36	}
af1f55d9	37
7e19fb92	38	sub add_suspects{
af1f55d9	39	my $class = shift;
7e19fb92	40	my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
af1f55d9	41	for my $c (@_){
af1f55d9	42	my $e = find_encoding($c) or die "Unknown encoding: $c";
7e19fb92	43	$self->{Suspects}{$e->name} = $e;
af1f55d9	44	$DEBUG and warn "Added: ", $e->name;
	45	}
	46	}
	47
af1f55d9	48	sub decode($$;$){
af1f55d9	49	my ($obj, $octet, $chk) = @_;
7e19fb92	50	my $guessed = guess($obj, $octet);
10c5ecbb	51	unless (ref($guessed)){
	52	require Carp;
	53	Carp::croak($guessed);
	54	}
7e19fb92	55	my $utf8 = $guessed->decode($octet, $chk);
af1f55d9	56	$_[1] = $octet if $chk;
	57	return $utf8;
	58	}
	59
7e19fb92	60	sub guess_encoding{
7e19fb92	61	guess($Encode::Encoding{$Canon}, @_);
af1f55d9	62	}
	63
	64	sub guess {
7e19fb92	65	my $class = shift;
	66	my $obj = ref($class) ? $class : $Encode::Encoding{$Canon};
	67	my $octet = shift;
2fc614e0	68
	69	# sanity check
	70	return unless defined $octet and length $octet;
	71
7e19fb92	72	# cheat 0: utf8 flag;
af1f55d9	73	Encode::is_utf8($octet) and return find_encoding('utf8');
7e19fb92	74	# cheat 1: BOM
	75	use Encode::Unicode;
	76	my $BOM = unpack('n', $octet);
	77	return find_encoding('UTF-16')
2fc614e0	78	if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe));
7e19fb92	79	$BOM = unpack('N', $octet);
7e19fb92	80	return find_encoding('UTF-32')
2fc614e0	81	if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000));
7e19fb92	82	my %try = %{$obj->{Suspects}};
	83	for my $c (@_){
	84	my $e = find_encoding($c) or die "Unknown encoding: $c";
	85	$try{$e->name} = $e;
	86	$DEBUG and warn "Added: ", $e->name;
	87	}
8676e7d3	88	if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16\|32)(BE\|LE)
	89	my $utf;
	90	my ($be, $le) = (0, 0);
	91	if ($octet =~ /\x00\x00/o){ # UTF-32(BE\|LE) assumed
	92	$utf = "UTF-32";
	93	for my $char (unpack('N*', $octet)){
	94	$char & 0x0000ffff and $be++;
	95	$char & 0xffff0000 and $le++;
af1f55d9	96	}
8676e7d3	97	}else{ # UTF-16(BE\|LE) assumed
	98	$utf = "UTF-16";
	99	for my $char (unpack('n*', $octet)){
	100	$char & 0x00ff and $be++;
	101	$char & 0xff00 and $le++;
af1f55d9	102	}
af1f55d9	103	}
8676e7d3	104	$DEBUG and warn "$utf, be == $be, le == $le";
	105	$be == $le
	106	and return "Encodings ambiguous between $utf BE and LE ($be, $le)";
	107	$utf .= ($be > $le) ? 'BE' : 'LE';
	108	return find_encoding($utf);
	109	}else{
	110	my $nline = 1;
	111	for my $line (split /\r\n?\|\n/, $octet){
	112	# cheat 2 -- \e in the string
	113	if ($line =~ /\e/o){
	114	my @keys = keys %try;
	115	delete @try{qw/utf8 ascii/};
	116	for my $k (@keys){
	117	ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
	118	}
	119	}
	120	my %ok = %try;
	121	# warn join(",", keys %try);
	122	for my $k (keys %try){
	123	my $scratch = $line;
	124	$try{$k}->decode($scratch, FB_QUIET);
	125	if ($scratch eq ''){
	126	$DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
	127	}else{
	128	use bytes ();
	129	$DEBUG and
	130	warn sprintf("%4d:%-24s not ok; %d bytes left\n",
	131	$nline, $k, bytes::length($scratch));
	132	delete $ok{$k};
	133	}
	134	}
	135	%ok or return "No appropriate encodings found!";
	136	if (scalar(keys(%ok)) == 1){
	137	my ($retval) = values(%ok);
	138	return $retval;
	139	}
	140	%try = %ok; $nline++;
af1f55d9	141	}
af1f55d9	142	}
7e19fb92	143	$try{ascii} or
7e19fb92	144	return "Encodings too ambiguous: ", join(" or ", keys %try);
af1f55d9	145	return $try{ascii};
	146	}
	147
	148
7e19fb92	149
af1f55d9	150	1;
	151	__END__
	152
	153	=head1 NAME
	154
7e19fb92	155	Encode::Guess -- Guesses encoding from data
	156
	157	=head1 SYNOPSIS
	158
	159	# if you are sure $data won't contain anything bogus
	160
e8c86ba6	161	use Encode;
7e19fb92	162	use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
	163	my $utf8 = decode("Guess", $data);
	164	my $data = encode("Guess", $utf8); # this doesn't work!
	165
	166	# more elaborate way
9735c3fc	167	use Encode::Guess;
7e19fb92	168	my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/);
	169	ref($enc) or die "Can't guess: $enc"; # trap error this way
	170	$utf8 = $enc->decode($data);
	171	# or
	172	$utf8 = decode($enc->name, $data)
	173
	174	=head1 ABSTRACT
	175
	176	Encode::Guess enables you to guess in what encoding a given data is
	177	encoded, or at least tries to.
	178
	179	=head1 DESCRIPTION
	180
	181	By default, it checks only ascii, utf8 and UTF-16/32 with BOM.
	182
	183	use Encode::Guess; # ascii/utf8/BOMed UTF
	184
	185	To use it more practically, you have to give the names of encodings to
	186	check (I<suspects> as follows). The name of suspects can either be
	187	canonical names or aliases.
	188
	189	# tries all major Japanese Encodings as well
	190	use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
	191
	192	=over 4
	193
	194	=item Encode::Guess->set_suspects
	195
	196	You can also change the internal suspects list via C<set_suspects>
	197	method.
	198
	199	use Encode::Guess;
	200	Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/);
	201
	202	=item Encode::Guess->add_suspects
	203
	204	Or you can use C<add_suspects> method. The difference is that
	205	C<set_suspects> flushes the current suspects list while
	206	C<add_suspects> adds.
	207
	208	use Encode::Guess;
	209	Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/);
	210	# now the suspects are euc-jp,shiftjis,7bit-jis, AND
	211	# euc-kr,euc-cn, and big5-eten
	212	Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/);
	213
	214	=item Encode::decode("Guess" ...)
	215
	216	When you are content with suspects list, you can now
	217
	218	my $utf8 = Encode::decode("Guess", $data);
	219
	220	=item Encode::Guess->guess($data)
	221
9735c3fc	222	But it will croak if:
	223
	224	=over
	225
	226	=item *
	227
	228	Two or more suspects remain
	229
	230	=item *
	231
	232	No suspects left
	233
	234	=back
	235
	236	So you should instead try this;
7e19fb92	237
	238	my $decoder = Encode::Guess->guess($data);
	239
	240	On success, $decoder is an object that is documented in
	241	L<Encode::Encoding>. So you can now do this;
	242
	243	my $utf8 = $decoder->decode($data);
	244
	245	On failure, $decoder now contains an error message so the whole thing
	246	would be as follows;
	247
	248	my $decoder = Encode::Guess->guess($data);
	249	die $decoder unless ref($decoder);
	250	my $utf8 = $decoder->decode($data);
	251
	252	=item guess_encoding($data, [, I<list of suspects>])
	253
	254	You can also try C<guess_encoding> function which is exported by
	255	default. It takes $data to check and it also takes the list of
	256	suspects by option. The optional suspect list is I<not reflected> to
	257	the internal suspects list.
	258
	259	my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/);
	260	die $decoder unless ref($decoder);
	261	my $utf8 = $decoder->decode($data);
	262	# check only ascii and utf8
	263	my $decoder = guess_encoding($data);
	264
	265	=back
	266
	267	=head1 CAVEATS
	268
	269	=over 4
	270
	271	=item *
	272
	273	Because of the algorithm used, ISO-8859 series and other single-byte
	274	encodings do not work well unless either one of ISO-8859 is the only
	275	one suspect (besides ascii and utf8).
	276
	277	use Encode::Guess;
	278	# perhaps ok
	279	my $decoder = guess_encoding($data, 'latin1');
	280	# definitely NOT ok
	281	my $decoder = guess_encoding($data, qw/latin1 greek/);
	282
	283	The reason is that Encode::Guess guesses encoding by trial and error.
	284	It first splits $data into lines and tries to decode the line for each
9735c3fc	285	suspect. It keeps it going until all but one encoding is eliminated
7e19fb92	286	out of suspects list. ISO-8859 series is just too successful for most
	287	cases (because it fills almost all code points in \x00-\xff).
	288
	289	=item *
	290
	291	Do not mix national standard encodings and the corresponding vendor
	292	encodings.
	293
	294	# a very bad idea
	295	my $decoder
	296	= guess_encoding($data, qw/shiftjis MacJapanese cp932/);
	297
	298	The reason is that vendor encoding is usually a superset of national
	299	standard so it becomes too ambiguous for most cases.
	300
	301	=item *
	302
	303	On the other hand, mixing various national standard encodings
	304	automagically works unless $data is too short to allow for guessing.
	305
	306	# This is ok if $data is long enough
	307	my $decoder =
	308	guess_encoding($data, qw/euc-cn
	309	euc-jp shiftjis 7bit-jis
	310	euc-kr
	311	big5-eten/);
	312
	313	=item *
	314
	315	DO NOT PUT TOO MANY SUSPECTS! Don't you try something like this!
	316
	317	my $decoder = guess_encoding($data,
	318	Encode->encodings(":all"));
	319
	320	=back
	321
	322	It is, after all, just a guess. You should alway be explicit when it
	323	comes to encodings. But there are some, especially Japanese,
	324	environment that guess-coding is a must. Use this module with care.
	325
982a4085	326	=head1 TO DO
	327
	328	Encode::Guess does not work on EBCDIC platforms.
	329
7e19fb92	330	=head1 SEE ALSO
	331
	332	L<Encode>, L<Encode::Encoding>
af1f55d9	333
	334	=cut
	335