Commit | Line | Data |
af1f55d9 |
1 | package Encode::Guess; |
2 | use strict; |
656ebd29 |
3 | use warnings; |
af1f55d9 |
4 | use Encode qw(:fallbacks find_encoding); |
656ebd29 |
5 | our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; |
af1f55d9 |
6 | |
7 | my $Canon = 'Guess'; |
8f139f4c |
8 | sub DEBUG () { 0 } |
7e19fb92 |
9 | our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8); |
d1256cb1 |
10 | $Encode::Encoding{$Canon} = bless { |
11 | Name => $Canon, |
12 | Suspects => {%DEF_SUSPECTS}, |
13 | } => __PACKAGE__; |
7e19fb92 |
14 | |
10c5ecbb |
15 | use base qw(Encode::Encoding); |
7e19fb92 |
16 | sub needs_lines { 1 } |
d1256cb1 |
17 | sub perlio_ok { 0 } |
7e19fb92 |
18 | |
d1256cb1 |
19 | our @EXPORT = qw(guess_encoding); |
23f3589e |
20 | our $NoUTFAutoGuess = 0; |
d1256cb1 |
21 | our $UTF8_BOM = pack( "C3", 0xef, 0xbb, 0xbf ); |
7e19fb92 |
22 | |
d1256cb1 |
23 | sub import { # Exporter not used so we do it on our own |
7e19fb92 |
24 | my $callpkg = caller; |
d1256cb1 |
25 | for my $item (@EXPORT) { |
26 | no strict 'refs'; |
27 | *{"$callpkg\::$item"} = \&{"$item"}; |
7e19fb92 |
28 | } |
29 | set_suspects(@_); |
30 | } |
af1f55d9 |
31 | |
d1256cb1 |
32 | sub set_suspects { |
7e19fb92 |
33 | my $class = shift; |
34 | my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; |
d1256cb1 |
35 | $self->{Suspects} = {%DEF_SUSPECTS}; |
7e19fb92 |
36 | $self->add_suspects(@_); |
37 | } |
af1f55d9 |
38 | |
d1256cb1 |
39 | sub add_suspects { |
af1f55d9 |
40 | my $class = shift; |
7e19fb92 |
41 | my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; |
d1256cb1 |
42 | for my $c (@_) { |
43 | my $e = find_encoding($c) or die "Unknown encoding: $c"; |
44 | $self->{Suspects}{ $e->name } = $e; |
45 | DEBUG and warn "Added: ", $e->name; |
af1f55d9 |
46 | } |
47 | } |
48 | |
d1256cb1 |
49 | sub decode($$;$) { |
50 | my ( $obj, $octet, $chk ) = @_; |
51 | my $guessed = guess( $obj, $octet ); |
52 | unless ( ref($guessed) ) { |
53 | require Carp; |
54 | Carp::croak($guessed); |
10c5ecbb |
55 | } |
d1256cb1 |
56 | my $utf8 = $guessed->decode( $octet, $chk ); |
af1f55d9 |
57 | $_[1] = $octet if $chk; |
58 | return $utf8; |
59 | } |
60 | |
d1256cb1 |
61 | sub guess_encoding { |
62 | guess( $Encode::Encoding{$Canon}, @_ ); |
af1f55d9 |
63 | } |
64 | |
65 | sub guess { |
7e19fb92 |
66 | my $class = shift; |
67 | my $obj = ref($class) ? $class : $Encode::Encoding{$Canon}; |
68 | my $octet = shift; |
2fc614e0 |
69 | |
70 | # sanity check |
71 | return unless defined $octet and length $octet; |
72 | |
7e19fb92 |
73 | # cheat 0: utf8 flag; |
23f3589e |
74 | if ( Encode::is_utf8($octet) ) { |
d1256cb1 |
75 | return find_encoding('utf8') unless $NoUTFAutoGuess; |
76 | Encode::_utf8_off($octet); |
23f3589e |
77 | } |
d1256cb1 |
78 | |
7e19fb92 |
79 | # cheat 1: BOM |
80 | use Encode::Unicode; |
23f3589e |
81 | unless ($NoUTFAutoGuess) { |
d1256cb1 |
82 | my $BOM = pack( 'C3', unpack( "C3", $octet ) ); |
83 | return find_encoding('utf8') |
84 | if ( defined $BOM and $BOM eq $UTF8_BOM ); |
85 | $BOM = unpack( 'N', $octet ); |
86 | return find_encoding('UTF-32') |
87 | if ( defined $BOM and ( $BOM == 0xFeFF or $BOM == 0xFFFe0000 ) ); |
88 | $BOM = unpack( 'n', $octet ); |
89 | return find_encoding('UTF-16') |
90 | if ( defined $BOM and ( $BOM == 0xFeFF or $BOM == 0xFFFe ) ); |
91 | if ( $octet =~ /\x00/o ) |
92 | { # if \x00 found, we assume UTF-(16|32)(BE|LE) |
93 | my $utf; |
94 | my ( $be, $le ) = ( 0, 0 ); |
95 | if ( $octet =~ /\x00\x00/o ) { # UTF-32(BE|LE) assumed |
96 | $utf = "UTF-32"; |
97 | for my $char ( unpack( 'N*', $octet ) ) { |
98 | $char & 0x0000ffff and $be++; |
99 | $char & 0xffff0000 and $le++; |
100 | } |
101 | } |
102 | else { # UTF-16(BE|LE) assumed |
103 | $utf = "UTF-16"; |
104 | for my $char ( unpack( 'n*', $octet ) ) { |
105 | $char & 0x00ff and $be++; |
106 | $char & 0xff00 and $le++; |
107 | } |
108 | } |
109 | DEBUG and warn "$utf, be == $be, le == $le"; |
110 | $be == $le |
111 | and return |
112 | "Encodings ambiguous between $utf BE and LE ($be, $le)"; |
113 | $utf .= ( $be > $le ) ? 'BE' : 'LE'; |
114 | return find_encoding($utf); |
115 | } |
23f3589e |
116 | } |
d1256cb1 |
117 | my %try = %{ $obj->{Suspects} }; |
118 | for my $c (@_) { |
119 | my $e = find_encoding($c) or die "Unknown encoding: $c"; |
120 | $try{ $e->name } = $e; |
121 | DEBUG and warn "Added: ", $e->name; |
7e19fb92 |
122 | } |
23f3589e |
123 | my $nline = 1; |
d1256cb1 |
124 | for my $line ( split /\r\n?|\n/, $octet ) { |
125 | |
126 | # cheat 2 -- \e in the string |
127 | if ( $line =~ /\e/o ) { |
128 | my @keys = keys %try; |
129 | delete @try{qw/utf8 ascii/}; |
130 | for my $k (@keys) { |
131 | ref( $try{$k} ) eq 'Encode::XS' and delete $try{$k}; |
132 | } |
133 | } |
134 | my %ok = %try; |
135 | |
136 | # warn join(",", keys %try); |
137 | for my $k ( keys %try ) { |
138 | my $scratch = $line; |
139 | $try{$k}->decode( $scratch, FB_QUIET ); |
140 | if ( $scratch eq '' ) { |
141 | DEBUG and warn sprintf( "%4d:%-24s ok\n", $nline, $k ); |
142 | } |
143 | else { |
144 | use bytes (); |
145 | DEBUG |
146 | and warn sprintf( "%4d:%-24s not ok; %d bytes left\n", |
147 | $nline, $k, bytes::length($scratch) ); |
148 | delete $ok{$k}; |
149 | } |
150 | } |
151 | %ok or return "No appropriate encodings found!"; |
152 | if ( scalar( keys(%ok) ) == 1 ) { |
153 | my ($retval) = values(%ok); |
154 | return $retval; |
155 | } |
156 | %try = %ok; |
157 | $nline++; |
af1f55d9 |
158 | } |
d1256cb1 |
159 | $try{ascii} |
160 | or return "Encodings too ambiguous: ", join( " or ", keys %try ); |
af1f55d9 |
161 | return $try{ascii}; |
162 | } |
163 | |
af1f55d9 |
164 | 1; |
165 | __END__ |
166 | |
167 | =head1 NAME |
168 | |
7e19fb92 |
169 | Encode::Guess -- Guesses encoding from data |
170 | |
171 | =head1 SYNOPSIS |
172 | |
173 | # if you are sure $data won't contain anything bogus |
174 | |
e8c86ba6 |
175 | use Encode; |
7e19fb92 |
176 | use Encode::Guess qw/euc-jp shiftjis 7bit-jis/; |
177 | my $utf8 = decode("Guess", $data); |
178 | my $data = encode("Guess", $utf8); # this doesn't work! |
179 | |
180 | # more elaborate way |
9735c3fc |
181 | use Encode::Guess; |
7e19fb92 |
182 | my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/); |
183 | ref($enc) or die "Can't guess: $enc"; # trap error this way |
184 | $utf8 = $enc->decode($data); |
185 | # or |
186 | $utf8 = decode($enc->name, $data) |
187 | |
188 | =head1 ABSTRACT |
189 | |
190 | Encode::Guess enables you to guess in what encoding a given data is |
191 | encoded, or at least tries to. |
192 | |
193 | =head1 DESCRIPTION |
194 | |
195 | By default, it checks only ascii, utf8 and UTF-16/32 with BOM. |
196 | |
197 | use Encode::Guess; # ascii/utf8/BOMed UTF |
198 | |
199 | To use it more practically, you have to give the names of encodings to |
200 | check (I<suspects> as follows). The name of suspects can either be |
201 | canonical names or aliases. |
202 | |
7237418a |
203 | CAVEAT: Unlike UTF-(16|32), BOM in utf8 is NOT AUTOMATICALLY STRIPPED. |
204 | |
7e19fb92 |
205 | # tries all major Japanese Encodings as well |
206 | use Encode::Guess qw/euc-jp shiftjis 7bit-jis/; |
207 | |
23f3589e |
208 | If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true |
209 | value, no heuristics will be applied to UTF8/16/32, and the result |
210 | will be limited to the suspects and C<ascii>. |
211 | |
7e19fb92 |
212 | =over 4 |
213 | |
214 | =item Encode::Guess->set_suspects |
215 | |
216 | You can also change the internal suspects list via C<set_suspects> |
217 | method. |
218 | |
219 | use Encode::Guess; |
220 | Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/); |
221 | |
222 | =item Encode::Guess->add_suspects |
223 | |
224 | Or you can use C<add_suspects> method. The difference is that |
225 | C<set_suspects> flushes the current suspects list while |
226 | C<add_suspects> adds. |
227 | |
228 | use Encode::Guess; |
229 | Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/); |
230 | # now the suspects are euc-jp,shiftjis,7bit-jis, AND |
231 | # euc-kr,euc-cn, and big5-eten |
232 | Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/); |
233 | |
234 | =item Encode::decode("Guess" ...) |
235 | |
236 | When you are content with suspects list, you can now |
237 | |
238 | my $utf8 = Encode::decode("Guess", $data); |
239 | |
240 | =item Encode::Guess->guess($data) |
241 | |
9735c3fc |
242 | But it will croak if: |
243 | |
244 | =over |
245 | |
246 | =item * |
247 | |
248 | Two or more suspects remain |
249 | |
250 | =item * |
251 | |
252 | No suspects left |
253 | |
254 | =back |
255 | |
256 | So you should instead try this; |
7e19fb92 |
257 | |
258 | my $decoder = Encode::Guess->guess($data); |
259 | |
260 | On success, $decoder is an object that is documented in |
261 | L<Encode::Encoding>. So you can now do this; |
262 | |
263 | my $utf8 = $decoder->decode($data); |
264 | |
265 | On failure, $decoder now contains an error message so the whole thing |
266 | would be as follows; |
267 | |
268 | my $decoder = Encode::Guess->guess($data); |
269 | die $decoder unless ref($decoder); |
270 | my $utf8 = $decoder->decode($data); |
271 | |
272 | =item guess_encoding($data, [, I<list of suspects>]) |
273 | |
274 | You can also try C<guess_encoding> function which is exported by |
275 | default. It takes $data to check and it also takes the list of |
276 | suspects by option. The optional suspect list is I<not reflected> to |
277 | the internal suspects list. |
278 | |
279 | my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/); |
280 | die $decoder unless ref($decoder); |
281 | my $utf8 = $decoder->decode($data); |
282 | # check only ascii and utf8 |
283 | my $decoder = guess_encoding($data); |
284 | |
285 | =back |
286 | |
287 | =head1 CAVEATS |
288 | |
289 | =over 4 |
290 | |
291 | =item * |
292 | |
293 | Because of the algorithm used, ISO-8859 series and other single-byte |
294 | encodings do not work well unless either one of ISO-8859 is the only |
295 | one suspect (besides ascii and utf8). |
296 | |
297 | use Encode::Guess; |
298 | # perhaps ok |
299 | my $decoder = guess_encoding($data, 'latin1'); |
300 | # definitely NOT ok |
301 | my $decoder = guess_encoding($data, qw/latin1 greek/); |
302 | |
303 | The reason is that Encode::Guess guesses encoding by trial and error. |
304 | It first splits $data into lines and tries to decode the line for each |
9735c3fc |
305 | suspect. It keeps it going until all but one encoding is eliminated |
7e19fb92 |
306 | out of suspects list. ISO-8859 series is just too successful for most |
307 | cases (because it fills almost all code points in \x00-\xff). |
308 | |
309 | =item * |
310 | |
311 | Do not mix national standard encodings and the corresponding vendor |
312 | encodings. |
313 | |
314 | # a very bad idea |
315 | my $decoder |
316 | = guess_encoding($data, qw/shiftjis MacJapanese cp932/); |
317 | |
318 | The reason is that vendor encoding is usually a superset of national |
319 | standard so it becomes too ambiguous for most cases. |
320 | |
321 | =item * |
322 | |
323 | On the other hand, mixing various national standard encodings |
324 | automagically works unless $data is too short to allow for guessing. |
325 | |
326 | # This is ok if $data is long enough |
327 | my $decoder = |
328 | guess_encoding($data, qw/euc-cn |
329 | euc-jp shiftjis 7bit-jis |
330 | euc-kr |
331 | big5-eten/); |
332 | |
333 | =item * |
334 | |
335 | DO NOT PUT TOO MANY SUSPECTS! Don't you try something like this! |
336 | |
337 | my $decoder = guess_encoding($data, |
338 | Encode->encodings(":all")); |
339 | |
340 | =back |
341 | |
342 | It is, after all, just a guess. You should alway be explicit when it |
343 | comes to encodings. But there are some, especially Japanese, |
344 | environment that guess-coding is a must. Use this module with care. |
345 | |
982a4085 |
346 | =head1 TO DO |
347 | |
348 | Encode::Guess does not work on EBCDIC platforms. |
349 | |
7e19fb92 |
350 | =head1 SEE ALSO |
351 | |
352 | L<Encode>, L<Encode::Encoding> |
af1f55d9 |
353 | |
354 | =cut |
355 | |