Commit | Line | Data |
af1f55d9 |
1 | package Encode::Guess; |
2 | use strict; |
7e19fb92 |
3 | |
af1f55d9 |
4 | use Encode qw(:fallbacks find_encoding); |
8676e7d3 |
5 | our $VERSION = do { my @r = (q$Revision: 1.8 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
af1f55d9 |
6 | |
7 | my $Canon = 'Guess'; |
af1f55d9 |
8 | our $DEBUG = 0; |
7e19fb92 |
9 | our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8); |
10 | $Encode::Encoding{$Canon} = |
11 | bless { |
12 | Name => $Canon, |
13 | Suspects => { %DEF_SUSPECTS }, |
14 | } => __PACKAGE__; |
15 | |
10c5ecbb |
16 | use base qw(Encode::Encoding); |
7e19fb92 |
17 | sub needs_lines { 1 } |
18 | sub perlio_ok { 0 } |
7e19fb92 |
19 | |
20 | our @EXPORT = qw(guess_encoding); |
21 | |
22 | sub import { # Exporter not used so we do it on our own |
23 | my $callpkg = caller; |
24 | for my $item (@EXPORT){ |
25 | no strict 'refs'; |
26 | *{"$callpkg\::$item"} = \&{"$item"}; |
27 | } |
28 | set_suspects(@_); |
29 | } |
af1f55d9 |
30 | |
7e19fb92 |
31 | sub set_suspects{ |
32 | my $class = shift; |
33 | my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; |
34 | $self->{Suspects} = { %DEF_SUSPECTS }; |
35 | $self->add_suspects(@_); |
36 | } |
af1f55d9 |
37 | |
7e19fb92 |
38 | sub add_suspects{ |
af1f55d9 |
39 | my $class = shift; |
7e19fb92 |
40 | my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; |
af1f55d9 |
41 | for my $c (@_){ |
42 | my $e = find_encoding($c) or die "Unknown encoding: $c"; |
7e19fb92 |
43 | $self->{Suspects}{$e->name} = $e; |
af1f55d9 |
44 | $DEBUG and warn "Added: ", $e->name; |
45 | } |
46 | } |
47 | |
af1f55d9 |
48 | sub decode($$;$){ |
49 | my ($obj, $octet, $chk) = @_; |
7e19fb92 |
50 | my $guessed = guess($obj, $octet); |
10c5ecbb |
51 | unless (ref($guessed)){ |
52 | require Carp; |
53 | Carp::croak($guessed); |
54 | } |
7e19fb92 |
55 | my $utf8 = $guessed->decode($octet, $chk); |
af1f55d9 |
56 | $_[1] = $octet if $chk; |
57 | return $utf8; |
58 | } |
59 | |
7e19fb92 |
60 | sub guess_encoding{ |
61 | guess($Encode::Encoding{$Canon}, @_); |
af1f55d9 |
62 | } |
63 | |
64 | sub guess { |
7e19fb92 |
65 | my $class = shift; |
66 | my $obj = ref($class) ? $class : $Encode::Encoding{$Canon}; |
67 | my $octet = shift; |
2fc614e0 |
68 | |
69 | # sanity check |
70 | return unless defined $octet and length $octet; |
71 | |
7e19fb92 |
72 | # cheat 0: utf8 flag; |
af1f55d9 |
73 | Encode::is_utf8($octet) and return find_encoding('utf8'); |
7e19fb92 |
74 | # cheat 1: BOM |
75 | use Encode::Unicode; |
76 | my $BOM = unpack('n', $octet); |
77 | return find_encoding('UTF-16') |
2fc614e0 |
78 | if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe)); |
7e19fb92 |
79 | $BOM = unpack('N', $octet); |
80 | return find_encoding('UTF-32') |
2fc614e0 |
81 | if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000)); |
7e19fb92 |
82 | my %try = %{$obj->{Suspects}}; |
83 | for my $c (@_){ |
84 | my $e = find_encoding($c) or die "Unknown encoding: $c"; |
85 | $try{$e->name} = $e; |
86 | $DEBUG and warn "Added: ", $e->name; |
87 | } |
8676e7d3 |
88 | if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE) |
89 | my $utf; |
90 | my ($be, $le) = (0, 0); |
91 | if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed |
92 | $utf = "UTF-32"; |
93 | for my $char (unpack('N*', $octet)){ |
94 | $char & 0x0000ffff and $be++; |
95 | $char & 0xffff0000 and $le++; |
af1f55d9 |
96 | } |
8676e7d3 |
97 | }else{ # UTF-16(BE|LE) assumed |
98 | $utf = "UTF-16"; |
99 | for my $char (unpack('n*', $octet)){ |
100 | $char & 0x00ff and $be++; |
101 | $char & 0xff00 and $le++; |
af1f55d9 |
102 | } |
103 | } |
8676e7d3 |
104 | $DEBUG and warn "$utf, be == $be, le == $le"; |
105 | $be == $le |
106 | and return "Encodings ambiguous between $utf BE and LE ($be, $le)"; |
107 | $utf .= ($be > $le) ? 'BE' : 'LE'; |
108 | return find_encoding($utf); |
109 | }else{ |
110 | my $nline = 1; |
111 | for my $line (split /\r\n?|\n/, $octet){ |
112 | # cheat 2 -- \e in the string |
113 | if ($line =~ /\e/o){ |
114 | my @keys = keys %try; |
115 | delete @try{qw/utf8 ascii/}; |
116 | for my $k (@keys){ |
117 | ref($try{$k}) eq 'Encode::XS' and delete $try{$k}; |
118 | } |
119 | } |
120 | my %ok = %try; |
121 | # warn join(",", keys %try); |
122 | for my $k (keys %try){ |
123 | my $scratch = $line; |
124 | $try{$k}->decode($scratch, FB_QUIET); |
125 | if ($scratch eq ''){ |
126 | $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k); |
127 | }else{ |
128 | use bytes (); |
129 | $DEBUG and |
130 | warn sprintf("%4d:%-24s not ok; %d bytes left\n", |
131 | $nline, $k, bytes::length($scratch)); |
132 | delete $ok{$k}; |
133 | } |
134 | } |
135 | %ok or return "No appropriate encodings found!"; |
136 | if (scalar(keys(%ok)) == 1){ |
137 | my ($retval) = values(%ok); |
138 | return $retval; |
139 | } |
140 | %try = %ok; $nline++; |
af1f55d9 |
141 | } |
af1f55d9 |
142 | } |
7e19fb92 |
143 | $try{ascii} or |
144 | return "Encodings too ambiguous: ", join(" or ", keys %try); |
af1f55d9 |
145 | return $try{ascii}; |
146 | } |
147 | |
148 | |
7e19fb92 |
149 | |
af1f55d9 |
150 | 1; |
151 | __END__ |
152 | |
153 | =head1 NAME |
154 | |
7e19fb92 |
155 | Encode::Guess -- Guesses encoding from data |
156 | |
157 | =head1 SYNOPSIS |
158 | |
159 | # if you are sure $data won't contain anything bogus |
160 | |
e8c86ba6 |
161 | use Encode; |
7e19fb92 |
162 | use Encode::Guess qw/euc-jp shiftjis 7bit-jis/; |
163 | my $utf8 = decode("Guess", $data); |
164 | my $data = encode("Guess", $utf8); # this doesn't work! |
165 | |
166 | # more elaborate way |
9735c3fc |
167 | use Encode::Guess; |
7e19fb92 |
168 | my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/); |
169 | ref($enc) or die "Can't guess: $enc"; # trap error this way |
170 | $utf8 = $enc->decode($data); |
171 | # or |
172 | $utf8 = decode($enc->name, $data) |
173 | |
174 | =head1 ABSTRACT |
175 | |
176 | Encode::Guess enables you to guess in what encoding a given data is |
177 | encoded, or at least tries to. |
178 | |
179 | =head1 DESCRIPTION |
180 | |
181 | By default, it checks only ascii, utf8 and UTF-16/32 with BOM. |
182 | |
183 | use Encode::Guess; # ascii/utf8/BOMed UTF |
184 | |
185 | To use it more practically, you have to give the names of encodings to |
186 | check (I<suspects> as follows). The name of suspects can either be |
187 | canonical names or aliases. |
188 | |
189 | # tries all major Japanese Encodings as well |
190 | use Encode::Guess qw/euc-jp shiftjis 7bit-jis/; |
191 | |
192 | =over 4 |
193 | |
194 | =item Encode::Guess->set_suspects |
195 | |
196 | You can also change the internal suspects list via C<set_suspects> |
197 | method. |
198 | |
199 | use Encode::Guess; |
200 | Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/); |
201 | |
202 | =item Encode::Guess->add_suspects |
203 | |
204 | Or you can use C<add_suspects> method. The difference is that |
205 | C<set_suspects> flushes the current suspects list while |
206 | C<add_suspects> adds. |
207 | |
208 | use Encode::Guess; |
209 | Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/); |
210 | # now the suspects are euc-jp,shiftjis,7bit-jis, AND |
211 | # euc-kr,euc-cn, and big5-eten |
212 | Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/); |
213 | |
214 | =item Encode::decode("Guess" ...) |
215 | |
216 | When you are content with suspects list, you can now |
217 | |
218 | my $utf8 = Encode::decode("Guess", $data); |
219 | |
220 | =item Encode::Guess->guess($data) |
221 | |
9735c3fc |
222 | But it will croak if: |
223 | |
224 | =over |
225 | |
226 | =item * |
227 | |
228 | Two or more suspects remain |
229 | |
230 | =item * |
231 | |
232 | No suspects left |
233 | |
234 | =back |
235 | |
236 | So you should instead try this; |
7e19fb92 |
237 | |
238 | my $decoder = Encode::Guess->guess($data); |
239 | |
240 | On success, $decoder is an object that is documented in |
241 | L<Encode::Encoding>. So you can now do this; |
242 | |
243 | my $utf8 = $decoder->decode($data); |
244 | |
245 | On failure, $decoder now contains an error message so the whole thing |
246 | would be as follows; |
247 | |
248 | my $decoder = Encode::Guess->guess($data); |
249 | die $decoder unless ref($decoder); |
250 | my $utf8 = $decoder->decode($data); |
251 | |
252 | =item guess_encoding($data, [, I<list of suspects>]) |
253 | |
254 | You can also try C<guess_encoding> function which is exported by |
255 | default. It takes $data to check and it also takes the list of |
256 | suspects by option. The optional suspect list is I<not reflected> to |
257 | the internal suspects list. |
258 | |
259 | my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/); |
260 | die $decoder unless ref($decoder); |
261 | my $utf8 = $decoder->decode($data); |
262 | # check only ascii and utf8 |
263 | my $decoder = guess_encoding($data); |
264 | |
265 | =back |
266 | |
267 | =head1 CAVEATS |
268 | |
269 | =over 4 |
270 | |
271 | =item * |
272 | |
273 | Because of the algorithm used, ISO-8859 series and other single-byte |
274 | encodings do not work well unless either one of ISO-8859 is the only |
275 | one suspect (besides ascii and utf8). |
276 | |
277 | use Encode::Guess; |
278 | # perhaps ok |
279 | my $decoder = guess_encoding($data, 'latin1'); |
280 | # definitely NOT ok |
281 | my $decoder = guess_encoding($data, qw/latin1 greek/); |
282 | |
283 | The reason is that Encode::Guess guesses encoding by trial and error. |
284 | It first splits $data into lines and tries to decode the line for each |
9735c3fc |
285 | suspect. It keeps it going until all but one encoding is eliminated |
7e19fb92 |
286 | out of suspects list. ISO-8859 series is just too successful for most |
287 | cases (because it fills almost all code points in \x00-\xff). |
288 | |
289 | =item * |
290 | |
291 | Do not mix national standard encodings and the corresponding vendor |
292 | encodings. |
293 | |
294 | # a very bad idea |
295 | my $decoder |
296 | = guess_encoding($data, qw/shiftjis MacJapanese cp932/); |
297 | |
298 | The reason is that vendor encoding is usually a superset of national |
299 | standard so it becomes too ambiguous for most cases. |
300 | |
301 | =item * |
302 | |
303 | On the other hand, mixing various national standard encodings |
304 | automagically works unless $data is too short to allow for guessing. |
305 | |
306 | # This is ok if $data is long enough |
307 | my $decoder = |
308 | guess_encoding($data, qw/euc-cn |
309 | euc-jp shiftjis 7bit-jis |
310 | euc-kr |
311 | big5-eten/); |
312 | |
313 | =item * |
314 | |
315 | DO NOT PUT TOO MANY SUSPECTS! Don't you try something like this! |
316 | |
317 | my $decoder = guess_encoding($data, |
318 | Encode->encodings(":all")); |
319 | |
320 | =back |
321 | |
322 | It is, after all, just a guess. You should alway be explicit when it |
323 | comes to encodings. But there are some, especially Japanese, |
324 | environment that guess-coding is a must. Use this module with care. |
325 | |
982a4085 |
326 | =head1 TO DO |
327 | |
328 | Encode::Guess does not work on EBCDIC platforms. |
329 | |
7e19fb92 |
330 | =head1 SEE ALSO |
331 | |
332 | L<Encode>, L<Encode::Encoding> |
af1f55d9 |
333 | |
334 | =cut |
335 | |