Commit | Line | Data |
3ef515df |
1 | #!./perl |
1a1e8f7d |
2 | # $Id: piconv,v 2.4 2009/07/08 13:34:15 dankogai Exp $ |
67d7b5ef |
3 | # |
9160fdbd |
4 | use 5.8.0; |
67d7b5ef |
5 | use strict; |
6 | use Encode ; |
7 | use Encode::Alias; |
8 | my %Scheme = map {$_ => 1} qw(from_to decode_encode perlio); |
9 | |
bedba681 |
10 | use File::Basename; |
11 | my $name = basename($0); |
12 | |
c14d784c |
13 | use Getopt::Long qw(:config no_ignore_case); |
bedba681 |
14 | |
15 | my %Opt; |
16 | |
17 | help() |
18 | unless |
19 | GetOptions(\%Opt, |
d1256cb1 |
20 | 'from|f=s', |
21 | 'to|t=s', |
22 | 'list|l', |
23 | 'string|s=s', |
24 | 'check|C=i', |
25 | 'c', |
26 | 'perlqq|p', |
27 | 'htmlcref', |
28 | 'xmlcref', |
29 | 'debug|D', |
30 | 'scheme|S=s', |
31 | 'resolve|r=s', |
32 | 'help', |
33 | ); |
bedba681 |
34 | |
35 | $Opt{help} and help(); |
36 | $Opt{list} and list_encodings(); |
67d7b5ef |
37 | my $locale = $ENV{LC_CTYPE} || $ENV{LC_ALL} || $ENV{LANG}; |
bedba681 |
38 | defined $Opt{resolve} and resolve_encoding($Opt{resolve}); |
39 | $Opt{from} || $Opt{to} || help(); |
40 | my $from = $Opt{from} || $locale or help("from_encoding unspecified"); |
41 | my $to = $Opt{to} || $locale or help("to_encoding unspecified"); |
42 | $Opt{string} and Encode::from_to($Opt{string}, $from, $to) and print $Opt{string} and exit; |
1a1e8f7d |
43 | my $scheme = do { |
44 | if (defined $Opt{scheme}) { |
45 | if (!exists $Scheme{$Opt{scheme}}) { |
46 | warn "Unknown scheme '$Opt{scheme}', fallback to 'from_to'.\n"; |
47 | 'from_to'; |
48 | } else { |
49 | $Opt{scheme}; |
50 | } |
51 | } else { |
52 | 'from_to'; |
53 | } |
54 | }; |
55 | |
bedba681 |
56 | $Opt{check} ||= $Opt{c}; |
d1256cb1 |
57 | $Opt{perlqq} and $Opt{check} = Encode::PERLQQ; |
58 | $Opt{htmlcref} and $Opt{check} = Encode::HTMLCREF; |
59 | $Opt{xmlcref} and $Opt{check} = Encode::XMLCREF; |
bedba681 |
60 | |
61 | if ($Opt{debug}){ |
67d7b5ef |
62 | my $cfrom = Encode->getEncoding($from)->name; |
63 | my $cto = Encode->getEncoding($to)->name; |
ce912cd4 |
64 | print <<"EOT"; |
67d7b5ef |
65 | Scheme: $scheme |
66 | From: $from => $cfrom |
67 | To: $to => $cto |
68 | EOT |
69 | } |
70 | |
1a1e8f7d |
71 | my %use_bom = map { $_ => 1 } qw/UTF-16 UTF-32/; |
72 | |
8f1ed24a |
73 | # we do not use <> (or ARGV) for the sake of binmode() |
d1256cb1 |
74 | @ARGV or push @ARGV, \*STDIN; |
8f1ed24a |
75 | |
d1256cb1 |
76 | unless ( $scheme eq 'perlio' ) { |
8f1ed24a |
77 | binmode STDOUT; |
1a1e8f7d |
78 | my $need2slurp = $use_bom{ find_encoding($to)->name }; |
d1256cb1 |
79 | for my $argv (@ARGV) { |
80 | my $ifh = ref $argv ? $argv : undef; |
1a1e8f7d |
81 | $ifh or open $ifh, "<", $argv or warn "Can't open $argv: $!" and next; |
d1256cb1 |
82 | $ifh or open $ifh, "<", $argv or next; |
83 | binmode $ifh; |
84 | if ( $scheme eq 'from_to' ) { # default |
1a1e8f7d |
85 | if ($need2slurp){ |
86 | local $/; |
87 | $_ = <$ifh>; |
88 | Encode::from_to( $_, $from, $to, $Opt{check} ); |
89 | print; |
90 | }else{ |
91 | while (<$ifh>) { |
92 | Encode::from_to( $_, $from, $to, $Opt{check} ); |
93 | print; |
94 | } |
95 | } |
d1256cb1 |
96 | } |
97 | elsif ( $scheme eq 'decode_encode' ) { # step-by-step |
1a1e8f7d |
98 | if ($need2slurp){ |
99 | local $/; |
100 | $_ = <$ifh>; |
d1256cb1 |
101 | my $decoded = decode( $from, $_, $Opt{check} ); |
102 | my $encoded = encode( $to, $decoded ); |
103 | print $encoded; |
1a1e8f7d |
104 | }else{ |
105 | while (<$ifh>) { |
106 | my $decoded = decode( $from, $_, $Opt{check} ); |
107 | my $encoded = encode( $to, $decoded ); |
108 | print $encoded; |
109 | } |
110 | } |
111 | } |
112 | else { # won't reach |
d1256cb1 |
113 | die "$name: unknown scheme: $scheme"; |
114 | } |
8f1ed24a |
115 | } |
d1256cb1 |
116 | } |
117 | else { |
118 | |
8f1ed24a |
119 | # NI-S favorite |
120 | binmode STDOUT => "raw:encoding($to)"; |
d1256cb1 |
121 | for my $argv (@ARGV) { |
122 | my $ifh = ref $argv ? $argv : undef; |
1a1e8f7d |
123 | $ifh or open $ifh, "<", $argv or warn "Can't open $argv: $!" and next; |
d1256cb1 |
124 | $ifh or open $ifh, "<", $argv or next; |
125 | binmode $ifh => "raw:encoding($from)"; |
126 | print while (<$ifh>); |
8f1ed24a |
127 | } |
67d7b5ef |
128 | } |
129 | |
d1256cb1 |
130 | sub list_encodings { |
131 | print join( "\n", Encode->encodings(":all") ), "\n"; |
bedba681 |
132 | exit 0; |
133 | } |
134 | |
135 | sub resolve_encoding { |
d1256cb1 |
136 | if ( my $alias = Encode::resolve_alias( $_[0] ) ) { |
137 | print $alias, "\n"; |
138 | exit 0; |
139 | } |
140 | else { |
141 | warn "$name: $_[0] is not known to Encode\n"; |
142 | exit 1; |
bedba681 |
143 | } |
67d7b5ef |
144 | } |
145 | |
d1256cb1 |
146 | sub help { |
67d7b5ef |
147 | my $message = shift; |
67d7b5ef |
148 | $message and print STDERR "$name error: $message\n"; |
149 | print STDERR <<"EOT"; |
150 | $name [-f from_encoding] [-t to_encoding] [-s string] [files...] |
151 | $name -l |
bedba681 |
152 | $name -r encoding_alias |
153 | -l,--list |
154 | lists all available encodings |
155 | -r,--resolve encoding_alias |
156 | resolve encoding to its (Encode) canonical name |
157 | -f,--from from_encoding |
158 | when omitted, the current locale will be used |
159 | -t,--to to_encoding |
160 | when omitted, the current locale will be used |
161 | -s,--string string |
162 | "string" will be the input instead of STDIN or files |
163 | The following are mainly of interest to Encode hackers: |
164 | -D,--debug show debug information |
d1256cb1 |
165 | -C N | -c check the validity of the input |
bedba681 |
166 | -S,--scheme scheme use the scheme for conversion |
d1256cb1 |
167 | Those are handy when you can only see ascii characters: |
168 | -p,--perlqq |
169 | --htmlcref |
170 | --xmlcref |
67d7b5ef |
171 | EOT |
d1256cb1 |
172 | exit; |
67d7b5ef |
173 | } |
174 | |
175 | __END__ |
176 | |
177 | =head1 NAME |
178 | |
179 | piconv -- iconv(1), reinvented in perl |
180 | |
181 | =head1 SYNOPSIS |
182 | |
183 | piconv [-f from_encoding] [-t to_encoding] [-s string] [files...] |
184 | piconv -l |
bedba681 |
185 | piconv [-C N|-c|-p] |
186 | piconv -S scheme ... |
187 | piconv -r encoding |
188 | piconv -D ... |
189 | piconv -h |
67d7b5ef |
190 | |
191 | =head1 DESCRIPTION |
192 | |
0ab8f81e |
193 | B<piconv> is perl version of B<iconv>, a character encoding converter |
194 | widely available for various Unixen today. This script was primarily |
195 | a technology demonstrator for Perl 5.8.0, but you can use piconv in the |
196 | place of iconv for virtually any case. |
67d7b5ef |
197 | |
0ab8f81e |
198 | piconv converts the character encoding of either STDIN or files |
199 | specified in the argument and prints out to STDOUT. |
67d7b5ef |
200 | |
bedba681 |
201 | Here is the list of options. Each option can be in short format (-f) |
202 | or long (--from). |
67d7b5ef |
203 | |
204 | =over 4 |
205 | |
bedba681 |
206 | =item -f,--from from_encoding |
67d7b5ef |
207 | |
0ab8f81e |
208 | Specifies the encoding you are converting from. Unlike B<iconv>, |
209 | this option can be omitted. In such cases, the current locale is used. |
67d7b5ef |
210 | |
bedba681 |
211 | =item -t,--to to_encoding |
67d7b5ef |
212 | |
0ab8f81e |
213 | Specifies the encoding you are converting to. Unlike B<iconv>, |
214 | this option can be omitted. In such cases, the current locale is used. |
67d7b5ef |
215 | |
0ab8f81e |
216 | Therefore, when both -f and -t are omitted, B<piconv> just acts |
217 | like B<cat>. |
67d7b5ef |
218 | |
bedba681 |
219 | =item -s,--string I<string> |
67d7b5ef |
220 | |
bedba681 |
221 | uses I<string> instead of file for the source of text. |
67d7b5ef |
222 | |
bedba681 |
223 | =item -l,--list |
67d7b5ef |
224 | |
ce912cd4 |
225 | Lists all available encodings, one per line, in case-insensitive |
0ab8f81e |
226 | order. Note that only the canonical names are listed; many aliases |
ce912cd4 |
227 | exist. For example, the names are case-insensitive, and many standard |
0ab8f81e |
228 | and common aliases work, such as "latin1" for "ISO-8859-1", or "ibm850" |
ce912cd4 |
229 | instead of "cp850", or "winlatin1" for "cp1252". See L<Encode::Supported> |
0ab8f81e |
230 | for a full discussion. |
67d7b5ef |
231 | |
bedba681 |
232 | =item -C,--check I<N> |
b2704119 |
233 | |
234 | Check the validity of the stream if I<N> = 1. When I<N> = -1, something |
235 | interesting happens when it encounters an invalid character. |
236 | |
237 | =item -c |
238 | |
239 | Same as C<-C 1>. |
240 | |
bedba681 |
241 | =item -p,--perlqq |
b2704119 |
242 | |
d1256cb1 |
243 | =item --htmlcref |
244 | |
245 | =item --xmlcref |
246 | |
247 | Applies PERLQQ, HTMLCREF, XMLCREF, respectively. Try |
248 | |
249 | piconv -f utf8 -t ascii --perlqq |
250 | |
251 | To see what it does. |
b2704119 |
252 | |
bedba681 |
253 | =item -h,--help |
d31fa6c4 |
254 | |
255 | Show usage. |
256 | |
bedba681 |
257 | =item -D,--debug |
67d7b5ef |
258 | |
7748829a |
259 | Invokes debugging mode. Primarily for Encode hackers. |
67d7b5ef |
260 | |
bedba681 |
261 | =item -S,--scheme scheme |
67d7b5ef |
262 | |
263 | Selects which scheme is to be used for conversion. Available schemes |
0ab8f81e |
264 | are as follows: |
67d7b5ef |
265 | |
266 | =over 4 |
267 | |
268 | =item from_to |
269 | |
270 | Uses Encode::from_to for conversion. This is the default. |
271 | |
272 | =item decode_encode |
273 | |
7748829a |
274 | Input strings are decode()d then encode()d. A straight two-step |
67d7b5ef |
275 | implementation. |
276 | |
277 | =item perlio |
278 | |
7748829a |
279 | The new perlIO layer is used. NI-S' favorite. |
67d7b5ef |
280 | |
51e4e64d |
281 | You should use this option if you are using UTF-16 and others which |
282 | linefeed is not $/. |
283 | |
67d7b5ef |
284 | =back |
285 | |
0ab8f81e |
286 | Like the I<-D> option, this is also for Encode hackers. |
67d7b5ef |
287 | |
288 | =back |
289 | |
290 | =head1 SEE ALSO |
291 | |
1a1e8f7d |
292 | L<iconv(1)> |
293 | L<locale(3)> |
67d7b5ef |
294 | L<Encode> |
ce912cd4 |
295 | L<Encode::Supported> |
296 | L<Encode::Alias> |
67d7b5ef |
297 | L<PerlIO> |
298 | |
299 | =cut |