Commit | Line | Data |
d1edabcf |
1 | package open; |
99ef548b |
2 | use warnings; |
ac27b0f5 |
3 | use Carp; |
9cfe5470 |
4 | $open::hint_bits = 0x20000; # HINT_LOCALIZE_HH |
16fe6d59 |
5 | |
0c4f7ff0 |
6 | our $VERSION = '1.01'; |
b75c8c73 |
7 | |
58d53262 |
8 | my $locale_encoding; |
9 | |
b178108d |
10 | sub in_locale { $^H & ($locale::hint_bits || 0)} |
58d53262 |
11 | |
12 | sub _get_locale_encoding { |
13 | unless (defined $locale_encoding) { |
276c9210 |
14 | # I18N::Langinfo isn't available everywhere |
9615f2ee |
15 | eval { |
16 | require I18N::Langinfo; |
17 | I18N::Langinfo->import(qw(langinfo CODESET)); |
ba6ce41c |
18 | $locale_encoding = langinfo(CODESET()); |
9615f2ee |
19 | }; |
11fc5dc3 |
20 | my $country_language; |
a4157ebb |
21 | |
22 | no warnings 'uninitialized'; |
23 | |
58d53262 |
24 | if (not $locale_encoding && in_locale()) { |
11fc5dc3 |
25 | if ($ENV{LC_ALL} =~ /^([^.]+)\.([^.]+)$/) { |
26 | ($country_language, $locale_encoding) = ($1, $2); |
27 | } elsif ($ENV{LANG} =~ /^([^.]+)\.([^.]+)$/) { |
28 | ($country_language, $locale_encoding) = ($1, $2); |
58d53262 |
29 | } |
61de9fb5 |
30 | # LANGUAGE affects only LC_MESSAGES only on glibc |
1e616cf5 |
31 | } elsif (not $locale_encoding) { |
32 | if ($ENV{LC_ALL} =~ /\butf-?8\b/i || |
33 | $ENV{LANG} =~ /\butf-?8\b/i) { |
34 | $locale_encoding = 'utf8'; |
35 | } |
36 | # Could do more heuristics based on the country and language |
58d53262 |
37 | # parts of LC_ALL and LANG (the parts before the dot (if any)), |
38 | # since we have Locale::Country and Locale::Language available. |
39 | # TODO: get a database of Language -> Encoding mappings |
421e5dc3 |
40 | # (the Estonian database at http://www.eki.ee/letter/ |
41 | # would be excellent!) --jhi |
58d53262 |
42 | } |
11fc5dc3 |
43 | if (defined $locale_encoding && |
44 | $locale_encoding eq 'euc' && |
45 | defined $country_language) { |
56fb2e42 |
46 | if ($country_language =~ /^ja_JP|japan(?:ese)?$/i) { |
1e616cf5 |
47 | $locale_encoding = 'euc-jp'; |
5a192dee |
48 | } elsif ($country_language =~ /^ko_KR|korean?$/i) { |
1e616cf5 |
49 | $locale_encoding = 'euc-kr'; |
a4157ebb |
50 | } elsif ($country_language =~ /^zh_CN|chin(?:a|ese)?$/i) { |
51 | $locale_encoding = 'euc-cn'; |
56fb2e42 |
52 | } elsif ($country_language =~ /^zh_TW|taiwan(?:ese)?$/i) { |
011f8d22 |
53 | $locale_encoding = 'euc-tw'; |
11fc5dc3 |
54 | } |
55 | croak "Locale encoding 'euc' too ambiguous" |
56 | if $locale_encoding eq 'euc'; |
57 | } |
58d53262 |
58 | } |
59 | } |
60 | |
16fe6d59 |
61 | sub import { |
dfebf958 |
62 | my ($class,@args) = @_; |
e2d9456f |
63 | croak("`use open' needs explicit list of PerlIO layers") unless @args; |
b178108d |
64 | my $std; |
16fe6d59 |
65 | $^H |= $open::hint_bits; |
ba6ce41c |
66 | my ($in,$out) = split(/\0/,(${^OPEN} || "\0"), -1); |
dfebf958 |
67 | while (@args) { |
68 | my $type = shift(@args); |
1e616cf5 |
69 | my $dscp; |
70 | if ($type =~ /^:?(utf8|locale|encoding\(.+\))$/) { |
71 | $type = 'IO'; |
72 | $dscp = ":$1"; |
b178108d |
73 | } elsif ($type eq ':std') { |
74 | $std = 1; |
75 | next; |
1e616cf5 |
76 | } else { |
725d232a |
77 | $dscp = shift(@args) || ''; |
1e616cf5 |
78 | } |
ac27b0f5 |
79 | my @val; |
1e616cf5 |
80 | foreach my $layer (split(/\s+/,$dscp)) { |
dfebf958 |
81 | $layer =~ s/^://; |
58d53262 |
82 | if ($layer eq 'locale') { |
83 | use Encode; |
84 | _get_locale_encoding() |
85 | unless defined $locale_encoding; |
99ef548b |
86 | (warnings::warnif("layer", "Cannot figure out an encoding to use"), last) |
58d53262 |
87 | unless defined $locale_encoding; |
11fc5dc3 |
88 | if ($locale_encoding =~ /^utf-?8$/i) { |
89 | $layer = "utf8"; |
90 | } else { |
738b23dc |
91 | $layer = "encoding($locale_encoding)"; |
11fc5dc3 |
92 | } |
b178108d |
93 | $std = 1; |
97ed432b |
94 | } else { |
011f8d22 |
95 | my $target = $layer; # the layer name itself |
96 | $target =~ s/^(\w+)\(.+\)$/$1/; # strip parameters |
97 | |
98 | unless(PerlIO::Layer::->find($target)) { |
e2d9456f |
99 | warnings::warnif("layer", "Unknown PerlIO layer '$layer'"); |
97ed432b |
100 | } |
ac27b0f5 |
101 | } |
102 | push(@val,":$layer"); |
103 | if ($layer =~ /^(crlf|raw)$/) { |
104 | $^H{"open_$type"} = $layer; |
16fe6d59 |
105 | } |
ac27b0f5 |
106 | } |
107 | if ($type eq 'IN') { |
108 | $in = join(' ',@val); |
109 | } |
110 | elsif ($type eq 'OUT') { |
111 | $out = join(' ',@val); |
16fe6d59 |
112 | } |
1e616cf5 |
113 | elsif ($type eq 'IO') { |
f3b00462 |
114 | $in = $out = join(' ',@val); |
115 | } |
16fe6d59 |
116 | else { |
e2d9456f |
117 | croak "Unknown PerlIO layer class '$type'"; |
16fe6d59 |
118 | } |
119 | } |
a4157ebb |
120 | ${^OPEN} = join("\0",$in,$out) if $in or $out; |
b178108d |
121 | if ($std) { |
122 | if ($in) { |
123 | if ($in =~ /:utf8\b/) { |
124 | binmode(STDIN, ":utf8"); |
125 | } elsif ($in =~ /(\w+\(.+\))/) { |
126 | binmode(STDIN, ":$1"); |
127 | } |
128 | } |
129 | if ($out) { |
130 | if ($out =~ /:utf8\b/) { |
131 | binmode(STDOUT, ":utf8"); |
132 | binmode(STDERR, ":utf8"); |
133 | } elsif ($out =~ /(\w+\(.+\))/) { |
134 | binmode(STDOUT, ":$1"); |
135 | binmode(STDERR, ":$1"); |
136 | } |
137 | } |
138 | } |
16fe6d59 |
139 | } |
140 | |
141 | 1; |
142 | __END__ |
d1edabcf |
143 | |
144 | =head1 NAME |
145 | |
e2d9456f |
146 | open - perl pragma to set default PerlIO layers for input and output |
d1edabcf |
147 | |
148 | =head1 SYNOPSIS |
149 | |
d5563ed7 |
150 | use open IN => ":crlf", OUT => ":bytes"; |
1e616cf5 |
151 | use open OUT => ':utf8'; |
152 | use open IO => ":encoding(iso-8859-7)"; |
153 | |
154 | use open IO => ':locale'; |
725d232a |
155 | |
1e616cf5 |
156 | use open ':utf8'; |
157 | use open ':locale'; |
158 | use open ':encoding(iso-8859-7)'; |
d1edabcf |
159 | |
b178108d |
160 | use open ':std'; |
161 | |
d1edabcf |
162 | =head1 DESCRIPTION |
163 | |
e2d9456f |
164 | Full-fledged support for I/O layers is now implemented provided |
d151aa0e |
165 | Perl is configured to use PerlIO as its IO system (which is now the |
166 | default). |
16fe6d59 |
167 | |
7d3b96bb |
168 | The C<open> pragma serves as one of the interfaces to declare default |
fae2c0fb |
169 | "layers" (also known as "disciplines") for all I/O. Any open(), |
170 | readpipe() (aka qx//) and similar operators found within the lexical |
171 | scope of this pragma will use the declared defaults. |
7d3b96bb |
172 | |
1e616cf5 |
173 | With the C<IN> subpragma you can declare the default layers |
d8d29d4f |
174 | of input streams, and with the C<OUT> subpragma you can declare |
1e616cf5 |
175 | the default layers of output streams. With the C<IO> subpragma |
176 | you can control both input and output streams simultaneously. |
177 | |
178 | If you have a legacy encoding, you can use the C<:encoding(...)> tag. |
179 | |
e2d9456f |
180 | if you want to set your encoding layers based on your |
1e616cf5 |
181 | locale environment variables, you can use the C<:locale> tag. |
182 | For example: |
183 | |
184 | $ENV{LANG} = 'ru_RU.KOI8-R'; |
dbd62f41 |
185 | # the :locale will probe the locale environment variables like LANG |
186 | use open OUT => ':locale'; |
1e616cf5 |
187 | open(O, ">koi8"); |
23bcb45a |
188 | print O chr(0x430); # Unicode CYRILLIC SMALL LETTER A = KOI8-R 0xc1 |
1e616cf5 |
189 | close O; |
190 | open(I, "<koi8"); |
23bcb45a |
191 | printf "%#x\n", ord(<I>), "\n"; # this should print 0xc1 |
1e616cf5 |
192 | close I; |
193 | |
194 | These are equivalent |
195 | |
196 | use open ':utf8'; |
197 | use open IO => ':utf8'; |
198 | |
199 | as are these |
200 | |
201 | use open ':locale'; |
202 | use open IO => ':locale'; |
203 | |
204 | and these |
205 | |
206 | use open ':encoding(iso-8859-7)'; |
207 | use open IO => ':encoding(iso-8859-7)'; |
208 | |
b5d8778e |
209 | The matching of encoding names is loose: case does not matter, and |
210 | many encodings have several aliases. See L<Encode::Supported> for |
211 | details and the list of supported locales. |
212 | |
e2d9456f |
213 | Note that C<:utf8> PerlIO layer must always be specified exactly like |
b5d8778e |
214 | that, it is not subject to the loose matching of encoding names. |
215 | |
d151aa0e |
216 | When open() is given an explicit list of layers they are appended to |
217 | the list declared using this pragma. |
7d3b96bb |
218 | |
b178108d |
219 | The C<:std> subpragma on its own has no effect, but if combined with |
220 | the C<:utf8> or C<:encoding> subpragmas, it converts the standard |
221 | filehandles (STDIN, STDOUT, STDERR) to comply with encoding selected |
222 | for input/output handles. For example, if both input and out are |
223 | chosen to be C<:utf8>, a C<:std> will mean that STDIN, STDOUT, and |
224 | STDERR are also in C<:utf8>. On the other hand, if only output is |
fb80c70c |
225 | chosen to be in C<< :encoding(koi8r) >>, a C<:std> will cause only the |
b178108d |
226 | STDOUT and STDERR to be in C<koi8r>. The C<:locale> subpragma |
227 | implicitly turns on C<:std>. |
228 | |
ba9a69eb |
229 | The logic of C<:locale> is as follows: |
230 | |
231 | =over 4 |
232 | |
233 | =item 1. |
234 | |
235 | If the platform supports the langinfo(CODESET) interface, the codeset |
236 | returned is used as the default encoding for the open pragma. |
237 | |
238 | =item 2. |
239 | |
240 | If 1. didn't work but we are under the locale pragma, the environment |
241 | variables LC_ALL and LANG (in that order) are matched for encodings |
242 | (the part after C<.>, if any), and if any found, that is used |
243 | as the default encoding for the open pragma. |
244 | |
245 | =item 3. |
246 | |
247 | If 1. and 2. didn't work, the environment variables LC_ALL and LANG |
248 | (in that order) are matched for anything looking like UTF-8, and if |
249 | any found, C<:utf8> is used as the default encoding for the open |
250 | pragma. |
251 | |
252 | =back |
253 | |
61de9fb5 |
254 | If your locale environment variables (LC_ALL, LC_CTYPE, LANG) |
b310b053 |
255 | contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), |
256 | the default encoding of your STDIN, STDOUT, and STDERR, and of |
257 | B<any subsequent file open>, is UTF-8. |
258 | |
e2d9456f |
259 | Directory handles may also support PerlIO layers in the future. |
7d3b96bb |
260 | |
261 | =head1 NONPERLIO FUNCTIONALITY |
262 | |
d151aa0e |
263 | If Perl is not built to use PerlIO as its IO system then only the two |
e2d9456f |
264 | pseudo-layers C<:bytes> and C<:crlf> are available. |
16fe6d59 |
265 | |
e2d9456f |
266 | The C<:bytes> layer corresponds to "binary mode" and the C<:crlf> |
267 | layer corresponds to "text mode" on platforms that distinguish |
16fe6d59 |
268 | between the two modes when opening files (which is many DOS-like |
e2d9456f |
269 | platforms, including Windows). These two layers are no-ops on |
d151aa0e |
270 | platforms where binmode() is a no-op, but perform their functions |
271 | everywhere if PerlIO is enabled. |
7d3b96bb |
272 | |
273 | =head1 IMPLEMENTATION DETAILS |
d1edabcf |
274 | |
f3b00462 |
275 | There is a class method in C<PerlIO::Layer> C<find> which is |
276 | implemented as XS code. It is called by C<import> to validate the |
277 | layers: |
0c4f7ff0 |
278 | |
279 | PerlIO::Layer::->find("perlio") |
280 | |
f3b00462 |
281 | The return value (if defined) is a Perl object, of class |
282 | C<PerlIO::Layer> which is created by the C code in F<perlio.c>. As |
283 | yet there is nothing useful you can do with the object at the perl |
284 | level. |
16fe6d59 |
285 | |
d1edabcf |
286 | =head1 SEE ALSO |
287 | |
1768d7eb |
288 | L<perlfunc/"binmode">, L<perlfunc/"open">, L<perlunicode>, L<PerlIO>, |
289 | L<encoding> |
d1edabcf |
290 | |
291 | =cut |