Commit | Line | Data |
de1df517 |
1 | # $File: //member/autrijus/.vimrc $ $Author: autrijus $ |
2 | # $Revision: #14 $ $Change: 4137 $ $DateTime: 2003/02/08 11:41:59 $ |
3 | |
4 | package encoding::warnings; |
5 | $encoding::warnings::VERSION = '0.05'; |
6 | |
7 | use strict; |
8 | |
9 | =head1 NAME |
10 | |
11 | encoding::warnings - Warn on implicit encoding conversions |
12 | |
13 | =head1 VERSION |
14 | |
15 | This document describes version 0.05 of encoding::warnings, released |
16 | July 15, 2004. |
17 | |
18 | =head1 SYNOPSIS |
19 | |
20 | use encoding::warnings; # or 'FATAL' to raise fatal exceptions |
21 | |
22 | utf8::encode($a = chr(20000)); # a byte-string (raw bytes) |
23 | $b = chr(20000); # a unicode-string (wide characters) |
24 | |
25 | # "Bytes implicitly upgraded into wide characters as iso-8859-1" |
26 | $c = $a . $b; |
27 | |
28 | =head1 DESCRIPTION |
29 | |
30 | =head2 Overview of the problem |
31 | |
32 | By default, there is a fundamental asymmetry in Perl's unicode model: |
33 | implicit upgrading from byte-strings to unicode-strings assumes that |
34 | they were encoded in I<ISO 8859-1 (Latin-1)>, but unicode-strings are |
35 | downgraded with UTF-8 encoding. This happens because the first 256 |
36 | codepoints in Unicode happens to agree with Latin-1. |
37 | |
38 | However, this silent upgrading can easily cause problems, if you happen |
39 | to mix unicode strings with non-Latin1 data -- i.e. byte-strings encoded |
40 | in UTF-8 or other encodings. The error will not manifest until the |
41 | combined string is written to output, at which time it would be impossible |
42 | to see where did the silent upgrading occur. |
43 | |
44 | =head2 Detecting the problem |
45 | |
46 | This module simplifies the process of diagnosing such problems. Just put |
47 | this line on top of your main program: |
48 | |
49 | use encoding::warnings; |
50 | |
51 | Afterwards, implicit upgrading of high-bit bytes will raise a warning. |
52 | Ex.: C<Bytes implicitly upgraded into wide characters as iso-8859-1 at |
53 | - line 7>. |
54 | |
55 | However, strings composed purely of ASCII code points (C<0x00>..C<0x7F>) |
56 | will I<not> trigger this warning. |
57 | |
58 | You can also make the warnings fatal by importing this module as: |
59 | |
60 | use encoding::warnings 'FATAL'; |
61 | |
62 | =head2 Solving the problem |
63 | |
64 | Most of the time, this warning occurs when a byte-string is concatenated |
65 | with a unicode-string. There are a number of ways to solve it: |
66 | |
67 | =over 4 |
68 | |
69 | =item * Upgrade both sides to unicode-strings |
70 | |
71 | If your program does not need compatibility for Perl 5.6 and earlier, |
72 | the recommended approach is to apply appropriate IO disciplines, so all |
73 | data in your program become unicode-strings. See L<encoding>, L<open> and |
74 | L<perlfunc/binmode> for how. |
75 | |
76 | =item * Downgrade both sides to byte-strings |
77 | |
78 | The other way works too, especially if you are sure that all your data |
79 | are under the same encoding, or if compatibility with older versions |
80 | of Perl is desired. |
81 | |
82 | You may downgrade strings with C<Encode::encode> and C<utf8::encode>. |
83 | See L<Encode> and L<utf8> for details. |
84 | |
85 | =item * Specify the encoding for implicit byte-string upgrading |
86 | |
87 | If you are confident that all byte-strings will be in a specific |
88 | encoding like UTF-8, I<and> need not support older versions of Perl, |
89 | use the C<encoding> pragma: |
90 | |
91 | use encoding 'utf8'; |
92 | |
93 | Similarly, this will silence warnings from this module, and preserve the |
94 | default behaviour: |
95 | |
96 | use encoding 'iso-8859-1'; |
97 | |
98 | However, note that C<use encoding> actually had three distinct effects: |
99 | |
100 | =over 4 |
101 | |
102 | =item * PerlIO layers for B<STDIN> and B<STDOUT> |
103 | |
104 | This is similar to what L<open> pragma does. |
105 | |
106 | =item * Literal conversions |
107 | |
108 | This turns I<all> literal string in your program into unicode-strings |
109 | (equivalent to a C<use utf8>), by decoding them using the specified |
110 | encoding. |
111 | |
112 | =item * Implicit upgrading for byte-strings |
113 | |
114 | This will silence warnings from this module, as shown above. |
115 | |
116 | =back |
117 | |
118 | Because literal conversions also work on empty strings, it may surprise |
119 | some people: |
120 | |
121 | use encoding 'big5'; |
122 | |
123 | my $byte_string = pack("C*", 0xA4, 0x40); |
124 | print length $a; # 2 here. |
125 | $a .= ""; # concatenating with a unicode string... |
126 | print length $a; # 1 here! |
127 | |
128 | In other words, do not C<use encoding> unless you are certain that the |
129 | program will not deal with any raw, 8-bit binary data at all. |
130 | |
131 | However, the C<Filter =E<gt> 1> flavor of C<use encoding> will I<not> |
132 | affect implicit upgrading for byte-strings, and is thus incapable of |
133 | silencing warnings from this module. See L<encoding> for more details. |
134 | |
135 | =back |
136 | |
137 | =head1 CAVEATS |
138 | |
139 | This module currently affects the whole script, instead of inside its |
140 | lexical block. This is expected to be addressed during Perl 5.9 development, |
141 | where the B<encoding> module will also be made lexical. |
142 | |
143 | =cut |
144 | |
145 | # Constants. |
146 | sub ASCII () { 0 } |
147 | sub LATIN1 () { 1 } |
148 | sub FATAL () { 2 } |
149 | |
150 | # Install a ${^ENCODING} handler if no other one are already in place. |
151 | sub import { |
152 | my $class = shift; |
153 | my $fatal = shift || ''; |
154 | |
155 | local $@; |
156 | return if ${^ENCODING} and ref(${^ENCODING}) ne $class; |
157 | return unless eval { require Encode; 1 }; |
158 | |
159 | my $ascii = Encode::find_encoding('us-ascii') or return; |
160 | my $latin1 = Encode::find_encoding('iso-8859-1') or return; |
161 | |
162 | # Have to undef explicitly here |
163 | undef ${^ENCODING}; |
164 | |
165 | # Install a warning handler for decode() |
166 | ${^ENCODING} = bless( |
167 | [ |
168 | $ascii, |
169 | $latin1, |
170 | (($fatal eq 'FATAL') ? 'Carp::croak' : 'Carp::carp'), |
171 | ], $class, |
172 | ); |
173 | } |
174 | |
175 | # Don't worry about source code literals. |
176 | sub cat_decode { |
177 | my $self = shift; |
178 | return $self->[LATIN1]->cat_decode(@_); |
179 | } |
180 | |
181 | # Warn if the data is not purely US-ASCII. |
182 | sub decode { |
183 | my $self = shift; |
184 | |
185 | local $@; |
186 | my $rv = eval { $self->[ASCII]->decode($_[0], Encode::FB_CROAK()) }; |
187 | return $rv unless $@; |
188 | |
189 | require Carp; |
190 | no strict 'refs'; |
191 | $self->[FATAL]->( |
192 | "Bytes implicitly upgraded into wide characters as iso-8859-1" |
193 | ); |
194 | return $self->[LATIN1]->decode(@_); |
195 | } |
196 | |
197 | sub name { 'iso-8859-1' } |
198 | |
199 | 1; |
200 | |
201 | __END__ |
202 | |
203 | =head1 SEE ALSO |
204 | |
205 | L<perlunicode>, L<perluniintro> |
206 | |
207 | L<open>, L<utf8>, L<encoding>, L<Encode> |
208 | |
209 | =head1 AUTHORS |
210 | |
211 | Autrijus Tang E<lt>autrijus@autrijus.orgE<gt> |
212 | |
213 | =head1 COPYRIGHT |
214 | |
215 | Copyright 2004 by Autrijus Tang E<lt>autrijus@autrijus.orgE<gt>. |
216 | |
217 | This program is free software; you can redistribute it and/or modify it |
218 | under the same terms as Perl itself. |
219 | |
220 | See L<http://www.perl.com/perl/misc/Artistic.html> |
221 | |
222 | =cut |