Commit | Line | Data |
2c674647 |
1 | package Encode; |
2 | |
3 | $VERSION = 0.01; |
4 | |
5 | require DynaLoader; |
6 | require Exporter; |
7 | |
8 | @ISA = qw(Exporter DynaLoader); |
9 | |
10 | @EXPORT_OK = |
11 | qw( |
12 | bytes_to_utf8 |
13 | utf8_to_bytes |
14 | chars_to_utf8 |
15 | utf8_to_chars |
16 | utf8_to_chars_check |
17 | bytes_to_chars |
18 | chars_to_bytes |
19 | from_to |
20 | is_utf8 |
21 | on_utf8 |
22 | off_utf8 |
23 | utf_to_utf |
24 | ); |
25 | |
26 | bootstrap Encode (); |
27 | |
28 | =pod |
29 | |
30 | =head1 NAME |
31 | |
32 | Encode - character encodings |
33 | |
34 | =head2 TERMINOLOGY |
35 | |
36 | =over |
37 | |
38 | =item * |
39 | |
40 | I<char>: a character in the range 0..maxint (at least 2**32-1) |
41 | |
42 | =item * |
43 | |
44 | I<byte>: a character in the range 0..255 |
45 | |
46 | =back |
47 | |
48 | The marker [INTERNAL] marks Internal Implementation Details, in |
49 | general meant only for those who think they know what they are doing, |
50 | and such details may change in future releases. |
51 | |
52 | =head2 bytes |
53 | |
54 | =over 4 |
55 | |
56 | =item * |
57 | |
58 | bytes_to_utf8(STRING [, FROM]) |
59 | |
60 | The bytes in STRING are recoded in-place into UTF-8. If no FROM is |
61 | specified the bytes are expected to be encoded in US-ASCII or ISO |
62 | 8859-1 (Latin 1). Returns the new size of STRING, or C<undef> if |
63 | there's a failure. |
64 | |
65 | [INTERNAL] Also the UTF-8 flag of STRING is turned on. |
66 | |
67 | =item * |
68 | |
69 | utf8_to_bytes(STRING [, TO [, CHECK]]) |
70 | |
71 | The UTF-8 in STRING is decoded in-place into bytes. If no TO encoding |
72 | is specified the bytes are expected to be encoded in US-ASCII or ISO |
73 | 8859-1 (Latin 1). Returns the new size of STRING, or C<undef> if |
74 | there's a failure. |
75 | |
76 | What if there are characters > 255? What if the UTF-8 in STRING is |
77 | malformed? See L</"Handling Malformed Data">. |
78 | |
79 | [INTERNAL] The UTF-8 flag of STRING is not checked. |
80 | |
81 | =back |
82 | |
83 | =head2 chars |
84 | |
85 | =over 4 |
86 | |
87 | =item * |
88 | |
89 | chars_to_utf8(STRING) |
90 | |
91 | The chars in STRING are encoded in-place into UTF-8. Returns the new |
92 | size of STRING, or C<undef> if there's a failure. |
93 | |
94 | No assumptions are made on the encoding of the chars. If you want to |
95 | assume that the chars are Unicode and to trap illegal Unicode |
96 | characters, you must use C<from_to('Unicode', ...)>. |
97 | |
98 | [INTERNAL] Also the UTF-8 flag of STRING is turned on. |
99 | |
100 | =over 4 |
101 | |
102 | =item * |
103 | |
104 | utf8_to_chars(STRING) |
105 | |
106 | The UTF-8 in STRING is decoded in-place into chars. Returns the new |
107 | size of STRING, or C<undef> if there's a failure. |
108 | |
109 | If the UTF-8 in STRING is malformed C<undef> is returned, and also an |
110 | optional lexical warning (category utf8) is given. |
111 | |
112 | [INTERNAL] The UTF-8 flag of STRING is not checked. |
113 | |
114 | =item * |
115 | |
116 | utf8_to_chars_check(STRING [, CHECK]) |
117 | |
118 | (Note that special naming of this interface since a two-argument |
119 | utf8_to_chars() has different semantics.) |
120 | |
121 | The UTF-8 in STRING is decoded in-place into chars. Returns the new |
122 | size of STRING, or C<undef> if there is a failure. |
123 | |
124 | If the UTF-8 in STRING is malformed? See L</"Handling Malformed Data">. |
125 | |
126 | [INTERNAL] The UTF-8 flag of STRING is not checked. |
127 | |
128 | =back |
129 | |
130 | =head2 chars With Encoding |
131 | |
132 | =over 4 |
133 | |
134 | =item * |
135 | |
136 | chars_to_utf8(STRING, FROM [, CHECK]) |
137 | |
138 | The chars in STRING encoded in FROM are recoded in-place into UTF-8. |
139 | Returns the new size of STRING, or C<undef> if there's a failure. |
140 | |
141 | No assumptions are made on the encoding of the chars. If you want to |
142 | assume that the chars are Unicode and to trap illegal Unicode |
143 | characters, you must use C<from_to('Unicode', ...)>. |
144 | |
145 | [INTERNAL] Also the UTF-8 flag of STRING is turned on. |
146 | |
147 | =item * |
148 | |
149 | utf8_to_chars(STRING, TO [, CHECK]) |
150 | |
151 | The UTF-8 in STRING is decoded in-place into chars encoded in TO. |
152 | Returns the new size of STRING, or C<undef> if there's a failure. |
153 | |
154 | If the UTF-8 in STRING is malformed? See L</"Handling Malformed Data">. |
155 | |
156 | [INTERNAL] The UTF-8 flag of STRING is not checked. |
157 | |
158 | =item * |
159 | |
160 | bytes_to_chars(STRING, FROM [, CHECK]) |
161 | |
162 | The bytes in STRING encoded in FROM are recoded in-place into chars. |
163 | Returns the new size of STRING in bytes, or C<undef> if there's a |
164 | failure. |
165 | |
166 | If the mapping is impossible? See L</"Handling Malformed Data">. |
167 | |
168 | =item * |
169 | |
170 | chars_to_bytes(STRING, TO [, CHECK]) |
171 | |
172 | The chars in STRING are recoded in-place to bytes encoded in TO. |
173 | Returns the new size of STRING in bytes, or C<undef> if there's a |
174 | failure. |
175 | |
176 | If the mapping is impossible? See L</"Handling Malformed Data">. |
177 | |
178 | =item * |
179 | |
180 | from_to(STRING, FROM, TO [, CHECK]) |
181 | |
182 | The chars in STRING encoded in FROM are recoded in-place into TO. |
183 | Returns the new size of STRING, or C<undef> if there's a failure. |
184 | |
185 | If mapping between the encodings is impossible? |
186 | See L</"Handling Malformed Data">. |
187 | |
188 | [INTERNAL] If TO is UTF-8, also the UTF-8 flag of STRING is turned on. |
189 | |
190 | =back |
191 | |
192 | =head2 Testing For UTF-8 |
193 | |
194 | =over 4 |
195 | |
196 | =item * |
197 | |
198 | is_utf8(STRING [, CHECK]) |
199 | |
200 | [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING. |
201 | If CHECK is true, also checks the data in STRING for being |
202 | well-formed UTF-8. Returns true if successful, false otherwise. |
203 | |
204 | =back |
205 | |
206 | =head2 Toggling UTF-8-ness |
207 | |
208 | =over 4 |
209 | |
210 | =item * |
211 | |
212 | on_utf8(STRING) |
213 | |
214 | [INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is |
215 | B<not> checked for being well-formed UTF-8. Do not use unless you |
216 | B<know> that the STRING is well-formed UTF-8. Returns the previous |
217 | state of the UTF-8 flag (so please don't test the return value as |
218 | I<not> success or failure), or C<undef> if STRING is not a string. |
219 | |
220 | =item * |
221 | |
222 | off_utf8(STRING) |
223 | |
224 | [INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously. |
225 | Returns the previous state of the UTF-8 flag (so please don't test the |
226 | return value as I<not> success or failure), or C<undef> if STRING is |
227 | not a string. |
228 | |
229 | =back |
230 | |
231 | =head2 UTF-16 and UTF-32 Encodings |
232 | |
233 | =over 4 |
234 | |
235 | =item * |
236 | |
237 | utf_to_utf(STRING, FROM, TO [, CHECK]) |
238 | |
239 | The data in STRING is converted from Unicode Transfer Encoding FROM to |
240 | Unicode Transfer Encoding TO. Both FROM and TO may be any of the |
241 | following tags (case-insensitive, with or without 'utf' or 'utf-' prefix): |
242 | |
243 | tag meaning |
244 | |
245 | '7' UTF-7 |
246 | '8' UTF-8 |
247 | '16be' UTF-16 big-endian |
248 | '16le' UTF-16 little-endian |
249 | '16' UTF-16 native-endian |
250 | '32be' UTF-32 big-endian |
251 | '32le' UTF-32 little-endian |
252 | '32' UTF-32 native-endian |
253 | |
254 | UTF-16 is also known as UCS-2, 16 bit or 2-byte chunks, and UTF-32 as |
255 | UCS-4, 32-bit or 4-byte chunks. Returns the new size of STRING, or |
256 | C<undef> is there's a failure. |
257 | |
258 | If FROM is UTF-8 and the UTF-8 in STRING is malformed? See |
259 | L</"Handling Malformed Data">. |
260 | |
261 | [INTERNAL] Even if CHECK is true and FROM is UTF-8, the UTF-8 flag of |
262 | STRING is not checked. If TO is UTF-8, also the UTF-8 flag of STRING is |
263 | turned on. Identical FROM and TO are fine. |
264 | |
265 | =back |
266 | |
267 | =head2 Handling Malformed Data |
268 | |
269 | If CHECK is not set, C<undef> is returned. If the data is supposed to |
270 | be UTF-8, an optional lexical warning (category utf8) is given. If |
271 | CHECK is true but not a code reference, dies. If CHECK is a code |
272 | reference, it is called with the arguments |
273 | |
274 | (MALFORMED_STRING, STRING_FROM_SO_FAR, STRING_TO_SO_FAR) |
275 | |
276 | Two return values are expected from the call: the string to be used in |
277 | the result string in place of the malformed section, and the length of |
278 | the malformed section in bytes. |
279 | |
280 | =cut |
281 | |
282 | sub bytes_to_utf8 { |
283 | &_bytes_to_utf8; |
284 | } |
285 | |
286 | sub utf8_to_bytes { |
287 | &_utf8_to_bytes; |
288 | } |
289 | |
290 | sub chars_to_utf8 { |
291 | &C_to_utf8; |
292 | } |
293 | |
294 | sub utf8_to_chars { |
295 | &_utf8_to_chars; |
296 | } |
297 | |
298 | sub utf8_to_chars_check { |
299 | &_utf8_to_chars_check; |
300 | } |
301 | |
302 | sub bytes_to_chars { |
303 | &_bytes_to_chars; |
304 | } |
305 | |
306 | sub chars_to_bytes { |
307 | &_chars_to_bytes; |
308 | } |
309 | |
310 | sub from_to { |
311 | &_from_to; |
312 | } |
313 | |
314 | sub is_utf8 { |
315 | &_is_utf8; |
316 | } |
317 | |
318 | sub on_utf8 { |
319 | &_on_utf8; |
320 | } |
321 | |
322 | sub off_utf8 { |
323 | &_off_utf8; |
324 | } |
325 | |
326 | sub utf_to_utf { |
327 | &_utf_to_utf; |
328 | } |
329 | |