Commit | Line | Data |
f2a2953c |
1 | package Encode::Unicode; |
2 | |
df1df145 |
3 | use strict; |
f2a2953c |
4 | use warnings; |
5 | |
0ab8f81e |
6 | our $VERSION = do { my @r = (q$Revision: 1.34 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
f2a2953c |
7 | |
85982a32 |
8 | use XSLoader; |
9 | XSLoader::load(__PACKAGE__,$VERSION); |
df1df145 |
10 | |
f2a2953c |
11 | # |
12 | # Object Generator 8 transcoders all at once! |
13 | # |
df1df145 |
14 | |
f2a2953c |
15 | require Encode; |
16 | for my $name (qw(UTF-16 UTF-16BE UTF-16LE |
17 | UTF-32 UTF-32BE UTF-32LE |
18 | UCS-2BE UCS-2LE)) |
df1df145 |
19 | { |
f2a2953c |
20 | my ($size, $endian, $ucs2, $mask); |
21 | $name =~ /^(\w+)-(\d+)(\w*)$/o; |
22 | if ($ucs2 = ($1 eq 'UCS')){ |
23 | $size = 2; |
24 | }else{ |
25 | $size = $2/8; |
df1df145 |
26 | } |
f2a2953c |
27 | $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ; |
28 | $size == 4 and $endian = uc($endian); |
29 | |
30 | $Encode::Encoding{$name} = |
31 | bless { |
32 | Name => $name, |
33 | size => $size, |
34 | endian => $endian, |
35 | ucs2 => $ucs2, |
c731e18e |
36 | } => __PACKAGE__; |
f2a2953c |
37 | |
df1df145 |
38 | } |
39 | |
f2a2953c |
40 | sub name { shift->{'Name'} } |
aae85ceb |
41 | sub new_sequence |
42 | { |
43 | my $self = shift; |
44 | # Return the original if endian known |
45 | return $self if ($self->{endian}); |
46 | # Return a clone |
47 | return bless {%$self},ref($self); |
48 | } |
49 | |
0ab8f81e |
50 | sub needs_lines { 0 }; |
51 | |
52 | sub perlio_ok { |
53 | exists $INC{"PerlIO/encoding.pm"} or return 0; |
54 | return 1; |
55 | } |
56 | |
f2a2953c |
57 | |
58 | # |
0ab8f81e |
59 | # three implementations of (en|de)code exist. The XS version is the |
60 | # fastest. *_modern uses an array and *_classic sticks with substr. |
61 | # *_classic is much slower but more memory conservative. |
62 | # *_xs is the default. |
f2a2953c |
63 | |
64 | sub set_transcoder{ |
65 | no warnings qw(redefine); |
66 | my $type = shift; |
aae85ceb |
67 | if ($type eq "xs"){ |
68 | *decode = \&decode_xs; |
69 | *encode = \&encode_xs; |
70 | }elsif($type eq "modern"){ |
f2a2953c |
71 | *decode = \&decode_modern; |
72 | *encode = \&encode_modern; |
73 | }elsif($type eq "classic"){ |
74 | *decode = \&decode_classic; |
75 | *encode = \&encode_classic; |
76 | }else{ |
fcb875d4 |
77 | require Carp; |
aae85ceb |
78 | Carp::croak __PACKAGE__, "::set_transcoder(modern|classic|xs)"; |
f2a2953c |
79 | } |
80 | } |
81 | |
aae85ceb |
82 | set_transcoder("xs"); |
f2a2953c |
83 | |
84 | # |
85982a32 |
85 | # Aux. subs & constants |
86 | # |
87 | |
88 | sub FBCHAR(){ 0xFFFd } |
89 | sub BOM_BE(){ 0xFeFF } |
90 | sub BOM16LE(){ 0xFFFe } |
91 | sub BOM32LE(){ 0xFFFe0000 } |
92 | |
93 | sub valid_ucs2($){ |
94 | return |
95 | (0 <= $_[0] && $_[0] < 0xD800) |
96 | || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF); |
97 | } |
98 | |
99 | sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF } |
100 | sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 } |
101 | sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF } |
102 | |
103 | sub ensurrogate($){ |
104 | use integer; # we have divisions |
105 | my $uni = shift; |
106 | my $hi = ($uni - 0x10000) / 0x400 + 0xD800; |
107 | my $lo = ($uni - 0x10000) % 0x400 + 0xDC00; |
108 | return ($hi, $lo); |
109 | } |
110 | |
111 | sub desurrogate($$){ |
112 | my ($hi, $lo) = @_; |
113 | return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00); |
114 | } |
115 | |
116 | sub Mask { {2 => 0xffff, 4 => 0xffffffff} } |
117 | |
118 | # |
f2a2953c |
119 | # *_modern are much faster but guzzle more memory |
120 | # |
121 | |
aae85ceb |
122 | sub decode_modern($$;$) |
df1df145 |
123 | { |
f2a2953c |
124 | my ($obj, $str, $chk ) = @_; |
125 | my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; |
126 | |
127 | # warn "$size, $endian, $ucs2"; |
128 | $endian ||= BOMB($size, substr($str, 0, $size, '')) |
129 | or poisoned2death($obj, "Where's the BOM?"); |
130 | my $mask = Mask->{$size}; |
131 | my $utf8 = ''; |
132 | my @ord = unpack("$endian*", $str); |
133 | undef $str; # to conserve memory |
134 | while (@ord){ |
135 | my $ord = shift @ord; |
136 | unless ($size == 4 or valid_ucs2($ord &= $mask)){ |
137 | if ($ucs2){ |
fcb875d4 |
138 | $chk and |
f2a2953c |
139 | poisoned2death($obj, "no surrogates allowed", $ord); |
140 | shift @ord; # skip the next one as well |
141 | $ord = FBCHAR; |
142 | }else{ |
143 | unless (isHiSurrogate($ord)){ |
144 | poisoned2death($obj, "Malformed HI surrogate", $ord); |
145 | } |
146 | my $lo = shift @ord; |
147 | unless (isLoSurrogate($lo &= $mask)){ |
148 | poisoned2death($obj, "Malformed LO surrogate", $ord, $lo); |
149 | } |
150 | $ord = desurrogate($ord, $lo); |
151 | } |
152 | } |
153 | $utf8 .= chr($ord); |
df1df145 |
154 | } |
f2a2953c |
155 | utf8::upgrade($utf8); |
156 | return $utf8; |
157 | } |
158 | |
aae85ceb |
159 | sub encode_modern($$;$) |
f2a2953c |
160 | { |
161 | my ($obj, $utf8, $chk) = @_; |
162 | my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; |
163 | my @str = (); |
164 | unless ($endian){ |
165 | $endian = ($size == 4) ? 'N' : 'n'; |
166 | push @str, BOM_BE; |
167 | } |
168 | my @ord = unpack("U*", $utf8); |
169 | undef $utf8; # to conserve memory |
170 | for my $ord (@ord){ |
171 | unless ($size == 4 or valid_ucs2($ord)) { |
172 | unless(issurrogate($ord)){ |
173 | if ($ucs2){ |
fcb875d4 |
174 | $chk and |
f2a2953c |
175 | poisoned2death($obj, "code point too high", $ord); |
176 | |
177 | push @str, FBCHAR; |
178 | }else{ |
fcb875d4 |
179 | |
f2a2953c |
180 | push @str, ensurrogate($ord); |
181 | } |
182 | }else{ # not supposed to happen |
183 | push @str, FBCHAR; |
184 | } |
185 | }else{ |
186 | push @str, $ord; |
187 | } |
188 | } |
189 | return pack("$endian*", @str); |
190 | } |
191 | |
192 | # |
193 | # *_classic are slower but more memory conservative |
194 | # |
195 | |
aae85ceb |
196 | sub decode_classic($$;$) |
f2a2953c |
197 | { |
198 | my ($obj, $str, $chk ) = @_; |
199 | my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; |
200 | |
201 | # warn "$size, $endian, $ucs2"; |
202 | $endian ||= BOMB($size, substr($str, 0, $size, '')) |
203 | or poisoned2death($obj, "Where's the BOM?"); |
204 | my $mask = Mask->{$size}; |
205 | my $utf8 = ''; |
206 | my @ord = unpack("$endian*", $str); |
207 | while (length($str)){ |
208 | my $ord = unpack($endian, substr($str, 0, $size, '')); |
209 | unless ($size == 4 or valid_ucs2($ord &= $mask)){ |
210 | if ($ucs2){ |
fcb875d4 |
211 | $chk and |
f2a2953c |
212 | poisoned2death($obj, "no surrogates allowed", $ord); |
213 | substr($str,0,$size,''); # skip the next one as well |
214 | $ord = FBCHAR; |
215 | }else{ |
216 | unless (isHiSurrogate($ord)){ |
217 | poisoned2death($obj, "Malformed HI surrogate", $ord); |
218 | } |
219 | my $lo = unpack($endian ,substr($str,0,$size,'')); |
220 | unless (isLoSurrogate($lo &= $mask)){ |
221 | poisoned2death($obj, "Malformed LO surrogate", $ord, $lo); |
222 | } |
223 | $ord = desurrogate($ord, $lo); |
224 | } |
225 | } |
226 | $utf8 .= chr($ord); |
227 | } |
228 | utf8::upgrade($utf8); |
229 | return $utf8; |
230 | } |
231 | |
aae85ceb |
232 | sub encode_classic($$;$) |
f2a2953c |
233 | { |
234 | my ($obj, $utf8, $chk) = @_; |
235 | my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; |
236 | # warn join ", ", $size, $ucs2, $endian, $mask; |
237 | my $str = ''; |
238 | unless ($endian){ |
239 | $endian = ($size == 4) ? 'N' : 'n'; |
240 | $str .= pack($endian, BOM_BE); |
241 | } |
242 | while (length($utf8)){ |
243 | my $ord = ord(substr($utf8,0,1,'')); |
244 | unless ($size == 4 or valid_ucs2($ord)) { |
245 | unless(issurrogate($ord)){ |
246 | if ($ucs2){ |
fcb875d4 |
247 | $chk and |
f2a2953c |
248 | poisoned2death($obj, "code point too high", $ord); |
249 | $str .= pack($endian, FBCHAR); |
250 | }else{ |
251 | $str .= pack($endian.2, ensurrogate($ord)); |
252 | } |
253 | }else{ # not supposed to happen |
254 | $str .= pack($endian, FBCHAR); |
255 | } |
256 | }else{ |
257 | $str .= pack($endian, $ord); |
258 | } |
259 | } |
260 | return $str; |
261 | } |
262 | |
263 | sub BOMB { |
264 | my ($size, $bom) = @_; |
265 | my $N = $size == 2 ? 'n' : 'N'; |
266 | my $ord = unpack($N, $bom); |
fcb875d4 |
267 | return ($ord eq BOM_BE) ? $N : |
f2a2953c |
268 | ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef; |
269 | } |
270 | |
271 | sub poisoned2death{ |
272 | my $obj = shift; |
273 | my $msg = shift; |
274 | my $pair = join(", ", map {sprintf "\\x%x", $_} @_); |
275 | require Carp; |
276 | Carp::croak $obj->name, ":", $msg, "<$pair>.", caller; |
df1df145 |
277 | } |
278 | |
279 | 1; |
280 | __END__ |
67d7b5ef |
281 | |
282 | =head1 NAME |
283 | |
0ab8f81e |
284 | Encode::Unicode -- Various Unicode Transformation Formats |
67d7b5ef |
285 | |
286 | =cut |
f2a2953c |
287 | |
288 | =head1 SYNOPSIS |
289 | |
fcb875d4 |
290 | use Encode qw/encode decode/; |
f2a2953c |
291 | $ucs2 = encode("UCS-2BE", $utf8); |
292 | $utf8 = decode("UCS-2BE", $ucs2); |
293 | |
294 | =head1 ABSTRACT |
295 | |
296 | This module implements all Character Encoding Schemes of Unicode that |
297 | are officially documented by Unicode Consortium (except, of course, |
298 | for UTF-8, which is a native format in perl). |
299 | |
300 | =over 4 |
301 | |
302 | =item L<http://www.unicode.org/glossary/> says: |
303 | |
304 | I<Character Encoding Scheme> A character encoding form plus byte |
305 | serialization. There are seven character encoding schemes in Unicode: |
306 | UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE. |
307 | |
308 | =item Quick Reference |
309 | |
310 | Decodes from ord(N) Encodes chr(N) to... |
311 | octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} == |
312 | ---------------+-----------------+------------------------------ |
313 | UCS-2BE 2 N N is bogus Not Available |
314 | UCS-2LE 2 N N bogus Not Available |
315 | UTF-16 2/4 Y Y is S.P S.P BE/LE |
316 | UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd |
317 | UTF-16LE 2 N Y S.P S.P 0x2ad8,0xcddf |
318 | UTF-32 4 Y - is bogus As is BE/LE |
0ab8f81e |
319 | UTF-32BE 4 N - bogus As is 0x0001abcd |
320 | UTF-32LE 4 N - bogus As is 0xcdab0100 |
f2a2953c |
321 | UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d |
322 | ---------------+-----------------+------------------------------ |
323 | |
324 | =back |
325 | |
326 | =head1 Size, Endianness, and BOM |
327 | |
0ab8f81e |
328 | You can categorize these CES by 3 criteria: size of each character, |
329 | endianness, and Byte Order Mark. |
f2a2953c |
330 | |
0ab8f81e |
331 | =head2 by size |
f2a2953c |
332 | |
333 | UCS-2 is a fixed-length encoding with each character taking 16 bits. |
0ab8f81e |
334 | It B<does not> support I<surrogate pairs>. When a surrogate pair |
335 | is encountered during decode(), its place is filled with \x{FFFD} |
336 | if I<CHECK> is 0, or the routine croaks if I<CHECK> is 1. When a |
337 | character whose ord value is larger than 0xFFFF is encountered, |
338 | its place is filled with \x{FFFD} if I<CHECK> is 0, or the routine |
339 | croaks if I<CHECK> is 1. |
340 | |
341 | UTF-16 is almost the same as UCS-2 but it supports I<surrogate pairs>. |
f2a2953c |
342 | When it encounters a high surrogate (0xD800-0xDBFF), it fetches the |
0ab8f81e |
343 | following low surrogate (0xDC00-0xDFFF) and C<desurrogate>s them to |
344 | form a character. Bogus surrogates result in death. When \x{10000} |
345 | or above is encountered during encode(), it C<ensurrogate>s them and |
346 | pushes the surrogate pair to the output stream. |
f2a2953c |
347 | |
348 | UTF-32 is a fixed-length encoding with each character taking 32 bits. |
0ab8f81e |
349 | Since it is 32-bit, there is no need for I<surrogate pairs>. |
f2a2953c |
350 | |
0ab8f81e |
351 | =head2 by endianness |
f2a2953c |
352 | |
0ab8f81e |
353 | The first (and now failed) goal of Unicode was to map all character |
354 | repertoires into a fixed-length integer so that programmers are happy. |
355 | Since each character is either a I<short> or I<long> in C, you have to |
356 | pay attention to the endianness of each platform when you pass data |
357 | to one another. |
f2a2953c |
358 | |
359 | Anything marked as BE is Big Endian (or network byte order) and LE is |
0ab8f81e |
360 | Little Endian (aka VAX byte order). For anything not marked either |
361 | BE or LE, a character called Byte Order Mark (BOM) indicating the |
362 | endianness is prepended to the string. |
f2a2953c |
363 | |
364 | =over 4 |
365 | |
fcb875d4 |
366 | =item BOM as integer when fetched in network byte order |
f2a2953c |
367 | |
fcb875d4 |
368 | 16 32 bits/char |
369 | ------------------------- |
370 | BE 0xFeFF 0x0000FeFF |
371 | LE 0xFFeF 0xFFFe0000 |
372 | ------------------------- |
f2a2953c |
373 | |
374 | =back |
fcb875d4 |
375 | |
0ab8f81e |
376 | This modules handles the BOM as follows. |
f2a2953c |
377 | |
378 | =over 4 |
379 | |
380 | =item * |
381 | |
382 | When BE or LE is explicitly stated as the name of encoding, BOM is |
0ab8f81e |
383 | simply treated as a normal character (ZERO WIDTH NO-BREAK SPACE). |
f2a2953c |
384 | |
385 | =item * |
386 | |
0ab8f81e |
387 | When BE or LE is omitted during decode(), it checks if BOM is at the |
388 | beginning of the string; if one is found, the endianness is set to |
389 | what the BOM says. If no BOM is found, the routine dies. |
f2a2953c |
390 | |
391 | =item * |
392 | |
393 | When BE or LE is omitted during encode(), it returns a BE-encoded |
394 | string with BOM prepended. So when you want to encode a whole text |
0ab8f81e |
395 | file, make sure you encode() the whole text at once, not line by line |
396 | or each line, not file, will have a BOM prepended. |
f2a2953c |
397 | |
398 | =item * |
399 | |
0ab8f81e |
400 | C<UCS-2> is an exception. Unlike others, this is an alias of UCS-2BE. |
f2a2953c |
401 | UCS-2 is already registered by IANA and others that way. |
402 | |
fdd579e2 |
403 | =back |
f2a2953c |
404 | |
fcb875d4 |
405 | =head1 Surrogate Pairs |
f2a2953c |
406 | |
fcb875d4 |
407 | To say the least, surrogate pairs were the biggest mistake of the |
408 | Unicode Consortium. But according to the late Douglas Adams in I<The |
409 | Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the |
410 | Universe was created. This has made a lot of people very angry and |
411 | been widely regarded as a bad move>. Their mistake was not of this |
412 | magnitude so let's forgive them. |
f2a2953c |
413 | |
414 | (I don't dare make any comparison with Unicode Consortium and the |
c731e18e |
415 | Vogons here ;) Or, comparing Encode to Babel Fish is completely |
416 | appropriate -- if you can only stick this into your ear :) |
f2a2953c |
417 | |
0ab8f81e |
418 | Surrogate pairs were born when the Unicode Consortium finally |
fcb875d4 |
419 | admitted that 16 bits were not big enough to hold all the world's |
0ab8f81e |
420 | character repertoires. But they already made UCS-2 16-bit. What |
f2a2953c |
421 | do we do? |
422 | |
0ab8f81e |
423 | Back then, the range 0xD800-0xDFFF was not allocated. Let's split |
424 | that range in half and use the first half to represent the C<upper |
425 | half of a character> and the second half to represent the C<lower |
426 | half of a character>. That way, you can represent 1024 * 1024 = |
427 | 1048576 more characters. Now we can store character ranges up to |
428 | \x{10ffff} even with 16-bit encodings. This pair of half-character is |
429 | now called a I<surrogate pair> and UTF-16 is the name of the encoding |
430 | that embraces them. |
f2a2953c |
431 | |
448e90bb |
432 | Here is a formula to ensurrogate a Unicode character \x{10000} and |
f2a2953c |
433 | above; |
434 | |
435 | $hi = ($uni - 0x10000) / 0x400 + 0xD800; |
436 | $lo = ($uni - 0x10000) % 0x400 + 0xDC00; |
437 | |
438 | And to desurrogate; |
439 | |
440 | $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00); |
441 | |
fcb875d4 |
442 | Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but |
443 | perl does not prohibit the use of characters within this range. To perl, |
444 | every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>. |
445 | |
446 | (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit |
0ab8f81e |
447 | integer support! |
f2a2953c |
448 | |
449 | =head1 SEE ALSO |
450 | |
fdd579e2 |
451 | L<Encode>, L<http://www.unicode.org/glossary/>, |
f2a2953c |
452 | |
fdd579e2 |
453 | RFC 2781 L<http://rfc.net/rfc2781.html>, |
454 | |
455 | L<http://www.unicode.org/unicode/faq/utf_bom.html> |
456 | |
fcb875d4 |
457 | Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)> |
458 | by Larry Wall, Tom Christiansen, Jon Orwant; |
459 | O'Reilly & Associates; ISBN 0-596-00027-8 |
460 | |
fdd579e2 |
461 | =cut |