Commit | Line | Data |
f2a2953c |
1 | package Encode::Unicode; |
2 | |
df1df145 |
3 | use strict; |
f2a2953c |
4 | use warnings; |
5 | |
c731e18e |
6 | our $VERSION = do { my @r = (q$Revision: 1.26 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
f2a2953c |
7 | |
8 | # |
9 | # Aux. subs & constants |
10 | # |
11 | |
12 | sub FBCHAR(){ 0xFFFd } |
13 | sub BOM_BE(){ 0xFeFF } |
14 | sub BOM16LE(){ 0xFFFe } |
15 | sub BOM32LE(){ 0xFeFF0000 } |
16 | |
17 | sub valid_ucs2($){ |
18 | if ($_[0] < 0xD800){ |
19 | return $_[0] > 0; |
20 | }else{ |
21 | return ($_[0] > 0xDFFFF && $_[0] <= 0xFFFF); |
22 | } |
23 | } |
24 | |
25 | sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF } |
26 | sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 } |
27 | sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF } |
28 | |
29 | sub ensurrogate($){ |
30 | use integer; # we have divisions |
31 | my $uni = shift; |
32 | my $hi = ($uni - 0x10000) / 0x400 + 0xD800; |
33 | my $lo = ($uni - 0x10000) % 0x400 + 0xDC00; |
34 | return ($hi, $lo); |
35 | } |
36 | |
37 | sub desurrogate($$){ |
38 | my ($hi, $lo) = @_; |
39 | return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00); |
40 | } |
ee981de6 |
41 | |
f2a2953c |
42 | sub Mask { {2 => 0xffff, 4 => 0xffffffff} } |
df1df145 |
43 | |
f2a2953c |
44 | # |
45 | # Object Generator 8 transcoders all at once! |
46 | # |
df1df145 |
47 | |
f2a2953c |
48 | require Encode; |
49 | for my $name (qw(UTF-16 UTF-16BE UTF-16LE |
50 | UTF-32 UTF-32BE UTF-32LE |
51 | UCS-2BE UCS-2LE)) |
df1df145 |
52 | { |
f2a2953c |
53 | my ($size, $endian, $ucs2, $mask); |
54 | $name =~ /^(\w+)-(\d+)(\w*)$/o; |
55 | if ($ucs2 = ($1 eq 'UCS')){ |
56 | $size = 2; |
57 | }else{ |
58 | $size = $2/8; |
df1df145 |
59 | } |
f2a2953c |
60 | $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ; |
61 | $size == 4 and $endian = uc($endian); |
62 | |
63 | $Encode::Encoding{$name} = |
64 | bless { |
65 | Name => $name, |
66 | size => $size, |
67 | endian => $endian, |
68 | ucs2 => $ucs2, |
c731e18e |
69 | } => __PACKAGE__; |
f2a2953c |
70 | |
df1df145 |
71 | } |
72 | |
f2a2953c |
73 | sub name { shift->{'Name'} } |
74 | sub new_sequence { $_[0] }; |
75 | |
76 | # |
77 | # the two implementation of (en|de)code exist. *_modern use |
78 | # array and *_classic stick with substr. *_classic is much |
79 | # slower but more memory conservative. *_moder is default. |
80 | |
81 | sub set_transcoder{ |
82 | no warnings qw(redefine); |
83 | my $type = shift; |
84 | if ($type eq "modern"){ |
85 | *decode = \&decode_modern; |
86 | *encode = \&encode_modern; |
87 | }elsif($type eq "classic"){ |
88 | *decode = \&decode_classic; |
89 | *encode = \&encode_classic; |
90 | }else{ |
91 | require Carp; |
92 | Carp::croak __PACKAGE__, "::set_transcoder(modern|classic)"; |
93 | } |
94 | } |
95 | |
96 | set_transcoder("modern"); |
97 | |
98 | # |
99 | # *_modern are much faster but guzzle more memory |
100 | # |
101 | |
102 | sub decode_modern |
df1df145 |
103 | { |
f2a2953c |
104 | my ($obj, $str, $chk ) = @_; |
105 | my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; |
106 | |
107 | # warn "$size, $endian, $ucs2"; |
108 | $endian ||= BOMB($size, substr($str, 0, $size, '')) |
109 | or poisoned2death($obj, "Where's the BOM?"); |
110 | my $mask = Mask->{$size}; |
111 | my $utf8 = ''; |
112 | my @ord = unpack("$endian*", $str); |
113 | undef $str; # to conserve memory |
114 | while (@ord){ |
115 | my $ord = shift @ord; |
116 | unless ($size == 4 or valid_ucs2($ord &= $mask)){ |
117 | if ($ucs2){ |
118 | $chk and |
119 | poisoned2death($obj, "no surrogates allowed", $ord); |
120 | shift @ord; # skip the next one as well |
121 | $ord = FBCHAR; |
122 | }else{ |
123 | unless (isHiSurrogate($ord)){ |
124 | poisoned2death($obj, "Malformed HI surrogate", $ord); |
125 | } |
126 | my $lo = shift @ord; |
127 | unless (isLoSurrogate($lo &= $mask)){ |
128 | poisoned2death($obj, "Malformed LO surrogate", $ord, $lo); |
129 | } |
130 | $ord = desurrogate($ord, $lo); |
131 | } |
132 | } |
133 | $utf8 .= chr($ord); |
df1df145 |
134 | } |
f2a2953c |
135 | utf8::upgrade($utf8); |
136 | return $utf8; |
137 | } |
138 | |
139 | sub encode_modern |
140 | { |
141 | my ($obj, $utf8, $chk) = @_; |
142 | my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; |
143 | my @str = (); |
144 | unless ($endian){ |
145 | $endian = ($size == 4) ? 'N' : 'n'; |
146 | push @str, BOM_BE; |
147 | } |
148 | my @ord = unpack("U*", $utf8); |
149 | undef $utf8; # to conserve memory |
150 | for my $ord (@ord){ |
151 | unless ($size == 4 or valid_ucs2($ord)) { |
152 | unless(issurrogate($ord)){ |
153 | if ($ucs2){ |
154 | $chk and |
155 | poisoned2death($obj, "code point too high", $ord); |
156 | |
157 | push @str, FBCHAR; |
158 | }else{ |
159 | |
160 | push @str, ensurrogate($ord); |
161 | } |
162 | }else{ # not supposed to happen |
163 | push @str, FBCHAR; |
164 | } |
165 | }else{ |
166 | push @str, $ord; |
167 | } |
168 | } |
169 | return pack("$endian*", @str); |
170 | } |
171 | |
172 | # |
173 | # *_classic are slower but more memory conservative |
174 | # |
175 | |
176 | sub decode_classic |
177 | { |
178 | my ($obj, $str, $chk ) = @_; |
179 | my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; |
180 | |
181 | # warn "$size, $endian, $ucs2"; |
182 | $endian ||= BOMB($size, substr($str, 0, $size, '')) |
183 | or poisoned2death($obj, "Where's the BOM?"); |
184 | my $mask = Mask->{$size}; |
185 | my $utf8 = ''; |
186 | my @ord = unpack("$endian*", $str); |
187 | while (length($str)){ |
188 | my $ord = unpack($endian, substr($str, 0, $size, '')); |
189 | unless ($size == 4 or valid_ucs2($ord &= $mask)){ |
190 | if ($ucs2){ |
191 | $chk and |
192 | poisoned2death($obj, "no surrogates allowed", $ord); |
193 | substr($str,0,$size,''); # skip the next one as well |
194 | $ord = FBCHAR; |
195 | }else{ |
196 | unless (isHiSurrogate($ord)){ |
197 | poisoned2death($obj, "Malformed HI surrogate", $ord); |
198 | } |
199 | my $lo = unpack($endian ,substr($str,0,$size,'')); |
200 | unless (isLoSurrogate($lo &= $mask)){ |
201 | poisoned2death($obj, "Malformed LO surrogate", $ord, $lo); |
202 | } |
203 | $ord = desurrogate($ord, $lo); |
204 | } |
205 | } |
206 | $utf8 .= chr($ord); |
207 | } |
208 | utf8::upgrade($utf8); |
209 | return $utf8; |
210 | } |
211 | |
212 | sub encode_classic |
213 | { |
214 | my ($obj, $utf8, $chk) = @_; |
215 | my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; |
216 | # warn join ", ", $size, $ucs2, $endian, $mask; |
217 | my $str = ''; |
218 | unless ($endian){ |
219 | $endian = ($size == 4) ? 'N' : 'n'; |
220 | $str .= pack($endian, BOM_BE); |
221 | } |
222 | while (length($utf8)){ |
223 | my $ord = ord(substr($utf8,0,1,'')); |
224 | unless ($size == 4 or valid_ucs2($ord)) { |
225 | unless(issurrogate($ord)){ |
226 | if ($ucs2){ |
227 | $chk and |
228 | poisoned2death($obj, "code point too high", $ord); |
229 | $str .= pack($endian, FBCHAR); |
230 | }else{ |
231 | $str .= pack($endian.2, ensurrogate($ord)); |
232 | } |
233 | }else{ # not supposed to happen |
234 | $str .= pack($endian, FBCHAR); |
235 | } |
236 | }else{ |
237 | $str .= pack($endian, $ord); |
238 | } |
239 | } |
240 | return $str; |
241 | } |
242 | |
243 | sub BOMB { |
244 | my ($size, $bom) = @_; |
245 | my $N = $size == 2 ? 'n' : 'N'; |
246 | my $ord = unpack($N, $bom); |
247 | return ($ord eq BOM_BE) ? $N : |
248 | ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef; |
249 | } |
250 | |
251 | sub poisoned2death{ |
252 | my $obj = shift; |
253 | my $msg = shift; |
254 | my $pair = join(", ", map {sprintf "\\x%x", $_} @_); |
255 | require Carp; |
256 | Carp::croak $obj->name, ":", $msg, "<$pair>.", caller; |
df1df145 |
257 | } |
258 | |
259 | 1; |
260 | __END__ |
67d7b5ef |
261 | |
262 | =head1 NAME |
263 | |
f2a2953c |
264 | Encode::Unicode -- Various Unicode Transform Format |
67d7b5ef |
265 | |
266 | =cut |
f2a2953c |
267 | |
268 | =head1 SYNOPSIS |
269 | |
270 | use Encode qw/encode decode/; |
271 | $ucs2 = encode("UCS-2BE", $utf8); |
272 | $utf8 = decode("UCS-2BE", $ucs2); |
273 | |
274 | =head1 ABSTRACT |
275 | |
276 | This module implements all Character Encoding Schemes of Unicode that |
277 | are officially documented by Unicode Consortium (except, of course, |
278 | for UTF-8, which is a native format in perl). |
279 | |
280 | =over 4 |
281 | |
282 | =item L<http://www.unicode.org/glossary/> says: |
283 | |
284 | I<Character Encoding Scheme> A character encoding form plus byte |
285 | serialization. There are seven character encoding schemes in Unicode: |
286 | UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE. |
287 | |
288 | =item Quick Reference |
289 | |
290 | Decodes from ord(N) Encodes chr(N) to... |
291 | octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} == |
292 | ---------------+-----------------+------------------------------ |
293 | UCS-2BE 2 N N is bogus Not Available |
294 | UCS-2LE 2 N N bogus Not Available |
295 | UTF-16 2/4 Y Y is S.P S.P BE/LE |
296 | UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd |
297 | UTF-16LE 2 N Y S.P S.P 0x2ad8,0xcddf |
298 | UTF-32 4 Y - is bogus As is BE/LE |
299 | UTF-32BE 4 N - bogus As is 0x0010abcd |
300 | UTF-32LE 4 N - bogus As is 0xcdab1000 |
301 | UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d |
302 | ---------------+-----------------+------------------------------ |
303 | |
304 | =back |
305 | |
306 | =head1 Size, Endianness, and BOM |
307 | |
308 | You can categorize these CES by 3 criteria; Size of each character, |
309 | Endianness, and Byte Order Mark. |
310 | |
311 | =head2 by Size |
312 | |
313 | UCS-2 is a fixed-length encoding with each character taking 16 bits. |
314 | It B<does not> support I<Surrogate Pair>. When surrogate pair is |
315 | encountered during decode(), it fills its place with \xFFFD without |
316 | I<CHECK> or croaks if I<CHECK>. When a character which ord value is |
317 | larger than 0xFFFF, it uses 0xFFFD without I<CHECK> or croaks if |
318 | <CHECK>. |
319 | |
320 | UTF-16 is almost the same as UCS-2 but it supports I<Surrogate Pair>. |
321 | When it encounters a high surrogate (0xD800-0xDBFF), it fetches the |
322 | following low surrogate (0xDC00-0xDFFF), C<desurrogate> them to form a |
323 | character. Bogus surrogates result in death. When \x{10000} or above |
324 | is encountered during encode(), it C<ensurrogate>s them and push the |
325 | surrogate pair to the output stream. |
326 | |
327 | UTF-32 is a fixed-length encoding with each character taking 32 bits. |
328 | Since it is 32-bit there is no need for I<Surrogate Pair>. |
329 | |
330 | =head2 by Endianness |
331 | |
332 | First (and now failed) goal of Unicode was to map all character |
333 | repartories into a fixed-length integer so programmers are happy. |
334 | Since each character is either I<short> or I<long> in C, you have to |
335 | put endianness of each platform when you pass data to one another. |
336 | |
337 | Anything marked as BE is Big Endian (or network byte order) and LE is |
338 | Little Endian (aka VAX byte order). For anything without, a character |
339 | called Byte Order Mark (BOM) is prepended to the head of string. |
340 | |
341 | =over 4 |
342 | |
343 | =item BOM as integer |
344 | |
345 | 16 32 bits/char |
346 | ------------------------- |
347 | BE 0xFeFF 0x0000FeFF |
348 | LE 0xFFeF 0xFeFF0000 |
349 | ------------------------- |
350 | |
351 | =back |
352 | |
353 | This modules handles BOM as follows. |
354 | |
355 | =over 4 |
356 | |
357 | =item * |
358 | |
359 | When BE or LE is explicitly stated as the name of encoding, BOM is |
360 | simply treated as one of characters (ZERO WIDTH NO-BREAK SPACE). |
361 | |
362 | =item * |
363 | |
364 | When BE or LE is omitted during decode(), it checks if BOM is in the |
365 | beginning of the string and if found endianness is set to what BOM |
366 | says. if not found, dies. |
367 | |
368 | =item * |
369 | |
370 | When BE or LE is omitted during encode(), it returns a BE-encoded |
371 | string with BOM prepended. So when you want to encode a whole text |
372 | file, make sure you encode() by whole text, not line by line or each |
373 | line, not file, is prepended with BOMs. |
374 | |
375 | =item * |
376 | |
377 | C<UCS-2> is an exception. Unlike others this is an alias of UCS-2BE. |
378 | UCS-2 is already registered by IANA and others that way. |
379 | |
380 | |
381 | =head1 The Surrogate Pair |
382 | |
383 | To say the least, surrogate pair was the biggest mistake by Unicode |
384 | Consortium. I don't give a darn if they admit it or not. But |
385 | according to late Douglas Adams in I<The Hitchhiker's Guide to the |
386 | Galaxy> Triology, C<First the Universe was created and it was a bad |
387 | move>. Their mistake was not this magnitude so let's forgive them. |
388 | |
389 | (I don't dare make any comparison with Unicode Consortium and the |
c731e18e |
390 | Vogons here ;) Or, comparing Encode to Babel Fish is completely |
391 | appropriate -- if you can only stick this into your ear :) |
f2a2953c |
392 | |
393 | A surrogate pair was born when Unicode Consortium had finally |
394 | admitted that 16 bit was not big enough to hold all the world's |
395 | character repartorie. But they have already made UCS-2 16-bit. What |
396 | do we do? |
397 | |
398 | Back then 0xD800-0xDFFF was not allocated. Let's split them half and |
399 | use the first half to represent C<upper half of a character> and the |
400 | latter C<lower half of a character>. That way you can represent 1024 |
401 | * 1024 = 1048576 more characters. Now we can store character ranges |
402 | up to \x{10ffff} even with 16-bit encodings. This pair of |
403 | half-character is now called a I<Surrogate Pair> and UTF-16 is the |
404 | name of encoding that embraces them. |
405 | |
406 | Here is a fomula to ensurrogate a Unicode character \x{10000} and |
407 | above; |
408 | |
409 | $hi = ($uni - 0x10000) / 0x400 + 0xD800; |
410 | $lo = ($uni - 0x10000) % 0x400 + 0xDC00; |
411 | |
412 | And to desurrogate; |
413 | |
414 | $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00); |
415 | |
416 | Note this move has made \x{D800}-\x{DFFF} forbidden zone but perl |
417 | does not prohibit them for uses. |
418 | |
419 | =head1 SEE ALSO |
420 | |
421 | L<Encode>, L<http://www.unicode.org/glossary/> |
422 | |
423 | =back |