1 package Encode::Unicode;
6 our $VERSION = do { my @r = (q$Revision: 1.31 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
9 # Aux. subs & constants
12 sub FBCHAR(){ 0xFFFd }
13 sub BOM_BE(){ 0xFeFF }
14 sub BOM16LE(){ 0xFFFe }
15 sub BOM32LE(){ 0xFFFe0000 }
19 (0 <= $_[0] && $_[0] < 0xD800)
20 || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF);
23 sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF }
24 sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 }
25 sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF }
28 use integer; # we have divisions
30 my $hi = ($uni - 0x10000) / 0x400 + 0xD800;
31 my $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
37 return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
40 sub Mask { {2 => 0xffff, 4 => 0xffffffff} }
43 # Object Generator 8 transcoders all at once!
47 for my $name (qw(UTF-16 UTF-16BE UTF-16LE
48 UTF-32 UTF-32BE UTF-32LE
51 my ($size, $endian, $ucs2, $mask);
52 $name =~ /^(\w+)-(\d+)(\w*)$/o;
53 if ($ucs2 = ($1 eq 'UCS')){
58 $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ;
59 $size == 4 and $endian = uc($endian);
61 $Encode::Encoding{$name} =
71 sub name { shift->{'Name'} }
75 # Return the original if endian known
76 return $self if ($self->{endian});
78 return bless {%$self},ref($self);
83 # three implementation of (en|de)code exist. XS version is the fastest.
84 # *_modern use # an array and *_classic stick with substr. *_classic is
85 # much slower but more memory conservative. *_xs is default.
88 no warnings qw(redefine);
91 *decode = \&decode_xs;
92 *encode = \&encode_xs;
93 }elsif($type eq "modern"){
94 *decode = \&decode_modern;
95 *encode = \&encode_modern;
96 }elsif($type eq "classic"){
97 *decode = \&decode_classic;
98 *encode = \&encode_classic;
101 Carp::croak __PACKAGE__, "::set_transcoder(modern|classic|xs)";
105 set_transcoder("xs");
108 # *_modern are much faster but guzzle more memory
111 sub decode_modern($$;$)
113 my ($obj, $str, $chk ) = @_;
114 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
116 # warn "$size, $endian, $ucs2";
117 $endian ||= BOMB($size, substr($str, 0, $size, ''))
118 or poisoned2death($obj, "Where's the BOM?");
119 my $mask = Mask->{$size};
121 my @ord = unpack("$endian*", $str);
122 undef $str; # to conserve memory
124 my $ord = shift @ord;
125 unless ($size == 4 or valid_ucs2($ord &= $mask)){
128 poisoned2death($obj, "no surrogates allowed", $ord);
129 shift @ord; # skip the next one as well
132 unless (isHiSurrogate($ord)){
133 poisoned2death($obj, "Malformed HI surrogate", $ord);
136 unless (isLoSurrogate($lo &= $mask)){
137 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
139 $ord = desurrogate($ord, $lo);
144 utf8::upgrade($utf8);
148 sub encode_modern($$;$)
150 my ($obj, $utf8, $chk) = @_;
151 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
154 $endian = ($size == 4) ? 'N' : 'n';
157 my @ord = unpack("U*", $utf8);
158 undef $utf8; # to conserve memory
160 unless ($size == 4 or valid_ucs2($ord)) {
161 unless(issurrogate($ord)){
164 poisoned2death($obj, "code point too high", $ord);
169 push @str, ensurrogate($ord);
171 }else{ # not supposed to happen
178 return pack("$endian*", @str);
182 # *_classic are slower but more memory conservative
185 sub decode_classic($$;$)
187 my ($obj, $str, $chk ) = @_;
188 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
190 # warn "$size, $endian, $ucs2";
191 $endian ||= BOMB($size, substr($str, 0, $size, ''))
192 or poisoned2death($obj, "Where's the BOM?");
193 my $mask = Mask->{$size};
195 my @ord = unpack("$endian*", $str);
196 while (length($str)){
197 my $ord = unpack($endian, substr($str, 0, $size, ''));
198 unless ($size == 4 or valid_ucs2($ord &= $mask)){
201 poisoned2death($obj, "no surrogates allowed", $ord);
202 substr($str,0,$size,''); # skip the next one as well
205 unless (isHiSurrogate($ord)){
206 poisoned2death($obj, "Malformed HI surrogate", $ord);
208 my $lo = unpack($endian ,substr($str,0,$size,''));
209 unless (isLoSurrogate($lo &= $mask)){
210 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
212 $ord = desurrogate($ord, $lo);
217 utf8::upgrade($utf8);
221 sub encode_classic($$;$)
223 my ($obj, $utf8, $chk) = @_;
224 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
225 # warn join ", ", $size, $ucs2, $endian, $mask;
228 $endian = ($size == 4) ? 'N' : 'n';
229 $str .= pack($endian, BOM_BE);
231 while (length($utf8)){
232 my $ord = ord(substr($utf8,0,1,''));
233 unless ($size == 4 or valid_ucs2($ord)) {
234 unless(issurrogate($ord)){
237 poisoned2death($obj, "code point too high", $ord);
238 $str .= pack($endian, FBCHAR);
240 $str .= pack($endian.2, ensurrogate($ord));
242 }else{ # not supposed to happen
243 $str .= pack($endian, FBCHAR);
246 $str .= pack($endian, $ord);
253 my ($size, $bom) = @_;
254 my $N = $size == 2 ? 'n' : 'N';
255 my $ord = unpack($N, $bom);
256 return ($ord eq BOM_BE) ? $N :
257 ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
263 my $pair = join(", ", map {sprintf "\\x%x", $_} @_);
265 Carp::croak $obj->name, ":", $msg, "<$pair>.", caller;
273 Encode::Unicode -- Various Unicode Transform Format
279 use Encode qw/encode decode/;
280 $ucs2 = encode("UCS-2BE", $utf8);
281 $utf8 = decode("UCS-2BE", $ucs2);
285 This module implements all Character Encoding Schemes of Unicode that
286 are officially documented by Unicode Consortium (except, of course,
287 for UTF-8, which is a native format in perl).
291 =item L<http://www.unicode.org/glossary/> says:
293 I<Character Encoding Scheme> A character encoding form plus byte
294 serialization. There are seven character encoding schemes in Unicode:
295 UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE.
297 =item Quick Reference
299 Decodes from ord(N) Encodes chr(N) to...
300 octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} ==
301 ---------------+-----------------+------------------------------
302 UCS-2BE 2 N N is bogus Not Available
303 UCS-2LE 2 N N bogus Not Available
304 UTF-16 2/4 Y Y is S.P S.P BE/LE
305 UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd
306 UTF-16LE 2 N Y S.P S.P 0x2ad8,0xcddf
307 UTF-32 4 Y - is bogus As is BE/LE
308 UTF-32BE 4 N - bogus As is 0x0010abcd
309 UTF-32LE 4 N - bogus As is 0xcdab1000
310 UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d
311 ---------------+-----------------+------------------------------
315 =head1 Size, Endianness, and BOM
317 You can categorize these CES by 3 criteria; Size of each character,
318 Endianness, and Byte Order Mark.
322 UCS-2 is a fixed-length encoding with each character taking 16 bits.
323 It B<does not> support I<Surrogate Pairs>. When a surrogate pair is
324 encountered during decode(), its place is filled with \xFFFD without
325 I<CHECK> or croaks if I<CHECK>. When a character whose ord value is
326 larger than 0xFFFF is encountered, it uses 0xFFFD without I<CHECK> or
329 UTF-16 is almost the same as UCS-2 but it supports I<Surrogate Pairs>.
330 When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
331 following low surrogate (0xDC00-0xDFFF), C<desurrogate>s them to form a
332 character. Bogus surrogates result in death. When \x{10000} or above
333 is encountered during encode(), it C<ensurrogate>s them and pushes the
334 surrogate pair to the output stream.
336 UTF-32 is a fixed-length encoding with each character taking 32 bits.
337 Since it is 32-bit there is no need for I<Surrogate Pairs>.
341 First (and now failed) goal of Unicode was to map all character
342 repertories into a fixed-length integer so programmers are happy.
343 Since each character is either I<short> or I<long> in C, you have to
344 put endianness of each platform when you pass data to one another.
346 Anything marked as BE is Big Endian (or network byte order) and LE is
347 Little Endian (aka VAX byte order). For anything without, a character
348 called Byte Order Mark (BOM) is prepended to the head of string.
352 =item BOM as integer when fetched in network byte order
355 -------------------------
358 -------------------------
362 This modules handles BOM as follows.
368 When BE or LE is explicitly stated as the name of encoding, BOM is
369 simply treated as one of characters (ZERO WIDTH NO-BREAK SPACE).
373 When BE or LE is omitted during decode(), it checks if BOM is in the
374 beginning of the string and if found endianness is set to what BOM
375 says. If not found, dies.
379 When BE or LE is omitted during encode(), it returns a BE-encoded
380 string with BOM prepended. So when you want to encode a whole text
381 file, make sure you encode() by whole text, not line by line or each
382 line, not file, is prepended with BOMs.
386 C<UCS-2> is an exception. Unlike others this is an alias of UCS-2BE.
387 UCS-2 is already registered by IANA and others that way.
391 =head1 Surrogate Pairs
393 To say the least, surrogate pairs were the biggest mistake of the
394 Unicode Consortium. But according to the late Douglas Adams in I<The
395 Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the
396 Universe was created. This has made a lot of people very angry and
397 been widely regarded as a bad move>. Their mistake was not of this
398 magnitude so let's forgive them.
400 (I don't dare make any comparison with Unicode Consortium and the
401 Vogons here ;) Or, comparing Encode to Babel Fish is completely
402 appropriate -- if you can only stick this into your ear :)
404 Surrogate pairs were born when Unicode Consortium finally
405 admitted that 16 bits were not big enough to hold all the world's
406 character repertoire. But they have already made UCS-2 16-bit. What
409 Back then 0xD800-0xDFFF was not allocated. Let's split them half and
410 use the first half to represent C<upper half of a character> and the
411 latter C<lower half of a character>. That way you can represent 1024
412 * 1024 = 1048576 more characters. Now we can store character ranges
413 up to \x{10ffff} even with 16-bit encodings. This pair of
414 half-character is now called a I<Surrogate Pair> and UTF-16 is the
415 name of the encoding that embraces them.
417 Here is a formula to ensurrogate a Unicode character \x{10000} and
420 $hi = ($uni - 0x10000) / 0x400 + 0xD800;
421 $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
425 $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
427 Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but
428 perl does not prohibit the use of characters within this range. To perl,
429 every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
431 (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
432 integer support! (**)
434 (**) Is anything beyond \x{11_0000} still Unicode :?
438 L<Encode>, L<http://www.unicode.org/glossary/>,
440 RFC 2781 L<http://rfc.net/rfc2781.html>,
442 L<http://www.unicode.org/unicode/faq/utf_bom.html>
444 Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)>
445 by Larry Wall, Tom Christiansen, Jon Orwant;
446 O'Reilly & Associates; ISBN 0-596-00027-8