1 package Encode::Unicode;
6 our $VERSION = do { my @r = (q$Revision: 1.32 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
9 XSLoader::load(__PACKAGE__,$VERSION);
12 # Object Generator 8 transcoders all at once!
16 for my $name (qw(UTF-16 UTF-16BE UTF-16LE
17 UTF-32 UTF-32BE UTF-32LE
20 my ($size, $endian, $ucs2, $mask);
21 $name =~ /^(\w+)-(\d+)(\w*)$/o;
22 if ($ucs2 = ($1 eq 'UCS')){
27 $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ;
28 $size == 4 and $endian = uc($endian);
30 $Encode::Encoding{$name} =
40 sub name { shift->{'Name'} }
44 # Return the original if endian known
45 return $self if ($self->{endian});
47 return bless {%$self},ref($self);
52 # three implementation of (en|de)code exist. XS version is the fastest.
53 # *_modern use # an array and *_classic stick with substr. *_classic is
54 # much slower but more memory conservative. *_xs is default.
57 no warnings qw(redefine);
60 *decode = \&decode_xs;
61 *encode = \&encode_xs;
62 }elsif($type eq "modern"){
63 *decode = \&decode_modern;
64 *encode = \&encode_modern;
65 }elsif($type eq "classic"){
66 *decode = \&decode_classic;
67 *encode = \&encode_classic;
70 Carp::croak __PACKAGE__, "::set_transcoder(modern|classic|xs)";
77 # Aux. subs & constants
80 sub FBCHAR(){ 0xFFFd }
81 sub BOM_BE(){ 0xFeFF }
82 sub BOM16LE(){ 0xFFFe }
83 sub BOM32LE(){ 0xFFFe0000 }
87 (0 <= $_[0] && $_[0] < 0xD800)
88 || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF);
91 sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF }
92 sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 }
93 sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF }
96 use integer; # we have divisions
98 my $hi = ($uni - 0x10000) / 0x400 + 0xD800;
99 my $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
105 return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
108 sub Mask { {2 => 0xffff, 4 => 0xffffffff} }
111 # *_modern are much faster but guzzle more memory
114 sub decode_modern($$;$)
116 my ($obj, $str, $chk ) = @_;
117 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
119 # warn "$size, $endian, $ucs2";
120 $endian ||= BOMB($size, substr($str, 0, $size, ''))
121 or poisoned2death($obj, "Where's the BOM?");
122 my $mask = Mask->{$size};
124 my @ord = unpack("$endian*", $str);
125 undef $str; # to conserve memory
127 my $ord = shift @ord;
128 unless ($size == 4 or valid_ucs2($ord &= $mask)){
131 poisoned2death($obj, "no surrogates allowed", $ord);
132 shift @ord; # skip the next one as well
135 unless (isHiSurrogate($ord)){
136 poisoned2death($obj, "Malformed HI surrogate", $ord);
139 unless (isLoSurrogate($lo &= $mask)){
140 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
142 $ord = desurrogate($ord, $lo);
147 utf8::upgrade($utf8);
151 sub encode_modern($$;$)
153 my ($obj, $utf8, $chk) = @_;
154 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
157 $endian = ($size == 4) ? 'N' : 'n';
160 my @ord = unpack("U*", $utf8);
161 undef $utf8; # to conserve memory
163 unless ($size == 4 or valid_ucs2($ord)) {
164 unless(issurrogate($ord)){
167 poisoned2death($obj, "code point too high", $ord);
172 push @str, ensurrogate($ord);
174 }else{ # not supposed to happen
181 return pack("$endian*", @str);
185 # *_classic are slower but more memory conservative
188 sub decode_classic($$;$)
190 my ($obj, $str, $chk ) = @_;
191 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
193 # warn "$size, $endian, $ucs2";
194 $endian ||= BOMB($size, substr($str, 0, $size, ''))
195 or poisoned2death($obj, "Where's the BOM?");
196 my $mask = Mask->{$size};
198 my @ord = unpack("$endian*", $str);
199 while (length($str)){
200 my $ord = unpack($endian, substr($str, 0, $size, ''));
201 unless ($size == 4 or valid_ucs2($ord &= $mask)){
204 poisoned2death($obj, "no surrogates allowed", $ord);
205 substr($str,0,$size,''); # skip the next one as well
208 unless (isHiSurrogate($ord)){
209 poisoned2death($obj, "Malformed HI surrogate", $ord);
211 my $lo = unpack($endian ,substr($str,0,$size,''));
212 unless (isLoSurrogate($lo &= $mask)){
213 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
215 $ord = desurrogate($ord, $lo);
220 utf8::upgrade($utf8);
224 sub encode_classic($$;$)
226 my ($obj, $utf8, $chk) = @_;
227 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
228 # warn join ", ", $size, $ucs2, $endian, $mask;
231 $endian = ($size == 4) ? 'N' : 'n';
232 $str .= pack($endian, BOM_BE);
234 while (length($utf8)){
235 my $ord = ord(substr($utf8,0,1,''));
236 unless ($size == 4 or valid_ucs2($ord)) {
237 unless(issurrogate($ord)){
240 poisoned2death($obj, "code point too high", $ord);
241 $str .= pack($endian, FBCHAR);
243 $str .= pack($endian.2, ensurrogate($ord));
245 }else{ # not supposed to happen
246 $str .= pack($endian, FBCHAR);
249 $str .= pack($endian, $ord);
256 my ($size, $bom) = @_;
257 my $N = $size == 2 ? 'n' : 'N';
258 my $ord = unpack($N, $bom);
259 return ($ord eq BOM_BE) ? $N :
260 ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
266 my $pair = join(", ", map {sprintf "\\x%x", $_} @_);
268 Carp::croak $obj->name, ":", $msg, "<$pair>.", caller;
276 Encode::Unicode -- Various Unicode Transform Format
282 use Encode qw/encode decode/;
283 $ucs2 = encode("UCS-2BE", $utf8);
284 $utf8 = decode("UCS-2BE", $ucs2);
288 This module implements all Character Encoding Schemes of Unicode that
289 are officially documented by Unicode Consortium (except, of course,
290 for UTF-8, which is a native format in perl).
294 =item L<http://www.unicode.org/glossary/> says:
296 I<Character Encoding Scheme> A character encoding form plus byte
297 serialization. There are seven character encoding schemes in Unicode:
298 UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE.
300 =item Quick Reference
302 Decodes from ord(N) Encodes chr(N) to...
303 octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} ==
304 ---------------+-----------------+------------------------------
305 UCS-2BE 2 N N is bogus Not Available
306 UCS-2LE 2 N N bogus Not Available
307 UTF-16 2/4 Y Y is S.P S.P BE/LE
308 UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd
309 UTF-16LE 2 N Y S.P S.P 0x2ad8,0xcddf
310 UTF-32 4 Y - is bogus As is BE/LE
311 UTF-32BE 4 N - bogus As is 0x0010abcd
312 UTF-32LE 4 N - bogus As is 0xcdab1000
313 UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d
314 ---------------+-----------------+------------------------------
318 =head1 Size, Endianness, and BOM
320 You can categorize these CES by 3 criteria; Size of each character,
321 Endianness, and Byte Order Mark.
325 UCS-2 is a fixed-length encoding with each character taking 16 bits.
326 It B<does not> support I<Surrogate Pairs>. When a surrogate pair is
327 encountered during decode(), its place is filled with \xFFFD without
328 I<CHECK> or croaks if I<CHECK>. When a character whose ord value is
329 larger than 0xFFFF is encountered, it uses 0xFFFD without I<CHECK> or
332 UTF-16 is almost the same as UCS-2 but it supports I<Surrogate Pairs>.
333 When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
334 following low surrogate (0xDC00-0xDFFF), C<desurrogate>s them to form a
335 character. Bogus surrogates result in death. When \x{10000} or above
336 is encountered during encode(), it C<ensurrogate>s them and pushes the
337 surrogate pair to the output stream.
339 UTF-32 is a fixed-length encoding with each character taking 32 bits.
340 Since it is 32-bit there is no need for I<Surrogate Pairs>.
344 First (and now failed) goal of Unicode was to map all character
345 repertories into a fixed-length integer so programmers are happy.
346 Since each character is either I<short> or I<long> in C, you have to
347 put endianness of each platform when you pass data to one another.
349 Anything marked as BE is Big Endian (or network byte order) and LE is
350 Little Endian (aka VAX byte order). For anything without, a character
351 called Byte Order Mark (BOM) is prepended to the head of string.
355 =item BOM as integer when fetched in network byte order
358 -------------------------
361 -------------------------
365 This modules handles BOM as follows.
371 When BE or LE is explicitly stated as the name of encoding, BOM is
372 simply treated as one of characters (ZERO WIDTH NO-BREAK SPACE).
376 When BE or LE is omitted during decode(), it checks if BOM is in the
377 beginning of the string and if found endianness is set to what BOM
378 says. If not found, dies.
382 When BE or LE is omitted during encode(), it returns a BE-encoded
383 string with BOM prepended. So when you want to encode a whole text
384 file, make sure you encode() by whole text, not line by line or each
385 line, not file, is prepended with BOMs.
389 C<UCS-2> is an exception. Unlike others this is an alias of UCS-2BE.
390 UCS-2 is already registered by IANA and others that way.
394 =head1 Surrogate Pairs
396 To say the least, surrogate pairs were the biggest mistake of the
397 Unicode Consortium. But according to the late Douglas Adams in I<The
398 Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the
399 Universe was created. This has made a lot of people very angry and
400 been widely regarded as a bad move>. Their mistake was not of this
401 magnitude so let's forgive them.
403 (I don't dare make any comparison with Unicode Consortium and the
404 Vogons here ;) Or, comparing Encode to Babel Fish is completely
405 appropriate -- if you can only stick this into your ear :)
407 Surrogate pairs were born when Unicode Consortium finally
408 admitted that 16 bits were not big enough to hold all the world's
409 character repertoire. But they have already made UCS-2 16-bit. What
412 Back then 0xD800-0xDFFF was not allocated. Let's split them half and
413 use the first half to represent C<upper half of a character> and the
414 latter C<lower half of a character>. That way you can represent 1024
415 * 1024 = 1048576 more characters. Now we can store character ranges
416 up to \x{10ffff} even with 16-bit encodings. This pair of
417 half-character is now called a I<Surrogate Pair> and UTF-16 is the
418 name of the encoding that embraces them.
420 Here is a formula to ensurrogate a Unicode character \x{10000} and
423 $hi = ($uni - 0x10000) / 0x400 + 0xD800;
424 $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
428 $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
430 Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but
431 perl does not prohibit the use of characters within this range. To perl,
432 every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
434 (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
435 integer support! (**)
437 (**) Is anything beyond \x{11_0000} still Unicode :?
441 L<Encode>, L<http://www.unicode.org/glossary/>,
443 RFC 2781 L<http://rfc.net/rfc2781.html>,
445 L<http://www.unicode.org/unicode/faq/utf_bom.html>
447 Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)>
448 by Larry Wall, Tom Christiansen, Jon Orwant;
449 O'Reilly & Associates; ISBN 0-596-00027-8