1 package Encode::Unicode;
6 our $VERSION = do { my @r = (q$Revision: 1.29 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
9 # Aux. subs & constants
12 sub FBCHAR(){ 0xFFFd }
13 sub BOM_BE(){ 0xFeFF }
14 sub BOM16LE(){ 0xFFFe }
15 sub BOM32LE(){ 0xFFFe0000 }
19 (0 <= $_[0] && $_[0] < 0xD800)
20 || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF);
23 sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF }
24 sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 }
25 sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF }
28 use integer; # we have divisions
30 my $hi = ($uni - 0x10000) / 0x400 + 0xD800;
31 my $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
37 return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
40 sub Mask { {2 => 0xffff, 4 => 0xffffffff} }
43 # Object Generator 8 transcoders all at once!
47 for my $name (qw(UTF-16 UTF-16BE UTF-16LE
48 UTF-32 UTF-32BE UTF-32LE
51 my ($size, $endian, $ucs2, $mask);
52 $name =~ /^(\w+)-(\d+)(\w*)$/o;
53 if ($ucs2 = ($1 eq 'UCS')){
58 $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ;
59 $size == 4 and $endian = uc($endian);
61 $Encode::Encoding{$name} =
71 sub name { shift->{'Name'} }
72 sub new_sequence { $_[0] };
75 # two implementation of (en|de)code exist. *_modern use
76 # an array and *_classic stick with substr. *_classic is much
77 # slower but more memory conservative. *_modern is default.
80 no warnings qw(redefine);
82 if ($type eq "modern"){
83 *decode = \&decode_modern;
84 *encode = \&encode_modern;
85 }elsif($type eq "classic"){
86 *decode = \&decode_classic;
87 *encode = \&encode_classic;
90 Carp::croak __PACKAGE__, "::set_transcoder(modern|classic)";
94 set_transcoder("modern");
97 # *_modern are much faster but guzzle more memory
102 my ($obj, $str, $chk ) = @_;
103 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
105 # warn "$size, $endian, $ucs2";
106 $endian ||= BOMB($size, substr($str, 0, $size, ''))
107 or poisoned2death($obj, "Where's the BOM?");
108 my $mask = Mask->{$size};
110 my @ord = unpack("$endian*", $str);
111 undef $str; # to conserve memory
113 my $ord = shift @ord;
114 unless ($size == 4 or valid_ucs2($ord &= $mask)){
117 poisoned2death($obj, "no surrogates allowed", $ord);
118 shift @ord; # skip the next one as well
121 unless (isHiSurrogate($ord)){
122 poisoned2death($obj, "Malformed HI surrogate", $ord);
125 unless (isLoSurrogate($lo &= $mask)){
126 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
128 $ord = desurrogate($ord, $lo);
133 utf8::upgrade($utf8);
139 my ($obj, $utf8, $chk) = @_;
140 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
143 $endian = ($size == 4) ? 'N' : 'n';
146 my @ord = unpack("U*", $utf8);
147 undef $utf8; # to conserve memory
149 unless ($size == 4 or valid_ucs2($ord)) {
150 unless(issurrogate($ord)){
153 poisoned2death($obj, "code point too high", $ord);
158 push @str, ensurrogate($ord);
160 }else{ # not supposed to happen
167 return pack("$endian*", @str);
171 # *_classic are slower but more memory conservative
176 my ($obj, $str, $chk ) = @_;
177 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
179 # warn "$size, $endian, $ucs2";
180 $endian ||= BOMB($size, substr($str, 0, $size, ''))
181 or poisoned2death($obj, "Where's the BOM?");
182 my $mask = Mask->{$size};
184 my @ord = unpack("$endian*", $str);
185 while (length($str)){
186 my $ord = unpack($endian, substr($str, 0, $size, ''));
187 unless ($size == 4 or valid_ucs2($ord &= $mask)){
190 poisoned2death($obj, "no surrogates allowed", $ord);
191 substr($str,0,$size,''); # skip the next one as well
194 unless (isHiSurrogate($ord)){
195 poisoned2death($obj, "Malformed HI surrogate", $ord);
197 my $lo = unpack($endian ,substr($str,0,$size,''));
198 unless (isLoSurrogate($lo &= $mask)){
199 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
201 $ord = desurrogate($ord, $lo);
206 utf8::upgrade($utf8);
212 my ($obj, $utf8, $chk) = @_;
213 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
214 # warn join ", ", $size, $ucs2, $endian, $mask;
217 $endian = ($size == 4) ? 'N' : 'n';
218 $str .= pack($endian, BOM_BE);
220 while (length($utf8)){
221 my $ord = ord(substr($utf8,0,1,''));
222 unless ($size == 4 or valid_ucs2($ord)) {
223 unless(issurrogate($ord)){
226 poisoned2death($obj, "code point too high", $ord);
227 $str .= pack($endian, FBCHAR);
229 $str .= pack($endian.2, ensurrogate($ord));
231 }else{ # not supposed to happen
232 $str .= pack($endian, FBCHAR);
235 $str .= pack($endian, $ord);
242 my ($size, $bom) = @_;
243 my $N = $size == 2 ? 'n' : 'N';
244 my $ord = unpack($N, $bom);
245 return ($ord eq BOM_BE) ? $N :
246 ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
252 my $pair = join(", ", map {sprintf "\\x%x", $_} @_);
254 Carp::croak $obj->name, ":", $msg, "<$pair>.", caller;
262 Encode::Unicode -- Various Unicode Transform Format
268 use Encode qw/encode decode/;
269 $ucs2 = encode("UCS-2BE", $utf8);
270 $utf8 = decode("UCS-2BE", $ucs2);
274 This module implements all Character Encoding Schemes of Unicode that
275 are officially documented by Unicode Consortium (except, of course,
276 for UTF-8, which is a native format in perl).
280 =item L<http://www.unicode.org/glossary/> says:
282 I<Character Encoding Scheme> A character encoding form plus byte
283 serialization. There are seven character encoding schemes in Unicode:
284 UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE.
286 =item Quick Reference
288 Decodes from ord(N) Encodes chr(N) to...
289 octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} ==
290 ---------------+-----------------+------------------------------
291 UCS-2BE 2 N N is bogus Not Available
292 UCS-2LE 2 N N bogus Not Available
293 UTF-16 2/4 Y Y is S.P S.P BE/LE
294 UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd
295 UTF-16LE 2 N Y S.P S.P 0x2ad8,0xcddf
296 UTF-32 4 Y - is bogus As is BE/LE
297 UTF-32BE 4 N - bogus As is 0x0010abcd
298 UTF-32LE 4 N - bogus As is 0xcdab1000
299 UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d
300 ---------------+-----------------+------------------------------
304 =head1 Size, Endianness, and BOM
306 You can categorize these CES by 3 criteria; Size of each character,
307 Endianness, and Byte Order Mark.
311 UCS-2 is a fixed-length encoding with each character taking 16 bits.
312 It B<does not> support I<Surrogate Pairs>. When a surrogate pair is
313 encountered during decode(), its place is filled with \xFFFD without
314 I<CHECK> or croaks if I<CHECK>. When a character whose ord value is
315 larger than 0xFFFF is encountered, it uses 0xFFFD without I<CHECK> or
318 UTF-16 is almost the same as UCS-2 but it supports I<Surrogate Pairs>.
319 When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
320 following low surrogate (0xDC00-0xDFFF), C<desurrogate>s them to form a
321 character. Bogus surrogates result in death. When \x{10000} or above
322 is encountered during encode(), it C<ensurrogate>s them and pushes the
323 surrogate pair to the output stream.
325 UTF-32 is a fixed-length encoding with each character taking 32 bits.
326 Since it is 32-bit there is no need for I<Surrogate Pairs>.
330 First (and now failed) goal of Unicode was to map all character
331 repertories into a fixed-length integer so programmers are happy.
332 Since each character is either I<short> or I<long> in C, you have to
333 put endianness of each platform when you pass data to one another.
335 Anything marked as BE is Big Endian (or network byte order) and LE is
336 Little Endian (aka VAX byte order). For anything without, a character
337 called Byte Order Mark (BOM) is prepended to the head of string.
341 =item BOM as integer when fetched in network byte order
344 -------------------------
347 -------------------------
351 This modules handles BOM as follows.
357 When BE or LE is explicitly stated as the name of encoding, BOM is
358 simply treated as one of characters (ZERO WIDTH NO-BREAK SPACE).
362 When BE or LE is omitted during decode(), it checks if BOM is in the
363 beginning of the string and if found endianness is set to what BOM
364 says. If not found, dies.
368 When BE or LE is omitted during encode(), it returns a BE-encoded
369 string with BOM prepended. So when you want to encode a whole text
370 file, make sure you encode() by whole text, not line by line or each
371 line, not file, is prepended with BOMs.
375 C<UCS-2> is an exception. Unlike others this is an alias of UCS-2BE.
376 UCS-2 is already registered by IANA and others that way.
380 =head1 Surrogate Pairs
382 To say the least, surrogate pairs were the biggest mistake of the
383 Unicode Consortium. But according to the late Douglas Adams in I<The
384 Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the
385 Universe was created. This has made a lot of people very angry and
386 been widely regarded as a bad move>. Their mistake was not of this
387 magnitude so let's forgive them.
389 (I don't dare make any comparison with Unicode Consortium and the
390 Vogons here ;) Or, comparing Encode to Babel Fish is completely
391 appropriate -- if you can only stick this into your ear :)
393 Surrogate pairs were born when Unicode Consortium finally
394 admitted that 16 bits were not big enough to hold all the world's
395 character repertoire. But they have already made UCS-2 16-bit. What
398 Back then 0xD800-0xDFFF was not allocated. Let's split them half and
399 use the first half to represent C<upper half of a character> and the
400 latter C<lower half of a character>. That way you can represent 1024
401 * 1024 = 1048576 more characters. Now we can store character ranges
402 up to \x{10ffff} even with 16-bit encodings. This pair of
403 half-character is now called a I<Surrogate Pair> and UTF-16 is the
404 name of the encoding that embraces them.
406 Here is a fomula to ensurrogate a Unicode character \x{10000} and
409 $hi = ($uni - 0x10000) / 0x400 + 0xD800;
410 $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
414 $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
416 Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but
417 perl does not prohibit the use of characters within this range. To perl,
418 every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
420 (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
421 integer support! (**)
423 (**) Is anything beyond \x{11_0000} still Unicode :?
427 L<Encode>, L<http://www.unicode.org/glossary/>,
429 RFC 2781 L<http://rfc.net/rfc2781.html>,
431 L<http://www.unicode.org/unicode/faq/utf_bom.html>
433 Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)>
434 by Larry Wall, Tom Christiansen, Jon Orwant;
435 O'Reilly & Associates; ISBN 0-596-00027-8