1 package Encode::Unicode;
6 our $VERSION = do { my @r = (q$Revision: 1.26 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
9 # Aux. subs & constants
12 sub FBCHAR(){ 0xFFFd }
13 sub BOM_BE(){ 0xFeFF }
14 sub BOM16LE(){ 0xFFFe }
15 sub BOM32LE(){ 0xFeFF0000 }
21 return ($_[0] > 0xDFFFF && $_[0] <= 0xFFFF);
25 sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF }
26 sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 }
27 sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF }
30 use integer; # we have divisions
32 my $hi = ($uni - 0x10000) / 0x400 + 0xD800;
33 my $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
39 return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
42 sub Mask { {2 => 0xffff, 4 => 0xffffffff} }
45 # Object Generator 8 transcoders all at once!
49 for my $name (qw(UTF-16 UTF-16BE UTF-16LE
50 UTF-32 UTF-32BE UTF-32LE
53 my ($size, $endian, $ucs2, $mask);
54 $name =~ /^(\w+)-(\d+)(\w*)$/o;
55 if ($ucs2 = ($1 eq 'UCS')){
60 $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ;
61 $size == 4 and $endian = uc($endian);
63 $Encode::Encoding{$name} =
73 sub name { shift->{'Name'} }
74 sub new_sequence { $_[0] };
77 # the two implementation of (en|de)code exist. *_modern use
78 # array and *_classic stick with substr. *_classic is much
79 # slower but more memory conservative. *_moder is default.
82 no warnings qw(redefine);
84 if ($type eq "modern"){
85 *decode = \&decode_modern;
86 *encode = \&encode_modern;
87 }elsif($type eq "classic"){
88 *decode = \&decode_classic;
89 *encode = \&encode_classic;
92 Carp::croak __PACKAGE__, "::set_transcoder(modern|classic)";
96 set_transcoder("modern");
99 # *_modern are much faster but guzzle more memory
104 my ($obj, $str, $chk ) = @_;
105 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
107 # warn "$size, $endian, $ucs2";
108 $endian ||= BOMB($size, substr($str, 0, $size, ''))
109 or poisoned2death($obj, "Where's the BOM?");
110 my $mask = Mask->{$size};
112 my @ord = unpack("$endian*", $str);
113 undef $str; # to conserve memory
115 my $ord = shift @ord;
116 unless ($size == 4 or valid_ucs2($ord &= $mask)){
119 poisoned2death($obj, "no surrogates allowed", $ord);
120 shift @ord; # skip the next one as well
123 unless (isHiSurrogate($ord)){
124 poisoned2death($obj, "Malformed HI surrogate", $ord);
127 unless (isLoSurrogate($lo &= $mask)){
128 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
130 $ord = desurrogate($ord, $lo);
135 utf8::upgrade($utf8);
141 my ($obj, $utf8, $chk) = @_;
142 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
145 $endian = ($size == 4) ? 'N' : 'n';
148 my @ord = unpack("U*", $utf8);
149 undef $utf8; # to conserve memory
151 unless ($size == 4 or valid_ucs2($ord)) {
152 unless(issurrogate($ord)){
155 poisoned2death($obj, "code point too high", $ord);
160 push @str, ensurrogate($ord);
162 }else{ # not supposed to happen
169 return pack("$endian*", @str);
173 # *_classic are slower but more memory conservative
178 my ($obj, $str, $chk ) = @_;
179 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
181 # warn "$size, $endian, $ucs2";
182 $endian ||= BOMB($size, substr($str, 0, $size, ''))
183 or poisoned2death($obj, "Where's the BOM?");
184 my $mask = Mask->{$size};
186 my @ord = unpack("$endian*", $str);
187 while (length($str)){
188 my $ord = unpack($endian, substr($str, 0, $size, ''));
189 unless ($size == 4 or valid_ucs2($ord &= $mask)){
192 poisoned2death($obj, "no surrogates allowed", $ord);
193 substr($str,0,$size,''); # skip the next one as well
196 unless (isHiSurrogate($ord)){
197 poisoned2death($obj, "Malformed HI surrogate", $ord);
199 my $lo = unpack($endian ,substr($str,0,$size,''));
200 unless (isLoSurrogate($lo &= $mask)){
201 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
203 $ord = desurrogate($ord, $lo);
208 utf8::upgrade($utf8);
214 my ($obj, $utf8, $chk) = @_;
215 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
216 # warn join ", ", $size, $ucs2, $endian, $mask;
219 $endian = ($size == 4) ? 'N' : 'n';
220 $str .= pack($endian, BOM_BE);
222 while (length($utf8)){
223 my $ord = ord(substr($utf8,0,1,''));
224 unless ($size == 4 or valid_ucs2($ord)) {
225 unless(issurrogate($ord)){
228 poisoned2death($obj, "code point too high", $ord);
229 $str .= pack($endian, FBCHAR);
231 $str .= pack($endian.2, ensurrogate($ord));
233 }else{ # not supposed to happen
234 $str .= pack($endian, FBCHAR);
237 $str .= pack($endian, $ord);
244 my ($size, $bom) = @_;
245 my $N = $size == 2 ? 'n' : 'N';
246 my $ord = unpack($N, $bom);
247 return ($ord eq BOM_BE) ? $N :
248 ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
254 my $pair = join(", ", map {sprintf "\\x%x", $_} @_);
256 Carp::croak $obj->name, ":", $msg, "<$pair>.", caller;
264 Encode::Unicode -- Various Unicode Transform Format
270 use Encode qw/encode decode/;
271 $ucs2 = encode("UCS-2BE", $utf8);
272 $utf8 = decode("UCS-2BE", $ucs2);
276 This module implements all Character Encoding Schemes of Unicode that
277 are officially documented by Unicode Consortium (except, of course,
278 for UTF-8, which is a native format in perl).
282 =item L<http://www.unicode.org/glossary/> says:
284 I<Character Encoding Scheme> A character encoding form plus byte
285 serialization. There are seven character encoding schemes in Unicode:
286 UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE.
288 =item Quick Reference
290 Decodes from ord(N) Encodes chr(N) to...
291 octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} ==
292 ---------------+-----------------+------------------------------
293 UCS-2BE 2 N N is bogus Not Available
294 UCS-2LE 2 N N bogus Not Available
295 UTF-16 2/4 Y Y is S.P S.P BE/LE
296 UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd
297 UTF-16LE 2 N Y S.P S.P 0x2ad8,0xcddf
298 UTF-32 4 Y - is bogus As is BE/LE
299 UTF-32BE 4 N - bogus As is 0x0010abcd
300 UTF-32LE 4 N - bogus As is 0xcdab1000
301 UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d
302 ---------------+-----------------+------------------------------
306 =head1 Size, Endianness, and BOM
308 You can categorize these CES by 3 criteria; Size of each character,
309 Endianness, and Byte Order Mark.
313 UCS-2 is a fixed-length encoding with each character taking 16 bits.
314 It B<does not> support I<Surrogate Pair>. When surrogate pair is
315 encountered during decode(), it fills its place with \xFFFD without
316 I<CHECK> or croaks if I<CHECK>. When a character which ord value is
317 larger than 0xFFFF, it uses 0xFFFD without I<CHECK> or croaks if
320 UTF-16 is almost the same as UCS-2 but it supports I<Surrogate Pair>.
321 When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
322 following low surrogate (0xDC00-0xDFFF), C<desurrogate> them to form a
323 character. Bogus surrogates result in death. When \x{10000} or above
324 is encountered during encode(), it C<ensurrogate>s them and push the
325 surrogate pair to the output stream.
327 UTF-32 is a fixed-length encoding with each character taking 32 bits.
328 Since it is 32-bit there is no need for I<Surrogate Pair>.
332 First (and now failed) goal of Unicode was to map all character
333 repartories into a fixed-length integer so programmers are happy.
334 Since each character is either I<short> or I<long> in C, you have to
335 put endianness of each platform when you pass data to one another.
337 Anything marked as BE is Big Endian (or network byte order) and LE is
338 Little Endian (aka VAX byte order). For anything without, a character
339 called Byte Order Mark (BOM) is prepended to the head of string.
346 -------------------------
349 -------------------------
353 This modules handles BOM as follows.
359 When BE or LE is explicitly stated as the name of encoding, BOM is
360 simply treated as one of characters (ZERO WIDTH NO-BREAK SPACE).
364 When BE or LE is omitted during decode(), it checks if BOM is in the
365 beginning of the string and if found endianness is set to what BOM
366 says. if not found, dies.
370 When BE or LE is omitted during encode(), it returns a BE-encoded
371 string with BOM prepended. So when you want to encode a whole text
372 file, make sure you encode() by whole text, not line by line or each
373 line, not file, is prepended with BOMs.
377 C<UCS-2> is an exception. Unlike others this is an alias of UCS-2BE.
378 UCS-2 is already registered by IANA and others that way.
381 =head1 The Surrogate Pair
383 To say the least, surrogate pair was the biggest mistake by Unicode
384 Consortium. I don't give a darn if they admit it or not. But
385 according to late Douglas Adams in I<The Hitchhiker's Guide to the
386 Galaxy> Triology, C<First the Universe was created and it was a bad
387 move>. Their mistake was not this magnitude so let's forgive them.
389 (I don't dare make any comparison with Unicode Consortium and the
390 Vogons here ;) Or, comparing Encode to Babel Fish is completely
391 appropriate -- if you can only stick this into your ear :)
393 A surrogate pair was born when Unicode Consortium had finally
394 admitted that 16 bit was not big enough to hold all the world's
395 character repartorie. But they have already made UCS-2 16-bit. What
398 Back then 0xD800-0xDFFF was not allocated. Let's split them half and
399 use the first half to represent C<upper half of a character> and the
400 latter C<lower half of a character>. That way you can represent 1024
401 * 1024 = 1048576 more characters. Now we can store character ranges
402 up to \x{10ffff} even with 16-bit encodings. This pair of
403 half-character is now called a I<Surrogate Pair> and UTF-16 is the
404 name of encoding that embraces them.
406 Here is a fomula to ensurrogate a Unicode character \x{10000} and
409 $hi = ($uni - 0x10000) / 0x400 + 0xD800;
410 $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
414 $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
416 Note this move has made \x{D800}-\x{DFFF} forbidden zone but perl
417 does not prohibit them for uses.
421 L<Encode>, L<http://www.unicode.org/glossary/>