ext/Encode/Unicode/Unicode.pm

   1 package Encode::Unicode;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = do { my @r = (q$Revision: 1.35 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   7
   8 use XSLoader;
   9 XSLoader::load(__PACKAGE__,$VERSION);
  10
  11 #
  12 # Object Generator 8 transcoders all at once!
  13 #
  14
  15 require Encode;
  16 for my $name (qw(UTF-16 UTF-16BE UTF-16LE
  17                  UTF-32 UTF-32BE UTF-32LE
  18                         UCS-2BE  UCS-2LE))
  19 {
  20     my ($size, $endian, $ucs2, $mask);
  21     $name =~ /^(\w+)-(\d+)(\w*)$/o;
  22     if ($ucs2 = ($1 eq 'UCS')){
  23         $size = 2;
  24     }else{
  25         $size = $2/8;
  26     }
  27     $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ;
  28     $size == 4 and $endian = uc($endian);
  29
  30     $Encode::Encoding{$name} =
  31         bless {
  32                Name   =>   $name,
  33                size   =>   $size,
  34                endian => $endian,
  35                ucs2   =>   $ucs2,
  36               } => __PACKAGE__;
  37
  38 }
  39
  40 sub name { shift->{'Name'} }
  41 sub new_sequence
  42 {
  43     my $self = shift;
  44     # Return the original if endian known
  45     return $self if ($self->{endian});
  46     # Return a clone
  47     return bless {%$self},ref($self);
  48 }
  49
  50 sub needs_lines { 0 };
  51
  52 sub perlio_ok {
  53     eval{ require PerlIO::encoding };
  54     if ($@){
  55         return 0;
  56     }else{
  57         return 1;
  58     }
  59 }
  60
  61
  62 #
  63 # three implementations of (en|de)code exist.  The XS version is the
  64 # fastest.  *_modern uses an array and *_classic sticks with substr.
  65 # *_classic is  much slower but more memory conservative.
  66 # *_xs is the default.
  67
  68 sub set_transcoder{
  69     no warnings qw(redefine);
  70     my $type = shift;
  71     if    ($type eq "xs"){
  72         *decode = \&decode_xs;
  73         *encode = \&encode_xs;
  74     }elsif($type eq "modern"){
  75         *decode = \&decode_modern;
  76         *encode = \&encode_modern;
  77     }elsif($type eq "classic"){
  78         *decode = \&decode_classic;
  79         *encode = \&encode_classic;
  80     }else{
  81         require Carp;
  82         Carp::croak __PACKAGE__, "::set_transcoder(modern|classic|xs)";
  83     }
  84 }
  85
  86 set_transcoder("xs");
  87
  88 #
  89 # Aux. subs & constants
  90 #
  91
  92 sub FBCHAR(){ 0xFFFd }
  93 sub BOM_BE(){ 0xFeFF }
  94 sub BOM16LE(){ 0xFFFe }
  95 sub BOM32LE(){ 0xFFFe0000 }
  96
  97 sub valid_ucs2($){
  98     return
  99         (0 <= $_[0] && $_[0] < 0xD800)
 100             ||  ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF);
 101 }
 102
 103 sub issurrogate($){   0xD800 <= $_[0]  && $_[0] <= 0xDFFF }
 104 sub isHiSurrogate($){ 0xD800 <= $_[0]  && $_[0] <  0xDC00 }
 105 sub isLoSurrogate($){ 0xDC00 <= $_[0]  && $_[0] <= 0xDFFF }
 106
 107 sub ensurrogate($){
 108     use integer; # we have divisions
 109     my $uni = shift;
 110     my  $hi = ($uni - 0x10000) / 0x400 + 0xD800;
 111     my  $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
 112     return ($hi, $lo);
 113 }
 114
 115 sub desurrogate($$){
 116     my ($hi, $lo) = @_;
 117     return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
 118 }
 119
 120 sub Mask { {2 => 0xffff,  4 => 0xffffffff} }
 121
 122 #
 123 # *_modern are much faster but guzzle more memory
 124 #
 125
 126 sub decode_modern($$;$)
 127 {
 128     my ($obj, $str, $chk ) = @_;
 129     my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
 130
 131     # warn "$size, $endian, $ucs2";
 132     $endian ||= BOMB($size, substr($str, 0, $size, ''))
 133         or poisoned2death($obj, "Where's the BOM?");
 134     my  $mask = Mask->{$size};
 135     my $utf8   = '';
 136     my @ord = unpack("$endian*", $str);
 137     undef $str; # to conserve memory
 138     while (@ord){
 139         my $ord = shift @ord;
 140         unless ($size == 4 or valid_ucs2($ord &= $mask)){
 141             if ($ucs2){
 142                 $chk and
 143                     poisoned2death($obj, "no surrogates allowed", $ord);
 144                 shift @ord; # skip the next one as well
 145                 $ord = FBCHAR;
 146             }else{
 147                 unless (isHiSurrogate($ord)){
 148                     poisoned2death($obj, "Malformed HI surrogate", $ord);
 149                 }
 150                 my $lo = shift @ord;
 151                 unless (isLoSurrogate($lo &= $mask)){
 152                     poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
 153                 }
 154                 $ord = desurrogate($ord, $lo);
 155             }
 156         }
 157         $utf8 .= chr($ord);
 158     }
 159     utf8::upgrade($utf8);
 160     return $utf8;
 161 }
 162
 163 sub encode_modern($$;$)
 164 {
 165     my ($obj, $utf8, $chk) = @_;
 166     my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
 167     my @str = ();
 168     unless ($endian){
 169         $endian = ($size == 4) ? 'N' : 'n';
 170         push @str, BOM_BE;
 171     }
 172     my @ord = unpack("U*", $utf8);
 173     undef $utf8; # to conserve memory
 174     for my $ord (@ord){
 175         unless ($size == 4 or valid_ucs2($ord)) {
 176             unless(issurrogate($ord)){
 177                 if ($ucs2){
 178                     $chk and
 179                         poisoned2death($obj, "code point too high", $ord);
 180
 181                     push @str, FBCHAR;
 182                 }else{
 183
 184                     push @str, ensurrogate($ord);
 185                 }
 186             }else{  # not supposed to happen
 187                 push @str, FBCHAR;
 188             }
 189         }else{
 190             push @str, $ord;
 191         }
 192     }
 193     return pack("$endian*", @str);
 194 }
 195
 196 #
 197 # *_classic are slower but more memory conservative
 198 #
 199
 200 sub decode_classic($$;$)
 201 {
 202     my ($obj, $str, $chk ) = @_;
 203     my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
 204
 205     # warn "$size, $endian, $ucs2";
 206     $endian ||= BOMB($size, substr($str, 0, $size, ''))
 207         or poisoned2death($obj, "Where's the BOM?");
 208     my  $mask = Mask->{$size};
 209     my $utf8   = '';
 210     my @ord = unpack("$endian*", $str);
 211     while (length($str)){
 212          my $ord = unpack($endian, substr($str, 0, $size, ''));
 213         unless ($size == 4 or valid_ucs2($ord &= $mask)){
 214             if ($ucs2){
 215                 $chk and
 216                     poisoned2death($obj, "no surrogates allowed", $ord);
 217                 substr($str,0,$size,''); # skip the next one as well
 218                 $ord = FBCHAR;
 219             }else{
 220                 unless (isHiSurrogate($ord)){
 221                     poisoned2death($obj, "Malformed HI surrogate", $ord);
 222                 }
 223                 my $lo = unpack($endian ,substr($str,0,$size,''));
 224                 unless (isLoSurrogate($lo &= $mask)){
 225                     poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
 226                 }
 227                 $ord = desurrogate($ord, $lo);
 228             }
 229         }
 230         $utf8 .= chr($ord);
 231     }
 232     utf8::upgrade($utf8);
 233     return $utf8;
 234 }
 235
 236 sub encode_classic($$;$)
 237 {
 238     my ($obj, $utf8, $chk) = @_;
 239     my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
 240     # warn join ", ", $size, $ucs2, $endian, $mask;
 241     my $str   = '';
 242     unless ($endian){
 243         $endian = ($size == 4) ? 'N' : 'n';
 244         $str .= pack($endian, BOM_BE);
 245     }
 246     while (length($utf8)){
 247         my $ord  = ord(substr($utf8,0,1,''));
 248         unless ($size == 4 or valid_ucs2($ord)) {
 249             unless(issurrogate($ord)){
 250                 if ($ucs2){
 251                     $chk and
 252                         poisoned2death($obj, "code point too high", $ord);
 253                     $str .= pack($endian, FBCHAR);
 254                 }else{
 255                     $str .= pack($endian.2, ensurrogate($ord));
 256                 }
 257             }else{  # not supposed to happen
 258                 $str .= pack($endian, FBCHAR);
 259             }
 260         }else{
 261             $str .= pack($endian, $ord);
 262         }
 263     }
 264     return $str;
 265 }
 266
 267 sub BOMB {
 268     my ($size, $bom) = @_;
 269     my $N = $size == 2 ? 'n' : 'N';
 270     my $ord = unpack($N, $bom);
 271     return ($ord eq BOM_BE) ? $N :
 272         ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
 273 }
 274
 275 sub poisoned2death{
 276     my $obj = shift;
 277     my $msg = shift;
 278     my $pair = join(", ", map {sprintf "\\x%x", $_} @_);
 279     require Carp;
 280     Carp::croak $obj->name, ":", $msg, "<$pair>.", caller;
 281 }
 282
 283 1;
 284 __END__
 285
 286 =head1 NAME
 287
 288 Encode::Unicode -- Various Unicode Transformation Formats
 289
 290 =cut
 291
 292 =head1 SYNOPSIS
 293
 294     use Encode qw/encode decode/;
 295     $ucs2 = encode("UCS-2BE", $utf8);
 296     $utf8 = decode("UCS-2BE", $ucs2);
 297
 298 =head1 ABSTRACT
 299
 300 This module implements all Character Encoding Schemes of Unicode that
 301 are officially documented by Unicode Consortium (except, of course,
 302 for UTF-8, which is a native format in perl).
 303
 304 =over 4
 305
 306 =item L<http://www.unicode.org/glossary/> says:
 307
 308 I<Character Encoding Scheme> A character encoding form plus byte
 309 serialization. There are seven character encoding schemes in Unicode:
 310 UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE.
 311
 312 =item Quick Reference
 313
 314                 Decodes from ord(N)           Encodes chr(N) to...
 315        octet/char BOM S.P d800-dfff  ord > 0xffff     \x{1abcd} ==
 316   ---------------+-----------------+------------------------------
 317   UCS-2BE       2   N   N  is bogus                  Not Available
 318   UCS-2LE       2   N   N     bogus                  Not Available
 319   UTF-16      2/4   Y   Y  is   S.P           S.P            BE/LE
 320   UTF-16BE    2/4   N   Y       S.P           S.P    0xd82a,0xdfcd
 321   UTF-16LE      2   N   Y       S.P           S.P    0x2ad8,0xcddf
 322   UTF-32        4   Y   -  is bogus         As is            BE/LE
 323   UTF-32BE      4   N   -     bogus         As is       0x0001abcd
 324   UTF-32LE      4   N   -     bogus         As is       0xcdab0100
 325   UTF-8       1-4   -   -     bogus   >= 4 octets   \xf0\x9a\af\8d
 326   ---------------+-----------------+------------------------------
 327
 328 =back
 329
 330 =head1 Size, Endianness, and BOM
 331
 332 You can categorize these CES by 3 criteria:  size of each character,
 333 endianness, and Byte Order Mark.
 334
 335 =head2 by size
 336
 337 UCS-2 is a fixed-length encoding with each character taking 16 bits.
 338 It B<does not> support I<surrogate pairs>.  When a surrogate pair
 339 is encountered during decode(), its place is filled with \x{FFFD}
 340 if I<CHECK> is 0, or the routine croaks if I<CHECK> is 1.  When a
 341 character whose ord value is larger than 0xFFFF is encountered,
 342 its place is filled with \x{FFFD} if I<CHECK> is 0, or the routine
 343 croaks if I<CHECK> is 1.
 344
 345 UTF-16 is almost the same as UCS-2 but it supports I<surrogate pairs>.
 346 When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
 347 following low surrogate (0xDC00-0xDFFF) and C<desurrogate>s them to
 348 form a character.  Bogus surrogates result in death.  When \x{10000}
 349 or above is encountered during encode(), it C<ensurrogate>s them and
 350 pushes the surrogate pair to the output stream.
 351
 352 UTF-32 is a fixed-length encoding with each character taking 32 bits.
 353 Since it is 32-bit, there is no need for I<surrogate pairs>.
 354
 355 =head2 by endianness
 356
 357 The first (and now failed) goal of Unicode was to map all character
 358 repertoires into a fixed-length integer so that programmers are happy.
 359 Since each character is either a I<short> or I<long> in C, you have to
 360 pay attention to the endianness of each platform when you pass data
 361 to one another.
 362
 363 Anything marked as BE is Big Endian (or network byte order) and LE is
 364 Little Endian (aka VAX byte order).  For anything not marked either
 365 BE or LE, a character called Byte Order Mark (BOM) indicating the
 366 endianness is prepended to the string.
 367
 368 =over 4
 369
 370 =item BOM as integer when fetched in network byte order
 371
 372               16         32 bits/char
 373   -------------------------
 374   BE      0xFeFF 0x0000FeFF
 375   LE      0xFFeF 0xFFFe0000
 376   -------------------------
 377
 378 =back
 379
 380 This modules handles the BOM as follows.
 381
 382 =over 4
 383
 384 =item *
 385
 386 When BE or LE is explicitly stated as the name of encoding, BOM is
 387 simply treated as a normal character (ZERO WIDTH NO-BREAK SPACE).
 388
 389 =item *
 390
 391 When BE or LE is omitted during decode(), it checks if BOM is at the
 392 beginning of the string; if one is found, the endianness is set to
 393 what the BOM says.  If no BOM is found, the routine dies.
 394
 395 =item *
 396
 397 When BE or LE is omitted during encode(), it returns a BE-encoded
 398 string with BOM prepended.  So when you want to encode a whole text
 399 file, make sure you encode() the whole text at once, not line by line
 400 or each line, not file, will have a BOM prepended.
 401
 402 =item *
 403
 404 C<UCS-2> is an exception.  Unlike others, this is an alias of UCS-2BE.
 405 UCS-2 is already registered by IANA and others that way.
 406
 407 =back
 408
 409 =head1 Surrogate Pairs
 410
 411 To say the least, surrogate pairs were the biggest mistake of the
 412 Unicode Consortium.  But according to the late Douglas Adams in I<The
 413 Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the
 414 Universe was created. This has made a lot of people very angry and
 415 been widely regarded as a bad move>.  Their mistake was not of this
 416 magnitude so let's forgive them.
 417
 418 (I don't dare make any comparison with Unicode Consortium and the
 419 Vogons here ;)  Or, comparing Encode to Babel Fish is completely
 420 appropriate -- if you can only stick this into your ear :)
 421
 422 Surrogate pairs were born when the Unicode Consortium finally
 423 admitted that 16 bits were not big enough to hold all the world's
 424 character repertoires.  But they already made UCS-2 16-bit.  What
 425 do we do?
 426
 427 Back then, the range 0xD800-0xDFFF was not allocated.  Let's split
 428 that range in half and use the first half to represent the C<upper
 429 half of a character> and the second half to represent the C<lower
 430 half of a character>.  That way, you can represent 1024 * 1024 =
 431 1048576 more characters.  Now we can store character ranges up to
 432 \x{10ffff} even with 16-bit encodings.  This pair of half-character is
 433 now called a I<surrogate pair> and UTF-16 is the name of the encoding
 434 that embraces them.
 435
 436 Here is a formula to ensurrogate a Unicode character \x{10000} and
 437 above;
 438
 439   $hi = ($uni - 0x10000) / 0x400 + 0xD800;
 440   $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
 441
 442 And to desurrogate;
 443
 444  $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
 445
 446 Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but
 447 perl does not prohibit the use of characters within this range.  To perl,
 448 every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
 449
 450   (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
 451   integer support!
 452
 453 =head1 SEE ALSO
 454
 455 L<Encode>, L<http://www.unicode.org/glossary/>,
 456
 457 RFC 2781 L<http://rfc.net/rfc2781.html>,
 458
 459 L<http://www.unicode.org/unicode/faq/utf_bom.html>
 460
 461 Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)>
 462 by Larry Wall, Tom Christiansen, Jon Orwant;
 463 O'Reilly & Associates; ISBN 0-596-00027-8
 464
 465 =cut