ext/Encode/Unicode/Unicode.pm

   1 package Encode::Unicode;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = do { my @r = (q$Revision: 1.32 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   7
   8 use XSLoader;
   9 XSLoader::load(__PACKAGE__,$VERSION);
  10
  11 #
  12 # Object Generator 8 transcoders all at once!
  13 #
  14
  15 require Encode;
  16 for my $name (qw(UTF-16 UTF-16BE UTF-16LE
  17                  UTF-32 UTF-32BE UTF-32LE
  18                         UCS-2BE  UCS-2LE))
  19 {
  20     my ($size, $endian, $ucs2, $mask);
  21     $name =~ /^(\w+)-(\d+)(\w*)$/o;
  22     if ($ucs2 = ($1 eq 'UCS')){
  23         $size = 2;
  24     }else{
  25         $size = $2/8;
  26     }
  27     $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ;
  28     $size == 4 and $endian = uc($endian);
  29
  30     $Encode::Encoding{$name} =
  31         bless {
  32                Name   =>   $name,
  33                size   =>   $size,
  34                endian => $endian,
  35                ucs2   =>   $ucs2,
  36               } => __PACKAGE__;
  37
  38 }
  39
  40 sub name { shift->{'Name'} }
  41 sub new_sequence
  42 {
  43     my $self = shift;
  44     # Return the original if endian known
  45     return $self if ($self->{endian});
  46     # Return a clone
  47     return bless {%$self},ref($self);
  48 }
  49
  50
  51 #
  52 # three implementation of (en|de)code exist.  XS version is the fastest.
  53 # *_modern use # an array and *_classic stick with substr.  *_classic is
  54 #  much slower but more memory conservative.  *_xs is default.
  55
  56 sub set_transcoder{
  57     no warnings qw(redefine);
  58     my $type = shift;
  59     if    ($type eq "xs"){
  60         *decode = \&decode_xs;
  61         *encode = \&encode_xs;
  62     }elsif($type eq "modern"){
  63         *decode = \&decode_modern;
  64         *encode = \&encode_modern;
  65     }elsif($type eq "classic"){
  66         *decode = \&decode_classic;
  67         *encode = \&encode_classic;
  68     }else{
  69         require Carp;
  70         Carp::croak __PACKAGE__, "::set_transcoder(modern|classic|xs)";
  71     }
  72 }
  73
  74 set_transcoder("xs");
  75
  76 #
  77 # Aux. subs & constants
  78 #
  79
  80 sub FBCHAR(){ 0xFFFd }
  81 sub BOM_BE(){ 0xFeFF }
  82 sub BOM16LE(){ 0xFFFe }
  83 sub BOM32LE(){ 0xFFFe0000 }
  84
  85 sub valid_ucs2($){
  86     return
  87         (0 <= $_[0] && $_[0] < 0xD800)
  88             ||  ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF);
  89 }
  90
  91 sub issurrogate($){   0xD800 <= $_[0]  && $_[0] <= 0xDFFF }
  92 sub isHiSurrogate($){ 0xD800 <= $_[0]  && $_[0] <  0xDC00 }
  93 sub isLoSurrogate($){ 0xDC00 <= $_[0]  && $_[0] <= 0xDFFF }
  94
  95 sub ensurrogate($){
  96     use integer; # we have divisions
  97     my $uni = shift;
  98     my  $hi = ($uni - 0x10000) / 0x400 + 0xD800;
  99     my  $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
 100     return ($hi, $lo);
 101 }
 102
 103 sub desurrogate($$){
 104     my ($hi, $lo) = @_;
 105     return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
 106 }
 107
 108 sub Mask { {2 => 0xffff,  4 => 0xffffffff} }
 109
 110 #
 111 # *_modern are much faster but guzzle more memory
 112 #
 113
 114 sub decode_modern($$;$)
 115 {
 116     my ($obj, $str, $chk ) = @_;
 117     my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
 118
 119     # warn "$size, $endian, $ucs2";
 120     $endian ||= BOMB($size, substr($str, 0, $size, ''))
 121         or poisoned2death($obj, "Where's the BOM?");
 122     my  $mask = Mask->{$size};
 123     my $utf8   = '';
 124     my @ord = unpack("$endian*", $str);
 125     undef $str; # to conserve memory
 126     while (@ord){
 127         my $ord = shift @ord;
 128         unless ($size == 4 or valid_ucs2($ord &= $mask)){
 129             if ($ucs2){
 130                 $chk and
 131                     poisoned2death($obj, "no surrogates allowed", $ord);
 132                 shift @ord; # skip the next one as well
 133                 $ord = FBCHAR;
 134             }else{
 135                 unless (isHiSurrogate($ord)){
 136                     poisoned2death($obj, "Malformed HI surrogate", $ord);
 137                 }
 138                 my $lo = shift @ord;
 139                 unless (isLoSurrogate($lo &= $mask)){
 140                     poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
 141                 }
 142                 $ord = desurrogate($ord, $lo);
 143             }
 144         }
 145         $utf8 .= chr($ord);
 146     }
 147     utf8::upgrade($utf8);
 148     return $utf8;
 149 }
 150
 151 sub encode_modern($$;$)
 152 {
 153     my ($obj, $utf8, $chk) = @_;
 154     my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
 155     my @str = ();
 156     unless ($endian){
 157         $endian = ($size == 4) ? 'N' : 'n';
 158         push @str, BOM_BE;
 159     }
 160     my @ord = unpack("U*", $utf8);
 161     undef $utf8; # to conserve memory
 162     for my $ord (@ord){
 163         unless ($size == 4 or valid_ucs2($ord)) {
 164             unless(issurrogate($ord)){
 165                 if ($ucs2){
 166                     $chk and
 167                         poisoned2death($obj, "code point too high", $ord);
 168
 169                     push @str, FBCHAR;
 170                 }else{
 171
 172                     push @str, ensurrogate($ord);
 173                 }
 174             }else{  # not supposed to happen
 175                 push @str, FBCHAR;
 176             }
 177         }else{
 178             push @str, $ord;
 179         }
 180     }
 181     return pack("$endian*", @str);
 182 }
 183
 184 #
 185 # *_classic are slower but more memory conservative
 186 #
 187
 188 sub decode_classic($$;$)
 189 {
 190     my ($obj, $str, $chk ) = @_;
 191     my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
 192
 193     # warn "$size, $endian, $ucs2";
 194     $endian ||= BOMB($size, substr($str, 0, $size, ''))
 195         or poisoned2death($obj, "Where's the BOM?");
 196     my  $mask = Mask->{$size};
 197     my $utf8   = '';
 198     my @ord = unpack("$endian*", $str);
 199     while (length($str)){
 200          my $ord = unpack($endian, substr($str, 0, $size, ''));
 201         unless ($size == 4 or valid_ucs2($ord &= $mask)){
 202             if ($ucs2){
 203                 $chk and
 204                     poisoned2death($obj, "no surrogates allowed", $ord);
 205                 substr($str,0,$size,''); # skip the next one as well
 206                 $ord = FBCHAR;
 207             }else{
 208                 unless (isHiSurrogate($ord)){
 209                     poisoned2death($obj, "Malformed HI surrogate", $ord);
 210                 }
 211                 my $lo = unpack($endian ,substr($str,0,$size,''));
 212                 unless (isLoSurrogate($lo &= $mask)){
 213                     poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
 214                 }
 215                 $ord = desurrogate($ord, $lo);
 216             }
 217         }
 218         $utf8 .= chr($ord);
 219     }
 220     utf8::upgrade($utf8);
 221     return $utf8;
 222 }
 223
 224 sub encode_classic($$;$)
 225 {
 226     my ($obj, $utf8, $chk) = @_;
 227     my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
 228     # warn join ", ", $size, $ucs2, $endian, $mask;
 229     my $str   = '';
 230     unless ($endian){
 231         $endian = ($size == 4) ? 'N' : 'n';
 232         $str .= pack($endian, BOM_BE);
 233     }
 234     while (length($utf8)){
 235         my $ord  = ord(substr($utf8,0,1,''));
 236         unless ($size == 4 or valid_ucs2($ord)) {
 237             unless(issurrogate($ord)){
 238                 if ($ucs2){
 239                     $chk and
 240                         poisoned2death($obj, "code point too high", $ord);
 241                     $str .= pack($endian, FBCHAR);
 242                 }else{
 243                     $str .= pack($endian.2, ensurrogate($ord));
 244                 }
 245             }else{  # not supposed to happen
 246                 $str .= pack($endian, FBCHAR);
 247             }
 248         }else{
 249             $str .= pack($endian, $ord);
 250         }
 251     }
 252     return $str;
 253 }
 254
 255 sub BOMB {
 256     my ($size, $bom) = @_;
 257     my $N = $size == 2 ? 'n' : 'N';
 258     my $ord = unpack($N, $bom);
 259     return ($ord eq BOM_BE) ? $N :
 260         ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
 261 }
 262
 263 sub poisoned2death{
 264     my $obj = shift;
 265     my $msg = shift;
 266     my $pair = join(", ", map {sprintf "\\x%x", $_} @_);
 267     require Carp;
 268     Carp::croak $obj->name, ":", $msg, "<$pair>.", caller;
 269 }
 270
 271 1;
 272 __END__
 273
 274 =head1 NAME
 275
 276 Encode::Unicode -- Various Unicode Transform Format
 277
 278 =cut
 279
 280 =head1 SYNOPSIS
 281
 282     use Encode qw/encode decode/;
 283     $ucs2 = encode("UCS-2BE", $utf8);
 284     $utf8 = decode("UCS-2BE", $ucs2);
 285
 286 =head1 ABSTRACT
 287
 288 This module implements all Character Encoding Schemes of Unicode that
 289 are officially documented by Unicode Consortium (except, of course,
 290 for UTF-8, which is a native format in perl).
 291
 292 =over 4
 293
 294 =item L<http://www.unicode.org/glossary/> says:
 295
 296 I<Character Encoding Scheme> A character encoding form plus byte
 297 serialization. There are seven character encoding schemes in Unicode:
 298 UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE.
 299
 300 =item Quick Reference
 301
 302                 Decodes from ord(N)           Encodes chr(N) to...
 303        octet/char BOM S.P d800-dfff  ord > 0xffff     \x{1abcd} ==
 304   ---------------+-----------------+------------------------------
 305   UCS-2BE       2   N   N  is bogus                  Not Available
 306   UCS-2LE       2   N   N     bogus                  Not Available
 307   UTF-16      2/4   Y   Y  is   S.P           S.P            BE/LE
 308   UTF-16BE    2/4   N   Y       S.P           S.P    0xd82a,0xdfcd
 309   UTF-16LE      2   N   Y       S.P           S.P    0x2ad8,0xcddf
 310   UTF-32        4   Y   -  is bogus         As is            BE/LE
 311   UTF-32BE      4   N   -     bogus         As is       0x0010abcd
 312   UTF-32LE      4   N   -     bogus         As is       0xcdab1000
 313   UTF-8       1-4   -   -     bogus   >= 4 octets   \xf0\x9a\af\8d
 314   ---------------+-----------------+------------------------------
 315
 316 =back
 317
 318 =head1 Size, Endianness, and BOM
 319
 320 You can categorize these CES by 3 criteria;  Size of each character,
 321 Endianness, and Byte Order Mark.
 322
 323 =head2 by Size
 324
 325 UCS-2 is a fixed-length encoding with each character taking 16 bits.
 326 It B<does not> support I<Surrogate Pairs>.  When a surrogate pair is
 327 encountered during decode(), its place is filled with \xFFFD without
 328 I<CHECK> or croaks if I<CHECK>.  When a character whose ord value is
 329 larger than 0xFFFF is encountered, it uses 0xFFFD without I<CHECK> or
 330 croaks if <CHECK>.
 331
 332 UTF-16 is almost the same as UCS-2 but it supports I<Surrogate Pairs>.
 333 When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
 334 following low surrogate (0xDC00-0xDFFF), C<desurrogate>s them to form a
 335 character.  Bogus surrogates result in death.  When \x{10000} or above
 336 is encountered during encode(), it C<ensurrogate>s them and pushes the
 337 surrogate pair to the output stream.
 338
 339 UTF-32 is a fixed-length encoding with each character taking 32 bits.
 340 Since it is 32-bit there is no need for I<Surrogate Pairs>.
 341
 342 =head2 by Endianness
 343
 344 First (and now failed) goal of Unicode was to map all character
 345 repertories into a fixed-length integer so programmers are happy.
 346 Since each character is either I<short> or I<long> in C, you have to
 347 put endianness of each platform when you pass data to one another.
 348
 349 Anything marked as BE is Big Endian (or network byte order) and LE is
 350 Little Endian (aka VAX byte order).  For anything without, a character
 351 called Byte Order Mark (BOM) is prepended to the head of string.
 352
 353 =over 4
 354
 355 =item BOM as integer when fetched in network byte order
 356
 357               16         32 bits/char
 358   -------------------------
 359   BE      0xFeFF 0x0000FeFF
 360   LE      0xFFeF 0xFFFe0000
 361   -------------------------
 362
 363 =back
 364
 365 This modules handles BOM as follows.
 366
 367 =over 4
 368
 369 =item *
 370
 371 When BE or LE is explicitly stated as the name of encoding, BOM is
 372 simply treated as one of characters (ZERO WIDTH NO-BREAK SPACE).
 373
 374 =item *
 375
 376 When BE or LE is omitted during decode(), it checks if BOM is in the
 377 beginning of the string and if found endianness is set to what BOM
 378 says.  If not found, dies.
 379
 380 =item *
 381
 382 When BE or LE is omitted during encode(), it returns a BE-encoded
 383 string with BOM prepended.  So when you want to encode a whole text
 384 file, make sure you encode() by whole text, not line by line or each
 385 line, not file, is prepended with BOMs.
 386
 387 =item *
 388
 389 C<UCS-2> is an exception.  Unlike others this is an alias of UCS-2BE.
 390 UCS-2 is already registered by IANA and others that way.
 391
 392 =back
 393
 394 =head1 Surrogate Pairs
 395
 396 To say the least, surrogate pairs were the biggest mistake of the
 397 Unicode Consortium.  But according to the late Douglas Adams in I<The
 398 Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the
 399 Universe was created. This has made a lot of people very angry and
 400 been widely regarded as a bad move>.  Their mistake was not of this
 401 magnitude so let's forgive them.
 402
 403 (I don't dare make any comparison with Unicode Consortium and the
 404 Vogons here ;)  Or, comparing Encode to Babel Fish is completely
 405 appropriate -- if you can only stick this into your ear :)
 406
 407 Surrogate pairs were born when Unicode Consortium finally
 408 admitted that 16 bits were not big enough to hold all the world's
 409 character repertoire.  But they have already made UCS-2 16-bit.  What
 410 do we do?
 411
 412 Back then 0xD800-0xDFFF was not allocated.  Let's split them half and
 413 use the first half to represent C<upper half of a character> and the
 414 latter C<lower half of a character>.  That way you can represent 1024
 415 * 1024 = 1048576 more characters.  Now we can store character ranges
 416 up to \x{10ffff} even with 16-bit encodings.  This pair of
 417 half-character is now called a I<Surrogate Pair> and UTF-16 is the
 418 name of the encoding that embraces them.
 419
 420 Here is a formula to ensurrogate a Unicode character \x{10000} and
 421 above;
 422
 423   $hi = ($uni - 0x10000) / 0x400 + 0xD800;
 424   $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
 425
 426 And to desurrogate;
 427
 428  $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
 429
 430 Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but
 431 perl does not prohibit the use of characters within this range.  To perl,
 432 every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
 433
 434   (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
 435   integer support! (**)
 436
 437   (**) Is anything beyond \x{11_0000} still Unicode :?
 438
 439 =head1 SEE ALSO
 440
 441 L<Encode>, L<http://www.unicode.org/glossary/>,
 442
 443 RFC 2781 L<http://rfc.net/rfc2781.html>,
 444
 445 L<http://www.unicode.org/unicode/faq/utf_bom.html>
 446
 447 Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)>
 448 by Larry Wall, Tom Christiansen, Jon Orwant;
 449 O'Reilly & Associates; ISBN 0-596-00027-8
 450
 451 =cut