ext/Encode/lib/Encode/Unicode.pm

   1 package Encode::Unicode;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = do { my @r = (q$Revision: 1.29 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   7
   8 #
   9 # Aux. subs & constants
  10 #
  11
  12 sub FBCHAR(){ 0xFFFd }
  13 sub BOM_BE(){ 0xFeFF }
  14 sub BOM16LE(){ 0xFFFe }
  15 sub BOM32LE(){ 0xFFFe0000 }
  16
  17 sub valid_ucs2($){
  18     return
  19         (0 <= $_[0] && $_[0] < 0xD800)
  20             ||  ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF);
  21 }
  22
  23 sub issurrogate($){   0xD800 <= $_[0]  && $_[0] <= 0xDFFF }
  24 sub isHiSurrogate($){ 0xD800 <= $_[0]  && $_[0] <  0xDC00 }
  25 sub isLoSurrogate($){ 0xDC00 <= $_[0]  && $_[0] <= 0xDFFF }
  26
  27 sub ensurrogate($){
  28     use integer; # we have divisions
  29     my $uni = shift;
  30     my  $hi = ($uni - 0x10000) / 0x400 + 0xD800;
  31     my  $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
  32     return ($hi, $lo);
  33 }
  34
  35 sub desurrogate($$){
  36     my ($hi, $lo) = @_;
  37     return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
  38 }
  39
  40 sub Mask { {2 => 0xffff,  4 => 0xffffffff} }
  41
  42 #
  43 # Object Generator 8 transcoders all at once!
  44 #
  45
  46 require Encode;
  47 for my $name (qw(UTF-16 UTF-16BE UTF-16LE
  48                  UTF-32 UTF-32BE UTF-32LE
  49                         UCS-2BE  UCS-2LE))
  50 {
  51     my ($size, $endian, $ucs2, $mask);
  52     $name =~ /^(\w+)-(\d+)(\w*)$/o;
  53     if ($ucs2 = ($1 eq 'UCS')){
  54         $size = 2;
  55     }else{
  56         $size = $2/8;
  57     }
  58     $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ;
  59     $size == 4 and $endian = uc($endian);
  60
  61     $Encode::Encoding{$name} =
  62         bless {
  63                Name   =>   $name,
  64                size   =>   $size,
  65                endian => $endian,
  66                ucs2   =>   $ucs2,
  67               } => __PACKAGE__;
  68
  69 }
  70
  71 sub name { shift->{'Name'} }
  72 sub new_sequence { $_[0] };
  73
  74 #
  75 # two implementation of (en|de)code exist.  *_modern use
  76 # an array and *_classic stick with substr.  *_classic is much
  77 # slower but more memory conservative.  *_modern is default.
  78
  79 sub set_transcoder{
  80     no warnings qw(redefine);
  81     my $type = shift;
  82     if     ($type eq "modern"){
  83         *decode = \&decode_modern;
  84         *encode = \&encode_modern;
  85     }elsif($type eq "classic"){
  86         *decode = \&decode_classic;
  87         *encode = \&encode_classic;
  88     }else{
  89         require Carp;
  90         Carp::croak __PACKAGE__, "::set_transcoder(modern|classic)";
  91     }
  92 }
  93
  94 set_transcoder("modern");
  95
  96 #
  97 # *_modern are much faster but guzzle more memory
  98 #
  99
 100 sub decode_modern
 101 {
 102     my ($obj, $str, $chk ) = @_;
 103     my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
 104
 105     # warn "$size, $endian, $ucs2";
 106     $endian ||= BOMB($size, substr($str, 0, $size, ''))
 107         or poisoned2death($obj, "Where's the BOM?");
 108     my  $mask = Mask->{$size};
 109     my $utf8   = '';
 110     my @ord = unpack("$endian*", $str);
 111     undef $str; # to conserve memory
 112     while (@ord){
 113         my $ord = shift @ord;
 114         unless ($size == 4 or valid_ucs2($ord &= $mask)){
 115             if ($ucs2){
 116                 $chk and
 117                     poisoned2death($obj, "no surrogates allowed", $ord);
 118                 shift @ord; # skip the next one as well
 119                 $ord = FBCHAR;
 120             }else{
 121                 unless (isHiSurrogate($ord)){
 122                     poisoned2death($obj, "Malformed HI surrogate", $ord);
 123                 }
 124                 my $lo = shift @ord;
 125                 unless (isLoSurrogate($lo &= $mask)){
 126                     poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
 127                 }
 128                 $ord = desurrogate($ord, $lo);
 129             }
 130         }
 131         $utf8 .= chr($ord);
 132     }
 133     utf8::upgrade($utf8);
 134     return $utf8;
 135 }
 136
 137 sub encode_modern
 138 {
 139     my ($obj, $utf8, $chk) = @_;
 140     my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
 141     my @str = ();
 142     unless ($endian){
 143         $endian = ($size == 4) ? 'N' : 'n';
 144         push @str, BOM_BE;
 145     }
 146     my @ord = unpack("U*", $utf8);
 147     undef $utf8; # to conserve memory
 148     for my $ord (@ord){
 149         unless ($size == 4 or valid_ucs2($ord)) {
 150             unless(issurrogate($ord)){
 151                 if ($ucs2){
 152                     $chk and
 153                         poisoned2death($obj, "code point too high", $ord);
 154
 155                     push @str, FBCHAR;
 156                 }else{
 157
 158                     push @str, ensurrogate($ord);
 159                 }
 160             }else{  # not supposed to happen
 161                 push @str, FBCHAR;
 162             }
 163         }else{
 164             push @str, $ord;
 165         }
 166     }
 167     return pack("$endian*", @str);
 168 }
 169
 170 #
 171 # *_classic are slower but more memory conservative
 172 #
 173
 174 sub decode_classic
 175 {
 176     my ($obj, $str, $chk ) = @_;
 177     my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
 178
 179     # warn "$size, $endian, $ucs2";
 180     $endian ||= BOMB($size, substr($str, 0, $size, ''))
 181         or poisoned2death($obj, "Where's the BOM?");
 182     my  $mask = Mask->{$size};
 183     my $utf8   = '';
 184     my @ord = unpack("$endian*", $str);
 185     while (length($str)){
 186          my $ord = unpack($endian, substr($str, 0, $size, ''));
 187         unless ($size == 4 or valid_ucs2($ord &= $mask)){
 188             if ($ucs2){
 189                 $chk and
 190                     poisoned2death($obj, "no surrogates allowed", $ord);
 191                 substr($str,0,$size,''); # skip the next one as well
 192                 $ord = FBCHAR;
 193             }else{
 194                 unless (isHiSurrogate($ord)){
 195                     poisoned2death($obj, "Malformed HI surrogate", $ord);
 196                 }
 197                 my $lo = unpack($endian ,substr($str,0,$size,''));
 198                 unless (isLoSurrogate($lo &= $mask)){
 199                     poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
 200                 }
 201                 $ord = desurrogate($ord, $lo);
 202             }
 203         }
 204         $utf8 .= chr($ord);
 205     }
 206     utf8::upgrade($utf8);
 207     return $utf8;
 208 }
 209
 210 sub encode_classic
 211 {
 212     my ($obj, $utf8, $chk) = @_;
 213     my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
 214     # warn join ", ", $size, $ucs2, $endian, $mask;
 215     my $str   = '';
 216     unless ($endian){
 217         $endian = ($size == 4) ? 'N' : 'n';
 218         $str .= pack($endian, BOM_BE);
 219     }
 220     while (length($utf8)){
 221         my $ord  = ord(substr($utf8,0,1,''));
 222         unless ($size == 4 or valid_ucs2($ord)) {
 223             unless(issurrogate($ord)){
 224                 if ($ucs2){
 225                     $chk and
 226                         poisoned2death($obj, "code point too high", $ord);
 227                     $str .= pack($endian, FBCHAR);
 228                 }else{
 229                     $str .= pack($endian.2, ensurrogate($ord));
 230                 }
 231             }else{  # not supposed to happen
 232                 $str .= pack($endian, FBCHAR);
 233             }
 234         }else{
 235             $str .= pack($endian, $ord);
 236         }
 237     }
 238     return $str;
 239 }
 240
 241 sub BOMB {
 242     my ($size, $bom) = @_;
 243     my $N = $size == 2 ? 'n' : 'N';
 244     my $ord = unpack($N, $bom);
 245     return ($ord eq BOM_BE) ? $N :
 246         ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
 247 }
 248
 249 sub poisoned2death{
 250     my $obj = shift;
 251     my $msg = shift;
 252     my $pair = join(", ", map {sprintf "\\x%x", $_} @_);
 253     require Carp;
 254     Carp::croak $obj->name, ":", $msg, "<$pair>.", caller;
 255 }
 256
 257 1;
 258 __END__
 259
 260 =head1 NAME
 261
 262 Encode::Unicode -- Various Unicode Transform Format
 263
 264 =cut
 265
 266 =head1 SYNOPSIS
 267
 268     use Encode qw/encode decode/;
 269     $ucs2 = encode("UCS-2BE", $utf8);
 270     $utf8 = decode("UCS-2BE", $ucs2);
 271
 272 =head1 ABSTRACT
 273
 274 This module implements all Character Encoding Schemes of Unicode that
 275 are officially documented by Unicode Consortium (except, of course,
 276 for UTF-8, which is a native format in perl).
 277
 278 =over 4
 279
 280 =item L<http://www.unicode.org/glossary/> says:
 281
 282 I<Character Encoding Scheme> A character encoding form plus byte
 283 serialization. There are seven character encoding schemes in Unicode:
 284 UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE.
 285
 286 =item Quick Reference
 287
 288                 Decodes from ord(N)           Encodes chr(N) to...
 289        octet/char BOM S.P d800-dfff  ord > 0xffff     \x{1abcd} ==
 290   ---------------+-----------------+------------------------------
 291   UCS-2BE       2   N   N  is bogus                  Not Available
 292   UCS-2LE       2   N   N     bogus                  Not Available
 293   UTF-16      2/4   Y   Y  is   S.P           S.P            BE/LE
 294   UTF-16BE    2/4   N   Y       S.P           S.P    0xd82a,0xdfcd
 295   UTF-16LE      2   N   Y       S.P           S.P    0x2ad8,0xcddf
 296   UTF-32        4   Y   -  is bogus         As is            BE/LE
 297   UTF-32BE      4   N   -     bogus         As is       0x0010abcd
 298   UTF-32LE      4   N   -     bogus         As is       0xcdab1000
 299   UTF-8       1-4   -   -     bogus   >= 4 octets   \xf0\x9a\af\8d
 300   ---------------+-----------------+------------------------------
 301
 302 =back
 303
 304 =head1 Size, Endianness, and BOM
 305
 306 You can categorize these CES by 3 criteria;  Size of each character,
 307 Endianness, and Byte Order Mark.
 308
 309 =head2 by Size
 310
 311 UCS-2 is a fixed-length encoding with each character taking 16 bits.
 312 It B<does not> support I<Surrogate Pairs>.  When a surrogate pair is
 313 encountered during decode(), its place is filled with \xFFFD without
 314 I<CHECK> or croaks if I<CHECK>.  When a character whose ord value is
 315 larger than 0xFFFF is encountered, it uses 0xFFFD without I<CHECK> or
 316 croaks if <CHECK>.
 317
 318 UTF-16 is almost the same as UCS-2 but it supports I<Surrogate Pairs>.
 319 When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
 320 following low surrogate (0xDC00-0xDFFF), C<desurrogate>s them to form a
 321 character.  Bogus surrogates result in death.  When \x{10000} or above
 322 is encountered during encode(), it C<ensurrogate>s them and pushes the
 323 surrogate pair to the output stream.
 324
 325 UTF-32 is a fixed-length encoding with each character taking 32 bits.
 326 Since it is 32-bit there is no need for I<Surrogate Pairs>.
 327
 328 =head2 by Endianness
 329
 330 First (and now failed) goal of Unicode was to map all character
 331 repertories into a fixed-length integer so programmers are happy.
 332 Since each character is either I<short> or I<long> in C, you have to
 333 put endianness of each platform when you pass data to one another.
 334
 335 Anything marked as BE is Big Endian (or network byte order) and LE is
 336 Little Endian (aka VAX byte order).  For anything without, a character
 337 called Byte Order Mark (BOM) is prepended to the head of string.
 338
 339 =over 4
 340
 341 =item BOM as integer when fetched in network byte order
 342
 343               16         32 bits/char
 344   -------------------------
 345   BE      0xFeFF 0x0000FeFF
 346   LE      0xFFeF 0xFFFe0000
 347   -------------------------
 348
 349 =back
 350
 351 This modules handles BOM as follows.
 352
 353 =over 4
 354
 355 =item *
 356
 357 When BE or LE is explicitly stated as the name of encoding, BOM is
 358 simply treated as one of characters (ZERO WIDTH NO-BREAK SPACE).
 359
 360 =item *
 361
 362 When BE or LE is omitted during decode(), it checks if BOM is in the
 363 beginning of the string and if found endianness is set to what BOM
 364 says.  If not found, dies.
 365
 366 =item *
 367
 368 When BE or LE is omitted during encode(), it returns a BE-encoded
 369 string with BOM prepended.  So when you want to encode a whole text
 370 file, make sure you encode() by whole text, not line by line or each
 371 line, not file, is prepended with BOMs.
 372
 373 =item *
 374
 375 C<UCS-2> is an exception.  Unlike others this is an alias of UCS-2BE.
 376 UCS-2 is already registered by IANA and others that way.
 377
 378 =back
 379
 380 =head1 Surrogate Pairs
 381
 382 To say the least, surrogate pairs were the biggest mistake of the
 383 Unicode Consortium.  But according to the late Douglas Adams in I<The
 384 Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the
 385 Universe was created. This has made a lot of people very angry and
 386 been widely regarded as a bad move>.  Their mistake was not of this
 387 magnitude so let's forgive them.
 388
 389 (I don't dare make any comparison with Unicode Consortium and the
 390 Vogons here ;)  Or, comparing Encode to Babel Fish is completely
 391 appropriate -- if you can only stick this into your ear :)
 392
 393 Surrogate pairs were born when Unicode Consortium finally
 394 admitted that 16 bits were not big enough to hold all the world's
 395 character repertoire.  But they have already made UCS-2 16-bit.  What
 396 do we do?
 397
 398 Back then 0xD800-0xDFFF was not allocated.  Let's split them half and
 399 use the first half to represent C<upper half of a character> and the
 400 latter C<lower half of a character>.  That way you can represent 1024
 401 * 1024 = 1048576 more characters.  Now we can store character ranges
 402 up to \x{10ffff} even with 16-bit encodings.  This pair of
 403 half-character is now called a I<Surrogate Pair> and UTF-16 is the
 404 name of the encoding that embraces them.
 405
 406 Here is a fomula to ensurrogate a Unicode character \x{10000} and
 407 above;
 408
 409   $hi = ($uni - 0x10000) / 0x400 + 0xD800;
 410   $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
 411
 412 And to desurrogate;
 413
 414  $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
 415
 416 Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but
 417 perl does not prohibit the use of characters within this range.  To perl,
 418 every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
 419
 420   (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
 421   integer support! (**)
 422
 423   (**) Is anything beyond \x{11_0000} still Unicode :?
 424
 425 =head1 SEE ALSO
 426
 427 L<Encode>, L<http://www.unicode.org/glossary/>,
 428
 429 RFC 2781 L<http://rfc.net/rfc2781.html>,
 430
 431 L<http://www.unicode.org/unicode/faq/utf_bom.html>
 432
 433 Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)>
 434 by Larry Wall, Tom Christiansen, Jon Orwant;
 435 O'Reilly & Associates; ISBN 0-596-00027-8
 436
 437 =cut