ext/Encode/lib/Encode/Unicode.pm

   1 package Encode::Unicode;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = do { my @r = (q$Revision: 1.26 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   7
   8 #
   9 # Aux. subs & constants
  10 #
  11
  12 sub FBCHAR(){ 0xFFFd }
  13 sub BOM_BE(){ 0xFeFF }
  14 sub BOM16LE(){ 0xFFFe }
  15 sub BOM32LE(){ 0xFeFF0000 }
  16
  17 sub valid_ucs2($){
  18     if ($_[0] < 0xD800){
  19         return $_[0] > 0;
  20     }else{
  21         return ($_[0] > 0xDFFFF && $_[0] <= 0xFFFF);
  22     }
  23 }
  24
  25 sub issurrogate($){   0xD800 <= $_[0]  && $_[0] <= 0xDFFF }
  26 sub isHiSurrogate($){ 0xD800 <= $_[0]  && $_[0] <  0xDC00 }
  27 sub isLoSurrogate($){ 0xDC00 <= $_[0]  && $_[0] <= 0xDFFF }
  28
  29 sub ensurrogate($){
  30     use integer; # we have divisions
  31     my $uni = shift;
  32     my  $hi = ($uni - 0x10000) / 0x400 + 0xD800;
  33     my  $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
  34     return ($hi, $lo);
  35 }
  36
  37 sub desurrogate($$){
  38     my ($hi, $lo) = @_;
  39     return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
  40 }
  41
  42 sub Mask { {2 => 0xffff,  4 => 0xffffffff} }
  43
  44 #
  45 # Object Generator 8 transcoders all at once!
  46 #
  47
  48 require Encode;
  49 for my $name (qw(UTF-16 UTF-16BE UTF-16LE
  50                  UTF-32 UTF-32BE UTF-32LE
  51                         UCS-2BE  UCS-2LE))
  52 {
  53     my ($size, $endian, $ucs2, $mask);
  54     $name =~ /^(\w+)-(\d+)(\w*)$/o;
  55     if ($ucs2 = ($1 eq 'UCS')){
  56         $size = 2;
  57     }else{
  58         $size = $2/8;
  59     }
  60     $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ;
  61     $size == 4 and $endian = uc($endian);
  62
  63     $Encode::Encoding{$name} =
  64         bless {
  65                Name   =>   $name,
  66                size   =>   $size,
  67                endian => $endian,
  68                ucs2   =>   $ucs2,
  69               } => __PACKAGE__;
  70
  71 }
  72
  73 sub name { shift->{'Name'} }
  74 sub new_sequence { $_[0] };
  75
  76 #
  77 # the two implementation of (en|de)code exist.  *_modern use
  78 # array and *_classic stick with substr.  *_classic is much
  79 # slower but more memory conservative.  *_moder is default.
  80
  81 sub set_transcoder{
  82     no warnings qw(redefine);
  83     my $type = shift;
  84     if     ($type eq "modern"){
  85         *decode = \&decode_modern;
  86         *encode = \&encode_modern;
  87     }elsif($type eq "classic"){
  88         *decode = \&decode_classic;
  89         *encode = \&encode_classic;
  90     }else{
  91         require Carp;
  92         Carp::croak __PACKAGE__, "::set_transcoder(modern|classic)";
  93     }
  94 }
  95
  96 set_transcoder("modern");
  97
  98 #
  99 # *_modern are much faster but guzzle more memory
 100 #
 101
 102 sub decode_modern
 103 {
 104     my ($obj, $str, $chk ) = @_;
 105     my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
 106
 107     # warn "$size, $endian, $ucs2";
 108     $endian ||= BOMB($size, substr($str, 0, $size, ''))
 109         or poisoned2death($obj, "Where's the BOM?");
 110     my  $mask = Mask->{$size};
 111     my $utf8   = '';
 112     my @ord = unpack("$endian*", $str);
 113     undef $str; # to conserve memory
 114     while (@ord){
 115         my $ord = shift @ord;
 116         unless ($size == 4 or valid_ucs2($ord &= $mask)){
 117             if ($ucs2){
 118                 $chk and
 119                     poisoned2death($obj, "no surrogates allowed", $ord);
 120                 shift @ord; # skip the next one as well
 121                 $ord = FBCHAR;
 122             }else{
 123                 unless (isHiSurrogate($ord)){
 124                     poisoned2death($obj, "Malformed HI surrogate", $ord);
 125                 }
 126                 my $lo = shift @ord;
 127                 unless (isLoSurrogate($lo &= $mask)){
 128                     poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
 129                 }
 130                 $ord = desurrogate($ord, $lo);
 131             }
 132         }
 133         $utf8 .= chr($ord);
 134     }
 135     utf8::upgrade($utf8);
 136     return $utf8;
 137 }
 138
 139 sub encode_modern
 140 {
 141     my ($obj, $utf8, $chk) = @_;
 142     my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
 143     my @str = ();
 144     unless ($endian){
 145         $endian = ($size == 4) ? 'N' : 'n';
 146         push @str, BOM_BE;
 147     }
 148     my @ord = unpack("U*", $utf8);
 149     undef $utf8; # to conserve memory
 150     for my $ord (@ord){
 151         unless ($size == 4 or valid_ucs2($ord)) {
 152             unless(issurrogate($ord)){
 153                 if ($ucs2){
 154                     $chk and
 155                         poisoned2death($obj, "code point too high", $ord);
 156
 157                     push @str, FBCHAR;
 158                 }else{
 159
 160                     push @str, ensurrogate($ord);
 161                 }
 162             }else{  # not supposed to happen
 163                 push @str, FBCHAR;
 164             }
 165         }else{
 166             push @str, $ord;
 167         }
 168     }
 169     return pack("$endian*", @str);
 170 }
 171
 172 #
 173 # *_classic are slower but more memory conservative
 174 #
 175
 176 sub decode_classic
 177 {
 178     my ($obj, $str, $chk ) = @_;
 179     my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
 180
 181     # warn "$size, $endian, $ucs2";
 182     $endian ||= BOMB($size, substr($str, 0, $size, ''))
 183         or poisoned2death($obj, "Where's the BOM?");
 184     my  $mask = Mask->{$size};
 185     my $utf8   = '';
 186     my @ord = unpack("$endian*", $str);
 187     while (length($str)){
 188          my $ord = unpack($endian, substr($str, 0, $size, ''));
 189         unless ($size == 4 or valid_ucs2($ord &= $mask)){
 190             if ($ucs2){
 191                 $chk and
 192                     poisoned2death($obj, "no surrogates allowed", $ord);
 193                 substr($str,0,$size,''); # skip the next one as well
 194                 $ord = FBCHAR;
 195             }else{
 196                 unless (isHiSurrogate($ord)){
 197                     poisoned2death($obj, "Malformed HI surrogate", $ord);
 198                 }
 199                 my $lo = unpack($endian ,substr($str,0,$size,''));
 200                 unless (isLoSurrogate($lo &= $mask)){
 201                     poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
 202                 }
 203                 $ord = desurrogate($ord, $lo);
 204             }
 205         }
 206         $utf8 .= chr($ord);
 207     }
 208     utf8::upgrade($utf8);
 209     return $utf8;
 210 }
 211
 212 sub encode_classic
 213 {
 214     my ($obj, $utf8, $chk) = @_;
 215     my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
 216     # warn join ", ", $size, $ucs2, $endian, $mask;
 217     my $str   = '';
 218     unless ($endian){
 219         $endian = ($size == 4) ? 'N' : 'n';
 220         $str .= pack($endian, BOM_BE);
 221     }
 222     while (length($utf8)){
 223         my $ord  = ord(substr($utf8,0,1,''));
 224         unless ($size == 4 or valid_ucs2($ord)) {
 225             unless(issurrogate($ord)){
 226                 if ($ucs2){
 227                     $chk and
 228                         poisoned2death($obj, "code point too high", $ord);
 229                     $str .= pack($endian, FBCHAR);
 230                 }else{
 231                     $str .= pack($endian.2, ensurrogate($ord));
 232                 }
 233             }else{  # not supposed to happen
 234                 $str .= pack($endian, FBCHAR);
 235             }
 236         }else{
 237             $str .= pack($endian, $ord);
 238         }
 239     }
 240     return $str;
 241 }
 242
 243 sub BOMB {
 244     my ($size, $bom) = @_;
 245     my $N = $size == 2 ? 'n' : 'N';
 246     my $ord = unpack($N, $bom);
 247     return ($ord eq BOM_BE) ? $N :
 248         ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
 249 }
 250
 251 sub poisoned2death{
 252     my $obj = shift;
 253     my $msg = shift;
 254     my $pair = join(", ", map {sprintf "\\x%x", $_} @_);
 255     require Carp;
 256     Carp::croak $obj->name, ":", $msg, "<$pair>.", caller;
 257 }
 258
 259 1;
 260 __END__
 261
 262 =head1 NAME
 263
 264 Encode::Unicode -- Various Unicode Transform Format
 265
 266 =cut
 267
 268 =head1 SYNOPSIS
 269
 270     use Encode qw/encode decode/;
 271     $ucs2 = encode("UCS-2BE", $utf8);
 272     $utf8 = decode("UCS-2BE", $ucs2);
 273
 274 =head1 ABSTRACT
 275
 276 This module implements all Character Encoding Schemes of Unicode that
 277 are officially documented by Unicode Consortium (except, of course,
 278 for UTF-8, which is a native format in perl).
 279
 280 =over 4
 281
 282 =item L<http://www.unicode.org/glossary/> says:
 283
 284 I<Character Encoding Scheme> A character encoding form plus byte
 285 serialization. There are seven character encoding schemes in Unicode:
 286 UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE.
 287
 288 =item Quick Reference
 289
 290                 Decodes from ord(N)           Encodes chr(N) to...
 291        octet/char BOM S.P d800-dfff  ord > 0xffff     \x{1abcd} ==
 292   ---------------+-----------------+------------------------------
 293   UCS-2BE       2   N   N  is bogus                  Not Available
 294   UCS-2LE       2   N   N     bogus                  Not Available
 295   UTF-16      2/4   Y   Y  is   S.P           S.P            BE/LE
 296   UTF-16BE    2/4   N   Y       S.P           S.P    0xd82a,0xdfcd
 297   UTF-16LE      2   N   Y       S.P           S.P    0x2ad8,0xcddf
 298   UTF-32        4   Y   -  is bogus         As is            BE/LE
 299   UTF-32BE      4   N   -     bogus         As is       0x0010abcd
 300   UTF-32LE      4   N   -     bogus         As is       0xcdab1000
 301   UTF-8       1-4   -   -     bogus   >= 4 octets   \xf0\x9a\af\8d
 302   ---------------+-----------------+------------------------------
 303
 304 =back
 305
 306 =head1 Size, Endianness, and BOM
 307
 308 You can categorize these CES by 3 criteria;  Size of each character,
 309 Endianness, and Byte Order Mark.
 310
 311 =head2 by Size
 312
 313 UCS-2 is a fixed-length encoding with each character taking 16 bits.
 314 It B<does not> support I<Surrogate Pair>.  When surrogate pair is
 315 encountered during decode(), it fills its place with \xFFFD without
 316 I<CHECK> or croaks if I<CHECK>.  When a character which ord value is
 317 larger than 0xFFFF, it uses 0xFFFD without I<CHECK> or croaks if
 318 <CHECK>.
 319
 320 UTF-16 is almost the same as UCS-2 but it supports I<Surrogate Pair>.
 321 When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
 322 following low surrogate (0xDC00-0xDFFF), C<desurrogate> them to form a
 323 character.  Bogus surrogates result in death.  When \x{10000} or above
 324 is encountered during encode(), it C<ensurrogate>s them and push the
 325 surrogate pair to the output stream.
 326
 327 UTF-32 is a fixed-length encoding with each character taking 32 bits.
 328 Since it is 32-bit there is no need for I<Surrogate Pair>.
 329
 330 =head2 by Endianness
 331
 332 First (and now failed) goal of Unicode was to map all character
 333 repartories into a fixed-length integer so programmers are happy.
 334 Since each character is either I<short> or I<long> in C, you have to
 335 put endianness of each platform when you pass data to one another.
 336
 337 Anything marked as BE is Big Endian (or network byte order) and LE is
 338 Little Endian (aka VAX byte order).  For anything without, a character
 339 called Byte Order Mark (BOM) is prepended to the head of string.
 340
 341 =over 4
 342
 343 =item BOM as integer
 344
 345             16         32 bits/char
 346 -------------------------
 347 BE      0xFeFF 0x0000FeFF
 348 LE      0xFFeF 0xFeFF0000
 349 -------------------------
 350
 351 =back
 352
 353 This modules handles BOM as follows.
 354
 355 =over 4
 356
 357 =item *
 358
 359 When BE or LE is explicitly stated as the name of encoding, BOM is
 360 simply treated as one of characters (ZERO WIDTH NO-BREAK SPACE).
 361
 362 =item *
 363
 364 When BE or LE is omitted during decode(), it checks if BOM is in the
 365 beginning of the string and if found endianness is set to what BOM
 366 says.  if not found, dies.
 367
 368 =item *
 369
 370 When BE or LE is omitted during encode(), it returns a BE-encoded
 371 string with BOM prepended.  So when you want to encode a whole text
 372 file, make sure you encode() by whole text, not line by line or each
 373 line, not file, is prepended with BOMs.
 374
 375 =item *
 376
 377 C<UCS-2> is an exception.  Unlike others this is an alias of UCS-2BE.
 378 UCS-2 is already registered by IANA and others that way.
 379
 380
 381 =head1 The Surrogate Pair
 382
 383 To say the least, surrogate pair was the biggest mistake by Unicode
 384 Consortium.  I don't give a darn if they admit it or not.  But
 385 according to late Douglas Adams in I<The Hitchhiker's Guide to the
 386 Galaxy> Triology,  C<First the Universe was created and it was a bad
 387 move>. Their mistake was not this magnitude so let's forgive them.
 388
 389 (I don't dare make any comparison with Unicode Consortium and the
 390 Vogons here ;)  Or, comparing Encode to Babel Fish is completely
 391 appropriate -- if you can only stick this into your ear :)
 392
 393 A surrogate pair was born when Unicode Consortium had finally
 394 admitted that 16 bit was not big enough to hold all the world's
 395 character repartorie. But they have already made UCS-2 16-bit.  What
 396 do we do?
 397
 398 Back then 0xD800-0xDFFF was not allocated.  Let's split them half and
 399 use the first half to represent C<upper half of a character> and the
 400 latter C<lower half of a character>.  That way you can represent 1024
 401 * 1024 = 1048576 more characters.  Now we can store character ranges
 402 up to \x{10ffff} even with 16-bit encodings.  This pair of
 403 half-character is now called a I<Surrogate Pair> and UTF-16 is the
 404 name of encoding that embraces them.
 405
 406 Here is a fomula to ensurrogate a Unicode character \x{10000} and
 407 above;
 408
 409   $hi = ($uni - 0x10000) / 0x400 + 0xD800;
 410   $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
 411
 412 And to desurrogate;
 413
 414  $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
 415
 416 Note this move has made \x{D800}-\x{DFFF} forbidden zone  but perl
 417 does not prohibit them for uses.
 418
 419 =head1 SEE ALSO
 420
 421 L<Encode>, L<http://www.unicode.org/glossary/>
 422
 423 =back