ext/Encode/Encode.pm

   1 package Encode;
   2
   3 $VERSION = 0.01;
   4
   5 require DynaLoader;
   6 require Exporter;
   7
   8 @ISA = qw(Exporter DynaLoader);
   9
  10 @EXPORT_OK =
  11     qw(
  12        bytes_to_utf8
  13        utf8_to_bytes
  14        chars_to_utf8
  15        utf8_to_chars
  16        utf8_to_chars_check
  17        bytes_to_chars
  18        chars_to_bytes
  19        from_to
  20        is_utf8
  21        on_utf8
  22        off_utf8
  23        utf_to_utf
  24        encodings
  25        utf8_decode
  26        utf8_encode
  27        utf8_upgrade
  28        utf8_downgrade
  29       );
  30
  31 bootstrap Encode ();
  32
  33 =pod
  34
  35 =head1 NAME
  36
  37 Encode - character encodings
  38
  39 =head2 TERMINOLOGY
  40
  41 =over
  42
  43 =item *
  44
  45 I<char>: a character in the range 0..maxint (at least 2**32-1)
  46
  47 =item *
  48
  49 I<byte>: a character in the range 0..255
  50
  51 =back
  52
  53 The marker [INTERNAL] marks Internal Implementation Details, in
  54 general meant only for those who think they know what they are doing,
  55 and such details may change in future releases.
  56
  57 =head2 bytes
  58
  59 =over 4
  60
  61 =item *
  62
  63         bytes_to_utf8(STRING [, FROM])
  64
  65 The bytes in STRING are recoded in-place into UTF-8.  If no FROM is
  66 specified the bytes are expected to be encoded in US-ASCII or ISO
  67 8859-1 (Latin 1).  Returns the new size of STRING, or C<undef> if
  68 there's a failure.
  69
  70 [INTERNAL] Also the UTF-8 flag of STRING is turned on.
  71
  72 =item *
  73
  74         utf8_to_bytes(STRING [, TO [, CHECK]])
  75
  76 The UTF-8 in STRING is decoded in-place into bytes.  If no TO encoding
  77 is specified the bytes are expected to be encoded in US-ASCII or ISO
  78 8859-1 (Latin 1).  Returns the new size of STRING, or C<undef> if
  79 there's a failure.
  80
  81 What if there are characters > 255?  What if the UTF-8 in STRING is
  82 malformed?  See L</"Handling Malformed Data">.
  83
  84 [INTERNAL] The UTF-8 flag of STRING is not checked.
  85
  86 =back
  87
  88 =head2 chars
  89
  90 =over 4
  91
  92 =item *
  93
  94         chars_to_utf8(STRING)
  95
  96 The chars in STRING are encoded in-place into UTF-8.  Returns the new
  97 size of STRING, or C<undef> if there's a failure.
  98
  99 No assumptions are made on the encoding of the chars.  If you want to
 100 assume that the chars are Unicode and to trap illegal Unicode
 101 characters, you must use C<from_to('Unicode', ...)>.
 102
 103 [INTERNAL] Also the UTF-8 flag of STRING is turned on.
 104
 105 =over 4
 106
 107 =item *
 108
 109         utf8_to_chars(STRING)
 110
 111 The UTF-8 in STRING is decoded in-place into chars.  Returns the new
 112 size of STRING, or C<undef> if there's a failure.
 113
 114 If the UTF-8 in STRING is malformed C<undef> is returned, and also an
 115 optional lexical warning (category utf8) is given.
 116
 117 [INTERNAL] The UTF-8 flag of STRING is not checked.
 118
 119 =item *
 120
 121         utf8_to_chars_check(STRING [, CHECK])
 122
 123 (Note that special naming of this interface since a two-argument
 124 utf8_to_chars() has different semantics.)
 125
 126 The UTF-8 in STRING is decoded in-place into chars.  Returns the new
 127 size of STRING, or C<undef> if there is a failure.
 128
 129 If the UTF-8 in STRING is malformed?  See L</"Handling Malformed Data">.
 130
 131 [INTERNAL] The UTF-8 flag of STRING is not checked.
 132
 133 =back
 134
 135 =head2 chars With Encoding
 136
 137 =over 4
 138
 139 =item *
 140
 141         chars_to_utf8(STRING, FROM [, CHECK])
 142
 143 The chars in STRING encoded in FROM are recoded in-place into UTF-8.
 144 Returns the new size of STRING, or C<undef> if there's a failure.
 145
 146 No assumptions are made on the encoding of the chars.  If you want to
 147 assume that the chars are Unicode and to trap illegal Unicode
 148 characters, you must use C<from_to('Unicode', ...)>.
 149
 150 [INTERNAL] Also the UTF-8 flag of STRING is turned on.
 151
 152 =item *
 153
 154         utf8_to_chars(STRING, TO [, CHECK])
 155
 156 The UTF-8 in STRING is decoded in-place into chars encoded in TO.
 157 Returns the new size of STRING, or C<undef> if there's a failure.
 158
 159 If the UTF-8 in STRING is malformed?  See L</"Handling Malformed Data">.
 160
 161 [INTERNAL] The UTF-8 flag of STRING is not checked.
 162
 163 =item *
 164
 165         bytes_to_chars(STRING, FROM [, CHECK])
 166
 167 The bytes in STRING encoded in FROM are recoded in-place into chars.
 168 Returns the new size of STRING in bytes, or C<undef> if there's a
 169 failure.
 170
 171 If the mapping is impossible?  See L</"Handling Malformed Data">.
 172
 173 =item *
 174
 175         chars_to_bytes(STRING, TO [, CHECK])
 176
 177 The chars in STRING are recoded in-place to bytes encoded in TO.
 178 Returns the new size of STRING in bytes, or C<undef> if there's a
 179 failure.
 180
 181 If the mapping is impossible?  See L</"Handling Malformed Data">.
 182
 183 =item *
 184
 185         from_to(STRING, FROM, TO [, CHECK])
 186
 187 The chars in STRING encoded in FROM are recoded in-place into TO.
 188 Returns the new size of STRING, or C<undef> if there's a failure.
 189
 190 If mapping between the encodings is impossible?
 191 See L</"Handling Malformed Data">.
 192
 193 [INTERNAL] If TO is UTF-8, also the UTF-8 flag of STRING is turned on.
 194
 195 =back
 196
 197 =head2 Testing For UTF-8
 198
 199 =over 4
 200
 201 =item *
 202
 203         is_utf8(STRING [, CHECK])
 204
 205 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
 206 If CHECK is true, also checks the data in STRING for being
 207 well-formed UTF-8.  Returns true if successful, false otherwise.
 208
 209 =back
 210
 211 =head2 Toggling UTF-8-ness
 212
 213 =over 4
 214
 215 =item *
 216
 217         on_utf8(STRING)
 218
 219 [INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
 220 B<not> checked for being well-formed UTF-8.  Do not use unless you
 221 B<know> that the STRING is well-formed UTF-8.  Returns the previous
 222 state of the UTF-8 flag (so please don't test the return value as
 223 I<not> success or failure), or C<undef> if STRING is not a string.
 224
 225 =item *
 226
 227         off_utf8(STRING)
 228
 229 [INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
 230 Returns the previous state of the UTF-8 flag (so please don't test the
 231 return value as I<not> success or failure), or C<undef> if STRING is
 232 not a string.
 233
 234 =back
 235
 236 =head2 UTF-16 and UTF-32 Encodings
 237
 238 =over 4
 239
 240 =item *
 241
 242         utf_to_utf(STRING, FROM, TO [, CHECK])
 243
 244 The data in STRING is converted from Unicode Transfer Encoding FROM to
 245 Unicode Transfer Encoding TO.  Both FROM and TO may be any of the
 246 following tags (case-insensitive, with or without 'utf' or 'utf-' prefix):
 247
 248         tag             meaning
 249
 250         '7'             UTF-7
 251         '8'             UTF-8
 252         '16be'          UTF-16 big-endian
 253         '16le'          UTF-16 little-endian
 254         '16'            UTF-16 native-endian
 255         '32be'          UTF-32 big-endian
 256         '32le'          UTF-32 little-endian
 257         '32'            UTF-32 native-endian
 258
 259 UTF-16 is also known as UCS-2, 16 bit or 2-byte chunks, and UTF-32 as
 260 UCS-4, 32-bit or 4-byte chunks.  Returns the new size of STRING, or
 261 C<undef> is there's a failure.
 262
 263 If FROM is UTF-8 and the UTF-8 in STRING is malformed?  See
 264 L</"Handling Malformed Data">.
 265
 266 [INTERNAL] Even if CHECK is true and FROM is UTF-8, the UTF-8 flag of
 267 STRING is not checked.  If TO is UTF-8, also the UTF-8 flag of STRING is
 268 turned on.  Identical FROM and TO are fine.
 269
 270 =back
 271
 272 =head2 Handling Malformed Data
 273
 274 If CHECK is not set, C<undef> is returned.  If the data is supposed to
 275 be UTF-8, an optional lexical warning (category utf8) is given.  If
 276 CHECK is true but not a code reference, dies.  If CHECK is a code
 277 reference, it is called with the arguments
 278
 279         (MALFORMED_STRING, STRING_FROM_SO_FAR, STRING_TO_SO_FAR)
 280
 281 Two return values are expected from the call: the string to be used in
 282 the result string in place of the malformed section, and the length of
 283 the malformed section in bytes.
 284
 285 =cut
 286
 287 sub bytes_to_utf8 {
 288     &_bytes_to_utf8;
 289 }
 290
 291 sub utf8_to_bytes {
 292     &_utf8_to_bytes;
 293 }
 294
 295 sub chars_to_utf8 {
 296     &C_to_utf8;
 297 }
 298
 299 sub utf8_to_chars {
 300     &_utf8_to_chars;
 301 }
 302
 303 sub utf8_to_chars_check {
 304     &_utf8_to_chars_check;
 305 }
 306
 307 sub bytes_to_chars {
 308     &_bytes_to_chars;
 309 }
 310
 311 sub chars_to_bytes {
 312     &_chars_to_bytes;
 313 }
 314
 315 sub is_utf8 {
 316     &_is_utf8;
 317 }
 318
 319 sub on_utf8 {
 320     &_on_utf8;
 321 }
 322
 323 sub off_utf8 {
 324     &_off_utf8;
 325 }
 326
 327 sub utf_to_utf {
 328     &_utf_to_utf;
 329 }
 330
 331 use Carp;
 332
 333 sub from_to
 334 {
 335  my ($string,$from,$to,$check) = @_;
 336  my $f = __PACKAGE__->getEncoding($from);
 337  croak("Unknown encoding '$from'") unless $f;
 338  my $t = __PACKAGE__->getEncoding($to);
 339  croak("Unknown encoding '$to'") unless $t;
 340  my $uni = $f->toUnicode($string,$check);
 341  return undef if ($check && length($string));
 342  $string = $t->fromUnicode($uni,$check);
 343  return undef if ($check && length($uni));
 344  return length($_[0] = $string);
 345 }
 346
 347 # The global hash is declared in XS code
 348 $encoding{Unicode}    = bless({},'Encode::Unicode');
 349 $encoding{'iso10646-1'} = bless({},'Encode::iso10646_1');
 350
 351 sub encodings
 352 {
 353  my ($class) = @_;
 354  foreach my $dir (@INC)
 355   {
 356    if (opendir(my $dh,"$dir/Encode"))
 357     {
 358      while (defined(my $name = readdir($dh)))
 359       {
 360        if ($name =~ /^(.*)\.enc$/)
 361         {
 362          next if exists $encoding{$1};
 363          $encoding{$1} = "$dir/$name";
 364         }
 365       }
 366      closedir($dh);
 367     }
 368   }
 369  return keys %encoding;
 370 }
 371
 372 sub loadEncoding
 373 {
 374  my ($class,$name,$file) = @_;
 375  if (open(my $fh,$file))
 376   {
 377    my $type;
 378    while (1)
 379     {
 380      my $line = <$fh>;
 381      $type = substr($line,0,1);
 382      last unless $type eq '#';
 383     }
 384    $class .= ('::'.(($type eq 'E') ? 'Escape' : 'Table'));
 385    #warn "Loading $file";
 386    return $class->read($fh,$name,$type);
 387   }
 388  else
 389   {
 390    return undef;
 391   }
 392 }
 393
 394 sub getEncoding
 395 {
 396  my ($class,$name) = @_;
 397  my $enc;
 398  unless (ref($enc = $encoding{$name}))
 399   {
 400    $enc = $class->loadEncoding($name,$enc) if defined $enc;
 401    unless (ref($enc))
 402     {
 403      foreach my $dir (@INC)
 404       {
 405        last if ($enc = $class->loadEncoding($name,"$dir/Encode/$name.enc"));
 406       }
 407     }
 408    $encoding{$name} = $enc;
 409   }
 410  return $enc;
 411 }
 412
 413 package Encode::Unicode;
 414
 415 # Dummy package that provides the encode interface but leaves data
 416 # as UTF-8 encoded. It is here so that from_to() works.
 417
 418 sub name { 'Unicode' }
 419
 420 sub toUnicode
 421 {
 422  my ($obj,$str,$chk) = @_;
 423  Encode::utf8_upgrade($str);
 424  $_[1] = '' if $chk;
 425  return $str;
 426 }
 427
 428 *fromUnicode = \&toUnicode;
 429
 430 package Encode::Table;
 431
 432 sub read
 433 {
 434  my ($class,$fh,$name,$type) = @_;
 435  my $rep = $class->can("rep_$type");
 436  my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
 437  my @touni;
 438  my %fmuni;
 439  my $count = 0;
 440  $def = hex($def);
 441  while ($pages--)
 442   {
 443    my $line = <$fh>;
 444    chomp($line);
 445    my $page = hex($line);
 446    my @page;
 447    my $ch = $page * 256;
 448    for (my $i = 0; $i < 16; $i++)
 449     {
 450      my $line = <$fh>;
 451      for (my $j = 0; $j < 16; $j++)
 452       {
 453        my $val = hex(substr($line,0,4,''));
 454        if ($val || !$ch)
 455         {
 456          my $uch = chr($val);
 457          push(@page,$uch);
 458          $fmuni{$uch} = $ch;
 459          $count++;
 460         }
 461        else
 462         {
 463          push(@page,undef);
 464         }
 465        $ch++;
 466       }
 467     }
 468    $touni[$page] = \@page;
 469   }
 470
 471  return bless {Name  => $name,
 472                Rep   => $rep,
 473                ToUni => \@touni,
 474                FmUni => \%fmuni,
 475                Def   => $def,
 476                Num   => $count,
 477               },$class;
 478 }
 479
 480 sub name { shift->{'Name'} }
 481
 482 sub rep_S { 'C' }
 483
 484 sub rep_D { 'n' }
 485
 486 sub rep_M { ($_[0] > 255) ? 'n' : 'C' }
 487
 488 sub representation
 489 {
 490  my ($obj,$ch) = @_;
 491  $ch = 0 unless @_ > 1;
 492  $obj-{'Rep'}->($ch);
 493 }
 494
 495 sub toUnicode
 496 {
 497  my ($obj,$str,$chk) = @_;
 498  my $rep   = $obj->{'Rep'};
 499  my $touni = $obj->{'ToUni'};
 500  my $uni   = '';
 501  while (length($str))
 502   {
 503    my $ch = ord(substr($str,0,1,''));
 504    my $x;
 505    if (&$rep($ch) eq 'C')
 506     {
 507      $x = $touni->[0][$ch];
 508     }
 509    else
 510     {
 511      $x = $touni->[$ch][ord(substr($str,0,1,''))];
 512     }
 513    unless (defined $x)
 514     {
 515      last if $chk;
 516      # What do we do here ?
 517      $x = '';
 518     }
 519    $uni .= $x;
 520   }
 521  $_[1] = $str if $chk;
 522  return $uni;
 523 }
 524
 525 sub fromUnicode
 526 {
 527  my ($obj,$uni,$chk) = @_;
 528  my $fmuni = $obj->{'FmUni'};
 529  my $str   = '';
 530  my $def   = $obj->{'Def'};
 531  my $rep   = $obj->{'Rep'};
 532  while (length($uni))
 533   {
 534    my $ch = substr($uni,0,1,'');
 535    my $x  = $fmuni->{chr(ord($ch))};
 536    unless (defined $x)
 537     {
 538      last if ($chk);
 539      $x = $def;
 540     }
 541    $str .= pack(&$rep($x),$x);
 542   }
 543  $_[1] = $uni if $chk;
 544  return $str;
 545 }
 546
 547 package Encode::iso10646_1;
 548 # Encoding is 16-bit network order Unicode
 549 # Used for X font encodings
 550
 551 sub name { 'iso10646-1' }
 552
 553 sub toUnicode
 554 {
 555  my ($obj,$str,$chk) = @_;
 556  my $uni   = '';
 557  while (length($str))
 558   {
 559    my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
 560    $uni .= chr($code);
 561   }
 562  $_[1] = $str if $chk;
 563  Encode::utf8_upgrade($uni);
 564  return $uni;
 565 }
 566
 567 sub fromUnicode
 568 {
 569  my ($obj,$uni,$chk) = @_;
 570  my $str   = '';
 571  while (length($uni))
 572   {
 573    my $ch = substr($uni,0,1,'');
 574    my $x  = ord($ch);
 575    unless ($x < 32768)
 576     {
 577      last if ($chk);
 578      $x = 0;
 579     }
 580    $str .= pack('n',$x);
 581   }
 582  $_[1] = $uni if $chk;
 583  return $str;
 584 }
 585
 586
 587 package Encode::Escape;
 588 use Carp;
 589
 590 sub read
 591 {
 592  my ($class,$fh,$name) = @_;
 593  my %self = (Name => $name, Num => 0);
 594  while (<$fh>)
 595   {
 596    my ($key,$val) = /^(\S+)\s+(.*)$/;
 597    $val =~ s/^\{(.*?)\}/$1/g;
 598    $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge;
 599    $self{$key} = $val;
 600   }
 601  return bless \%self,$class;
 602 }
 603
 604 sub name { shift->{'Name'} }
 605
 606 sub toUnicode
 607 {
 608  croak("Not implemented yet");
 609 }
 610
 611 sub fromUnicode
 612 {
 613  croak("Not implemented yet");
 614 }
 615
 616 1;
 617
 618 __END__