ext/Encode/Encode.pm

   1 package Encode;
   2
   3 $VERSION = 0.01;
   4
   5 require DynaLoader;
   6 require Exporter;
   7
   8 @ISA = qw(Exporter DynaLoader);
   9
  10 @EXPORT_OK =
  11     qw(
  12        bytes_to_utf8
  13        utf8_to_bytes
  14        chars_to_utf8
  15        utf8_to_chars
  16        utf8_to_chars_check
  17        bytes_to_chars
  18        chars_to_bytes
  19        from_to
  20        is_utf8
  21        on_utf8
  22        off_utf8
  23        utf_to_utf
  24        encodings
  25       );
  26
  27 bootstrap Encode ();
  28
  29 =pod
  30
  31 =head1 NAME
  32
  33 Encode - character encodings
  34
  35 =head2 TERMINOLOGY
  36
  37 =over
  38
  39 =item *
  40
  41 I<char>: a character in the range 0..maxint (at least 2**32-1)
  42
  43 =item *
  44
  45 I<byte>: a character in the range 0..255
  46
  47 =back
  48
  49 The marker [INTERNAL] marks Internal Implementation Details, in
  50 general meant only for those who think they know what they are doing,
  51 and such details may change in future releases.
  52
  53 =head2 bytes
  54
  55 =over 4
  56
  57 =item *
  58
  59         bytes_to_utf8(STRING [, FROM])
  60
  61 The bytes in STRING are recoded in-place into UTF-8.  If no FROM is
  62 specified the bytes are expected to be encoded in US-ASCII or ISO
  63 8859-1 (Latin 1).  Returns the new size of STRING, or C<undef> if
  64 there's a failure.
  65
  66 [INTERNAL] Also the UTF-8 flag of STRING is turned on.
  67
  68 =item *
  69
  70         utf8_to_bytes(STRING [, TO [, CHECK]])
  71
  72 The UTF-8 in STRING is decoded in-place into bytes.  If no TO encoding
  73 is specified the bytes are expected to be encoded in US-ASCII or ISO
  74 8859-1 (Latin 1).  Returns the new size of STRING, or C<undef> if
  75 there's a failure.
  76
  77 What if there are characters > 255?  What if the UTF-8 in STRING is
  78 malformed?  See L</"Handling Malformed Data">.
  79
  80 [INTERNAL] The UTF-8 flag of STRING is not checked.
  81
  82 =back
  83
  84 =head2 chars
  85
  86 =over 4
  87
  88 =item *
  89
  90         chars_to_utf8(STRING)
  91
  92 The chars in STRING are encoded in-place into UTF-8.  Returns the new
  93 size of STRING, or C<undef> if there's a failure.
  94
  95 No assumptions are made on the encoding of the chars.  If you want to
  96 assume that the chars are Unicode and to trap illegal Unicode
  97 characters, you must use C<from_to('Unicode', ...)>.
  98
  99 [INTERNAL] Also the UTF-8 flag of STRING is turned on.
 100
 101 =over 4
 102
 103 =item *
 104
 105         utf8_to_chars(STRING)
 106
 107 The UTF-8 in STRING is decoded in-place into chars.  Returns the new
 108 size of STRING, or C<undef> if there's a failure.
 109
 110 If the UTF-8 in STRING is malformed C<undef> is returned, and also an
 111 optional lexical warning (category utf8) is given.
 112
 113 [INTERNAL] The UTF-8 flag of STRING is not checked.
 114
 115 =item *
 116
 117         utf8_to_chars_check(STRING [, CHECK])
 118
 119 (Note that special naming of this interface since a two-argument
 120 utf8_to_chars() has different semantics.)
 121
 122 The UTF-8 in STRING is decoded in-place into chars.  Returns the new
 123 size of STRING, or C<undef> if there is a failure.
 124
 125 If the UTF-8 in STRING is malformed?  See L</"Handling Malformed Data">.
 126
 127 [INTERNAL] The UTF-8 flag of STRING is not checked.
 128
 129 =back
 130
 131 =head2 chars With Encoding
 132
 133 =over 4
 134
 135 =item *
 136
 137         chars_to_utf8(STRING, FROM [, CHECK])
 138
 139 The chars in STRING encoded in FROM are recoded in-place into UTF-8.
 140 Returns the new size of STRING, or C<undef> if there's a failure.
 141
 142 No assumptions are made on the encoding of the chars.  If you want to
 143 assume that the chars are Unicode and to trap illegal Unicode
 144 characters, you must use C<from_to('Unicode', ...)>.
 145
 146 [INTERNAL] Also the UTF-8 flag of STRING is turned on.
 147
 148 =item *
 149
 150         utf8_to_chars(STRING, TO [, CHECK])
 151
 152 The UTF-8 in STRING is decoded in-place into chars encoded in TO.
 153 Returns the new size of STRING, or C<undef> if there's a failure.
 154
 155 If the UTF-8 in STRING is malformed?  See L</"Handling Malformed Data">.
 156
 157 [INTERNAL] The UTF-8 flag of STRING is not checked.
 158
 159 =item *
 160
 161         bytes_to_chars(STRING, FROM [, CHECK])
 162
 163 The bytes in STRING encoded in FROM are recoded in-place into chars.
 164 Returns the new size of STRING in bytes, or C<undef> if there's a
 165 failure.
 166
 167 If the mapping is impossible?  See L</"Handling Malformed Data">.
 168
 169 =item *
 170
 171         chars_to_bytes(STRING, TO [, CHECK])
 172
 173 The chars in STRING are recoded in-place to bytes encoded in TO.
 174 Returns the new size of STRING in bytes, or C<undef> if there's a
 175 failure.
 176
 177 If the mapping is impossible?  See L</"Handling Malformed Data">.
 178
 179 =item *
 180
 181         from_to(STRING, FROM, TO [, CHECK])
 182
 183 The chars in STRING encoded in FROM are recoded in-place into TO.
 184 Returns the new size of STRING, or C<undef> if there's a failure.
 185
 186 If mapping between the encodings is impossible?
 187 See L</"Handling Malformed Data">.
 188
 189 [INTERNAL] If TO is UTF-8, also the UTF-8 flag of STRING is turned on.
 190
 191 =back
 192
 193 =head2 Testing For UTF-8
 194
 195 =over 4
 196
 197 =item *
 198
 199         is_utf8(STRING [, CHECK])
 200
 201 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
 202 If CHECK is true, also checks the data in STRING for being
 203 well-formed UTF-8.  Returns true if successful, false otherwise.
 204
 205 =back
 206
 207 =head2 Toggling UTF-8-ness
 208
 209 =over 4
 210
 211 =item *
 212
 213         on_utf8(STRING)
 214
 215 [INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
 216 B<not> checked for being well-formed UTF-8.  Do not use unless you
 217 B<know> that the STRING is well-formed UTF-8.  Returns the previous
 218 state of the UTF-8 flag (so please don't test the return value as
 219 I<not> success or failure), or C<undef> if STRING is not a string.
 220
 221 =item *
 222
 223         off_utf8(STRING)
 224
 225 [INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
 226 Returns the previous state of the UTF-8 flag (so please don't test the
 227 return value as I<not> success or failure), or C<undef> if STRING is
 228 not a string.
 229
 230 =back
 231
 232 =head2 UTF-16 and UTF-32 Encodings
 233
 234 =over 4
 235
 236 =item *
 237
 238         utf_to_utf(STRING, FROM, TO [, CHECK])
 239
 240 The data in STRING is converted from Unicode Transfer Encoding FROM to
 241 Unicode Transfer Encoding TO.  Both FROM and TO may be any of the
 242 following tags (case-insensitive, with or without 'utf' or 'utf-' prefix):
 243
 244         tag             meaning
 245
 246         '7'             UTF-7
 247         '8'             UTF-8
 248         '16be'          UTF-16 big-endian
 249         '16le'          UTF-16 little-endian
 250         '16'            UTF-16 native-endian
 251         '32be'          UTF-32 big-endian
 252         '32le'          UTF-32 little-endian
 253         '32'            UTF-32 native-endian
 254
 255 UTF-16 is also known as UCS-2, 16 bit or 2-byte chunks, and UTF-32 as
 256 UCS-4, 32-bit or 4-byte chunks.  Returns the new size of STRING, or
 257 C<undef> is there's a failure.
 258
 259 If FROM is UTF-8 and the UTF-8 in STRING is malformed?  See
 260 L</"Handling Malformed Data">.
 261
 262 [INTERNAL] Even if CHECK is true and FROM is UTF-8, the UTF-8 flag of
 263 STRING is not checked.  If TO is UTF-8, also the UTF-8 flag of STRING is
 264 turned on.  Identical FROM and TO are fine.
 265
 266 =back
 267
 268 =head2 Handling Malformed Data
 269
 270 If CHECK is not set, C<undef> is returned.  If the data is supposed to
 271 be UTF-8, an optional lexical warning (category utf8) is given.  If
 272 CHECK is true but not a code reference, dies.  If CHECK is a code
 273 reference, it is called with the arguments
 274
 275         (MALFORMED_STRING, STRING_FROM_SO_FAR, STRING_TO_SO_FAR)
 276
 277 Two return values are expected from the call: the string to be used in
 278 the result string in place of the malformed section, and the length of
 279 the malformed section in bytes.
 280
 281 =cut
 282
 283 sub bytes_to_utf8 {
 284     &_bytes_to_utf8;
 285 }
 286
 287 sub utf8_to_bytes {
 288     &_utf8_to_bytes;
 289 }
 290
 291 sub chars_to_utf8 {
 292     &C_to_utf8;
 293 }
 294
 295 sub utf8_to_chars {
 296     &_utf8_to_chars;
 297 }
 298
 299 sub utf8_to_chars_check {
 300     &_utf8_to_chars_check;
 301 }
 302
 303 sub bytes_to_chars {
 304     &_bytes_to_chars;
 305 }
 306
 307 sub chars_to_bytes {
 308     &_chars_to_bytes;
 309 }
 310
 311 sub is_utf8 {
 312     &_is_utf8;
 313 }
 314
 315 sub on_utf8 {
 316     &_on_utf8;
 317 }
 318
 319 sub off_utf8 {
 320     &_off_utf8;
 321 }
 322
 323 sub utf_to_utf {
 324     &_utf_to_utf;
 325 }
 326
 327 use Carp;
 328
 329 sub from_to
 330 {
 331  my ($string,$from,$to,$check) = @_;
 332  my $f = __PACKAGE__->getEncoding($from);
 333  croak("Unknown encoding '$from'") unless $f;
 334  my $t = __PACKAGE__->getEncoding($to);
 335  croak("Unknown encoding '$to'") unless $t;
 336  my $uni = $f->toUnicode($string,$check);
 337  return undef if ($check && length($string));
 338  $string = $t->fromUnicode($uni,$check);
 339  return undef if ($check && length($uni));
 340  return length($_[0] = $string);
 341 }
 342
 343 # The global hash is declared in XS code
 344 $encoding{Unicode}    = bless({},'Encode::Unicode');
 345 $encoding{'iso10646-1'} = bless({},'Encode::iso10646_1');
 346
 347 sub encodings
 348 {
 349  my ($class) = @_;
 350  foreach my $dir (@INC)
 351   {
 352    if (opendir(my $dh,"$dir/Encode"))
 353     {
 354      while (defined(my $name = readdir($dh)))
 355       {
 356        if ($name =~ /^(.*)\.enc$/)
 357         {
 358          next if exists $encoding{$1};
 359          $encoding{$1} = "$dir/$name";
 360         }
 361       }
 362      closedir($dh);
 363     }
 364   }
 365  return keys %encoding;
 366 }
 367
 368 sub loadEncoding
 369 {
 370  my ($class,$name,$file) = @_;
 371  if (open(my $fh,$file))
 372   {
 373    my $type;
 374    while (1)
 375     {
 376      my $line = <$fh>;
 377      $type = substr($line,0,1);
 378      last unless $type eq '#';
 379     }
 380    $class .= ('::'.(($type eq 'E') ? 'Escape' : 'Table'));
 381    #warn "Loading $file";
 382    return $class->read($fh,$name,$type);
 383   }
 384  else
 385   {
 386    return undef;
 387   }
 388 }
 389
 390 sub getEncoding
 391 {
 392  my ($class,$name) = @_;
 393  my $enc;
 394  unless (ref($enc = $encoding{$name}))
 395   {
 396    $enc = $class->loadEncoding($name,$enc) if defined $enc;
 397    unless (ref($enc))
 398     {
 399      foreach my $dir (@INC)
 400       {
 401        last if ($enc = $class->loadEncoding($name,"$dir/Encode/$name.enc"));
 402       }
 403     }
 404    $encoding{$name} = $enc;
 405   }
 406  return $enc;
 407 }
 408
 409 package Encode::Unicode;
 410
 411 # Dummy package that provides the encode interface but leaves data
 412 # as UTF-8 encoded. It is here so that from_to()
 413
 414 sub name { 'Unicode' }
 415
 416 sub toUnicode   { $_[1] }
 417
 418 sub fromUnicode { $_[1] }
 419
 420 package Encode::Table;
 421
 422 sub read
 423 {
 424  my ($class,$fh,$name,$type) = @_;
 425  my $rep = $class->can("rep_$type");
 426  my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
 427  my @touni;
 428  my %fmuni;
 429  my $count = 0;
 430  $def = hex($def);
 431  while ($pages--)
 432   {
 433    my $line = <$fh>;
 434    chomp($line);
 435    my $page = hex($line);
 436    my @page;
 437    my $ch = $page * 256;
 438    for (my $i = 0; $i < 16; $i++)
 439     {
 440      my $line = <$fh>;
 441      for (my $j = 0; $j < 16; $j++)
 442       {
 443        my $val = hex(substr($line,0,4,''));
 444        if ($val || !$ch)
 445         {
 446          my $uch = chr($val);
 447          push(@page,$uch);
 448          $fmuni{$uch} = $ch;
 449          $count++;
 450         }
 451        else
 452         {
 453          push(@page,undef);
 454         }
 455        $ch++;
 456       }
 457     }
 458    $touni[$page] = \@page;
 459   }
 460
 461  return bless {Name  => $name,
 462                Rep   => $rep,
 463                ToUni => \@touni,
 464                FmUni => \%fmuni,
 465                Def   => $def,
 466                Num   => $count,
 467               },$class;
 468 }
 469
 470 sub name { shift->{'Name'} }
 471
 472 sub rep_S { 'C' }
 473
 474 sub rep_D { 'n' }
 475
 476 sub rep_M { ($_[0] > 255) ? 'n' : 'C' }
 477
 478 sub representation
 479 {
 480  my ($obj,$ch) = @_;
 481  $ch = 0 unless @_ > 1;
 482  $obj-{'Rep'}->($ch);
 483 }
 484
 485 sub toUnicode
 486 {
 487  my ($obj,$str,$chk) = @_;
 488  my $rep   = $obj->{'Rep'};
 489  my $touni = $obj->{'ToUni'};
 490  my $uni   = '';
 491  while (length($str))
 492   {
 493    my $ch = ord(substr($str,0,1,''));
 494    my $x;
 495    if (&$rep($ch) eq 'C')
 496     {
 497      $x = $touni->[0][$ch];
 498     }
 499    else
 500     {
 501      $x = $touni->[$ch][ord(substr($str,0,1,''))];
 502     }
 503    unless (defined $x)
 504     {
 505      last if $chk;
 506      # What do we do here ?
 507      $x = '';
 508     }
 509    $uni .= $x;
 510   }
 511  $_[1] = $str if $chk;
 512  return $uni;
 513 }
 514
 515 sub fromUnicode
 516 {
 517  my ($obj,$uni,$chk) = @_;
 518  my $fmuni = $obj->{'FmUni'};
 519  my $str   = '';
 520  my $def   = $obj->{'Def'};
 521  my $rep   = $obj->{'Rep'};
 522  while (length($uni))
 523   {
 524    my $ch = substr($uni,0,1,'');
 525    my $x  = $fmuni->{chr(ord($ch))};
 526    unless (defined $x)
 527     {
 528      last if ($chk);
 529      $x = $def;
 530     }
 531    $str .= pack(&$rep($x),$x);
 532   }
 533  $_[1] = $uni if $chk;
 534  return $str;
 535 }
 536
 537 package Encode::iso10646_1;
 538 # Encoding is 16-bit network order Unicode
 539 # Used for X font encodings
 540
 541 sub name { 'iso10646-1' }
 542
 543 sub toUnicode
 544 {
 545  my ($obj,$str,$chk) = @_;
 546  my $uni   = '';
 547  while (length($str))
 548   {
 549    my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
 550    $uni .= chr($code);
 551   }
 552  $_[1] = $str if $chk;
 553  return $uni;
 554 }
 555
 556 sub fromUnicode
 557 {
 558  my ($obj,$uni,$chk) = @_;
 559  my $str   = '';
 560  while (length($uni))
 561   {
 562    my $ch = substr($uni,0,1,'');
 563    my $x  = ord($ch);
 564    unless ($x < 32768)
 565     {
 566      last if ($chk);
 567      $x = 0;
 568     }
 569    $str .= pack('n',$x);
 570   }
 571  $_[1] = $uni if $chk;
 572  return $str;
 573 }
 574
 575
 576 package Encode::Escape;
 577 use Carp;
 578
 579 sub read
 580 {
 581  my ($class,$fh,$name) = @_;
 582  my %self = (Name => $name, Num => 0);
 583  while (<$fh>)
 584   {
 585    my ($key,$val) = /^(\S+)\s+(.*)$/;
 586    $val =~ s/^\{(.*?)\}/$1/g;
 587    $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge;
 588    $self{$key} = $val;
 589   }
 590  return bless \%self,$class;
 591 }
 592
 593 sub name { shift->{'Name'} }
 594
 595 sub toUnicode
 596 {
 597  croak("Not implemented yet");
 598 }
 599
 600 sub fromUnicode
 601 {
 602  croak("Not implemented yet");
 603 }
 604
 605 1;
 606
 607 __END__