[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm

package Encode;

$VERSION = 0.01;

require DynaLoader;
require Exporter;

@ISA = qw(Exporter DynaLoader);

@EXPORT_OK =
    qw(
       bytes_to_utf8
       utf8_to_bytes
       chars_to_utf8
       utf8_to_chars
       utf8_to_chars_check
       bytes_to_chars
       chars_to_bytes
       from_to
       is_utf8
       on_utf8
       off_utf8
       utf_to_utf
       encodings
       utf8_decode
       utf8_encode
       utf8_upgrade
       utf8_downgrade
      );

bootstrap Encode ();

=pod

=head1 NAME

Encode - character encodings

=head2 TERMINOLOGY

=over

=item *

I<char>: a character in the range 0..maxint (at least 2**32-1)

=item *

I<byte>: a character in the range 0..255

=back

The marker [INTERNAL] marks Internal Implementation Details, in
general meant only for those who think they know what they are doing,
and such details may change in future releases.

=head2 bytes

=over 4

=item *

        bytes_to_utf8(STRING [, FROM])

The bytes in STRING are recoded in-place into UTF-8.  If no FROM is
specified the bytes are expected to be encoded in US-ASCII or ISO
8859-1 (Latin 1).  Returns the new size of STRING, or C<undef> if
there's a failure.

[INTERNAL] Also the UTF-8 flag of STRING is turned on.

=item *

        utf8_to_bytes(STRING [, TO [, CHECK]])

The UTF-8 in STRING is decoded in-place into bytes.  If no TO encoding
is specified the bytes are expected to be encoded in US-ASCII or ISO
8859-1 (Latin 1).  Returns the new size of STRING, or C<undef> if
there's a failure.

What if there are characters > 255?  What if the UTF-8 in STRING is
malformed?  See L</"Handling Malformed Data">.

[INTERNAL] The UTF-8 flag of STRING is not checked.

=back

=head2 chars

=over 4

=item *

        chars_to_utf8(STRING)

The chars in STRING are encoded in-place into UTF-8.  Returns the new
size of STRING, or C<undef> if there's a failure.

No assumptions are made on the encoding of the chars.  If you want to
assume that the chars are Unicode and to trap illegal Unicode
characters, you must use C<from_to('Unicode', ...)>.

[INTERNAL] Also the UTF-8 flag of STRING is turned on.

=over 4

=item *

        utf8_to_chars(STRING)

The UTF-8 in STRING is decoded in-place into chars.  Returns the new
size of STRING, or C<undef> if there's a failure.

If the UTF-8 in STRING is malformed C<undef> is returned, and also an
optional lexical warning (category utf8) is given.

[INTERNAL] The UTF-8 flag of STRING is not checked.

=item *

        utf8_to_chars_check(STRING [, CHECK])

(Note that special naming of this interface since a two-argument
utf8_to_chars() has different semantics.)

The UTF-8 in STRING is decoded in-place into chars.  Returns the new
size of STRING, or C<undef> if there is a failure.

If the UTF-8 in STRING is malformed?  See L</"Handling Malformed Data">.

[INTERNAL] The UTF-8 flag of STRING is not checked.

=back

=head2 chars With Encoding

=over 4

=item *

        chars_to_utf8(STRING, FROM [, CHECK])

The chars in STRING encoded in FROM are recoded in-place into UTF-8.
Returns the new size of STRING, or C<undef> if there's a failure.

No assumptions are made on the encoding of the chars.  If you want to
assume that the chars are Unicode and to trap illegal Unicode
characters, you must use C<from_to('Unicode', ...)>.

[INTERNAL] Also the UTF-8 flag of STRING is turned on.

=item *

        utf8_to_chars(STRING, TO [, CHECK])

The UTF-8 in STRING is decoded in-place into chars encoded in TO.
Returns the new size of STRING, or C<undef> if there's a failure.

If the UTF-8 in STRING is malformed?  See L</"Handling Malformed Data">.

[INTERNAL] The UTF-8 flag of STRING is not checked.

=item *

	bytes_to_chars(STRING, FROM [, CHECK])

The bytes in STRING encoded in FROM are recoded in-place into chars.
Returns the new size of STRING in bytes, or C<undef> if there's a
failure.

If the mapping is impossible?  See L</"Handling Malformed Data">.

=item *

	chars_to_bytes(STRING, TO [, CHECK])

The chars in STRING are recoded in-place to bytes encoded in TO.
Returns the new size of STRING in bytes, or C<undef> if there's a
failure.

If the mapping is impossible?  See L</"Handling Malformed Data">.

=item *

        from_to(STRING, FROM, TO [, CHECK])

The chars in STRING encoded in FROM are recoded in-place into TO.
Returns the new size of STRING, or C<undef> if there's a failure.

If mapping between the encodings is impossible?
See L</"Handling Malformed Data">.

[INTERNAL] If TO is UTF-8, also the UTF-8 flag of STRING is turned on.

=back

=head2 Testing For UTF-8

=over 4

=item *

        is_utf8(STRING [, CHECK])

[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
If CHECK is true, also checks the data in STRING for being
well-formed UTF-8.  Returns true if successful, false otherwise.

=back

=head2 Toggling UTF-8-ness

=over 4

=item *

        on_utf8(STRING)

[INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
B<not> checked for being well-formed UTF-8.  Do not use unless you
B<know> that the STRING is well-formed UTF-8.  Returns the previous
state of the UTF-8 flag (so please don't test the return value as
I<not> success or failure), or C<undef> if STRING is not a string.

=item *

        off_utf8(STRING)

[INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
Returns the previous state of the UTF-8 flag (so please don't test the
return value as I<not> success or failure), or C<undef> if STRING is
not a string.

=back

=head2 UTF-16 and UTF-32 Encodings

=over 4

=item *

        utf_to_utf(STRING, FROM, TO [, CHECK])

The data in STRING is converted from Unicode Transfer Encoding FROM to
Unicode Transfer Encoding TO.  Both FROM and TO may be any of the
following tags (case-insensitive, with or without 'utf' or 'utf-' prefix):

        tag             meaning

        '7'             UTF-7
        '8'             UTF-8
        '16be'          UTF-16 big-endian
        '16le'          UTF-16 little-endian
        '16'            UTF-16 native-endian
        '32be'          UTF-32 big-endian
        '32le'          UTF-32 little-endian
        '32'            UTF-32 native-endian

UTF-16 is also known as UCS-2, 16 bit or 2-byte chunks, and UTF-32 as
UCS-4, 32-bit or 4-byte chunks.  Returns the new size of STRING, or
C<undef> is there's a failure.

If FROM is UTF-8 and the UTF-8 in STRING is malformed?  See
L</"Handling Malformed Data">.

[INTERNAL] Even if CHECK is true and FROM is UTF-8, the UTF-8 flag of
STRING is not checked.  If TO is UTF-8, also the UTF-8 flag of STRING is
turned on.  Identical FROM and TO are fine.

=back

=head2 Handling Malformed Data

If CHECK is not set, C<undef> is returned.  If the data is supposed to
be UTF-8, an optional lexical warning (category utf8) is given.  If
CHECK is true but not a code reference, dies.  If CHECK is a code
reference, it is called with the arguments

	(MALFORMED_STRING, STRING_FROM_SO_FAR, STRING_TO_SO_FAR)

Two return values are expected from the call: the string to be used in
the result string in place of the malformed section, and the length of
the malformed section in bytes.

=cut

sub bytes_to_utf8 {
    &_bytes_to_utf8;
}

sub utf8_to_bytes {
    &_utf8_to_bytes;
}

sub chars_to_utf8 {
    &C_to_utf8;
}

sub utf8_to_chars {
    &_utf8_to_chars;
}

sub utf8_to_chars_check {
    &_utf8_to_chars_check;
}

sub bytes_to_chars {
    &_bytes_to_chars;
}

sub chars_to_bytes {
    &_chars_to_bytes;
}

sub is_utf8 {
    &_is_utf8;
}

sub on_utf8 {
    &_on_utf8;
}

sub off_utf8 {
    &_off_utf8;
}

sub utf_to_utf {
    &_utf_to_utf;
}

use Carp;

sub from_to
{
 my ($string,$from,$to,$check) = @_;
 my $f = __PACKAGE__->getEncoding($from);
 croak("Unknown encoding '$from'") unless $f;
 my $t = __PACKAGE__->getEncoding($to);
 croak("Unknown encoding '$to'") unless $t;
 my $uni = $f->toUnicode($string,$check);
 return undef if ($check && length($string));
 $string = $t->fromUnicode($uni,$check);
 return undef if ($check && length($uni));
 return length($_[0] = $string);
}

# The global hash is declared in XS code
$encoding{Unicode}    = bless({},'Encode::Unicode');
$encoding{'iso10646-1'} = bless({},'Encode::iso10646_1');

sub encodings
{
 my ($class) = @_;
 foreach my $dir (@INC)
  {
   if (opendir(my $dh,"$dir/Encode"))
    {
     while (defined(my $name = readdir($dh)))
      {
       if ($name =~ /^(.*)\.enc$/)
        {
         next if exists $encoding{$1};
         $encoding{$1} = "$dir/$name";
        }
      }
     closedir($dh);
    }
  }
 return keys %encoding;
}

sub loadEncoding
{
 my ($class,$name,$file) = @_;
 if (open(my $fh,$file))
  {
   my $type;
   while (1)
    {
     my $line = <$fh>;
     $type = substr($line,0,1);
     last unless $type eq '#';
    }
   $class .= ('::'.(($type eq 'E') ? 'Escape' : 'Table'));
   #warn "Loading $file";
   return $class->read($fh,$name,$type);
  }
 else
  {
   return undef;
  }
}

sub getEncoding
{
 my ($class,$name) = @_;
 my $enc;
 unless (ref($enc = $encoding{$name}))
  {
   $enc = $class->loadEncoding($name,$enc) if defined $enc;
   unless (ref($enc))
    {
     foreach my $dir (@INC)
      {
       last if ($enc = $class->loadEncoding($name,"$dir/Encode/$name.enc"));
      }
    }
   $encoding{$name} = $enc;
  }
 return $enc;
}

package Encode::Unicode;

# Dummy package that provides the encode interface but leaves data
# as UTF-8 encoded. It is here so that from_to() works.

sub name { 'Unicode' }

sub toUnicode
{
 my ($obj,$str,$chk) = @_;
 Encode::utf8_upgrade($str);
 $_[1] = '' if $chk;
 return $str;
}

*fromUnicode = \&toUnicode;

package Encode::Table;

sub read
{
 my ($class,$fh,$name,$type) = @_;
 my $rep = $class->can("rep_$type");
 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
 my @touni;
 my %fmuni;
 my $count = 0;
 $def = hex($def);
 while ($pages--)
  {
   my $line = <$fh>;
   chomp($line);
   my $page = hex($line);
   my @page;
   my $ch = $page * 256;
   for (my $i = 0; $i < 16; $i++)
    {
     my $line = <$fh>;
     for (my $j = 0; $j < 16; $j++)
      {
       my $val = hex(substr($line,0,4,''));
       if ($val || !$ch)
        {
         my $uch = chr($val);
         push(@page,$uch);
         $fmuni{$uch} = $ch;
         $count++;
        }
       else
        {
         push(@page,undef);
        }
       $ch++;
      }
    }
   $touni[$page] = \@page;
  }

 return bless {Name  => $name,
               Rep   => $rep,
               ToUni => \@touni,
               FmUni => \%fmuni,
               Def   => $def,
               Num   => $count,
              },$class;
}

sub name { shift->{'Name'} }

sub rep_S { 'C' }

sub rep_D { 'n' }

sub rep_M { ($_[0] > 255) ? 'n' : 'C' }

sub representation
{
 my ($obj,$ch) = @_;
 $ch = 0 unless @_ > 1;
 $obj-{'Rep'}->($ch);
}

sub toUnicode
{
 my ($obj,$str,$chk) = @_;
 my $rep   = $obj->{'Rep'};
 my $touni = $obj->{'ToUni'};
 my $uni   = '';
 while (length($str))
  {
   my $ch = ord(substr($str,0,1,''));
   my $x;
   if (&$rep($ch) eq 'C')
    {
     $x = $touni->[0][$ch];
    }
   else
    {
     $x = $touni->[$ch][ord(substr($str,0,1,''))];
    }
   unless (defined $x)
    {
     last if $chk;
     # What do we do here ?
     $x = '';
    }
   $uni .= $x;
  }
 $_[1] = $str if $chk;
 return $uni;
}

sub fromUnicode
{
 my ($obj,$uni,$chk) = @_;
 my $fmuni = $obj->{'FmUni'};
 my $str   = '';
 my $def   = $obj->{'Def'};
 my $rep   = $obj->{'Rep'};
 while (length($uni))
  {
   my $ch = substr($uni,0,1,'');
   my $x  = $fmuni->{chr(ord($ch))};
   unless (defined $x)
    {
     last if ($chk);
     $x = $def;
    }
   $str .= pack(&$rep($x),$x);
  }
 $_[1] = $uni if $chk;
 return $str;
}

package Encode::iso10646_1;
# Encoding is 16-bit network order Unicode
# Used for X font encodings

sub name { 'iso10646-1' }

sub toUnicode
{
 my ($obj,$str,$chk) = @_;
 my $uni   = '';
 while (length($str))
  {
   my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
   $uni .= chr($code);
  }
 $_[1] = $str if $chk;
 Encode::utf8_upgrade($uni);
 return $uni;
}

sub fromUnicode
{
 my ($obj,$uni,$chk) = @_;
 my $str   = '';
 while (length($uni))
  {
   my $ch = substr($uni,0,1,'');
   my $x  = ord($ch);
   unless ($x < 32768)
    {
     last if ($chk);
     $x = 0;
    }
   $str .= pack('n',$x);
  }
 $_[1] = $uni if $chk;
 return $str;
}


package Encode::Escape;
use Carp;

sub read
{
 my ($class,$fh,$name) = @_;
 my %self = (Name => $name, Num => 0);
 while (<$fh>)
  {
   my ($key,$val) = /^(\S+)\s+(.*)$/;
   $val =~ s/^\{(.*?)\}/$1/g;
   $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge;
   $self{$key} = $val;
  }
 return bless \%self,$class;
}

sub name { shift->{'Name'} }

sub toUnicode
{
 croak("Not implemented yet");
}

sub fromUnicode
{
 croak("Not implemented yet");
}

1;

__END__
Commit	Line	Data
2c674647	1	package Encode;
	2
	3	$VERSION = 0.01;
	4
	5	require DynaLoader;
	6	require Exporter;
	7
	8	@ISA = qw(Exporter DynaLoader);
	9
	10	@EXPORT_OK =
	11	qw(
	12	bytes_to_utf8
	13	utf8_to_bytes
	14	chars_to_utf8
	15	utf8_to_chars
	16	utf8_to_chars_check
	17	bytes_to_chars
	18	chars_to_bytes
	19	from_to
	20	is_utf8
	21	on_utf8
	22	off_utf8
	23	utf_to_utf
656753f8	24	encodings
a12c0f56	25	utf8_decode
	26	utf8_encode
	27	utf8_upgrade
	28	utf8_downgrade
2c674647	29	);
	30
	31	bootstrap Encode ();
	32
	33	=pod
	34
	35	=head1 NAME
	36
	37	Encode - character encodings
	38
	39	=head2 TERMINOLOGY
	40
	41	=over
	42
	43	=item *
	44
	45	I<char>: a character in the range 0..maxint (at least 2**32-1)
	46
	47	=item *
	48
	49	I<byte>: a character in the range 0..255
	50
	51	=back
	52
	53	The marker [INTERNAL] marks Internal Implementation Details, in
	54	general meant only for those who think they know what they are doing,
	55	and such details may change in future releases.
	56
	57	=head2 bytes
	58
	59	=over 4
	60
	61	=item *
	62
	63	bytes_to_utf8(STRING [, FROM])
	64
	65	The bytes in STRING are recoded in-place into UTF-8. If no FROM is
	66	specified the bytes are expected to be encoded in US-ASCII or ISO
	67	8859-1 (Latin 1). Returns the new size of STRING, or C<undef> if
	68	there's a failure.
	69
656753f8	70	[INTERNAL] Also the UTF-8 flag of STRING is turned on.
2c674647	71
	72	=item *
	73
	74	utf8_to_bytes(STRING [, TO [, CHECK]])
	75
	76	The UTF-8 in STRING is decoded in-place into bytes. If no TO encoding
	77	is specified the bytes are expected to be encoded in US-ASCII or ISO
	78	8859-1 (Latin 1). Returns the new size of STRING, or C<undef> if
	79	there's a failure.
	80
	81	What if there are characters > 255? What if the UTF-8 in STRING is
	82	malformed? See L</"Handling Malformed Data">.
	83
	84	[INTERNAL] The UTF-8 flag of STRING is not checked.
	85
	86	=back
	87
	88	=head2 chars
	89
	90	=over 4
	91
	92	=item *
	93
	94	chars_to_utf8(STRING)
	95
	96	The chars in STRING are encoded in-place into UTF-8. Returns the new
	97	size of STRING, or C<undef> if there's a failure.
	98
	99	No assumptions are made on the encoding of the chars. If you want to
	100	assume that the chars are Unicode and to trap illegal Unicode
	101	characters, you must use C<from_to('Unicode', ...)>.
	102
	103	[INTERNAL] Also the UTF-8 flag of STRING is turned on.
	104
	105	=over 4
	106
	107	=item *
	108
	109	utf8_to_chars(STRING)
	110
	111	The UTF-8 in STRING is decoded in-place into chars. Returns the new
656753f8	112	size of STRING, or C<undef> if there's a failure.
2c674647	113
	114	If the UTF-8 in STRING is malformed C<undef> is returned, and also an
	115	optional lexical warning (category utf8) is given.
	116
	117	[INTERNAL] The UTF-8 flag of STRING is not checked.
	118
	119	=item *
	120
	121	utf8_to_chars_check(STRING [, CHECK])
	122
	123	(Note that special naming of this interface since a two-argument
	124	utf8_to_chars() has different semantics.)
	125
	126	The UTF-8 in STRING is decoded in-place into chars. Returns the new
	127	size of STRING, or C<undef> if there is a failure.
	128
	129	If the UTF-8 in STRING is malformed? See L</"Handling Malformed Data">.
	130
	131	[INTERNAL] The UTF-8 flag of STRING is not checked.
	132
	133	=back
	134
	135	=head2 chars With Encoding
	136
	137	=over 4
	138
	139	=item *
	140
	141	chars_to_utf8(STRING, FROM [, CHECK])
	142
	143	The chars in STRING encoded in FROM are recoded in-place into UTF-8.
	144	Returns the new size of STRING, or C<undef> if there's a failure.
	145
	146	No assumptions are made on the encoding of the chars. If you want to
	147	assume that the chars are Unicode and to trap illegal Unicode
	148	characters, you must use C<from_to('Unicode', ...)>.
	149
	150	[INTERNAL] Also the UTF-8 flag of STRING is turned on.
	151
	152	=item *
	153
	154	utf8_to_chars(STRING, TO [, CHECK])
	155
	156	The UTF-8 in STRING is decoded in-place into chars encoded in TO.
	157	Returns the new size of STRING, or C<undef> if there's a failure.
	158
	159	If the UTF-8 in STRING is malformed? See L</"Handling Malformed Data">.
	160
	161	[INTERNAL] The UTF-8 flag of STRING is not checked.
	162
	163	=item *
	164
	165	bytes_to_chars(STRING, FROM [, CHECK])
	166
	167	The bytes in STRING encoded in FROM are recoded in-place into chars.
	168	Returns the new size of STRING in bytes, or C<undef> if there's a
	169	failure.
	170
	171	If the mapping is impossible? See L</"Handling Malformed Data">.
	172
	173	=item *
	174
	175	chars_to_bytes(STRING, TO [, CHECK])
	176
177	The chars in STRING are recoded in-place to bytes encoded in TO.
178	Returns the new size of STRING in bytes, or C<undef> if there's a
179	failure.
180
181	If the mapping is impossible? See L</"Handling Malformed Data">.
182
183	=item *
184
185	from_to(STRING, FROM, TO [, CHECK])
186
187	The chars in STRING encoded in FROM are recoded in-place into TO.
188	Returns the new size of STRING, or C<undef> if there's a failure.
189
190	If mapping between the encodings is impossible?
191	See L</"Handling Malformed Data">.
192
193	[INTERNAL] If TO is UTF-8, also the UTF-8 flag of STRING is turned on.
194
195	=back
196
197	=head2 Testing For UTF-8
198
199	=over 4
200
201	=item *
202
203	is_utf8(STRING [, CHECK])
204
205	[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
206	If CHECK is true, also checks the data in STRING for being
207	well-formed UTF-8. Returns true if successful, false otherwise.
208
209	=back
210
211	=head2 Toggling UTF-8-ness
212
213	=over 4
214
215	=item *
216
217	on_utf8(STRING)
218
219	[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
220	B<not> checked for being well-formed UTF-8. Do not use unless you
221	B<know> that the STRING is well-formed UTF-8. Returns the previous
222	state of the UTF-8 flag (so please don't test the return value as
223	I<not> success or failure), or C<undef> if STRING is not a string.
224
225	=item *
226
227	off_utf8(STRING)
228
229	[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
230	Returns the previous state of the UTF-8 flag (so please don't test the
231	return value as I<not> success or failure), or C<undef> if STRING is
232	not a string.
233
234	=back
235
236	=head2 UTF-16 and UTF-32 Encodings
237
238	=over 4
239
240	=item *
241
242	utf_to_utf(STRING, FROM, TO [, CHECK])
243
244	The data in STRING is converted from Unicode Transfer Encoding FROM to
245	Unicode Transfer Encoding TO. Both FROM and TO may be any of the
246	following tags (case-insensitive, with or without 'utf' or 'utf-' prefix):
247
248	tag meaning
249
250	'7' UTF-7
251	'8' UTF-8
252	'16be' UTF-16 big-endian
253	'16le' UTF-16 little-endian
254	'16' UTF-16 native-endian
255	'32be' UTF-32 big-endian
256	'32le' UTF-32 little-endian
257	'32' UTF-32 native-endian
258
259	UTF-16 is also known as UCS-2, 16 bit or 2-byte chunks, and UTF-32 as
260	UCS-4, 32-bit or 4-byte chunks. Returns the new size of STRING, or
261	C<undef> is there's a failure.
262
263	If FROM is UTF-8 and the UTF-8 in STRING is malformed? See
264	L</"Handling Malformed Data">.
265
266	[INTERNAL] Even if CHECK is true and FROM is UTF-8, the UTF-8 flag of
267	STRING is not checked. If TO is UTF-8, also the UTF-8 flag of STRING is
268	turned on. Identical FROM and TO are fine.
269
270	=back
271
272	=head2 Handling Malformed Data
273
274	If CHECK is not set, C<undef> is returned. If the data is supposed to
275	be UTF-8, an optional lexical warning (category utf8) is given. If
276	CHECK is true but not a code reference, dies. If CHECK is a code
277	reference, it is called with the arguments
278
279	(MALFORMED_STRING, STRING_FROM_SO_FAR, STRING_TO_SO_FAR)
280
281	Two return values are expected from the call: the string to be used in
282	the result string in place of the malformed section, and the length of
283	the malformed section in bytes.
284
285	=cut
286
287	sub bytes_to_utf8 {
288	&_bytes_to_utf8;
289	}
290
291	sub utf8_to_bytes {
292	&_utf8_to_bytes;
293	}
294
295	sub chars_to_utf8 {
296	&C_to_utf8;
297	}
298
299	sub utf8_to_chars {
300	&_utf8_to_chars;
301	}
302
303	sub utf8_to_chars_check {
304	&_utf8_to_chars_check;
305	}
306
307	sub bytes_to_chars {
308	&_bytes_to_chars;
309	}
310
311	sub chars_to_bytes {
312	&_chars_to_bytes;
313	}
314
2c674647	315	sub is_utf8 {
	316	&_is_utf8;
	317	}
	318
	319	sub on_utf8 {
	320	&_on_utf8;
	321	}
	322
	323	sub off_utf8 {
	324	&_off_utf8;
	325	}
	326
	327	sub utf_to_utf {
	328	&_utf_to_utf;
	329	}
	330
bf230f3d	331	use Carp;
bf230f3d	332
656753f8	333	sub from_to
	334	{
	335	my ($string,$from,$to,$check) = @_;
	336	my $f = __PACKAGE__->getEncoding($from);
bf230f3d	337	croak("Unknown encoding '$from'") unless $f;
656753f8	338	my $t = __PACKAGE__->getEncoding($to);
bf230f3d	339	croak("Unknown encoding '$to'") unless $t;
656753f8	340	my $uni = $f->toUnicode($string,$check);
bf230f3d	341	return undef if ($check && length($string));
656753f8	342	$string = $t->fromUnicode($uni,$check);
bf230f3d	343	return undef if ($check && length($uni));
656753f8	344	return length($_[0] = $string);
	345	}
	346
2f2b4ff2	347	# The global hash is declared in XS code
2f2b4ff2	348	$encoding{Unicode} = bless({},'Encode::Unicode');
9b37254d	349	$encoding{'iso10646-1'} = bless({},'Encode::iso10646_1');
5345d506	350
656753f8	351	sub encodings
	352	{
	353	my ($class) = @_;
5345d506	354	foreach my $dir (@INC)
656753f8	355	{
5345d506	356	if (opendir(my $dh,"$dir/Encode"))
656753f8	357	{
5345d506	358	while (defined(my $name = readdir($dh)))
	359	{
	360	if ($name =~ /^(.*)\.enc$/)
	361	{
	362	next if exists $encoding{$1};
	363	$encoding{$1} = "$dir/$name";
	364	}
	365	}
	366	closedir($dh);
656753f8	367	}
5345d506	368	}
	369	return keys %encoding;
	370	}
	371
	372	sub loadEncoding
	373	{
	374	my ($class,$name,$file) = @_;
	375	if (open(my $fh,$file))
	376	{
	377	my $type;
	378	while (1)
	379	{
	380	my $line = <$fh>;
	381	$type = substr($line,0,1);
	382	last unless $type eq '#';
	383	}
	384	$class .= ('::'.(($type eq 'E') ? 'Escape' : 'Table'));
c8991b40	385	#warn "Loading $file";
5345d506	386	return $class->read($fh,$name,$type);
656753f8	387	}
	388	else
	389	{
5345d506	390	return undef;
656753f8	391	}
656753f8	392	}
656753f8	393
656753f8	394	sub getEncoding
	395	{
	396	my ($class,$name) = @_;
5345d506	397	my $enc;
5345d506	398	unless (ref($enc = $encoding{$name}))
656753f8	399	{
5345d506	400	$enc = $class->loadEncoding($name,$enc) if defined $enc;
5345d506	401	unless (ref($enc))
656753f8	402	{
5345d506	403	foreach my $dir (@INC)
656753f8	404	{
5345d506	405	last if ($enc = $class->loadEncoding($name,"$dir/Encode/$name.enc"));
656753f8	406	}
87714904	407	}
5345d506	408	$encoding{$name} = $enc;
656753f8	409	}
5345d506	410	return $enc;
656753f8	411	}
	412
	413	package Encode::Unicode;
	414
9b37254d	415	# Dummy package that provides the encode interface but leaves data
a12c0f56	416	# as UTF-8 encoded. It is here so that from_to() works.
656753f8	417
	418	sub name { 'Unicode' }
	419
a12c0f56	420	sub toUnicode
	421	{
	422	my ($obj,$str,$chk) = @_;
	423	Encode::utf8_upgrade($str);
	424	$_[1] = '' if $chk;
	425	return $str;
	426	}
656753f8	427
a12c0f56	428	*fromUnicode = \&toUnicode;
656753f8	429
	430	package Encode::Table;
	431
	432	sub read
	433	{
	434	my ($class,$fh,$name,$type) = @_;
	435	my $rep = $class->can("rep_$type");
	436	my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
	437	my @touni;
	438	my %fmuni;
	439	my $count = 0;
	440	$def = hex($def);
656753f8	441	while ($pages--)
656753f8	442	{
87714904	443	my $line = <$fh>;
	444	chomp($line);
	445	my $page = hex($line);
656753f8	446	my @page;
	447	my $ch = $page * 256;
	448	for (my $i = 0; $i < 16; $i++)
	449	{
	450	my $line = <$fh>;
	451	for (my $j = 0; $j < 16; $j++)
	452	{
	453	my $val = hex(substr($line,0,4,''));
	454	if ($val \|\| !$ch)
	455	{
	456	my $uch = chr($val);
	457	push(@page,$uch);
87714904	458	$fmuni{$uch} = $ch;
656753f8	459	$count++;
	460	}
	461	else
	462	{
	463	push(@page,undef);
	464	}
	465	$ch++;
	466	}
	467	}
	468	$touni[$page] = \@page;
	469	}
	470
	471	return bless {Name => $name,
	472	Rep => $rep,
	473	ToUni => \@touni,
	474	FmUni => \%fmuni,
	475	Def => $def,
	476	Num => $count,
	477	},$class;
	478	}
	479
	480	sub name { shift->{'Name'} }
	481
	482	sub rep_S { 'C' }
	483
5dcbab34	484	sub rep_D { 'n' }
656753f8	485
5dcbab34	486	sub rep_M { ($_[0] > 255) ? 'n' : 'C' }
656753f8	487
	488	sub representation
	489	{
	490	my ($obj,$ch) = @_;
	491	$ch = 0 unless @_ > 1;
	492	$obj-{'Rep'}->($ch);
	493	}
	494
	495	sub toUnicode
	496	{
bf230f3d	497	my ($obj,$str,$chk) = @_;
656753f8	498	my $rep = $obj->{'Rep'};
	499	my $touni = $obj->{'ToUni'};
	500	my $uni = '';
	501	while (length($str))
	502	{
	503	my $ch = ord(substr($str,0,1,''));
bf230f3d	504	my $x;
656753f8	505	if (&$rep($ch) eq 'C')
656753f8	506	{
bf230f3d	507	$x = $touni->[0][$ch];
656753f8	508	}
	509	else
	510	{
bf230f3d	511	$x = $touni->[$ch][ord(substr($str,0,1,''))];
656753f8	512	}
bf230f3d	513	unless (defined $x)
	514	{
	515	last if $chk;
	516	# What do we do here ?
	517	$x = '';
	518	}
	519	$uni .= $x;
656753f8	520	}
bf230f3d	521	$_[1] = $str if $chk;
656753f8	522	return $uni;
	523	}
	524
	525	sub fromUnicode
	526	{
bf230f3d	527	my ($obj,$uni,$chk) = @_;
656753f8	528	my $fmuni = $obj->{'FmUni'};
	529	my $str = '';
	530	my $def = $obj->{'Def'};
87714904	531	my $rep = $obj->{'Rep'};
656753f8	532	while (length($uni))
	533	{
	534	my $ch = substr($uni,0,1,'');
63eec5db	535	my $x = $fmuni->{chr(ord($ch))};
bf230f3d	536	unless (defined $x)
	537	{
	538	last if ($chk);
	539	$x = $def;
	540	}
87714904	541	$str .= pack(&$rep($x),$x);
	542	}
	543	$_[1] = $uni if $chk;
	544	return $str;
	545	}
	546
9b37254d	547	package Encode::iso10646_1;
	548	# Encoding is 16-bit network order Unicode
	549	# Used for X font encodings
87714904	550
	551	sub name { 'iso10646-1' }
	552
	553	sub toUnicode
	554	{
	555	my ($obj,$str,$chk) = @_;
	556	my $uni = '';
	557	while (length($str))
	558	{
5dcbab34	559	my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
87714904	560	$uni .= chr($code);
	561	}
	562	$_[1] = $str if $chk;
a12c0f56	563	Encode::utf8_upgrade($uni);
87714904	564	return $uni;
	565	}
	566
	567	sub fromUnicode
	568	{
	569	my ($obj,$uni,$chk) = @_;
	570	my $str = '';
	571	while (length($uni))
	572	{
	573	my $ch = substr($uni,0,1,'');
	574	my $x = ord($ch);
	575	unless ($x < 32768)
	576	{
	577	last if ($chk);
	578	$x = 0;
	579	}
5dcbab34	580	$str .= pack('n',$x);
656753f8	581	}
bf230f3d	582	$_[1] = $uni if $chk;
656753f8	583	return $str;
	584	}
	585
2f2b4ff2	586
656753f8	587	package Encode::Escape;
	588	use Carp;
	589
	590	sub read
	591	{
	592	my ($class,$fh,$name) = @_;
	593	my %self = (Name => $name, Num => 0);
	594	while (<$fh>)
	595	{
	596	my ($key,$val) = /^(\S+)\s+(.*)$/;
	597	$val =~ s/^\{(.*?)\}/$1/g;
	598	$val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge;
	599	$self{$key} = $val;
	600	}
	601	return bless \%self,$class;
	602	}
	603
	604	sub name { shift->{'Name'} }
	605
	606	sub toUnicode
	607	{
	608	croak("Not implemented yet");
	609	}
	610
	611	sub fromUnicode
	612	{
	613	croak("Not implemented yet");
	614	}
	615
	616	1;
	617
	618	__END__