[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm

package Encode;

$VERSION = 0.01;

require DynaLoader;
require Exporter;

@ISA = qw(Exporter DynaLoader);

@EXPORT_OK =
    qw(
       bytes_to_utf8
       utf8_to_bytes
       chars_to_utf8
       utf8_to_chars
       utf8_to_chars_check
       bytes_to_chars
       chars_to_bytes
       from_to
       is_utf8
       on_utf8
       off_utf8
       utf_to_utf
       encodings
      );

bootstrap Encode ();

=pod

=head1 NAME

Encode - character encodings

=head2 TERMINOLOGY

=over

=item *

I<char>: a character in the range 0..maxint (at least 2**32-1)

=item *

I<byte>: a character in the range 0..255

=back

The marker [INTERNAL] marks Internal Implementation Details, in
general meant only for those who think they know what they are doing,
and such details may change in future releases.

=head2 bytes

=over 4

=item *

        bytes_to_utf8(STRING [, FROM])

The bytes in STRING are recoded in-place into UTF-8.  If no FROM is
specified the bytes are expected to be encoded in US-ASCII or ISO
8859-1 (Latin 1).  Returns the new size of STRING, or C<undef> if
there's a failure.

[INTERNAL] Also the UTF-8 flag of STRING is turned on.

=item *

        utf8_to_bytes(STRING [, TO [, CHECK]])

The UTF-8 in STRING is decoded in-place into bytes.  If no TO encoding
is specified the bytes are expected to be encoded in US-ASCII or ISO
8859-1 (Latin 1).  Returns the new size of STRING, or C<undef> if
there's a failure.

What if there are characters > 255?  What if the UTF-8 in STRING is
malformed?  See L</"Handling Malformed Data">.

[INTERNAL] The UTF-8 flag of STRING is not checked.

=back

=head2 chars

=over 4

=item *

        chars_to_utf8(STRING)

The chars in STRING are encoded in-place into UTF-8.  Returns the new
size of STRING, or C<undef> if there's a failure.

No assumptions are made on the encoding of the chars.  If you want to
assume that the chars are Unicode and to trap illegal Unicode
characters, you must use C<from_to('Unicode', ...)>.

[INTERNAL] Also the UTF-8 flag of STRING is turned on.

=over 4

=item *

        utf8_to_chars(STRING)

The UTF-8 in STRING is decoded in-place into chars.  Returns the new
size of STRING, or C<undef> if there's a failure.

If the UTF-8 in STRING is malformed C<undef> is returned, and also an
optional lexical warning (category utf8) is given.

[INTERNAL] The UTF-8 flag of STRING is not checked.

=item *

        utf8_to_chars_check(STRING [, CHECK])

(Note that special naming of this interface since a two-argument
utf8_to_chars() has different semantics.)

The UTF-8 in STRING is decoded in-place into chars.  Returns the new
size of STRING, or C<undef> if there is a failure.

If the UTF-8 in STRING is malformed?  See L</"Handling Malformed Data">.

[INTERNAL] The UTF-8 flag of STRING is not checked.

=back

=head2 chars With Encoding

=over 4

=item *

        chars_to_utf8(STRING, FROM [, CHECK])

The chars in STRING encoded in FROM are recoded in-place into UTF-8.
Returns the new size of STRING, or C<undef> if there's a failure.

No assumptions are made on the encoding of the chars.  If you want to
assume that the chars are Unicode and to trap illegal Unicode
characters, you must use C<from_to('Unicode', ...)>.

[INTERNAL] Also the UTF-8 flag of STRING is turned on.

=item *

        utf8_to_chars(STRING, TO [, CHECK])

The UTF-8 in STRING is decoded in-place into chars encoded in TO.
Returns the new size of STRING, or C<undef> if there's a failure.

If the UTF-8 in STRING is malformed?  See L</"Handling Malformed Data">.

[INTERNAL] The UTF-8 flag of STRING is not checked.

=item *

	bytes_to_chars(STRING, FROM [, CHECK])

The bytes in STRING encoded in FROM are recoded in-place into chars.
Returns the new size of STRING in bytes, or C<undef> if there's a
failure.

If the mapping is impossible?  See L</"Handling Malformed Data">.

=item *

	chars_to_bytes(STRING, TO [, CHECK])

The chars in STRING are recoded in-place to bytes encoded in TO.
Returns the new size of STRING in bytes, or C<undef> if there's a
failure.

If the mapping is impossible?  See L</"Handling Malformed Data">.

=item *

        from_to(STRING, FROM, TO [, CHECK])

The chars in STRING encoded in FROM are recoded in-place into TO.
Returns the new size of STRING, or C<undef> if there's a failure.

If mapping between the encodings is impossible?
See L</"Handling Malformed Data">.

[INTERNAL] If TO is UTF-8, also the UTF-8 flag of STRING is turned on.

=back

=head2 Testing For UTF-8

=over 4

=item *

        is_utf8(STRING [, CHECK])

[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
If CHECK is true, also checks the data in STRING for being
well-formed UTF-8.  Returns true if successful, false otherwise.

=back

=head2 Toggling UTF-8-ness

=over 4

=item *

        on_utf8(STRING)

[INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
B<not> checked for being well-formed UTF-8.  Do not use unless you
B<know> that the STRING is well-formed UTF-8.  Returns the previous
state of the UTF-8 flag (so please don't test the return value as
I<not> success or failure), or C<undef> if STRING is not a string.

=item *

        off_utf8(STRING)

[INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
Returns the previous state of the UTF-8 flag (so please don't test the
return value as I<not> success or failure), or C<undef> if STRING is
not a string.

=back

=head2 UTF-16 and UTF-32 Encodings

=over 4

=item *

        utf_to_utf(STRING, FROM, TO [, CHECK])

The data in STRING is converted from Unicode Transfer Encoding FROM to
Unicode Transfer Encoding TO.  Both FROM and TO may be any of the
following tags (case-insensitive, with or without 'utf' or 'utf-' prefix):

        tag             meaning

        '7'             UTF-7
        '8'             UTF-8
        '16be'          UTF-16 big-endian
        '16le'          UTF-16 little-endian
        '16'            UTF-16 native-endian
        '32be'          UTF-32 big-endian
        '32le'          UTF-32 little-endian
        '32'            UTF-32 native-endian

UTF-16 is also known as UCS-2, 16 bit or 2-byte chunks, and UTF-32 as
UCS-4, 32-bit or 4-byte chunks.  Returns the new size of STRING, or
C<undef> is there's a failure.

If FROM is UTF-8 and the UTF-8 in STRING is malformed?  See
L</"Handling Malformed Data">.

[INTERNAL] Even if CHECK is true and FROM is UTF-8, the UTF-8 flag of
STRING is not checked.  If TO is UTF-8, also the UTF-8 flag of STRING is
turned on.  Identical FROM and TO are fine.

=back

=head2 Handling Malformed Data

If CHECK is not set, C<undef> is returned.  If the data is supposed to
be UTF-8, an optional lexical warning (category utf8) is given.  If
CHECK is true but not a code reference, dies.  If CHECK is a code
reference, it is called with the arguments

	(MALFORMED_STRING, STRING_FROM_SO_FAR, STRING_TO_SO_FAR)

Two return values are expected from the call: the string to be used in
the result string in place of the malformed section, and the length of
the malformed section in bytes.

=cut

sub bytes_to_utf8 {
    &_bytes_to_utf8;
}

sub utf8_to_bytes {
    &_utf8_to_bytes;
}

sub chars_to_utf8 {
    &C_to_utf8;
}

sub utf8_to_chars {
    &_utf8_to_chars;
}

sub utf8_to_chars_check {
    &_utf8_to_chars_check;
}

sub bytes_to_chars {
    &_bytes_to_chars;
}

sub chars_to_bytes {
    &_chars_to_bytes;
}

sub is_utf8 {
    &_is_utf8;
}

sub on_utf8 {
    &_on_utf8;
}

sub off_utf8 {
    &_off_utf8;
}

sub utf_to_utf {
    &_utf_to_utf;
}

use Carp;

sub from_to
{
 my ($string,$from,$to,$check) = @_;
 my $f = __PACKAGE__->getEncoding($from);
 croak("Unknown encoding '$from'") unless $f;
 my $t = __PACKAGE__->getEncoding($to);
 croak("Unknown encoding '$to'") unless $t;
 my $uni = $f->toUnicode($string,$check);
 return undef if ($check && length($string));
 $string = $t->fromUnicode($uni,$check);
 return undef if ($check && length($uni));
 return length($_[0] = $string);
}

# The global hash is declared in XS code
$encoding{Unicode}    = bless({},'Encode::Unicode');
$encoding{'iso10646-1'} = bless({},'Encode::iso10646_1');

sub encodings
{
 my ($class) = @_;
 foreach my $dir (@INC)
  {
   if (opendir(my $dh,"$dir/Encode"))
    {
     while (defined(my $name = readdir($dh)))
      {
       if ($name =~ /^(.*)\.enc$/)
        {
         next if exists $encoding{$1};
         $encoding{$1} = "$dir/$name";
        }
      }
     closedir($dh);
    }
  }
 return keys %encoding;
}

sub loadEncoding
{
 my ($class,$name,$file) = @_;
 if (open(my $fh,$file))
  {
   my $type;
   while (1)
    {
     my $line = <$fh>;
     $type = substr($line,0,1);
     last unless $type eq '#';
    }
   $class .= ('::'.(($type eq 'E') ? 'Escape' : 'Table'));
   warn "Loading $file";
   return $class->read($fh,$name,$type);
  }
 else
  {
   return undef;
  }
}

sub getEncoding
{
 my ($class,$name) = @_;
 my $enc;
 unless (ref($enc = $encoding{$name}))
  {
   $enc = $class->loadEncoding($name,$enc) if defined $enc;
   unless (ref($enc))
    {
     foreach my $dir (@INC)
      {
       last if ($enc = $class->loadEncoding($name,"$dir/Encode/$name.enc"));
      }
    }
   $encoding{$name} = $enc;
  }
 return $enc;
}

package Encode::Unicode;

# Dummy package that provides the encode interface but leaves data
# as UTF-8 encoded. It is here so that from_to()

sub name { 'Unicode' }

sub toUnicode   { $_[1] }

sub fromUnicode { $_[1] }

package Encode::Table;

sub read
{
 my ($class,$fh,$name,$type) = @_;
 my $rep = $class->can("rep_$type");
 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
 my @touni;
 my %fmuni;
 my $count = 0;
 $def = hex($def);
 while ($pages--)
  {
   my $line = <$fh>;
   chomp($line);
   my $page = hex($line);
   my @page;
   my $ch = $page * 256;
   for (my $i = 0; $i < 16; $i++)
    {
     my $line = <$fh>;
     for (my $j = 0; $j < 16; $j++)
      {
       my $val = hex(substr($line,0,4,''));
       if ($val || !$ch)
        {
         my $uch = chr($val);
         push(@page,$uch);
         $fmuni{$uch} = $ch;
         $count++;
        }
       else
        {
         push(@page,undef);
        }
       $ch++;
      }
    }
   $touni[$page] = \@page;
  }

 return bless {Name  => $name,
               Rep   => $rep,
               ToUni => \@touni,
               FmUni => \%fmuni,
               Def   => $def,
               Num   => $count,
              },$class;
}

sub name { shift->{'Name'} }

sub rep_S { 'C' }

sub rep_D { 'n' }

sub rep_M { ($_[0] > 255) ? 'n' : 'C' }

sub representation
{
 my ($obj,$ch) = @_;
 $ch = 0 unless @_ > 1;
 $obj-{'Rep'}->($ch);
}

sub toUnicode
{
 my ($obj,$str,$chk) = @_;
 my $rep   = $obj->{'Rep'};
 my $touni = $obj->{'ToUni'};
 my $uni   = '';
 while (length($str))
  {
   my $ch = ord(substr($str,0,1,''));
   my $x;
   if (&$rep($ch) eq 'C')
    {
     $x = $touni->[0][$ch];
    }
   else
    {
     $x = $touni->[$ch][ord(substr($str,0,1,''))];
    }
   unless (defined $x)
    {
     last if $chk;
     # What do we do here ?
     $x = '';
    }
   $uni .= $x;
  }
 $_[1] = $str if $chk;
 return $uni;
}

sub fromUnicode
{
 my ($obj,$uni,$chk) = @_;
 my $fmuni = $obj->{'FmUni'};
 my $str   = '';
 my $def   = $obj->{'Def'};
 my $rep   = $obj->{'Rep'};
 while (length($uni))
  {
   my $ch = substr($uni,0,1,'');
   my $x  = $fmuni->{chr(ord($ch))};
   unless (defined $x)
    {
     last if ($chk);
     $x = $def;
    }
   $str .= pack(&$rep($x),$x);
  }
 $_[1] = $uni if $chk;
 return $str;
}

package Encode::iso10646_1;
# Encoding is 16-bit network order Unicode
# Used for X font encodings

sub name { 'iso10646-1' }

sub toUnicode
{
 my ($obj,$str,$chk) = @_;
 my $uni   = '';
 while (length($str))
  {
   my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
   $uni .= chr($code);
  }
 $_[1] = $str if $chk;
 return $uni;
}

sub fromUnicode
{
 my ($obj,$uni,$chk) = @_;
 my $str   = '';
 while (length($uni))
  {
   my $ch = substr($uni,0,1,'');
   my $x  = ord($ch);
   unless ($x < 32768)
    {
     last if ($chk);
     $x = 0;
    }
   $str .= pack('n',$x);
  }
 $_[1] = $uni if $chk;
 return $str;
}


package Encode::Escape;
use Carp;

sub read
{
 my ($class,$fh,$name) = @_;
 my %self = (Name => $name, Num => 0);
 while (<$fh>)
  {
   my ($key,$val) = /^(\S+)\s+(.*)$/;
   $val =~ s/^\{(.*?)\}/$1/g;
   $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge;
   $self{$key} = $val;
  }
 return bless \%self,$class;
}

sub name { shift->{'Name'} }

sub toUnicode
{
 croak("Not implemented yet");
}

sub fromUnicode
{
 croak("Not implemented yet");
}

1;

__END__
Commit	Line	Data
2c674647	1	package Encode;
	2
	3	$VERSION = 0.01;
	4
	5	require DynaLoader;
	6	require Exporter;
	7
	8	@ISA = qw(Exporter DynaLoader);
	9
	10	@EXPORT_OK =
	11	qw(
	12	bytes_to_utf8
	13	utf8_to_bytes
	14	chars_to_utf8
	15	utf8_to_chars
	16	utf8_to_chars_check
	17	bytes_to_chars
	18	chars_to_bytes
	19	from_to
	20	is_utf8
	21	on_utf8
	22	off_utf8
	23	utf_to_utf
656753f8	24	encodings
2c674647	25	);
	26
	27	bootstrap Encode ();
	28
	29	=pod
	30
	31	=head1 NAME
	32
	33	Encode - character encodings
	34
	35	=head2 TERMINOLOGY
	36
	37	=over
	38
	39	=item *
	40
	41	I<char>: a character in the range 0..maxint (at least 2**32-1)
	42
	43	=item *
	44
	45	I<byte>: a character in the range 0..255
	46
	47	=back
	48
	49	The marker [INTERNAL] marks Internal Implementation Details, in
	50	general meant only for those who think they know what they are doing,
	51	and such details may change in future releases.
	52
	53	=head2 bytes
	54
	55	=over 4
	56
	57	=item *
	58
	59	bytes_to_utf8(STRING [, FROM])
	60
	61	The bytes in STRING are recoded in-place into UTF-8. If no FROM is
	62	specified the bytes are expected to be encoded in US-ASCII or ISO
	63	8859-1 (Latin 1). Returns the new size of STRING, or C<undef> if
	64	there's a failure.
	65
656753f8	66	[INTERNAL] Also the UTF-8 flag of STRING is turned on.
2c674647	67
	68	=item *
	69
	70	utf8_to_bytes(STRING [, TO [, CHECK]])
	71
	72	The UTF-8 in STRING is decoded in-place into bytes. If no TO encoding
	73	is specified the bytes are expected to be encoded in US-ASCII or ISO
	74	8859-1 (Latin 1). Returns the new size of STRING, or C<undef> if
	75	there's a failure.
	76
	77	What if there are characters > 255? What if the UTF-8 in STRING is
	78	malformed? See L</"Handling Malformed Data">.
	79
	80	[INTERNAL] The UTF-8 flag of STRING is not checked.
	81
	82	=back
	83
	84	=head2 chars
	85
	86	=over 4
	87
	88	=item *
	89
	90	chars_to_utf8(STRING)
	91
	92	The chars in STRING are encoded in-place into UTF-8. Returns the new
	93	size of STRING, or C<undef> if there's a failure.
	94
	95	No assumptions are made on the encoding of the chars. If you want to
	96	assume that the chars are Unicode and to trap illegal Unicode
	97	characters, you must use C<from_to('Unicode', ...)>.
	98
	99	[INTERNAL] Also the UTF-8 flag of STRING is turned on.
	100
	101	=over 4
	102
	103	=item *
	104
	105	utf8_to_chars(STRING)
	106
	107	The UTF-8 in STRING is decoded in-place into chars. Returns the new
656753f8	108	size of STRING, or C<undef> if there's a failure.
2c674647	109
	110	If the UTF-8 in STRING is malformed C<undef> is returned, and also an
	111	optional lexical warning (category utf8) is given.
	112
	113	[INTERNAL] The UTF-8 flag of STRING is not checked.
	114
	115	=item *
	116
	117	utf8_to_chars_check(STRING [, CHECK])
	118
	119	(Note that special naming of this interface since a two-argument
	120	utf8_to_chars() has different semantics.)
	121
	122	The UTF-8 in STRING is decoded in-place into chars. Returns the new
	123	size of STRING, or C<undef> if there is a failure.
	124
	125	If the UTF-8 in STRING is malformed? See L</"Handling Malformed Data">.
	126
	127	[INTERNAL] The UTF-8 flag of STRING is not checked.
	128
	129	=back
	130
	131	=head2 chars With Encoding
	132
	133	=over 4
	134
	135	=item *
	136
	137	chars_to_utf8(STRING, FROM [, CHECK])
	138
	139	The chars in STRING encoded in FROM are recoded in-place into UTF-8.
	140	Returns the new size of STRING, or C<undef> if there's a failure.
	141
	142	No assumptions are made on the encoding of the chars. If you want to
	143	assume that the chars are Unicode and to trap illegal Unicode
	144	characters, you must use C<from_to('Unicode', ...)>.
	145
	146	[INTERNAL] Also the UTF-8 flag of STRING is turned on.
	147
	148	=item *
	149
	150	utf8_to_chars(STRING, TO [, CHECK])
	151
	152	The UTF-8 in STRING is decoded in-place into chars encoded in TO.
	153	Returns the new size of STRING, or C<undef> if there's a failure.
	154
	155	If the UTF-8 in STRING is malformed? See L</"Handling Malformed Data">.
	156
	157	[INTERNAL] The UTF-8 flag of STRING is not checked.
	158
	159	=item *
	160
	161	bytes_to_chars(STRING, FROM [, CHECK])
	162
	163	The bytes in STRING encoded in FROM are recoded in-place into chars.
	164	Returns the new size of STRING in bytes, or C<undef> if there's a
	165	failure.
	166
	167	If the mapping is impossible? See L</"Handling Malformed Data">.
	168
	169	=item *
	170
	171	chars_to_bytes(STRING, TO [, CHECK])
	172
173	The chars in STRING are recoded in-place to bytes encoded in TO.
174	Returns the new size of STRING in bytes, or C<undef> if there's a
175	failure.
176
177	If the mapping is impossible? See L</"Handling Malformed Data">.
178
179	=item *
180
181	from_to(STRING, FROM, TO [, CHECK])
182
183	The chars in STRING encoded in FROM are recoded in-place into TO.
184	Returns the new size of STRING, or C<undef> if there's a failure.
185
186	If mapping between the encodings is impossible?
187	See L</"Handling Malformed Data">.
188
189	[INTERNAL] If TO is UTF-8, also the UTF-8 flag of STRING is turned on.
190
191	=back
192
193	=head2 Testing For UTF-8
194
195	=over 4
196
197	=item *
198
199	is_utf8(STRING [, CHECK])
200
201	[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
202	If CHECK is true, also checks the data in STRING for being
203	well-formed UTF-8. Returns true if successful, false otherwise.
204
205	=back
206
207	=head2 Toggling UTF-8-ness
208
209	=over 4
210
211	=item *
212
213	on_utf8(STRING)
214
215	[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
216	B<not> checked for being well-formed UTF-8. Do not use unless you
217	B<know> that the STRING is well-formed UTF-8. Returns the previous
218	state of the UTF-8 flag (so please don't test the return value as
219	I<not> success or failure), or C<undef> if STRING is not a string.
220
221	=item *
222
223	off_utf8(STRING)
224
225	[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
226	Returns the previous state of the UTF-8 flag (so please don't test the
227	return value as I<not> success or failure), or C<undef> if STRING is
228	not a string.
229
230	=back
231
232	=head2 UTF-16 and UTF-32 Encodings
233
234	=over 4
235
236	=item *
237
238	utf_to_utf(STRING, FROM, TO [, CHECK])
239
240	The data in STRING is converted from Unicode Transfer Encoding FROM to
241	Unicode Transfer Encoding TO. Both FROM and TO may be any of the
242	following tags (case-insensitive, with or without 'utf' or 'utf-' prefix):
243
244	tag meaning
245
246	'7' UTF-7
247	'8' UTF-8
248	'16be' UTF-16 big-endian
249	'16le' UTF-16 little-endian
250	'16' UTF-16 native-endian
251	'32be' UTF-32 big-endian
252	'32le' UTF-32 little-endian
253	'32' UTF-32 native-endian
254
255	UTF-16 is also known as UCS-2, 16 bit or 2-byte chunks, and UTF-32 as
256	UCS-4, 32-bit or 4-byte chunks. Returns the new size of STRING, or
257	C<undef> is there's a failure.
258
259	If FROM is UTF-8 and the UTF-8 in STRING is malformed? See
260	L</"Handling Malformed Data">.
261
262	[INTERNAL] Even if CHECK is true and FROM is UTF-8, the UTF-8 flag of
263	STRING is not checked. If TO is UTF-8, also the UTF-8 flag of STRING is
264	turned on. Identical FROM and TO are fine.
265
266	=back
267
268	=head2 Handling Malformed Data
269
270	If CHECK is not set, C<undef> is returned. If the data is supposed to
271	be UTF-8, an optional lexical warning (category utf8) is given. If
272	CHECK is true but not a code reference, dies. If CHECK is a code
273	reference, it is called with the arguments
274
275	(MALFORMED_STRING, STRING_FROM_SO_FAR, STRING_TO_SO_FAR)
276
277	Two return values are expected from the call: the string to be used in
278	the result string in place of the malformed section, and the length of
279	the malformed section in bytes.
280
281	=cut
282
283	sub bytes_to_utf8 {
284	&_bytes_to_utf8;
285	}
286
287	sub utf8_to_bytes {
288	&_utf8_to_bytes;
289	}
290
291	sub chars_to_utf8 {
292	&C_to_utf8;
293	}
294
295	sub utf8_to_chars {
296	&_utf8_to_chars;
297	}
298
299	sub utf8_to_chars_check {
300	&_utf8_to_chars_check;
301	}
302
303	sub bytes_to_chars {
304	&_bytes_to_chars;
305	}
306
307	sub chars_to_bytes {
308	&_chars_to_bytes;
309	}
310
2c674647	311	sub is_utf8 {
	312	&_is_utf8;
	313	}
	314
	315	sub on_utf8 {
	316	&_on_utf8;
	317	}
	318
	319	sub off_utf8 {
	320	&_off_utf8;
	321	}
	322
	323	sub utf_to_utf {
	324	&_utf_to_utf;
	325	}
	326
bf230f3d	327	use Carp;
bf230f3d	328
656753f8	329	sub from_to
	330	{
	331	my ($string,$from,$to,$check) = @_;
	332	my $f = __PACKAGE__->getEncoding($from);
bf230f3d	333	croak("Unknown encoding '$from'") unless $f;
656753f8	334	my $t = __PACKAGE__->getEncoding($to);
bf230f3d	335	croak("Unknown encoding '$to'") unless $t;
656753f8	336	my $uni = $f->toUnicode($string,$check);
bf230f3d	337	return undef if ($check && length($string));
656753f8	338	$string = $t->fromUnicode($uni,$check);
bf230f3d	339	return undef if ($check && length($uni));
656753f8	340	return length($_[0] = $string);
	341	}
	342
2f2b4ff2	343	# The global hash is declared in XS code
2f2b4ff2	344	$encoding{Unicode} = bless({},'Encode::Unicode');
9b37254d	345	$encoding{'iso10646-1'} = bless({},'Encode::iso10646_1');
5345d506	346
656753f8	347	sub encodings
	348	{
	349	my ($class) = @_;
5345d506	350	foreach my $dir (@INC)
656753f8	351	{
5345d506	352	if (opendir(my $dh,"$dir/Encode"))
656753f8	353	{
5345d506	354	while (defined(my $name = readdir($dh)))
	355	{
	356	if ($name =~ /^(.*)\.enc$/)
	357	{
	358	next if exists $encoding{$1};
	359	$encoding{$1} = "$dir/$name";
	360	}
	361	}
	362	closedir($dh);
656753f8	363	}
5345d506	364	}
	365	return keys %encoding;
	366	}
	367
	368	sub loadEncoding
	369	{
	370	my ($class,$name,$file) = @_;
	371	if (open(my $fh,$file))
	372	{
	373	my $type;
	374	while (1)
	375	{
	376	my $line = <$fh>;
	377	$type = substr($line,0,1);
	378	last unless $type eq '#';
	379	}
	380	$class .= ('::'.(($type eq 'E') ? 'Escape' : 'Table'));
2f2b4ff2	381	warn "Loading $file";
5345d506	382	return $class->read($fh,$name,$type);
656753f8	383	}
	384	else
	385	{
5345d506	386	return undef;
656753f8	387	}
656753f8	388	}
656753f8	389
656753f8	390	sub getEncoding
	391	{
	392	my ($class,$name) = @_;
5345d506	393	my $enc;
5345d506	394	unless (ref($enc = $encoding{$name}))
656753f8	395	{
5345d506	396	$enc = $class->loadEncoding($name,$enc) if defined $enc;
5345d506	397	unless (ref($enc))
656753f8	398	{
5345d506	399	foreach my $dir (@INC)
656753f8	400	{
5345d506	401	last if ($enc = $class->loadEncoding($name,"$dir/Encode/$name.enc"));
656753f8	402	}
87714904	403	}
5345d506	404	$encoding{$name} = $enc;
656753f8	405	}
5345d506	406	return $enc;
656753f8	407	}
	408
	409	package Encode::Unicode;
	410
9b37254d	411	# Dummy package that provides the encode interface but leaves data
9b37254d	412	# as UTF-8 encoded. It is here so that from_to()
656753f8	413
	414	sub name { 'Unicode' }
	415
	416	sub toUnicode { $_[1] }
	417
	418	sub fromUnicode { $_[1] }
	419
	420	package Encode::Table;
	421
	422	sub read
	423	{
	424	my ($class,$fh,$name,$type) = @_;
	425	my $rep = $class->can("rep_$type");
	426	my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
	427	my @touni;
	428	my %fmuni;
	429	my $count = 0;
	430	$def = hex($def);
656753f8	431	while ($pages--)
656753f8	432	{
87714904	433	my $line = <$fh>;
	434	chomp($line);
	435	my $page = hex($line);
656753f8	436	my @page;
	437	my $ch = $page * 256;
	438	for (my $i = 0; $i < 16; $i++)
	439	{
	440	my $line = <$fh>;
	441	for (my $j = 0; $j < 16; $j++)
	442	{
	443	my $val = hex(substr($line,0,4,''));
	444	if ($val \|\| !$ch)
	445	{
	446	my $uch = chr($val);
	447	push(@page,$uch);
87714904	448	$fmuni{$uch} = $ch;
656753f8	449	$count++;
	450	}
	451	else
	452	{
	453	push(@page,undef);
	454	}
	455	$ch++;
	456	}
	457	}
	458	$touni[$page] = \@page;
	459	}
	460
	461	return bless {Name => $name,
	462	Rep => $rep,
	463	ToUni => \@touni,
	464	FmUni => \%fmuni,
	465	Def => $def,
	466	Num => $count,
	467	},$class;
	468	}
	469
	470	sub name { shift->{'Name'} }
	471
	472	sub rep_S { 'C' }
	473
5dcbab34	474	sub rep_D { 'n' }
656753f8	475
5dcbab34	476	sub rep_M { ($_[0] > 255) ? 'n' : 'C' }
656753f8	477
	478	sub representation
	479	{
	480	my ($obj,$ch) = @_;
	481	$ch = 0 unless @_ > 1;
	482	$obj-{'Rep'}->($ch);
	483	}
	484
	485	sub toUnicode
	486	{
bf230f3d	487	my ($obj,$str,$chk) = @_;
656753f8	488	my $rep = $obj->{'Rep'};
	489	my $touni = $obj->{'ToUni'};
	490	my $uni = '';
	491	while (length($str))
	492	{
	493	my $ch = ord(substr($str,0,1,''));
bf230f3d	494	my $x;
656753f8	495	if (&$rep($ch) eq 'C')
656753f8	496	{
bf230f3d	497	$x = $touni->[0][$ch];
656753f8	498	}
	499	else
	500	{
bf230f3d	501	$x = $touni->[$ch][ord(substr($str,0,1,''))];
656753f8	502	}
bf230f3d	503	unless (defined $x)
	504	{
	505	last if $chk;
	506	# What do we do here ?
	507	$x = '';
	508	}
	509	$uni .= $x;
656753f8	510	}
bf230f3d	511	$_[1] = $str if $chk;
656753f8	512	return $uni;
	513	}
	514
	515	sub fromUnicode
	516	{
bf230f3d	517	my ($obj,$uni,$chk) = @_;
656753f8	518	my $fmuni = $obj->{'FmUni'};
	519	my $str = '';
	520	my $def = $obj->{'Def'};
87714904	521	my $rep = $obj->{'Rep'};
656753f8	522	while (length($uni))
	523	{
	524	my $ch = substr($uni,0,1,'');
63eec5db	525	my $x = $fmuni->{chr(ord($ch))};
bf230f3d	526	unless (defined $x)
	527	{
	528	last if ($chk);
	529	$x = $def;
	530	}
87714904	531	$str .= pack(&$rep($x),$x);
	532	}
	533	$_[1] = $uni if $chk;
	534	return $str;
	535	}
	536
9b37254d	537	package Encode::iso10646_1;
	538	# Encoding is 16-bit network order Unicode
	539	# Used for X font encodings
87714904	540
	541	sub name { 'iso10646-1' }
	542
	543	sub toUnicode
	544	{
	545	my ($obj,$str,$chk) = @_;
	546	my $uni = '';
	547	while (length($str))
	548	{
5dcbab34	549	my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
87714904	550	$uni .= chr($code);
	551	}
	552	$_[1] = $str if $chk;
	553	return $uni;
	554	}
	555
	556	sub fromUnicode
	557	{
	558	my ($obj,$uni,$chk) = @_;
	559	my $str = '';
	560	while (length($uni))
	561	{
	562	my $ch = substr($uni,0,1,'');
	563	my $x = ord($ch);
	564	unless ($x < 32768)
	565	{
	566	last if ($chk);
	567	$x = 0;
	568	}
5dcbab34	569	$str .= pack('n',$x);
656753f8	570	}
bf230f3d	571	$_[1] = $uni if $chk;
656753f8	572	return $str;
	573	}
	574
2f2b4ff2	575
656753f8	576	package Encode::Escape;
	577	use Carp;
	578
	579	sub read
	580	{
	581	my ($class,$fh,$name) = @_;
	582	my %self = (Name => $name, Num => 0);
	583	while (<$fh>)
	584	{
	585	my ($key,$val) = /^(\S+)\s+(.*)$/;
	586	$val =~ s/^\{(.*?)\}/$1/g;
	587	$val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge;
	588	$self{$key} = $val;
	589	}
	590	return bless \%self,$class;
	591	}
	592
	593	sub name { shift->{'Name'} }
	594
	595	sub toUnicode
	596	{
	597	croak("Not implemented yet");
	598	}
	599
	600	sub fromUnicode
	601	{
	602	croak("Not implemented yet");
	603	}
	604
	605	1;
	606
	607	__END__