[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm

package Encode;

$VERSION = 0.01;

require DynaLoader;
require Exporter;

@ISA = qw(Exporter DynaLoader);

@EXPORT_OK =
    qw(
       bytes_to_utf8
       utf8_to_bytes
       chars_to_utf8
       utf8_to_chars
       utf8_to_chars_check
       bytes_to_chars
       chars_to_bytes
       from_to
       is_utf8
       on_utf8
       off_utf8
       utf_to_utf
       encodings
       utf8_decode
       utf8_encode
       utf8_upgrade
       utf8_downgrade
      );

bootstrap Encode ();

=pod

=head1 NAME

Encode - character encodings

=head2 TERMINOLOGY

=over

=item *

I<char>: a character in the range 0..maxint (at least 2**32-1)

=item *

I<byte>: a character in the range 0..255

=back

The marker [INTERNAL] marks Internal Implementation Details, in
general meant only for those who think they know what they are doing,
and such details may change in future releases.

=head2 bytes

=over 4

=item *

        bytes_to_utf8(STRING [, FROM])

The bytes in STRING are recoded in-place into UTF-8.  If no FROM is
specified the bytes are expected to be encoded in US-ASCII or ISO
8859-1 (Latin 1).  Returns the new size of STRING, or C<undef> if
there's a failure.

[INTERNAL] Also the UTF-8 flag of STRING is turned on.

=item *

        utf8_to_bytes(STRING [, TO [, CHECK]])

The UTF-8 in STRING is decoded in-place into bytes.  If no TO encoding
is specified the bytes are expected to be encoded in US-ASCII or ISO
8859-1 (Latin 1).  Returns the new size of STRING, or C<undef> if
there's a failure.

What if there are characters > 255?  What if the UTF-8 in STRING is
malformed?  See L</"Handling Malformed Data">.

[INTERNAL] The UTF-8 flag of STRING is not checked.

=back

=head2 chars

=over 4

=item *

        chars_to_utf8(STRING)

The chars in STRING are encoded in-place into UTF-8.  Returns the new
size of STRING, or C<undef> if there's a failure.

No assumptions are made on the encoding of the chars.  If you want to
assume that the chars are Unicode and to trap illegal Unicode
characters, you must use C<from_to('Unicode', ...)>.

[INTERNAL] Also the UTF-8 flag of STRING is turned on.

=over 4

=item *

        utf8_to_chars(STRING)

The UTF-8 in STRING is decoded in-place into chars.  Returns the new
size of STRING, or C<undef> if there's a failure.

If the UTF-8 in STRING is malformed C<undef> is returned, and also an
optional lexical warning (category utf8) is given.

[INTERNAL] The UTF-8 flag of STRING is not checked.

=item *

        utf8_to_chars_check(STRING [, CHECK])

(Note that special naming of this interface since a two-argument
utf8_to_chars() has different semantics.)

The UTF-8 in STRING is decoded in-place into chars.  Returns the new
size of STRING, or C<undef> if there is a failure.

If the UTF-8 in STRING is malformed?  See L</"Handling Malformed Data">.

[INTERNAL] The UTF-8 flag of STRING is not checked.

=back

=head2 chars With Encoding

=over 4

=item *

        chars_to_utf8(STRING, FROM [, CHECK])

The chars in STRING encoded in FROM are recoded in-place into UTF-8.
Returns the new size of STRING, or C<undef> if there's a failure.

No assumptions are made on the encoding of the chars.  If you want to
assume that the chars are Unicode and to trap illegal Unicode
characters, you must use C<from_to('Unicode', ...)>.

[INTERNAL] Also the UTF-8 flag of STRING is turned on.

=item *

        utf8_to_chars(STRING, TO [, CHECK])

The UTF-8 in STRING is decoded in-place into chars encoded in TO.
Returns the new size of STRING, or C<undef> if there's a failure.

If the UTF-8 in STRING is malformed?  See L</"Handling Malformed Data">.

[INTERNAL] The UTF-8 flag of STRING is not checked.

=item *

	bytes_to_chars(STRING, FROM [, CHECK])

The bytes in STRING encoded in FROM are recoded in-place into chars.
Returns the new size of STRING in bytes, or C<undef> if there's a
failure.

If the mapping is impossible?  See L</"Handling Malformed Data">.

=item *

	chars_to_bytes(STRING, TO [, CHECK])

The chars in STRING are recoded in-place to bytes encoded in TO.
Returns the new size of STRING in bytes, or C<undef> if there's a
failure.

If the mapping is impossible?  See L</"Handling Malformed Data">.

=item *

        from_to(STRING, FROM, TO [, CHECK])

The chars in STRING encoded in FROM are recoded in-place into TO.
Returns the new size of STRING, or C<undef> if there's a failure.

If mapping between the encodings is impossible?
See L</"Handling Malformed Data">.

[INTERNAL] If TO is UTF-8, also the UTF-8 flag of STRING is turned on.

=back

=head2 Testing For UTF-8

=over 4

=item *

        is_utf8(STRING [, CHECK])

[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
If CHECK is true, also checks the data in STRING for being
well-formed UTF-8.  Returns true if successful, false otherwise.

=back

=head2 Toggling UTF-8-ness

=over 4

=item *

        on_utf8(STRING)

[INTERNAL] Turn on the UTF-8 flag in STRING.  The data in STRING is
B<not> checked for being well-formed UTF-8.  Do not use unless you
B<know> that the STRING is well-formed UTF-8.  Returns the previous
state of the UTF-8 flag (so please don't test the return value as
I<not> success or failure), or C<undef> if STRING is not a string.

=item *

        off_utf8(STRING)

[INTERNAL] Turn off the UTF-8 flag in STRING.  Do not use frivolously.
Returns the previous state of the UTF-8 flag (so please don't test the
return value as I<not> success or failure), or C<undef> if STRING is
not a string.

=back

=head2 UTF-16 and UTF-32 Encodings

=over 4

=item *

        utf_to_utf(STRING, FROM, TO [, CHECK])

The data in STRING is converted from Unicode Transfer Encoding FROM to
Unicode Transfer Encoding TO.  Both FROM and TO may be any of the
following tags (case-insensitive, with or without 'utf' or 'utf-' prefix):

        tag             meaning

        '7'             UTF-7
        '8'             UTF-8
        '16be'          UTF-16 big-endian
        '16le'          UTF-16 little-endian
        '16'            UTF-16 native-endian
        '32be'          UTF-32 big-endian
        '32le'          UTF-32 little-endian
        '32'            UTF-32 native-endian

UTF-16 is also known as UCS-2, 16 bit or 2-byte chunks, and UTF-32 as
UCS-4, 32-bit or 4-byte chunks.  Returns the new size of STRING, or
C<undef> is there's a failure.

If FROM is UTF-8 and the UTF-8 in STRING is malformed?  See
L</"Handling Malformed Data">.

[INTERNAL] Even if CHECK is true and FROM is UTF-8, the UTF-8 flag of
STRING is not checked.  If TO is UTF-8, also the UTF-8 flag of STRING is
turned on.  Identical FROM and TO are fine.

=back

=head2 Handling Malformed Data

If CHECK is not set, C<undef> is returned.  If the data is supposed to
be UTF-8, an optional lexical warning (category utf8) is given.  If
CHECK is true but not a code reference, dies.  If CHECK is a code
reference, it is called with the arguments

	(MALFORMED_STRING, STRING_FROM_SO_FAR, STRING_TO_SO_FAR)

Two return values are expected from the call: the string to be used in
the result string in place of the malformed section, and the length of
the malformed section in bytes.

=cut

sub bytes_to_utf8 {
    &_bytes_to_utf8;
}

sub utf8_to_bytes {
    &_utf8_to_bytes;
}

sub chars_to_utf8 {
    &C_to_utf8;
}

sub utf8_to_chars {
    &_utf8_to_chars;
}

sub utf8_to_chars_check {
    &_utf8_to_chars_check;
}

sub bytes_to_chars {
    &_bytes_to_chars;
}

sub chars_to_bytes {
    &_chars_to_bytes;
}

sub is_utf8 {
    &_is_utf8;
}

sub on_utf8 {
    &_on_utf8;
}

sub off_utf8 {
    &_off_utf8;
}

sub utf_to_utf {
    &_utf_to_utf;
}

use Carp;

sub from_to
{
 my ($string,$from,$to,$check) = @_;
 my $f = __PACKAGE__->getEncoding($from);
 croak("Unknown encoding '$from'") unless $f;
 my $t = __PACKAGE__->getEncoding($to);
 croak("Unknown encoding '$to'") unless $t;
 my $uni = $f->toUnicode($string,$check);
 return undef if ($check && length($string));
 $string = $t->fromUnicode($uni,$check);
 return undef if ($check && length($uni));
 return length($_[0] = $string);
}

# The global hash is declared in XS code
$encoding{Unicode}    = bless({},'Encode::Unicode');
$encoding{'iso10646-1'} = bless({},'Encode::iso10646_1');

sub encodings
{
 my ($class) = @_;
 foreach my $dir (@INC)
  {
   if (opendir(my $dh,"$dir/Encode"))
    {
     while (defined(my $name = readdir($dh)))
      {
       if ($name =~ /^(.*)\.enc$/)
        {
         next if exists $encoding{$1};
         $encoding{$1} = "$dir/$name";
        }
      }
     closedir($dh);
    }
  }
 return keys %encoding;
}

sub loadEncoding
{
 my ($class,$name,$file) = @_;
 if (open(my $fh,$file))
  {
   my $type;
   while (1)
    {
     my $line = <$fh>;
     $type = substr($line,0,1);
     last unless $type eq '#';
    }
   $class .= ('::'.(($type eq 'E') ? 'Escape' : 'Table'));
   #warn "Loading $file";
   return $class->read($fh,$name,$type);
  }
 else
  {
   return undef;
  }
}

sub getEncoding
{
 my ($class,$name) = @_;
 my $enc;
 unless (ref($enc = $encoding{$name}))
  {
   $enc = $class->loadEncoding($name,$enc) if defined $enc;
   unless (ref($enc))
    {
     foreach my $dir (@INC)
      {
       last if ($enc = $class->loadEncoding($name,"$dir/Encode/$name.enc"));
      }
    }
   $encoding{$name} = $enc;
  }
 return $enc;
}

package Encode::Unicode;

# Dummy package that provides the encode interface but leaves data
# as UTF-8 encoded. It is here so that from_to() works.

sub name { 'Unicode' }

sub toUnicode
{
 my ($obj,$str,$chk) = @_;
 Encode::utf8_upgrade($str);
 $_[1] = '' if $chk;
 return $str;
}

*fromUnicode = \&toUnicode;

package Encode::Table;

sub read
{
 my ($class,$fh,$name,$type) = @_;
 my $rep = $class->can("rep_$type");
 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
 my @touni;
 my %fmuni;
 my $count = 0;
 $def = hex($def);
 while ($pages--)
  {
   my $line = <$fh>;
   chomp($line);
   my $page = hex($line);
   my @page;
   my $ch = $page * 256;
   for (my $i = 0; $i < 16; $i++)
    {
     my $line = <$fh>;
     for (my $j = 0; $j < 16; $j++)
      {
       my $val = hex(substr($line,0,4,''));
       if ($val || !$ch)
        {
         my $uch = chr($val);
         push(@page,$uch);
         $fmuni{$uch} = $ch;
         $count++;
        }
       else
        {
         push(@page,undef);
        }
       $ch++;
      }
    }
   $touni[$page] = \@page;
  }

 return bless {Name  => $name,
               Rep   => $rep,
               ToUni => \@touni,
               FmUni => \%fmuni,
               Def   => $def,
               Num   => $count,
              },$class;
}

sub name { shift->{'Name'} }

sub rep_S { 'C' }

sub rep_D { 'n' }

sub rep_M { ($_[0] > 255) ? 'n' : 'C' }

sub representation
{
 my ($obj,$ch) = @_;
 $ch = 0 unless @_ > 1;
 $obj-{'Rep'}->($ch);
}

sub toUnicode
{
 my ($obj,$str,$chk) = @_;
 my $rep   = $obj->{'Rep'};
 my $touni = $obj->{'ToUni'};
 my $uni   = '';
 while (length($str))
  {
   my $ch = ord(substr($str,0,1,''));
   my $x;
   if (&$rep($ch) eq 'C')
    {
     $x = $touni->[0][$ch];
    }
   else
    {
     $x = $touni->[$ch][ord(substr($str,0,1,''))];
    }
   unless (defined $x)
    {
     last if $chk;
     # What do we do here ?
     $x = '';
    }
   $uni .= $x;
  }
 $_[1] = $str if $chk;
 return $uni;
}

sub fromUnicode
{
 my ($obj,$uni,$chk) = @_;
 my $fmuni = $obj->{'FmUni'};
 my $str   = '';
 my $def   = $obj->{'Def'};
 my $rep   = $obj->{'Rep'};
 while (length($uni))
  {
   my $ch = substr($uni,0,1,'');
   my $x  = $fmuni->{chr(ord($ch))};
   unless (defined $x)
    {
     last if ($chk);
     $x = $def;
    }
   $str .= pack(&$rep($x),$x);
  }
 $_[1] = $uni if $chk;
 return $str;
}

package Encode::iso10646_1;
# Encoding is 16-bit network order Unicode
# Used for X font encodings

sub name { 'iso10646-1' }

sub toUnicode
{
 my ($obj,$str,$chk) = @_;
 my $uni   = '';
 while (length($str))
  {
   my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
   $uni .= chr($code);
  }
 $_[1] = $str if $chk;
 Encode::utf8_upgrade($uni);
 return $uni;
}

sub fromUnicode
{
 my ($obj,$uni,$chk) = @_;
 my $str   = '';
 while (length($uni))
  {
   my $ch = substr($uni,0,1,'');
   my $x  = ord($ch);
   unless ($x < 32768)
    {
     last if ($chk);
     $x = 0;
    }
   $str .= pack('n',$x);
  }
 $_[1] = $uni if $chk;
 return $str;
}


package Encode::Escape;
use Carp;

sub read
{
 my ($class,$fh,$name) = @_;
 my %self = (Name => $name, Num => 0);
 while (<$fh>)
  {
   my ($key,$val) = /^(\S+)\s+(.*)$/;
   $val =~ s/^\{(.*?)\}/$1/g;
   $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge;
   $self{$key} = $val;
  }
 return bless \%self,$class;
}

sub name { shift->{'Name'} }

sub toUnicode
{
 croak("Not implemented yet");
}

sub fromUnicode
{
 croak("Not implemented yet");
}

1;

__END__
Commit	Line	Data
22d4bb9c	1	package Encode;
	2
	3	$VERSION = 0.01;
	4
	5	require DynaLoader;
	6	require Exporter;
	7
	8	@ISA = qw(Exporter DynaLoader);
	9
	10	@EXPORT_OK =
	11	qw(
	12	bytes_to_utf8
	13	utf8_to_bytes
	14	chars_to_utf8
	15	utf8_to_chars
	16	utf8_to_chars_check
	17	bytes_to_chars
	18	chars_to_bytes
	19	from_to
	20	is_utf8
	21	on_utf8
	22	off_utf8
	23	utf_to_utf
	24	encodings
0e06870b	25	utf8_decode
	26	utf8_encode
	27	utf8_upgrade
	28	utf8_downgrade
22d4bb9c	29	);
	30
	31	bootstrap Encode ();
	32
	33	=pod
	34
	35	=head1 NAME
	36
	37	Encode - character encodings
	38
	39	=head2 TERMINOLOGY
	40
	41	=over
	42
	43	=item *
	44
	45	I<char>: a character in the range 0..maxint (at least 2**32-1)
	46
	47	=item *
	48
	49	I<byte>: a character in the range 0..255
	50
	51	=back
	52
	53	The marker [INTERNAL] marks Internal Implementation Details, in
	54	general meant only for those who think they know what they are doing,
	55	and such details may change in future releases.
	56
	57	=head2 bytes
	58
	59	=over 4
	60
	61	=item *
	62
	63	bytes_to_utf8(STRING [, FROM])
	64
	65	The bytes in STRING are recoded in-place into UTF-8. If no FROM is
	66	specified the bytes are expected to be encoded in US-ASCII or ISO
	67	8859-1 (Latin 1). Returns the new size of STRING, or C<undef> if
	68	there's a failure.
	69
	70	[INTERNAL] Also the UTF-8 flag of STRING is turned on.
	71
	72	=item *
	73
	74	utf8_to_bytes(STRING [, TO [, CHECK]])
	75
	76	The UTF-8 in STRING is decoded in-place into bytes. If no TO encoding
	77	is specified the bytes are expected to be encoded in US-ASCII or ISO
	78	8859-1 (Latin 1). Returns the new size of STRING, or C<undef> if
	79	there's a failure.
	80
	81	What if there are characters > 255? What if the UTF-8 in STRING is
	82	malformed? See L</"Handling Malformed Data">.
	83
	84	[INTERNAL] The UTF-8 flag of STRING is not checked.
	85
	86	=back
	87
	88	=head2 chars
	89
	90	=over 4
	91
	92	=item *
93
94	chars_to_utf8(STRING)
95
96	The chars in STRING are encoded in-place into UTF-8. Returns the new
97	size of STRING, or C<undef> if there's a failure.
98
99	No assumptions are made on the encoding of the chars. If you want to
100	assume that the chars are Unicode and to trap illegal Unicode
101	characters, you must use C<from_to('Unicode', ...)>.
102
103	[INTERNAL] Also the UTF-8 flag of STRING is turned on.
104
105	=over 4
106
107	=item *
108
109	utf8_to_chars(STRING)
110
111	The UTF-8 in STRING is decoded in-place into chars. Returns the new
112	size of STRING, or C<undef> if there's a failure.
113
114	If the UTF-8 in STRING is malformed C<undef> is returned, and also an
115	optional lexical warning (category utf8) is given.
116
117	[INTERNAL] The UTF-8 flag of STRING is not checked.
118
119	=item *
120
121	utf8_to_chars_check(STRING [, CHECK])
122
123	(Note that special naming of this interface since a two-argument
124	utf8_to_chars() has different semantics.)
125
126	The UTF-8 in STRING is decoded in-place into chars. Returns the new
127	size of STRING, or C<undef> if there is a failure.
128
129	If the UTF-8 in STRING is malformed? See L</"Handling Malformed Data">.
130
131	[INTERNAL] The UTF-8 flag of STRING is not checked.
132
133	=back
134
135	=head2 chars With Encoding
136
137	=over 4
138
139	=item *
140
141	chars_to_utf8(STRING, FROM [, CHECK])
142
143	The chars in STRING encoded in FROM are recoded in-place into UTF-8.
144	Returns the new size of STRING, or C<undef> if there's a failure.
145
146	No assumptions are made on the encoding of the chars. If you want to
147	assume that the chars are Unicode and to trap illegal Unicode
148	characters, you must use C<from_to('Unicode', ...)>.
149
150	[INTERNAL] Also the UTF-8 flag of STRING is turned on.
151
152	=item *
153
154	utf8_to_chars(STRING, TO [, CHECK])
155
156	The UTF-8 in STRING is decoded in-place into chars encoded in TO.
157	Returns the new size of STRING, or C<undef> if there's a failure.
158
159	If the UTF-8 in STRING is malformed? See L</"Handling Malformed Data">.
160
161	[INTERNAL] The UTF-8 flag of STRING is not checked.
162
163	=item *
164
165	bytes_to_chars(STRING, FROM [, CHECK])
166
167	The bytes in STRING encoded in FROM are recoded in-place into chars.
168	Returns the new size of STRING in bytes, or C<undef> if there's a
169	failure.
170
171	If the mapping is impossible? See L</"Handling Malformed Data">.
172
173	=item *
174
175	chars_to_bytes(STRING, TO [, CHECK])
176
177	The chars in STRING are recoded in-place to bytes encoded in TO.
178	Returns the new size of STRING in bytes, or C<undef> if there's a
179	failure.
180
181	If the mapping is impossible? See L</"Handling Malformed Data">.
182
183	=item *
184
185	from_to(STRING, FROM, TO [, CHECK])
186
187	The chars in STRING encoded in FROM are recoded in-place into TO.
188	Returns the new size of STRING, or C<undef> if there's a failure.
189
190	If mapping between the encodings is impossible?
191	See L</"Handling Malformed Data">.
192
193	[INTERNAL] If TO is UTF-8, also the UTF-8 flag of STRING is turned on.
194
195	=back
196
197	=head2 Testing For UTF-8
198
199	=over 4
200
201	=item *
202
203	is_utf8(STRING [, CHECK])
204
205	[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
206	If CHECK is true, also checks the data in STRING for being
207	well-formed UTF-8. Returns true if successful, false otherwise.
208
209	=back
210
211	=head2 Toggling UTF-8-ness
212
213	=over 4
214
215	=item *
216
217	on_utf8(STRING)
218
219	[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
220	B<not> checked for being well-formed UTF-8. Do not use unless you
221	B<know> that the STRING is well-formed UTF-8. Returns the previous
222	state of the UTF-8 flag (so please don't test the return value as
223	I<not> success or failure), or C<undef> if STRING is not a string.
224
225	=item *
226
227	off_utf8(STRING)
228
229	[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
230	Returns the previous state of the UTF-8 flag (so please don't test the
231	return value as I<not> success or failure), or C<undef> if STRING is
232	not a string.
233
234	=back
235
236	=head2 UTF-16 and UTF-32 Encodings
237
238	=over 4
239
240	=item *
241
242	utf_to_utf(STRING, FROM, TO [, CHECK])
243
244	The data in STRING is converted from Unicode Transfer Encoding FROM to
245	Unicode Transfer Encoding TO. Both FROM and TO may be any of the
246	following tags (case-insensitive, with or without 'utf' or 'utf-' prefix):
247
248	tag meaning
249
250	'7' UTF-7
251	'8' UTF-8
252	'16be' UTF-16 big-endian
253	'16le' UTF-16 little-endian
254	'16' UTF-16 native-endian
255	'32be' UTF-32 big-endian
256	'32le' UTF-32 little-endian
257	'32' UTF-32 native-endian
258
259	UTF-16 is also known as UCS-2, 16 bit or 2-byte chunks, and UTF-32 as
260	UCS-4, 32-bit or 4-byte chunks. Returns the new size of STRING, or
261	C<undef> is there's a failure.
262
263	If FROM is UTF-8 and the UTF-8 in STRING is malformed? See
264	L</"Handling Malformed Data">.
265
266	[INTERNAL] Even if CHECK is true and FROM is UTF-8, the UTF-8 flag of
267	STRING is not checked. If TO is UTF-8, also the UTF-8 flag of STRING is
268	turned on. Identical FROM and TO are fine.
269
270	=back
271
272	=head2 Handling Malformed Data
273
274	If CHECK is not set, C<undef> is returned. If the data is supposed to
275	be UTF-8, an optional lexical warning (category utf8) is given. If
276	CHECK is true but not a code reference, dies. If CHECK is a code
277	reference, it is called with the arguments
278
279	(MALFORMED_STRING, STRING_FROM_SO_FAR, STRING_TO_SO_FAR)
280
281	Two return values are expected from the call: the string to be used in
282	the result string in place of the malformed section, and the length of
283	the malformed section in bytes.
284
285	=cut
286
287	sub bytes_to_utf8 {
288	&_bytes_to_utf8;
289	}
290
291	sub utf8_to_bytes {
292	&_utf8_to_bytes;
293	}
294
295	sub chars_to_utf8 {
296	&C_to_utf8;
297	}
298
299	sub utf8_to_chars {
300	&_utf8_to_chars;
301	}
302
303	sub utf8_to_chars_check {
304	&_utf8_to_chars_check;
305	}
306
307	sub bytes_to_chars {
308	&_bytes_to_chars;
309	}
310
311	sub chars_to_bytes {
312	&_chars_to_bytes;
313	}
314
315	sub is_utf8 {
316	&_is_utf8;
317	}
318
319	sub on_utf8 {
320	&_on_utf8;
321	}
322
323	sub off_utf8 {
324	&_off_utf8;
325	}
326
327	sub utf_to_utf {
328	&_utf_to_utf;
329	}
330
331	use Carp;
332
333	sub from_to
334	{
335	my ($string,$from,$to,$check) = @_;
336	my $f = __PACKAGE__->getEncoding($from);
337	croak("Unknown encoding '$from'") unless $f;
338	my $t = __PACKAGE__->getEncoding($to);
339	croak("Unknown encoding '$to'") unless $t;
340	my $uni = $f->toUnicode($string,$check);
341	return undef if ($check && length($string));
342	$string = $t->fromUnicode($uni,$check);
343	return undef if ($check && length($uni));
344	return length($_[0] = $string);
345	}
346
0e06870b	347	# The global hash is declared in XS code
	348	$encoding{Unicode} = bless({},'Encode::Unicode');
	349	$encoding{'iso10646-1'} = bless({},'Encode::iso10646_1');
22d4bb9c	350
	351	sub encodings
	352	{
	353	my ($class) = @_;
	354	foreach my $dir (@INC)
	355	{
	356	if (opendir(my $dh,"$dir/Encode"))
	357	{
	358	while (defined(my $name = readdir($dh)))
	359	{
	360	if ($name =~ /^(.*)\.enc$/)
	361	{
	362	next if exists $encoding{$1};
	363	$encoding{$1} = "$dir/$name";
	364	}
	365	}
	366	closedir($dh);
	367	}
	368	}
	369	return keys %encoding;
	370	}
	371
	372	sub loadEncoding
	373	{
	374	my ($class,$name,$file) = @_;
	375	if (open(my $fh,$file))
	376	{
	377	my $type;
	378	while (1)
	379	{
	380	my $line = <$fh>;
	381	$type = substr($line,0,1);
	382	last unless $type eq '#';
	383	}
	384	$class .= ('::'.(($type eq 'E') ? 'Escape' : 'Table'));
0e06870b	385	#warn "Loading $file";
22d4bb9c	386	return $class->read($fh,$name,$type);
	387	}
	388	else
	389	{
	390	return undef;
	391	}
	392	}
	393
	394	sub getEncoding
	395	{
	396	my ($class,$name) = @_;
	397	my $enc;
	398	unless (ref($enc = $encoding{$name}))
	399	{
	400	$enc = $class->loadEncoding($name,$enc) if defined $enc;
	401	unless (ref($enc))
	402	{
	403	foreach my $dir (@INC)
	404	{
	405	last if ($enc = $class->loadEncoding($name,"$dir/Encode/$name.enc"));
	406	}
	407	}
	408	$encoding{$name} = $enc;
	409	}
	410	return $enc;
	411	}
	412
	413	package Encode::Unicode;
	414
0e06870b	415	# Dummy package that provides the encode interface but leaves data
0e06870b	416	# as UTF-8 encoded. It is here so that from_to() works.
22d4bb9c	417
	418	sub name { 'Unicode' }
	419
0e06870b	420	sub toUnicode
	421	{
	422	my ($obj,$str,$chk) = @_;
	423	Encode::utf8_upgrade($str);
	424	$_[1] = '' if $chk;
	425	return $str;
	426	}
22d4bb9c	427
0e06870b	428	*fromUnicode = \&toUnicode;
22d4bb9c	429
	430	package Encode::Table;
	431
	432	sub read
	433	{
	434	my ($class,$fh,$name,$type) = @_;
	435	my $rep = $class->can("rep_$type");
	436	my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
	437	my @touni;
	438	my %fmuni;
	439	my $count = 0;
	440	$def = hex($def);
	441	while ($pages--)
	442	{
	443	my $line = <$fh>;
	444	chomp($line);
	445	my $page = hex($line);
	446	my @page;
	447	my $ch = $page * 256;
	448	for (my $i = 0; $i < 16; $i++)
	449	{
	450	my $line = <$fh>;
	451	for (my $j = 0; $j < 16; $j++)
	452	{
	453	my $val = hex(substr($line,0,4,''));
	454	if ($val \|\| !$ch)
	455	{
	456	my $uch = chr($val);
	457	push(@page,$uch);
	458	$fmuni{$uch} = $ch;
	459	$count++;
	460	}
	461	else
	462	{
	463	push(@page,undef);
	464	}
	465	$ch++;
	466	}
	467	}
	468	$touni[$page] = \@page;
	469	}
	470
	471	return bless {Name => $name,
	472	Rep => $rep,
	473	ToUni => \@touni,
	474	FmUni => \%fmuni,
	475	Def => $def,
	476	Num => $count,
	477	},$class;
	478	}
	479
	480	sub name { shift->{'Name'} }
	481
	482	sub rep_S { 'C' }
	483
	484	sub rep_D { 'n' }
	485
	486	sub rep_M { ($_[0] > 255) ? 'n' : 'C' }
	487
	488	sub representation
	489	{
	490	my ($obj,$ch) = @_;
	491	$ch = 0 unless @_ > 1;
	492	$obj-{'Rep'}->($ch);
493	}
494
495	sub toUnicode
496	{
497	my ($obj,$str,$chk) = @_;
498	my $rep = $obj->{'Rep'};
499	my $touni = $obj->{'ToUni'};
500	my $uni = '';
501	while (length($str))
502	{
503	my $ch = ord(substr($str,0,1,''));
504	my $x;
505	if (&$rep($ch) eq 'C')
506	{
507	$x = $touni->[0][$ch];
508	}
509	else
510	{
511	$x = $touni->[$ch][ord(substr($str,0,1,''))];
512	}
513	unless (defined $x)
514	{
515	last if $chk;
516	# What do we do here ?
517	$x = '';
518	}
519	$uni .= $x;
520	}
521	$_[1] = $str if $chk;
522	return $uni;
523	}
524
525	sub fromUnicode
526	{
527	my ($obj,$uni,$chk) = @_;
528	my $fmuni = $obj->{'FmUni'};
529	my $str = '';
530	my $def = $obj->{'Def'};
531	my $rep = $obj->{'Rep'};
532	while (length($uni))
533	{
534	my $ch = substr($uni,0,1,'');
535	my $x = $fmuni->{chr(ord($ch))};
536	unless (defined $x)
537	{
538	last if ($chk);
539	$x = $def;
540	}
541	$str .= pack(&$rep($x),$x);
542	}
543	$_[1] = $uni if $chk;
544	return $str;
545	}
546
0e06870b	547	package Encode::iso10646_1;
	548	# Encoding is 16-bit network order Unicode
	549	# Used for X font encodings
22d4bb9c	550
	551	sub name { 'iso10646-1' }
	552
	553	sub toUnicode
	554	{
	555	my ($obj,$str,$chk) = @_;
	556	my $uni = '';
	557	while (length($str))
	558	{
	559	my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
	560	$uni .= chr($code);
	561	}
	562	$_[1] = $str if $chk;
0e06870b	563	Encode::utf8_upgrade($uni);
22d4bb9c	564	return $uni;
	565	}
	566
	567	sub fromUnicode
	568	{
	569	my ($obj,$uni,$chk) = @_;
	570	my $str = '';
	571	while (length($uni))
	572	{
	573	my $ch = substr($uni,0,1,'');
	574	my $x = ord($ch);
	575	unless ($x < 32768)
	576	{
	577	last if ($chk);
	578	$x = 0;
	579	}
	580	$str .= pack('n',$x);
	581	}
	582	$_[1] = $uni if $chk;
	583	return $str;
	584	}
	585
0e06870b	586
22d4bb9c	587	package Encode::Escape;
	588	use Carp;
	589
	590	sub read
	591	{
	592	my ($class,$fh,$name) = @_;
	593	my %self = (Name => $name, Num => 0);
	594	while (<$fh>)
	595	{
	596	my ($key,$val) = /^(\S+)\s+(.*)$/;
	597	$val =~ s/^\{(.*?)\}/$1/g;
	598	$val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge;
	599	$self{$key} = $val;
	600	}
	601	return bless \%self,$class;
	602	}
	603
	604	sub name { shift->{'Name'} }
	605
	606	sub toUnicode
	607	{
	608	croak("Not implemented yet");
	609	}
	610
	611	sub fromUnicode
	612	{
	613	croak("Not implemented yet");
	614	}
	615
	616	1;
	617
	618	__END__