Start of support of ICU-style .ucm files:
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
2
3$VERSION = 0.01;
4
5require DynaLoader;
6require Exporter;
7
8@ISA = qw(Exporter DynaLoader);
9
10@EXPORT_OK =
11 qw(
12 bytes_to_utf8
13 utf8_to_bytes
14 chars_to_utf8
15 utf8_to_chars
16 utf8_to_chars_check
17 bytes_to_chars
18 chars_to_bytes
19 from_to
20 is_utf8
21 on_utf8
22 off_utf8
23 utf_to_utf
656753f8 24 encodings
2c674647 25 );
26
27bootstrap Encode ();
28
29=pod
30
31=head1 NAME
32
33Encode - character encodings
34
35=head2 TERMINOLOGY
36
37=over
38
39=item *
40
41I<char>: a character in the range 0..maxint (at least 2**32-1)
42
43=item *
44
45I<byte>: a character in the range 0..255
46
47=back
48
49The marker [INTERNAL] marks Internal Implementation Details, in
50general meant only for those who think they know what they are doing,
51and such details may change in future releases.
52
53=head2 bytes
54
55=over 4
56
57=item *
58
59 bytes_to_utf8(STRING [, FROM])
60
61The bytes in STRING are recoded in-place into UTF-8. If no FROM is
62specified the bytes are expected to be encoded in US-ASCII or ISO
638859-1 (Latin 1). Returns the new size of STRING, or C<undef> if
64there's a failure.
65
656753f8 66[INTERNAL] Also the UTF-8 flag of STRING is turned on.
2c674647 67
68=item *
69
70 utf8_to_bytes(STRING [, TO [, CHECK]])
71
72The UTF-8 in STRING is decoded in-place into bytes. If no TO encoding
73is specified the bytes are expected to be encoded in US-ASCII or ISO
748859-1 (Latin 1). Returns the new size of STRING, or C<undef> if
75there's a failure.
76
77What if there are characters > 255? What if the UTF-8 in STRING is
78malformed? See L</"Handling Malformed Data">.
79
80[INTERNAL] The UTF-8 flag of STRING is not checked.
81
82=back
83
84=head2 chars
85
86=over 4
87
88=item *
89
90 chars_to_utf8(STRING)
91
92The chars in STRING are encoded in-place into UTF-8. Returns the new
93size of STRING, or C<undef> if there's a failure.
94
95No assumptions are made on the encoding of the chars. If you want to
96assume that the chars are Unicode and to trap illegal Unicode
97characters, you must use C<from_to('Unicode', ...)>.
98
99[INTERNAL] Also the UTF-8 flag of STRING is turned on.
100
101=over 4
102
103=item *
104
105 utf8_to_chars(STRING)
106
107The UTF-8 in STRING is decoded in-place into chars. Returns the new
656753f8 108size of STRING, or C<undef> if there's a failure.
2c674647 109
110If the UTF-8 in STRING is malformed C<undef> is returned, and also an
111optional lexical warning (category utf8) is given.
112
113[INTERNAL] The UTF-8 flag of STRING is not checked.
114
115=item *
116
117 utf8_to_chars_check(STRING [, CHECK])
118
119(Note that special naming of this interface since a two-argument
120utf8_to_chars() has different semantics.)
121
122The UTF-8 in STRING is decoded in-place into chars. Returns the new
123size of STRING, or C<undef> if there is a failure.
124
125If the UTF-8 in STRING is malformed? See L</"Handling Malformed Data">.
126
127[INTERNAL] The UTF-8 flag of STRING is not checked.
128
129=back
130
131=head2 chars With Encoding
132
133=over 4
134
135=item *
136
137 chars_to_utf8(STRING, FROM [, CHECK])
138
139The chars in STRING encoded in FROM are recoded in-place into UTF-8.
140Returns the new size of STRING, or C<undef> if there's a failure.
141
142No assumptions are made on the encoding of the chars. If you want to
143assume that the chars are Unicode and to trap illegal Unicode
144characters, you must use C<from_to('Unicode', ...)>.
145
146[INTERNAL] Also the UTF-8 flag of STRING is turned on.
147
148=item *
149
150 utf8_to_chars(STRING, TO [, CHECK])
151
152The UTF-8 in STRING is decoded in-place into chars encoded in TO.
153Returns the new size of STRING, or C<undef> if there's a failure.
154
155If the UTF-8 in STRING is malformed? See L</"Handling Malformed Data">.
156
157[INTERNAL] The UTF-8 flag of STRING is not checked.
158
159=item *
160
161 bytes_to_chars(STRING, FROM [, CHECK])
162
163The bytes in STRING encoded in FROM are recoded in-place into chars.
164Returns the new size of STRING in bytes, or C<undef> if there's a
165failure.
166
167If the mapping is impossible? See L</"Handling Malformed Data">.
168
169=item *
170
171 chars_to_bytes(STRING, TO [, CHECK])
172
173The chars in STRING are recoded in-place to bytes encoded in TO.
174Returns the new size of STRING in bytes, or C<undef> if there's a
175failure.
176
177If the mapping is impossible? See L</"Handling Malformed Data">.
178
179=item *
180
181 from_to(STRING, FROM, TO [, CHECK])
182
183The chars in STRING encoded in FROM are recoded in-place into TO.
184Returns the new size of STRING, or C<undef> if there's a failure.
185
186If mapping between the encodings is impossible?
187See L</"Handling Malformed Data">.
188
189[INTERNAL] If TO is UTF-8, also the UTF-8 flag of STRING is turned on.
190
191=back
192
193=head2 Testing For UTF-8
194
195=over 4
196
197=item *
198
199 is_utf8(STRING [, CHECK])
200
201[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
202If CHECK is true, also checks the data in STRING for being
203well-formed UTF-8. Returns true if successful, false otherwise.
204
205=back
206
207=head2 Toggling UTF-8-ness
208
209=over 4
210
211=item *
212
213 on_utf8(STRING)
214
215[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
216B<not> checked for being well-formed UTF-8. Do not use unless you
217B<know> that the STRING is well-formed UTF-8. Returns the previous
218state of the UTF-8 flag (so please don't test the return value as
219I<not> success or failure), or C<undef> if STRING is not a string.
220
221=item *
222
223 off_utf8(STRING)
224
225[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
226Returns the previous state of the UTF-8 flag (so please don't test the
227return value as I<not> success or failure), or C<undef> if STRING is
228not a string.
229
230=back
231
232=head2 UTF-16 and UTF-32 Encodings
233
234=over 4
235
236=item *
237
238 utf_to_utf(STRING, FROM, TO [, CHECK])
239
240The data in STRING is converted from Unicode Transfer Encoding FROM to
241Unicode Transfer Encoding TO. Both FROM and TO may be any of the
242following tags (case-insensitive, with or without 'utf' or 'utf-' prefix):
243
244 tag meaning
245
246 '7' UTF-7
247 '8' UTF-8
248 '16be' UTF-16 big-endian
249 '16le' UTF-16 little-endian
250 '16' UTF-16 native-endian
251 '32be' UTF-32 big-endian
252 '32le' UTF-32 little-endian
253 '32' UTF-32 native-endian
254
255UTF-16 is also known as UCS-2, 16 bit or 2-byte chunks, and UTF-32 as
256UCS-4, 32-bit or 4-byte chunks. Returns the new size of STRING, or
257C<undef> is there's a failure.
258
259If FROM is UTF-8 and the UTF-8 in STRING is malformed? See
260L</"Handling Malformed Data">.
261
262[INTERNAL] Even if CHECK is true and FROM is UTF-8, the UTF-8 flag of
263STRING is not checked. If TO is UTF-8, also the UTF-8 flag of STRING is
264turned on. Identical FROM and TO are fine.
265
266=back
267
268=head2 Handling Malformed Data
269
270If CHECK is not set, C<undef> is returned. If the data is supposed to
271be UTF-8, an optional lexical warning (category utf8) is given. If
272CHECK is true but not a code reference, dies. If CHECK is a code
273reference, it is called with the arguments
274
275 (MALFORMED_STRING, STRING_FROM_SO_FAR, STRING_TO_SO_FAR)
276
277Two return values are expected from the call: the string to be used in
278the result string in place of the malformed section, and the length of
279the malformed section in bytes.
280
281=cut
282
283sub bytes_to_utf8 {
284 &_bytes_to_utf8;
285}
286
287sub utf8_to_bytes {
288 &_utf8_to_bytes;
289}
290
291sub chars_to_utf8 {
292 &C_to_utf8;
293}
294
295sub utf8_to_chars {
296 &_utf8_to_chars;
297}
298
299sub utf8_to_chars_check {
300 &_utf8_to_chars_check;
301}
302
303sub bytes_to_chars {
304 &_bytes_to_chars;
305}
306
307sub chars_to_bytes {
308 &_chars_to_bytes;
309}
310
2c674647 311sub is_utf8 {
312 &_is_utf8;
313}
314
315sub on_utf8 {
316 &_on_utf8;
317}
318
319sub off_utf8 {
320 &_off_utf8;
321}
322
323sub utf_to_utf {
324 &_utf_to_utf;
325}
326
bf230f3d 327use Carp;
328
656753f8 329sub from_to
330{
331 my ($string,$from,$to,$check) = @_;
332 my $f = __PACKAGE__->getEncoding($from);
bf230f3d 333 croak("Unknown encoding '$from'") unless $f;
656753f8 334 my $t = __PACKAGE__->getEncoding($to);
bf230f3d 335 croak("Unknown encoding '$to'") unless $t;
656753f8 336 my $uni = $f->toUnicode($string,$check);
bf230f3d 337 return undef if ($check && length($string));
656753f8 338 $string = $t->fromUnicode($uni,$check);
bf230f3d 339 return undef if ($check && length($uni));
656753f8 340 return length($_[0] = $string);
341}
342
2f2b4ff2 343# The global hash is declared in XS code
344$encoding{Unicode} = bless({},'Encode::Unicode');
9b37254d 345$encoding{'iso10646-1'} = bless({},'Encode::iso10646_1');
5345d506 346
656753f8 347sub encodings
348{
349 my ($class) = @_;
5345d506 350 foreach my $dir (@INC)
656753f8 351 {
5345d506 352 if (opendir(my $dh,"$dir/Encode"))
656753f8 353 {
5345d506 354 while (defined(my $name = readdir($dh)))
355 {
356 if ($name =~ /^(.*)\.enc$/)
357 {
358 next if exists $encoding{$1};
359 $encoding{$1} = "$dir/$name";
360 }
361 }
362 closedir($dh);
656753f8 363 }
5345d506 364 }
365 return keys %encoding;
366}
367
368sub loadEncoding
369{
370 my ($class,$name,$file) = @_;
371 if (open(my $fh,$file))
372 {
373 my $type;
374 while (1)
375 {
376 my $line = <$fh>;
377 $type = substr($line,0,1);
378 last unless $type eq '#';
379 }
380 $class .= ('::'.(($type eq 'E') ? 'Escape' : 'Table'));
2f2b4ff2 381 warn "Loading $file";
5345d506 382 return $class->read($fh,$name,$type);
656753f8 383 }
384 else
385 {
5345d506 386 return undef;
656753f8 387 }
656753f8 388}
389
656753f8 390sub getEncoding
391{
392 my ($class,$name) = @_;
5345d506 393 my $enc;
394 unless (ref($enc = $encoding{$name}))
656753f8 395 {
5345d506 396 $enc = $class->loadEncoding($name,$enc) if defined $enc;
397 unless (ref($enc))
656753f8 398 {
5345d506 399 foreach my $dir (@INC)
656753f8 400 {
5345d506 401 last if ($enc = $class->loadEncoding($name,"$dir/Encode/$name.enc"));
656753f8 402 }
87714904 403 }
5345d506 404 $encoding{$name} = $enc;
656753f8 405 }
5345d506 406 return $enc;
656753f8 407}
408
409package Encode::Unicode;
410
9b37254d 411# Dummy package that provides the encode interface but leaves data
412# as UTF-8 encoded. It is here so that from_to()
656753f8 413
414sub name { 'Unicode' }
415
416sub toUnicode { $_[1] }
417
418sub fromUnicode { $_[1] }
419
420package Encode::Table;
421
422sub read
423{
424 my ($class,$fh,$name,$type) = @_;
425 my $rep = $class->can("rep_$type");
426 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
427 my @touni;
428 my %fmuni;
429 my $count = 0;
430 $def = hex($def);
656753f8 431 while ($pages--)
432 {
87714904 433 my $line = <$fh>;
434 chomp($line);
435 my $page = hex($line);
656753f8 436 my @page;
437 my $ch = $page * 256;
438 for (my $i = 0; $i < 16; $i++)
439 {
440 my $line = <$fh>;
441 for (my $j = 0; $j < 16; $j++)
442 {
443 my $val = hex(substr($line,0,4,''));
444 if ($val || !$ch)
445 {
446 my $uch = chr($val);
447 push(@page,$uch);
87714904 448 $fmuni{$uch} = $ch;
656753f8 449 $count++;
450 }
451 else
452 {
453 push(@page,undef);
454 }
455 $ch++;
456 }
457 }
458 $touni[$page] = \@page;
459 }
460
461 return bless {Name => $name,
462 Rep => $rep,
463 ToUni => \@touni,
464 FmUni => \%fmuni,
465 Def => $def,
466 Num => $count,
467 },$class;
468}
469
470sub name { shift->{'Name'} }
471
472sub rep_S { 'C' }
473
5dcbab34 474sub rep_D { 'n' }
656753f8 475
5dcbab34 476sub rep_M { ($_[0] > 255) ? 'n' : 'C' }
656753f8 477
478sub representation
479{
480 my ($obj,$ch) = @_;
481 $ch = 0 unless @_ > 1;
482 $obj-{'Rep'}->($ch);
483}
484
485sub toUnicode
486{
bf230f3d 487 my ($obj,$str,$chk) = @_;
656753f8 488 my $rep = $obj->{'Rep'};
489 my $touni = $obj->{'ToUni'};
490 my $uni = '';
491 while (length($str))
492 {
493 my $ch = ord(substr($str,0,1,''));
bf230f3d 494 my $x;
656753f8 495 if (&$rep($ch) eq 'C')
496 {
bf230f3d 497 $x = $touni->[0][$ch];
656753f8 498 }
499 else
500 {
bf230f3d 501 $x = $touni->[$ch][ord(substr($str,0,1,''))];
656753f8 502 }
bf230f3d 503 unless (defined $x)
504 {
505 last if $chk;
506 # What do we do here ?
507 $x = '';
508 }
509 $uni .= $x;
656753f8 510 }
bf230f3d 511 $_[1] = $str if $chk;
656753f8 512 return $uni;
513}
514
515sub fromUnicode
516{
bf230f3d 517 my ($obj,$uni,$chk) = @_;
656753f8 518 my $fmuni = $obj->{'FmUni'};
519 my $str = '';
520 my $def = $obj->{'Def'};
87714904 521 my $rep = $obj->{'Rep'};
656753f8 522 while (length($uni))
523 {
524 my $ch = substr($uni,0,1,'');
63eec5db 525 my $x = $fmuni->{chr(ord($ch))};
bf230f3d 526 unless (defined $x)
527 {
528 last if ($chk);
529 $x = $def;
530 }
87714904 531 $str .= pack(&$rep($x),$x);
532 }
533 $_[1] = $uni if $chk;
534 return $str;
535}
536
9b37254d 537package Encode::iso10646_1;
538# Encoding is 16-bit network order Unicode
539# Used for X font encodings
87714904 540
541sub name { 'iso10646-1' }
542
543sub toUnicode
544{
545 my ($obj,$str,$chk) = @_;
546 my $uni = '';
547 while (length($str))
548 {
5dcbab34 549 my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
87714904 550 $uni .= chr($code);
551 }
552 $_[1] = $str if $chk;
553 return $uni;
554}
555
556sub fromUnicode
557{
558 my ($obj,$uni,$chk) = @_;
559 my $str = '';
560 while (length($uni))
561 {
562 my $ch = substr($uni,0,1,'');
563 my $x = ord($ch);
564 unless ($x < 32768)
565 {
566 last if ($chk);
567 $x = 0;
568 }
5dcbab34 569 $str .= pack('n',$x);
656753f8 570 }
bf230f3d 571 $_[1] = $uni if $chk;
656753f8 572 return $str;
573}
574
2f2b4ff2 575
656753f8 576package Encode::Escape;
577use Carp;
578
579sub read
580{
581 my ($class,$fh,$name) = @_;
582 my %self = (Name => $name, Num => 0);
583 while (<$fh>)
584 {
585 my ($key,$val) = /^(\S+)\s+(.*)$/;
586 $val =~ s/^\{(.*?)\}/$1/g;
587 $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge;
588 $self{$key} = $val;
589 }
590 return bless \%self,$class;
591}
592
593sub name { shift->{'Name'} }
594
595sub toUnicode
596{
597 croak("Not implemented yet");
598}
599
600sub fromUnicode
601{
602 croak("Not implemented yet");
603}
604
6051;
606
607__END__