Once again syncing after too long an absence
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.pm
CommitLineData
22d4bb9c 1package Encode;
2
3$VERSION = 0.01;
4
5require DynaLoader;
6require Exporter;
7
8@ISA = qw(Exporter DynaLoader);
9
10@EXPORT_OK =
11 qw(
12 bytes_to_utf8
13 utf8_to_bytes
14 chars_to_utf8
15 utf8_to_chars
16 utf8_to_chars_check
17 bytes_to_chars
18 chars_to_bytes
19 from_to
20 is_utf8
21 on_utf8
22 off_utf8
23 utf_to_utf
24 encodings
0e06870b 25 utf8_decode
26 utf8_encode
27 utf8_upgrade
28 utf8_downgrade
22d4bb9c 29 );
30
31bootstrap Encode ();
32
33=pod
34
35=head1 NAME
36
37Encode - character encodings
38
39=head2 TERMINOLOGY
40
41=over
42
43=item *
44
45I<char>: a character in the range 0..maxint (at least 2**32-1)
46
47=item *
48
49I<byte>: a character in the range 0..255
50
51=back
52
53The marker [INTERNAL] marks Internal Implementation Details, in
54general meant only for those who think they know what they are doing,
55and such details may change in future releases.
56
57=head2 bytes
58
59=over 4
60
61=item *
62
63 bytes_to_utf8(STRING [, FROM])
64
65The bytes in STRING are recoded in-place into UTF-8. If no FROM is
66specified the bytes are expected to be encoded in US-ASCII or ISO
678859-1 (Latin 1). Returns the new size of STRING, or C<undef> if
68there's a failure.
69
70[INTERNAL] Also the UTF-8 flag of STRING is turned on.
71
72=item *
73
74 utf8_to_bytes(STRING [, TO [, CHECK]])
75
76The UTF-8 in STRING is decoded in-place into bytes. If no TO encoding
77is specified the bytes are expected to be encoded in US-ASCII or ISO
788859-1 (Latin 1). Returns the new size of STRING, or C<undef> if
79there's a failure.
80
81What if there are characters > 255? What if the UTF-8 in STRING is
82malformed? See L</"Handling Malformed Data">.
83
84[INTERNAL] The UTF-8 flag of STRING is not checked.
85
86=back
87
88=head2 chars
89
90=over 4
91
92=item *
93
94 chars_to_utf8(STRING)
95
96The chars in STRING are encoded in-place into UTF-8. Returns the new
97size of STRING, or C<undef> if there's a failure.
98
99No assumptions are made on the encoding of the chars. If you want to
100assume that the chars are Unicode and to trap illegal Unicode
101characters, you must use C<from_to('Unicode', ...)>.
102
103[INTERNAL] Also the UTF-8 flag of STRING is turned on.
104
105=over 4
106
107=item *
108
109 utf8_to_chars(STRING)
110
111The UTF-8 in STRING is decoded in-place into chars. Returns the new
112size of STRING, or C<undef> if there's a failure.
113
114If the UTF-8 in STRING is malformed C<undef> is returned, and also an
115optional lexical warning (category utf8) is given.
116
117[INTERNAL] The UTF-8 flag of STRING is not checked.
118
119=item *
120
121 utf8_to_chars_check(STRING [, CHECK])
122
123(Note that special naming of this interface since a two-argument
124utf8_to_chars() has different semantics.)
125
126The UTF-8 in STRING is decoded in-place into chars. Returns the new
127size of STRING, or C<undef> if there is a failure.
128
129If the UTF-8 in STRING is malformed? See L</"Handling Malformed Data">.
130
131[INTERNAL] The UTF-8 flag of STRING is not checked.
132
133=back
134
135=head2 chars With Encoding
136
137=over 4
138
139=item *
140
141 chars_to_utf8(STRING, FROM [, CHECK])
142
143The chars in STRING encoded in FROM are recoded in-place into UTF-8.
144Returns the new size of STRING, or C<undef> if there's a failure.
145
146No assumptions are made on the encoding of the chars. If you want to
147assume that the chars are Unicode and to trap illegal Unicode
148characters, you must use C<from_to('Unicode', ...)>.
149
150[INTERNAL] Also the UTF-8 flag of STRING is turned on.
151
152=item *
153
154 utf8_to_chars(STRING, TO [, CHECK])
155
156The UTF-8 in STRING is decoded in-place into chars encoded in TO.
157Returns the new size of STRING, or C<undef> if there's a failure.
158
159If the UTF-8 in STRING is malformed? See L</"Handling Malformed Data">.
160
161[INTERNAL] The UTF-8 flag of STRING is not checked.
162
163=item *
164
165 bytes_to_chars(STRING, FROM [, CHECK])
166
167The bytes in STRING encoded in FROM are recoded in-place into chars.
168Returns the new size of STRING in bytes, or C<undef> if there's a
169failure.
170
171If the mapping is impossible? See L</"Handling Malformed Data">.
172
173=item *
174
175 chars_to_bytes(STRING, TO [, CHECK])
176
177The chars in STRING are recoded in-place to bytes encoded in TO.
178Returns the new size of STRING in bytes, or C<undef> if there's a
179failure.
180
181If the mapping is impossible? See L</"Handling Malformed Data">.
182
183=item *
184
185 from_to(STRING, FROM, TO [, CHECK])
186
187The chars in STRING encoded in FROM are recoded in-place into TO.
188Returns the new size of STRING, or C<undef> if there's a failure.
189
190If mapping between the encodings is impossible?
191See L</"Handling Malformed Data">.
192
193[INTERNAL] If TO is UTF-8, also the UTF-8 flag of STRING is turned on.
194
195=back
196
197=head2 Testing For UTF-8
198
199=over 4
200
201=item *
202
203 is_utf8(STRING [, CHECK])
204
205[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
206If CHECK is true, also checks the data in STRING for being
207well-formed UTF-8. Returns true if successful, false otherwise.
208
209=back
210
211=head2 Toggling UTF-8-ness
212
213=over 4
214
215=item *
216
217 on_utf8(STRING)
218
219[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
220B<not> checked for being well-formed UTF-8. Do not use unless you
221B<know> that the STRING is well-formed UTF-8. Returns the previous
222state of the UTF-8 flag (so please don't test the return value as
223I<not> success or failure), or C<undef> if STRING is not a string.
224
225=item *
226
227 off_utf8(STRING)
228
229[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
230Returns the previous state of the UTF-8 flag (so please don't test the
231return value as I<not> success or failure), or C<undef> if STRING is
232not a string.
233
234=back
235
236=head2 UTF-16 and UTF-32 Encodings
237
238=over 4
239
240=item *
241
242 utf_to_utf(STRING, FROM, TO [, CHECK])
243
244The data in STRING is converted from Unicode Transfer Encoding FROM to
245Unicode Transfer Encoding TO. Both FROM and TO may be any of the
246following tags (case-insensitive, with or without 'utf' or 'utf-' prefix):
247
248 tag meaning
249
250 '7' UTF-7
251 '8' UTF-8
252 '16be' UTF-16 big-endian
253 '16le' UTF-16 little-endian
254 '16' UTF-16 native-endian
255 '32be' UTF-32 big-endian
256 '32le' UTF-32 little-endian
257 '32' UTF-32 native-endian
258
259UTF-16 is also known as UCS-2, 16 bit or 2-byte chunks, and UTF-32 as
260UCS-4, 32-bit or 4-byte chunks. Returns the new size of STRING, or
261C<undef> is there's a failure.
262
263If FROM is UTF-8 and the UTF-8 in STRING is malformed? See
264L</"Handling Malformed Data">.
265
266[INTERNAL] Even if CHECK is true and FROM is UTF-8, the UTF-8 flag of
267STRING is not checked. If TO is UTF-8, also the UTF-8 flag of STRING is
268turned on. Identical FROM and TO are fine.
269
270=back
271
272=head2 Handling Malformed Data
273
274If CHECK is not set, C<undef> is returned. If the data is supposed to
275be UTF-8, an optional lexical warning (category utf8) is given. If
276CHECK is true but not a code reference, dies. If CHECK is a code
277reference, it is called with the arguments
278
279 (MALFORMED_STRING, STRING_FROM_SO_FAR, STRING_TO_SO_FAR)
280
281Two return values are expected from the call: the string to be used in
282the result string in place of the malformed section, and the length of
283the malformed section in bytes.
284
285=cut
286
287sub bytes_to_utf8 {
288 &_bytes_to_utf8;
289}
290
291sub utf8_to_bytes {
292 &_utf8_to_bytes;
293}
294
295sub chars_to_utf8 {
296 &C_to_utf8;
297}
298
299sub utf8_to_chars {
300 &_utf8_to_chars;
301}
302
303sub utf8_to_chars_check {
304 &_utf8_to_chars_check;
305}
306
307sub bytes_to_chars {
308 &_bytes_to_chars;
309}
310
311sub chars_to_bytes {
312 &_chars_to_bytes;
313}
314
315sub is_utf8 {
316 &_is_utf8;
317}
318
319sub on_utf8 {
320 &_on_utf8;
321}
322
323sub off_utf8 {
324 &_off_utf8;
325}
326
327sub utf_to_utf {
328 &_utf_to_utf;
329}
330
331use Carp;
332
333sub from_to
334{
335 my ($string,$from,$to,$check) = @_;
336 my $f = __PACKAGE__->getEncoding($from);
337 croak("Unknown encoding '$from'") unless $f;
338 my $t = __PACKAGE__->getEncoding($to);
339 croak("Unknown encoding '$to'") unless $t;
340 my $uni = $f->toUnicode($string,$check);
341 return undef if ($check && length($string));
342 $string = $t->fromUnicode($uni,$check);
343 return undef if ($check && length($uni));
344 return length($_[0] = $string);
345}
346
0e06870b 347# The global hash is declared in XS code
348$encoding{Unicode} = bless({},'Encode::Unicode');
349$encoding{'iso10646-1'} = bless({},'Encode::iso10646_1');
22d4bb9c 350
351sub encodings
352{
353 my ($class) = @_;
354 foreach my $dir (@INC)
355 {
356 if (opendir(my $dh,"$dir/Encode"))
357 {
358 while (defined(my $name = readdir($dh)))
359 {
360 if ($name =~ /^(.*)\.enc$/)
361 {
362 next if exists $encoding{$1};
363 $encoding{$1} = "$dir/$name";
364 }
365 }
366 closedir($dh);
367 }
368 }
369 return keys %encoding;
370}
371
372sub loadEncoding
373{
374 my ($class,$name,$file) = @_;
375 if (open(my $fh,$file))
376 {
377 my $type;
378 while (1)
379 {
380 my $line = <$fh>;
381 $type = substr($line,0,1);
382 last unless $type eq '#';
383 }
384 $class .= ('::'.(($type eq 'E') ? 'Escape' : 'Table'));
0e06870b 385 #warn "Loading $file";
22d4bb9c 386 return $class->read($fh,$name,$type);
387 }
388 else
389 {
390 return undef;
391 }
392}
393
394sub getEncoding
395{
396 my ($class,$name) = @_;
397 my $enc;
398 unless (ref($enc = $encoding{$name}))
399 {
400 $enc = $class->loadEncoding($name,$enc) if defined $enc;
401 unless (ref($enc))
402 {
403 foreach my $dir (@INC)
404 {
405 last if ($enc = $class->loadEncoding($name,"$dir/Encode/$name.enc"));
406 }
407 }
408 $encoding{$name} = $enc;
409 }
410 return $enc;
411}
412
413package Encode::Unicode;
414
0e06870b 415# Dummy package that provides the encode interface but leaves data
416# as UTF-8 encoded. It is here so that from_to() works.
22d4bb9c 417
418sub name { 'Unicode' }
419
0e06870b 420sub toUnicode
421{
422 my ($obj,$str,$chk) = @_;
423 Encode::utf8_upgrade($str);
424 $_[1] = '' if $chk;
425 return $str;
426}
22d4bb9c 427
0e06870b 428*fromUnicode = \&toUnicode;
22d4bb9c 429
430package Encode::Table;
431
432sub read
433{
434 my ($class,$fh,$name,$type) = @_;
435 my $rep = $class->can("rep_$type");
436 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
437 my @touni;
438 my %fmuni;
439 my $count = 0;
440 $def = hex($def);
441 while ($pages--)
442 {
443 my $line = <$fh>;
444 chomp($line);
445 my $page = hex($line);
446 my @page;
447 my $ch = $page * 256;
448 for (my $i = 0; $i < 16; $i++)
449 {
450 my $line = <$fh>;
451 for (my $j = 0; $j < 16; $j++)
452 {
453 my $val = hex(substr($line,0,4,''));
454 if ($val || !$ch)
455 {
456 my $uch = chr($val);
457 push(@page,$uch);
458 $fmuni{$uch} = $ch;
459 $count++;
460 }
461 else
462 {
463 push(@page,undef);
464 }
465 $ch++;
466 }
467 }
468 $touni[$page] = \@page;
469 }
470
471 return bless {Name => $name,
472 Rep => $rep,
473 ToUni => \@touni,
474 FmUni => \%fmuni,
475 Def => $def,
476 Num => $count,
477 },$class;
478}
479
480sub name { shift->{'Name'} }
481
482sub rep_S { 'C' }
483
484sub rep_D { 'n' }
485
486sub rep_M { ($_[0] > 255) ? 'n' : 'C' }
487
488sub representation
489{
490 my ($obj,$ch) = @_;
491 $ch = 0 unless @_ > 1;
492 $obj-{'Rep'}->($ch);
493}
494
495sub toUnicode
496{
497 my ($obj,$str,$chk) = @_;
498 my $rep = $obj->{'Rep'};
499 my $touni = $obj->{'ToUni'};
500 my $uni = '';
501 while (length($str))
502 {
503 my $ch = ord(substr($str,0,1,''));
504 my $x;
505 if (&$rep($ch) eq 'C')
506 {
507 $x = $touni->[0][$ch];
508 }
509 else
510 {
511 $x = $touni->[$ch][ord(substr($str,0,1,''))];
512 }
513 unless (defined $x)
514 {
515 last if $chk;
516 # What do we do here ?
517 $x = '';
518 }
519 $uni .= $x;
520 }
521 $_[1] = $str if $chk;
522 return $uni;
523}
524
525sub fromUnicode
526{
527 my ($obj,$uni,$chk) = @_;
528 my $fmuni = $obj->{'FmUni'};
529 my $str = '';
530 my $def = $obj->{'Def'};
531 my $rep = $obj->{'Rep'};
532 while (length($uni))
533 {
534 my $ch = substr($uni,0,1,'');
535 my $x = $fmuni->{chr(ord($ch))};
536 unless (defined $x)
537 {
538 last if ($chk);
539 $x = $def;
540 }
541 $str .= pack(&$rep($x),$x);
542 }
543 $_[1] = $uni if $chk;
544 return $str;
545}
546
0e06870b 547package Encode::iso10646_1;
548# Encoding is 16-bit network order Unicode
549# Used for X font encodings
22d4bb9c 550
551sub name { 'iso10646-1' }
552
553sub toUnicode
554{
555 my ($obj,$str,$chk) = @_;
556 my $uni = '';
557 while (length($str))
558 {
559 my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
560 $uni .= chr($code);
561 }
562 $_[1] = $str if $chk;
0e06870b 563 Encode::utf8_upgrade($uni);
22d4bb9c 564 return $uni;
565}
566
567sub fromUnicode
568{
569 my ($obj,$uni,$chk) = @_;
570 my $str = '';
571 while (length($uni))
572 {
573 my $ch = substr($uni,0,1,'');
574 my $x = ord($ch);
575 unless ($x < 32768)
576 {
577 last if ($chk);
578 $x = 0;
579 }
580 $str .= pack('n',$x);
581 }
582 $_[1] = $uni if $chk;
583 return $str;
584}
585
0e06870b 586
22d4bb9c 587package Encode::Escape;
588use Carp;
589
590sub read
591{
592 my ($class,$fh,$name) = @_;
593 my %self = (Name => $name, Num => 0);
594 while (<$fh>)
595 {
596 my ($key,$val) = /^(\S+)\s+(.*)$/;
597 $val =~ s/^\{(.*?)\}/$1/g;
598 $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge;
599 $self{$key} = $val;
600 }
601 return bless \%self,$class;
602}
603
604sub name { shift->{'Name'} }
605
606sub toUnicode
607{
608 croak("Not implemented yet");
609}
610
611sub fromUnicode
612{
613 croak("Not implemented yet");
614}
615
6161;
617
618__END__