Upgrade to Encode 1.56, from Dan Kogai.
[p5sagit/p5-mst-13.2.git] / ext / Encode / Unicode / Unicode.pm
CommitLineData
f2a2953c 1package Encode::Unicode;
2
df1df145 3use strict;
f2a2953c 4use warnings;
5
0ab8f81e 6our $VERSION = do { my @r = (q$Revision: 1.34 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
f2a2953c 7
85982a32 8use XSLoader;
9XSLoader::load(__PACKAGE__,$VERSION);
df1df145 10
f2a2953c 11#
12# Object Generator 8 transcoders all at once!
13#
df1df145 14
f2a2953c 15require Encode;
16for my $name (qw(UTF-16 UTF-16BE UTF-16LE
17 UTF-32 UTF-32BE UTF-32LE
18 UCS-2BE UCS-2LE))
df1df145 19{
f2a2953c 20 my ($size, $endian, $ucs2, $mask);
21 $name =~ /^(\w+)-(\d+)(\w*)$/o;
22 if ($ucs2 = ($1 eq 'UCS')){
23 $size = 2;
24 }else{
25 $size = $2/8;
df1df145 26 }
f2a2953c 27 $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ;
28 $size == 4 and $endian = uc($endian);
29
30 $Encode::Encoding{$name} =
31 bless {
32 Name => $name,
33 size => $size,
34 endian => $endian,
35 ucs2 => $ucs2,
c731e18e 36 } => __PACKAGE__;
f2a2953c 37
df1df145 38}
39
f2a2953c 40sub name { shift->{'Name'} }
aae85ceb 41sub new_sequence
42{
43 my $self = shift;
44 # Return the original if endian known
45 return $self if ($self->{endian});
46 # Return a clone
47 return bless {%$self},ref($self);
48}
49
0ab8f81e 50sub needs_lines { 0 };
51
52sub perlio_ok {
53 exists $INC{"PerlIO/encoding.pm"} or return 0;
54 return 1;
55}
56
f2a2953c 57
58#
0ab8f81e 59# three implementations of (en|de)code exist. The XS version is the
60# fastest. *_modern uses an array and *_classic sticks with substr.
61# *_classic is much slower but more memory conservative.
62# *_xs is the default.
f2a2953c 63
64sub set_transcoder{
65 no warnings qw(redefine);
66 my $type = shift;
aae85ceb 67 if ($type eq "xs"){
68 *decode = \&decode_xs;
69 *encode = \&encode_xs;
70 }elsif($type eq "modern"){
f2a2953c 71 *decode = \&decode_modern;
72 *encode = \&encode_modern;
73 }elsif($type eq "classic"){
74 *decode = \&decode_classic;
75 *encode = \&encode_classic;
76 }else{
fcb875d4 77 require Carp;
aae85ceb 78 Carp::croak __PACKAGE__, "::set_transcoder(modern|classic|xs)";
f2a2953c 79 }
80}
81
aae85ceb 82set_transcoder("xs");
f2a2953c 83
84#
85982a32 85# Aux. subs & constants
86#
87
88sub FBCHAR(){ 0xFFFd }
89sub BOM_BE(){ 0xFeFF }
90sub BOM16LE(){ 0xFFFe }
91sub BOM32LE(){ 0xFFFe0000 }
92
93sub valid_ucs2($){
94 return
95 (0 <= $_[0] && $_[0] < 0xD800)
96 || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF);
97}
98
99sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF }
100sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 }
101sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF }
102
103sub ensurrogate($){
104 use integer; # we have divisions
105 my $uni = shift;
106 my $hi = ($uni - 0x10000) / 0x400 + 0xD800;
107 my $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
108 return ($hi, $lo);
109}
110
111sub desurrogate($$){
112 my ($hi, $lo) = @_;
113 return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
114}
115
116sub Mask { {2 => 0xffff, 4 => 0xffffffff} }
117
118#
f2a2953c 119# *_modern are much faster but guzzle more memory
120#
121
aae85ceb 122sub decode_modern($$;$)
df1df145 123{
f2a2953c 124 my ($obj, $str, $chk ) = @_;
125 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
126
127 # warn "$size, $endian, $ucs2";
128 $endian ||= BOMB($size, substr($str, 0, $size, ''))
129 or poisoned2death($obj, "Where's the BOM?");
130 my $mask = Mask->{$size};
131 my $utf8 = '';
132 my @ord = unpack("$endian*", $str);
133 undef $str; # to conserve memory
134 while (@ord){
135 my $ord = shift @ord;
136 unless ($size == 4 or valid_ucs2($ord &= $mask)){
137 if ($ucs2){
fcb875d4 138 $chk and
f2a2953c 139 poisoned2death($obj, "no surrogates allowed", $ord);
140 shift @ord; # skip the next one as well
141 $ord = FBCHAR;
142 }else{
143 unless (isHiSurrogate($ord)){
144 poisoned2death($obj, "Malformed HI surrogate", $ord);
145 }
146 my $lo = shift @ord;
147 unless (isLoSurrogate($lo &= $mask)){
148 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
149 }
150 $ord = desurrogate($ord, $lo);
151 }
152 }
153 $utf8 .= chr($ord);
df1df145 154 }
f2a2953c 155 utf8::upgrade($utf8);
156 return $utf8;
157}
158
aae85ceb 159sub encode_modern($$;$)
f2a2953c 160{
161 my ($obj, $utf8, $chk) = @_;
162 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
163 my @str = ();
164 unless ($endian){
165 $endian = ($size == 4) ? 'N' : 'n';
166 push @str, BOM_BE;
167 }
168 my @ord = unpack("U*", $utf8);
169 undef $utf8; # to conserve memory
170 for my $ord (@ord){
171 unless ($size == 4 or valid_ucs2($ord)) {
172 unless(issurrogate($ord)){
173 if ($ucs2){
fcb875d4 174 $chk and
f2a2953c 175 poisoned2death($obj, "code point too high", $ord);
176
177 push @str, FBCHAR;
178 }else{
fcb875d4 179
f2a2953c 180 push @str, ensurrogate($ord);
181 }
182 }else{ # not supposed to happen
183 push @str, FBCHAR;
184 }
185 }else{
186 push @str, $ord;
187 }
188 }
189 return pack("$endian*", @str);
190}
191
192#
193# *_classic are slower but more memory conservative
194#
195
aae85ceb 196sub decode_classic($$;$)
f2a2953c 197{
198 my ($obj, $str, $chk ) = @_;
199 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
200
201 # warn "$size, $endian, $ucs2";
202 $endian ||= BOMB($size, substr($str, 0, $size, ''))
203 or poisoned2death($obj, "Where's the BOM?");
204 my $mask = Mask->{$size};
205 my $utf8 = '';
206 my @ord = unpack("$endian*", $str);
207 while (length($str)){
208 my $ord = unpack($endian, substr($str, 0, $size, ''));
209 unless ($size == 4 or valid_ucs2($ord &= $mask)){
210 if ($ucs2){
fcb875d4 211 $chk and
f2a2953c 212 poisoned2death($obj, "no surrogates allowed", $ord);
213 substr($str,0,$size,''); # skip the next one as well
214 $ord = FBCHAR;
215 }else{
216 unless (isHiSurrogate($ord)){
217 poisoned2death($obj, "Malformed HI surrogate", $ord);
218 }
219 my $lo = unpack($endian ,substr($str,0,$size,''));
220 unless (isLoSurrogate($lo &= $mask)){
221 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
222 }
223 $ord = desurrogate($ord, $lo);
224 }
225 }
226 $utf8 .= chr($ord);
227 }
228 utf8::upgrade($utf8);
229 return $utf8;
230}
231
aae85ceb 232sub encode_classic($$;$)
f2a2953c 233{
234 my ($obj, $utf8, $chk) = @_;
235 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
236 # warn join ", ", $size, $ucs2, $endian, $mask;
237 my $str = '';
238 unless ($endian){
239 $endian = ($size == 4) ? 'N' : 'n';
240 $str .= pack($endian, BOM_BE);
241 }
242 while (length($utf8)){
243 my $ord = ord(substr($utf8,0,1,''));
244 unless ($size == 4 or valid_ucs2($ord)) {
245 unless(issurrogate($ord)){
246 if ($ucs2){
fcb875d4 247 $chk and
f2a2953c 248 poisoned2death($obj, "code point too high", $ord);
249 $str .= pack($endian, FBCHAR);
250 }else{
251 $str .= pack($endian.2, ensurrogate($ord));
252 }
253 }else{ # not supposed to happen
254 $str .= pack($endian, FBCHAR);
255 }
256 }else{
257 $str .= pack($endian, $ord);
258 }
259 }
260 return $str;
261}
262
263sub BOMB {
264 my ($size, $bom) = @_;
265 my $N = $size == 2 ? 'n' : 'N';
266 my $ord = unpack($N, $bom);
fcb875d4 267 return ($ord eq BOM_BE) ? $N :
f2a2953c 268 ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
269}
270
271sub poisoned2death{
272 my $obj = shift;
273 my $msg = shift;
274 my $pair = join(", ", map {sprintf "\\x%x", $_} @_);
275 require Carp;
276 Carp::croak $obj->name, ":", $msg, "<$pair>.", caller;
df1df145 277}
278
2791;
280__END__
67d7b5ef 281
282=head1 NAME
283
0ab8f81e 284Encode::Unicode -- Various Unicode Transformation Formats
67d7b5ef 285
286=cut
f2a2953c 287
288=head1 SYNOPSIS
289
fcb875d4 290 use Encode qw/encode decode/;
f2a2953c 291 $ucs2 = encode("UCS-2BE", $utf8);
292 $utf8 = decode("UCS-2BE", $ucs2);
293
294=head1 ABSTRACT
295
296This module implements all Character Encoding Schemes of Unicode that
297are officially documented by Unicode Consortium (except, of course,
298for UTF-8, which is a native format in perl).
299
300=over 4
301
302=item L<http://www.unicode.org/glossary/> says:
303
304I<Character Encoding Scheme> A character encoding form plus byte
305serialization. There are seven character encoding schemes in Unicode:
306UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE.
307
308=item Quick Reference
309
310 Decodes from ord(N) Encodes chr(N) to...
311 octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} ==
312 ---------------+-----------------+------------------------------
313 UCS-2BE 2 N N is bogus Not Available
314 UCS-2LE 2 N N bogus Not Available
315 UTF-16 2/4 Y Y is S.P S.P BE/LE
316 UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd
317 UTF-16LE 2 N Y S.P S.P 0x2ad8,0xcddf
318 UTF-32 4 Y - is bogus As is BE/LE
0ab8f81e 319 UTF-32BE 4 N - bogus As is 0x0001abcd
320 UTF-32LE 4 N - bogus As is 0xcdab0100
f2a2953c 321 UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d
322 ---------------+-----------------+------------------------------
323
324=back
325
326=head1 Size, Endianness, and BOM
327
0ab8f81e 328You can categorize these CES by 3 criteria: size of each character,
329endianness, and Byte Order Mark.
f2a2953c 330
0ab8f81e 331=head2 by size
f2a2953c 332
333UCS-2 is a fixed-length encoding with each character taking 16 bits.
0ab8f81e 334It B<does not> support I<surrogate pairs>. When a surrogate pair
335is encountered during decode(), its place is filled with \x{FFFD}
336if I<CHECK> is 0, or the routine croaks if I<CHECK> is 1. When a
337character whose ord value is larger than 0xFFFF is encountered,
338its place is filled with \x{FFFD} if I<CHECK> is 0, or the routine
339croaks if I<CHECK> is 1.
340
341UTF-16 is almost the same as UCS-2 but it supports I<surrogate pairs>.
f2a2953c 342When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
0ab8f81e 343following low surrogate (0xDC00-0xDFFF) and C<desurrogate>s them to
344form a character. Bogus surrogates result in death. When \x{10000}
345or above is encountered during encode(), it C<ensurrogate>s them and
346pushes the surrogate pair to the output stream.
f2a2953c 347
348UTF-32 is a fixed-length encoding with each character taking 32 bits.
0ab8f81e 349Since it is 32-bit, there is no need for I<surrogate pairs>.
f2a2953c 350
0ab8f81e 351=head2 by endianness
f2a2953c 352
0ab8f81e 353The first (and now failed) goal of Unicode was to map all character
354repertoires into a fixed-length integer so that programmers are happy.
355Since each character is either a I<short> or I<long> in C, you have to
356pay attention to the endianness of each platform when you pass data
357to one another.
f2a2953c 358
359Anything marked as BE is Big Endian (or network byte order) and LE is
0ab8f81e 360Little Endian (aka VAX byte order). For anything not marked either
361BE or LE, a character called Byte Order Mark (BOM) indicating the
362endianness is prepended to the string.
f2a2953c 363
364=over 4
365
fcb875d4 366=item BOM as integer when fetched in network byte order
f2a2953c 367
fcb875d4 368 16 32 bits/char
369 -------------------------
370 BE 0xFeFF 0x0000FeFF
371 LE 0xFFeF 0xFFFe0000
372 -------------------------
f2a2953c 373
374=back
fcb875d4 375
0ab8f81e 376This modules handles the BOM as follows.
f2a2953c 377
378=over 4
379
380=item *
381
382When BE or LE is explicitly stated as the name of encoding, BOM is
0ab8f81e 383simply treated as a normal character (ZERO WIDTH NO-BREAK SPACE).
f2a2953c 384
385=item *
386
0ab8f81e 387When BE or LE is omitted during decode(), it checks if BOM is at the
388beginning of the string; if one is found, the endianness is set to
389what the BOM says. If no BOM is found, the routine dies.
f2a2953c 390
391=item *
392
393When BE or LE is omitted during encode(), it returns a BE-encoded
394string with BOM prepended. So when you want to encode a whole text
0ab8f81e 395file, make sure you encode() the whole text at once, not line by line
396or each line, not file, will have a BOM prepended.
f2a2953c 397
398=item *
399
0ab8f81e 400C<UCS-2> is an exception. Unlike others, this is an alias of UCS-2BE.
f2a2953c 401UCS-2 is already registered by IANA and others that way.
402
fdd579e2 403=back
f2a2953c 404
fcb875d4 405=head1 Surrogate Pairs
f2a2953c 406
fcb875d4 407To say the least, surrogate pairs were the biggest mistake of the
408Unicode Consortium. But according to the late Douglas Adams in I<The
409Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the
410Universe was created. This has made a lot of people very angry and
411been widely regarded as a bad move>. Their mistake was not of this
412magnitude so let's forgive them.
f2a2953c 413
414(I don't dare make any comparison with Unicode Consortium and the
c731e18e 415Vogons here ;) Or, comparing Encode to Babel Fish is completely
416appropriate -- if you can only stick this into your ear :)
f2a2953c 417
0ab8f81e 418Surrogate pairs were born when the Unicode Consortium finally
fcb875d4 419admitted that 16 bits were not big enough to hold all the world's
0ab8f81e 420character repertoires. But they already made UCS-2 16-bit. What
f2a2953c 421do we do?
422
0ab8f81e 423Back then, the range 0xD800-0xDFFF was not allocated. Let's split
424that range in half and use the first half to represent the C<upper
425half of a character> and the second half to represent the C<lower
426half of a character>. That way, you can represent 1024 * 1024 =
4271048576 more characters. Now we can store character ranges up to
428\x{10ffff} even with 16-bit encodings. This pair of half-character is
429now called a I<surrogate pair> and UTF-16 is the name of the encoding
430that embraces them.
f2a2953c 431
448e90bb 432Here is a formula to ensurrogate a Unicode character \x{10000} and
f2a2953c 433above;
434
435 $hi = ($uni - 0x10000) / 0x400 + 0xD800;
436 $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
437
438And to desurrogate;
439
440 $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
441
fcb875d4 442Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but
443perl does not prohibit the use of characters within this range. To perl,
444every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
445
446 (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
0ab8f81e 447 integer support!
f2a2953c 448
449=head1 SEE ALSO
450
fdd579e2 451L<Encode>, L<http://www.unicode.org/glossary/>,
f2a2953c 452
fdd579e2 453RFC 2781 L<http://rfc.net/rfc2781.html>,
454
455L<http://www.unicode.org/unicode/faq/utf_bom.html>
456
fcb875d4 457Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)>
458by Larry Wall, Tom Christiansen, Jon Orwant;
459O'Reilly & Associates; ISBN 0-596-00027-8
460
fdd579e2 461=cut