Upgrade to Encode 1.61, from Dan Kogai.
[p5sagit/p5-mst-13.2.git] / ext / Encode / Unicode / Unicode.pm
CommitLineData
f2a2953c 1package Encode::Unicode;
2
df1df145 3use strict;
f2a2953c 4use warnings;
5
011b2d2f 6our $VERSION = do { my @r = (q$Revision: 1.35 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
f2a2953c 7
85982a32 8use XSLoader;
9XSLoader::load(__PACKAGE__,$VERSION);
df1df145 10
f2a2953c 11#
12# Object Generator 8 transcoders all at once!
13#
df1df145 14
f2a2953c 15require Encode;
16for my $name (qw(UTF-16 UTF-16BE UTF-16LE
17 UTF-32 UTF-32BE UTF-32LE
18 UCS-2BE UCS-2LE))
df1df145 19{
f2a2953c 20 my ($size, $endian, $ucs2, $mask);
21 $name =~ /^(\w+)-(\d+)(\w*)$/o;
22 if ($ucs2 = ($1 eq 'UCS')){
23 $size = 2;
24 }else{
25 $size = $2/8;
df1df145 26 }
f2a2953c 27 $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ;
28 $size == 4 and $endian = uc($endian);
29
30 $Encode::Encoding{$name} =
31 bless {
32 Name => $name,
33 size => $size,
34 endian => $endian,
35 ucs2 => $ucs2,
c731e18e 36 } => __PACKAGE__;
f2a2953c 37
df1df145 38}
39
f2a2953c 40sub name { shift->{'Name'} }
aae85ceb 41sub new_sequence
42{
43 my $self = shift;
44 # Return the original if endian known
45 return $self if ($self->{endian});
46 # Return a clone
47 return bless {%$self},ref($self);
48}
49
0ab8f81e 50sub needs_lines { 0 };
51
52sub perlio_ok {
011b2d2f 53 eval{ require PerlIO::encoding };
54 if ($@){
55 return 0;
56 }else{
57 return 1;
58 }
0ab8f81e 59}
60
f2a2953c 61
62#
0ab8f81e 63# three implementations of (en|de)code exist. The XS version is the
64# fastest. *_modern uses an array and *_classic sticks with substr.
65# *_classic is much slower but more memory conservative.
66# *_xs is the default.
f2a2953c 67
68sub set_transcoder{
69 no warnings qw(redefine);
70 my $type = shift;
aae85ceb 71 if ($type eq "xs"){
72 *decode = \&decode_xs;
73 *encode = \&encode_xs;
74 }elsif($type eq "modern"){
f2a2953c 75 *decode = \&decode_modern;
76 *encode = \&encode_modern;
77 }elsif($type eq "classic"){
78 *decode = \&decode_classic;
79 *encode = \&encode_classic;
80 }else{
fcb875d4 81 require Carp;
aae85ceb 82 Carp::croak __PACKAGE__, "::set_transcoder(modern|classic|xs)";
f2a2953c 83 }
84}
85
aae85ceb 86set_transcoder("xs");
f2a2953c 87
88#
85982a32 89# Aux. subs & constants
90#
91
92sub FBCHAR(){ 0xFFFd }
93sub BOM_BE(){ 0xFeFF }
94sub BOM16LE(){ 0xFFFe }
95sub BOM32LE(){ 0xFFFe0000 }
96
97sub valid_ucs2($){
98 return
99 (0 <= $_[0] && $_[0] < 0xD800)
100 || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF);
101}
102
103sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF }
104sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 }
105sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF }
106
107sub ensurrogate($){
108 use integer; # we have divisions
109 my $uni = shift;
110 my $hi = ($uni - 0x10000) / 0x400 + 0xD800;
111 my $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
112 return ($hi, $lo);
113}
114
115sub desurrogate($$){
116 my ($hi, $lo) = @_;
117 return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
118}
119
120sub Mask { {2 => 0xffff, 4 => 0xffffffff} }
121
122#
f2a2953c 123# *_modern are much faster but guzzle more memory
124#
125
aae85ceb 126sub decode_modern($$;$)
df1df145 127{
f2a2953c 128 my ($obj, $str, $chk ) = @_;
129 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
130
131 # warn "$size, $endian, $ucs2";
132 $endian ||= BOMB($size, substr($str, 0, $size, ''))
133 or poisoned2death($obj, "Where's the BOM?");
134 my $mask = Mask->{$size};
135 my $utf8 = '';
136 my @ord = unpack("$endian*", $str);
137 undef $str; # to conserve memory
138 while (@ord){
139 my $ord = shift @ord;
140 unless ($size == 4 or valid_ucs2($ord &= $mask)){
141 if ($ucs2){
fcb875d4 142 $chk and
f2a2953c 143 poisoned2death($obj, "no surrogates allowed", $ord);
144 shift @ord; # skip the next one as well
145 $ord = FBCHAR;
146 }else{
147 unless (isHiSurrogate($ord)){
148 poisoned2death($obj, "Malformed HI surrogate", $ord);
149 }
150 my $lo = shift @ord;
151 unless (isLoSurrogate($lo &= $mask)){
152 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
153 }
154 $ord = desurrogate($ord, $lo);
155 }
156 }
157 $utf8 .= chr($ord);
df1df145 158 }
f2a2953c 159 utf8::upgrade($utf8);
160 return $utf8;
161}
162
aae85ceb 163sub encode_modern($$;$)
f2a2953c 164{
165 my ($obj, $utf8, $chk) = @_;
166 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
167 my @str = ();
168 unless ($endian){
169 $endian = ($size == 4) ? 'N' : 'n';
170 push @str, BOM_BE;
171 }
172 my @ord = unpack("U*", $utf8);
173 undef $utf8; # to conserve memory
174 for my $ord (@ord){
175 unless ($size == 4 or valid_ucs2($ord)) {
176 unless(issurrogate($ord)){
177 if ($ucs2){
fcb875d4 178 $chk and
f2a2953c 179 poisoned2death($obj, "code point too high", $ord);
180
181 push @str, FBCHAR;
182 }else{
fcb875d4 183
f2a2953c 184 push @str, ensurrogate($ord);
185 }
186 }else{ # not supposed to happen
187 push @str, FBCHAR;
188 }
189 }else{
190 push @str, $ord;
191 }
192 }
193 return pack("$endian*", @str);
194}
195
196#
197# *_classic are slower but more memory conservative
198#
199
aae85ceb 200sub decode_classic($$;$)
f2a2953c 201{
202 my ($obj, $str, $chk ) = @_;
203 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
204
205 # warn "$size, $endian, $ucs2";
206 $endian ||= BOMB($size, substr($str, 0, $size, ''))
207 or poisoned2death($obj, "Where's the BOM?");
208 my $mask = Mask->{$size};
209 my $utf8 = '';
210 my @ord = unpack("$endian*", $str);
211 while (length($str)){
212 my $ord = unpack($endian, substr($str, 0, $size, ''));
213 unless ($size == 4 or valid_ucs2($ord &= $mask)){
214 if ($ucs2){
fcb875d4 215 $chk and
f2a2953c 216 poisoned2death($obj, "no surrogates allowed", $ord);
217 substr($str,0,$size,''); # skip the next one as well
218 $ord = FBCHAR;
219 }else{
220 unless (isHiSurrogate($ord)){
221 poisoned2death($obj, "Malformed HI surrogate", $ord);
222 }
223 my $lo = unpack($endian ,substr($str,0,$size,''));
224 unless (isLoSurrogate($lo &= $mask)){
225 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
226 }
227 $ord = desurrogate($ord, $lo);
228 }
229 }
230 $utf8 .= chr($ord);
231 }
232 utf8::upgrade($utf8);
233 return $utf8;
234}
235
aae85ceb 236sub encode_classic($$;$)
f2a2953c 237{
238 my ($obj, $utf8, $chk) = @_;
239 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
240 # warn join ", ", $size, $ucs2, $endian, $mask;
241 my $str = '';
242 unless ($endian){
243 $endian = ($size == 4) ? 'N' : 'n';
244 $str .= pack($endian, BOM_BE);
245 }
246 while (length($utf8)){
247 my $ord = ord(substr($utf8,0,1,''));
248 unless ($size == 4 or valid_ucs2($ord)) {
249 unless(issurrogate($ord)){
250 if ($ucs2){
fcb875d4 251 $chk and
f2a2953c 252 poisoned2death($obj, "code point too high", $ord);
253 $str .= pack($endian, FBCHAR);
254 }else{
255 $str .= pack($endian.2, ensurrogate($ord));
256 }
257 }else{ # not supposed to happen
258 $str .= pack($endian, FBCHAR);
259 }
260 }else{
261 $str .= pack($endian, $ord);
262 }
263 }
264 return $str;
265}
266
267sub BOMB {
268 my ($size, $bom) = @_;
269 my $N = $size == 2 ? 'n' : 'N';
270 my $ord = unpack($N, $bom);
fcb875d4 271 return ($ord eq BOM_BE) ? $N :
f2a2953c 272 ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
273}
274
275sub poisoned2death{
276 my $obj = shift;
277 my $msg = shift;
278 my $pair = join(", ", map {sprintf "\\x%x", $_} @_);
279 require Carp;
280 Carp::croak $obj->name, ":", $msg, "<$pair>.", caller;
df1df145 281}
282
2831;
284__END__
67d7b5ef 285
286=head1 NAME
287
0ab8f81e 288Encode::Unicode -- Various Unicode Transformation Formats
67d7b5ef 289
290=cut
f2a2953c 291
292=head1 SYNOPSIS
293
fcb875d4 294 use Encode qw/encode decode/;
f2a2953c 295 $ucs2 = encode("UCS-2BE", $utf8);
296 $utf8 = decode("UCS-2BE", $ucs2);
297
298=head1 ABSTRACT
299
300This module implements all Character Encoding Schemes of Unicode that
301are officially documented by Unicode Consortium (except, of course,
302for UTF-8, which is a native format in perl).
303
304=over 4
305
306=item L<http://www.unicode.org/glossary/> says:
307
308I<Character Encoding Scheme> A character encoding form plus byte
309serialization. There are seven character encoding schemes in Unicode:
310UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE.
311
312=item Quick Reference
313
314 Decodes from ord(N) Encodes chr(N) to...
315 octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} ==
316 ---------------+-----------------+------------------------------
317 UCS-2BE 2 N N is bogus Not Available
318 UCS-2LE 2 N N bogus Not Available
319 UTF-16 2/4 Y Y is S.P S.P BE/LE
320 UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd
321 UTF-16LE 2 N Y S.P S.P 0x2ad8,0xcddf
322 UTF-32 4 Y - is bogus As is BE/LE
0ab8f81e 323 UTF-32BE 4 N - bogus As is 0x0001abcd
324 UTF-32LE 4 N - bogus As is 0xcdab0100
f2a2953c 325 UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d
326 ---------------+-----------------+------------------------------
327
328=back
329
330=head1 Size, Endianness, and BOM
331
0ab8f81e 332You can categorize these CES by 3 criteria: size of each character,
333endianness, and Byte Order Mark.
f2a2953c 334
0ab8f81e 335=head2 by size
f2a2953c 336
337UCS-2 is a fixed-length encoding with each character taking 16 bits.
0ab8f81e 338It B<does not> support I<surrogate pairs>. When a surrogate pair
339is encountered during decode(), its place is filled with \x{FFFD}
340if I<CHECK> is 0, or the routine croaks if I<CHECK> is 1. When a
341character whose ord value is larger than 0xFFFF is encountered,
342its place is filled with \x{FFFD} if I<CHECK> is 0, or the routine
343croaks if I<CHECK> is 1.
344
345UTF-16 is almost the same as UCS-2 but it supports I<surrogate pairs>.
f2a2953c 346When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
0ab8f81e 347following low surrogate (0xDC00-0xDFFF) and C<desurrogate>s them to
348form a character. Bogus surrogates result in death. When \x{10000}
349or above is encountered during encode(), it C<ensurrogate>s them and
350pushes the surrogate pair to the output stream.
f2a2953c 351
352UTF-32 is a fixed-length encoding with each character taking 32 bits.
0ab8f81e 353Since it is 32-bit, there is no need for I<surrogate pairs>.
f2a2953c 354
0ab8f81e 355=head2 by endianness
f2a2953c 356
0ab8f81e 357The first (and now failed) goal of Unicode was to map all character
358repertoires into a fixed-length integer so that programmers are happy.
359Since each character is either a I<short> or I<long> in C, you have to
360pay attention to the endianness of each platform when you pass data
361to one another.
f2a2953c 362
363Anything marked as BE is Big Endian (or network byte order) and LE is
0ab8f81e 364Little Endian (aka VAX byte order). For anything not marked either
365BE or LE, a character called Byte Order Mark (BOM) indicating the
366endianness is prepended to the string.
f2a2953c 367
368=over 4
369
fcb875d4 370=item BOM as integer when fetched in network byte order
f2a2953c 371
fcb875d4 372 16 32 bits/char
373 -------------------------
374 BE 0xFeFF 0x0000FeFF
375 LE 0xFFeF 0xFFFe0000
376 -------------------------
f2a2953c 377
378=back
fcb875d4 379
0ab8f81e 380This modules handles the BOM as follows.
f2a2953c 381
382=over 4
383
384=item *
385
386When BE or LE is explicitly stated as the name of encoding, BOM is
0ab8f81e 387simply treated as a normal character (ZERO WIDTH NO-BREAK SPACE).
f2a2953c 388
389=item *
390
0ab8f81e 391When BE or LE is omitted during decode(), it checks if BOM is at the
392beginning of the string; if one is found, the endianness is set to
393what the BOM says. If no BOM is found, the routine dies.
f2a2953c 394
395=item *
396
397When BE or LE is omitted during encode(), it returns a BE-encoded
398string with BOM prepended. So when you want to encode a whole text
0ab8f81e 399file, make sure you encode() the whole text at once, not line by line
400or each line, not file, will have a BOM prepended.
f2a2953c 401
402=item *
403
0ab8f81e 404C<UCS-2> is an exception. Unlike others, this is an alias of UCS-2BE.
f2a2953c 405UCS-2 is already registered by IANA and others that way.
406
fdd579e2 407=back
f2a2953c 408
fcb875d4 409=head1 Surrogate Pairs
f2a2953c 410
fcb875d4 411To say the least, surrogate pairs were the biggest mistake of the
412Unicode Consortium. But according to the late Douglas Adams in I<The
413Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the
414Universe was created. This has made a lot of people very angry and
415been widely regarded as a bad move>. Their mistake was not of this
416magnitude so let's forgive them.
f2a2953c 417
418(I don't dare make any comparison with Unicode Consortium and the
c731e18e 419Vogons here ;) Or, comparing Encode to Babel Fish is completely
420appropriate -- if you can only stick this into your ear :)
f2a2953c 421
0ab8f81e 422Surrogate pairs were born when the Unicode Consortium finally
fcb875d4 423admitted that 16 bits were not big enough to hold all the world's
0ab8f81e 424character repertoires. But they already made UCS-2 16-bit. What
f2a2953c 425do we do?
426
0ab8f81e 427Back then, the range 0xD800-0xDFFF was not allocated. Let's split
428that range in half and use the first half to represent the C<upper
429half of a character> and the second half to represent the C<lower
430half of a character>. That way, you can represent 1024 * 1024 =
4311048576 more characters. Now we can store character ranges up to
432\x{10ffff} even with 16-bit encodings. This pair of half-character is
433now called a I<surrogate pair> and UTF-16 is the name of the encoding
434that embraces them.
f2a2953c 435
448e90bb 436Here is a formula to ensurrogate a Unicode character \x{10000} and
f2a2953c 437above;
438
439 $hi = ($uni - 0x10000) / 0x400 + 0xD800;
440 $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
441
442And to desurrogate;
443
444 $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
445
fcb875d4 446Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but
447perl does not prohibit the use of characters within this range. To perl,
448every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
449
450 (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
0ab8f81e 451 integer support!
f2a2953c 452
453=head1 SEE ALSO
454
fdd579e2 455L<Encode>, L<http://www.unicode.org/glossary/>,
f2a2953c 456
fdd579e2 457RFC 2781 L<http://rfc.net/rfc2781.html>,
458
459L<http://www.unicode.org/unicode/faq/utf_bom.html>
460
fcb875d4 461Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)>
462by Larry Wall, Tom Christiansen, Jon Orwant;
463O'Reilly & Associates; ISBN 0-596-00027-8
464
fdd579e2 465=cut