Various tweaks to Encode
[p5sagit/p5-mst-13.2.git] / ext / Encode / Unicode / Unicode.pm
CommitLineData
f2a2953c 1package Encode::Unicode;
2
df1df145 3use strict;
f2a2953c 4use warnings;
5
85982a32 6our $VERSION = do { my @r = (q$Revision: 1.32 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
f2a2953c 7
85982a32 8use XSLoader;
9XSLoader::load(__PACKAGE__,$VERSION);
df1df145 10
f2a2953c 11#
12# Object Generator 8 transcoders all at once!
13#
df1df145 14
f2a2953c 15require Encode;
16for my $name (qw(UTF-16 UTF-16BE UTF-16LE
17 UTF-32 UTF-32BE UTF-32LE
18 UCS-2BE UCS-2LE))
df1df145 19{
f2a2953c 20 my ($size, $endian, $ucs2, $mask);
21 $name =~ /^(\w+)-(\d+)(\w*)$/o;
22 if ($ucs2 = ($1 eq 'UCS')){
23 $size = 2;
24 }else{
25 $size = $2/8;
df1df145 26 }
f2a2953c 27 $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ;
28 $size == 4 and $endian = uc($endian);
29
30 $Encode::Encoding{$name} =
31 bless {
32 Name => $name,
33 size => $size,
34 endian => $endian,
35 ucs2 => $ucs2,
c731e18e 36 } => __PACKAGE__;
f2a2953c 37
df1df145 38}
39
f2a2953c 40sub name { shift->{'Name'} }
aae85ceb 41sub new_sequence
42{
43 my $self = shift;
44 # Return the original if endian known
45 return $self if ($self->{endian});
46 # Return a clone
47 return bless {%$self},ref($self);
48}
49
f2a2953c 50
51#
aae85ceb 52# three implementation of (en|de)code exist. XS version is the fastest.
53# *_modern use # an array and *_classic stick with substr. *_classic is
54# much slower but more memory conservative. *_xs is default.
f2a2953c 55
56sub set_transcoder{
57 no warnings qw(redefine);
58 my $type = shift;
aae85ceb 59 if ($type eq "xs"){
60 *decode = \&decode_xs;
61 *encode = \&encode_xs;
62 }elsif($type eq "modern"){
f2a2953c 63 *decode = \&decode_modern;
64 *encode = \&encode_modern;
65 }elsif($type eq "classic"){
66 *decode = \&decode_classic;
67 *encode = \&encode_classic;
68 }else{
fcb875d4 69 require Carp;
aae85ceb 70 Carp::croak __PACKAGE__, "::set_transcoder(modern|classic|xs)";
f2a2953c 71 }
72}
73
aae85ceb 74set_transcoder("xs");
f2a2953c 75
76#
85982a32 77# Aux. subs & constants
78#
79
80sub FBCHAR(){ 0xFFFd }
81sub BOM_BE(){ 0xFeFF }
82sub BOM16LE(){ 0xFFFe }
83sub BOM32LE(){ 0xFFFe0000 }
84
85sub valid_ucs2($){
86 return
87 (0 <= $_[0] && $_[0] < 0xD800)
88 || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF);
89}
90
91sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF }
92sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 }
93sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF }
94
95sub ensurrogate($){
96 use integer; # we have divisions
97 my $uni = shift;
98 my $hi = ($uni - 0x10000) / 0x400 + 0xD800;
99 my $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
100 return ($hi, $lo);
101}
102
103sub desurrogate($$){
104 my ($hi, $lo) = @_;
105 return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
106}
107
108sub Mask { {2 => 0xffff, 4 => 0xffffffff} }
109
110#
f2a2953c 111# *_modern are much faster but guzzle more memory
112#
113
aae85ceb 114sub decode_modern($$;$)
df1df145 115{
f2a2953c 116 my ($obj, $str, $chk ) = @_;
117 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
118
119 # warn "$size, $endian, $ucs2";
120 $endian ||= BOMB($size, substr($str, 0, $size, ''))
121 or poisoned2death($obj, "Where's the BOM?");
122 my $mask = Mask->{$size};
123 my $utf8 = '';
124 my @ord = unpack("$endian*", $str);
125 undef $str; # to conserve memory
126 while (@ord){
127 my $ord = shift @ord;
128 unless ($size == 4 or valid_ucs2($ord &= $mask)){
129 if ($ucs2){
fcb875d4 130 $chk and
f2a2953c 131 poisoned2death($obj, "no surrogates allowed", $ord);
132 shift @ord; # skip the next one as well
133 $ord = FBCHAR;
134 }else{
135 unless (isHiSurrogate($ord)){
136 poisoned2death($obj, "Malformed HI surrogate", $ord);
137 }
138 my $lo = shift @ord;
139 unless (isLoSurrogate($lo &= $mask)){
140 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
141 }
142 $ord = desurrogate($ord, $lo);
143 }
144 }
145 $utf8 .= chr($ord);
df1df145 146 }
f2a2953c 147 utf8::upgrade($utf8);
148 return $utf8;
149}
150
aae85ceb 151sub encode_modern($$;$)
f2a2953c 152{
153 my ($obj, $utf8, $chk) = @_;
154 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
155 my @str = ();
156 unless ($endian){
157 $endian = ($size == 4) ? 'N' : 'n';
158 push @str, BOM_BE;
159 }
160 my @ord = unpack("U*", $utf8);
161 undef $utf8; # to conserve memory
162 for my $ord (@ord){
163 unless ($size == 4 or valid_ucs2($ord)) {
164 unless(issurrogate($ord)){
165 if ($ucs2){
fcb875d4 166 $chk and
f2a2953c 167 poisoned2death($obj, "code point too high", $ord);
168
169 push @str, FBCHAR;
170 }else{
fcb875d4 171
f2a2953c 172 push @str, ensurrogate($ord);
173 }
174 }else{ # not supposed to happen
175 push @str, FBCHAR;
176 }
177 }else{
178 push @str, $ord;
179 }
180 }
181 return pack("$endian*", @str);
182}
183
184#
185# *_classic are slower but more memory conservative
186#
187
aae85ceb 188sub decode_classic($$;$)
f2a2953c 189{
190 my ($obj, $str, $chk ) = @_;
191 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
192
193 # warn "$size, $endian, $ucs2";
194 $endian ||= BOMB($size, substr($str, 0, $size, ''))
195 or poisoned2death($obj, "Where's the BOM?");
196 my $mask = Mask->{$size};
197 my $utf8 = '';
198 my @ord = unpack("$endian*", $str);
199 while (length($str)){
200 my $ord = unpack($endian, substr($str, 0, $size, ''));
201 unless ($size == 4 or valid_ucs2($ord &= $mask)){
202 if ($ucs2){
fcb875d4 203 $chk and
f2a2953c 204 poisoned2death($obj, "no surrogates allowed", $ord);
205 substr($str,0,$size,''); # skip the next one as well
206 $ord = FBCHAR;
207 }else{
208 unless (isHiSurrogate($ord)){
209 poisoned2death($obj, "Malformed HI surrogate", $ord);
210 }
211 my $lo = unpack($endian ,substr($str,0,$size,''));
212 unless (isLoSurrogate($lo &= $mask)){
213 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
214 }
215 $ord = desurrogate($ord, $lo);
216 }
217 }
218 $utf8 .= chr($ord);
219 }
220 utf8::upgrade($utf8);
221 return $utf8;
222}
223
aae85ceb 224sub encode_classic($$;$)
f2a2953c 225{
226 my ($obj, $utf8, $chk) = @_;
227 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
228 # warn join ", ", $size, $ucs2, $endian, $mask;
229 my $str = '';
230 unless ($endian){
231 $endian = ($size == 4) ? 'N' : 'n';
232 $str .= pack($endian, BOM_BE);
233 }
234 while (length($utf8)){
235 my $ord = ord(substr($utf8,0,1,''));
236 unless ($size == 4 or valid_ucs2($ord)) {
237 unless(issurrogate($ord)){
238 if ($ucs2){
fcb875d4 239 $chk and
f2a2953c 240 poisoned2death($obj, "code point too high", $ord);
241 $str .= pack($endian, FBCHAR);
242 }else{
243 $str .= pack($endian.2, ensurrogate($ord));
244 }
245 }else{ # not supposed to happen
246 $str .= pack($endian, FBCHAR);
247 }
248 }else{
249 $str .= pack($endian, $ord);
250 }
251 }
252 return $str;
253}
254
255sub BOMB {
256 my ($size, $bom) = @_;
257 my $N = $size == 2 ? 'n' : 'N';
258 my $ord = unpack($N, $bom);
fcb875d4 259 return ($ord eq BOM_BE) ? $N :
f2a2953c 260 ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
261}
262
263sub poisoned2death{
264 my $obj = shift;
265 my $msg = shift;
266 my $pair = join(", ", map {sprintf "\\x%x", $_} @_);
267 require Carp;
268 Carp::croak $obj->name, ":", $msg, "<$pair>.", caller;
df1df145 269}
270
2711;
272__END__
67d7b5ef 273
274=head1 NAME
275
f2a2953c 276Encode::Unicode -- Various Unicode Transform Format
67d7b5ef 277
278=cut
f2a2953c 279
280=head1 SYNOPSIS
281
fcb875d4 282 use Encode qw/encode decode/;
f2a2953c 283 $ucs2 = encode("UCS-2BE", $utf8);
284 $utf8 = decode("UCS-2BE", $ucs2);
285
286=head1 ABSTRACT
287
288This module implements all Character Encoding Schemes of Unicode that
289are officially documented by Unicode Consortium (except, of course,
290for UTF-8, which is a native format in perl).
291
292=over 4
293
294=item L<http://www.unicode.org/glossary/> says:
295
296I<Character Encoding Scheme> A character encoding form plus byte
297serialization. There are seven character encoding schemes in Unicode:
298UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE.
299
300=item Quick Reference
301
302 Decodes from ord(N) Encodes chr(N) to...
303 octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} ==
304 ---------------+-----------------+------------------------------
305 UCS-2BE 2 N N is bogus Not Available
306 UCS-2LE 2 N N bogus Not Available
307 UTF-16 2/4 Y Y is S.P S.P BE/LE
308 UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd
309 UTF-16LE 2 N Y S.P S.P 0x2ad8,0xcddf
310 UTF-32 4 Y - is bogus As is BE/LE
311 UTF-32BE 4 N - bogus As is 0x0010abcd
312 UTF-32LE 4 N - bogus As is 0xcdab1000
313 UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d
314 ---------------+-----------------+------------------------------
315
316=back
317
318=head1 Size, Endianness, and BOM
319
320You can categorize these CES by 3 criteria; Size of each character,
321Endianness, and Byte Order Mark.
322
323=head2 by Size
324
325UCS-2 is a fixed-length encoding with each character taking 16 bits.
fcb875d4 326It B<does not> support I<Surrogate Pairs>. When a surrogate pair is
327encountered during decode(), its place is filled with \xFFFD without
328I<CHECK> or croaks if I<CHECK>. When a character whose ord value is
329larger than 0xFFFF is encountered, it uses 0xFFFD without I<CHECK> or
330croaks if <CHECK>.
f2a2953c 331
fcb875d4 332UTF-16 is almost the same as UCS-2 but it supports I<Surrogate Pairs>.
f2a2953c 333When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
fcb875d4 334following low surrogate (0xDC00-0xDFFF), C<desurrogate>s them to form a
f2a2953c 335character. Bogus surrogates result in death. When \x{10000} or above
fcb875d4 336is encountered during encode(), it C<ensurrogate>s them and pushes the
f2a2953c 337surrogate pair to the output stream.
338
339UTF-32 is a fixed-length encoding with each character taking 32 bits.
fcb875d4 340Since it is 32-bit there is no need for I<Surrogate Pairs>.
f2a2953c 341
342=head2 by Endianness
343
344First (and now failed) goal of Unicode was to map all character
fcb875d4 345repertories into a fixed-length integer so programmers are happy.
f2a2953c 346Since each character is either I<short> or I<long> in C, you have to
347put endianness of each platform when you pass data to one another.
348
349Anything marked as BE is Big Endian (or network byte order) and LE is
350Little Endian (aka VAX byte order). For anything without, a character
351called Byte Order Mark (BOM) is prepended to the head of string.
352
353=over 4
354
fcb875d4 355=item BOM as integer when fetched in network byte order
f2a2953c 356
fcb875d4 357 16 32 bits/char
358 -------------------------
359 BE 0xFeFF 0x0000FeFF
360 LE 0xFFeF 0xFFFe0000
361 -------------------------
f2a2953c 362
363=back
fcb875d4 364
f2a2953c 365This modules handles BOM as follows.
366
367=over 4
368
369=item *
370
371When BE or LE is explicitly stated as the name of encoding, BOM is
372simply treated as one of characters (ZERO WIDTH NO-BREAK SPACE).
373
374=item *
375
376When BE or LE is omitted during decode(), it checks if BOM is in the
377beginning of the string and if found endianness is set to what BOM
fcb875d4 378says. If not found, dies.
f2a2953c 379
380=item *
381
382When BE or LE is omitted during encode(), it returns a BE-encoded
383string with BOM prepended. So when you want to encode a whole text
384file, make sure you encode() by whole text, not line by line or each
385line, not file, is prepended with BOMs.
386
387=item *
388
389C<UCS-2> is an exception. Unlike others this is an alias of UCS-2BE.
390UCS-2 is already registered by IANA and others that way.
391
fdd579e2 392=back
f2a2953c 393
fcb875d4 394=head1 Surrogate Pairs
f2a2953c 395
fcb875d4 396To say the least, surrogate pairs were the biggest mistake of the
397Unicode Consortium. But according to the late Douglas Adams in I<The
398Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the
399Universe was created. This has made a lot of people very angry and
400been widely regarded as a bad move>. Their mistake was not of this
401magnitude so let's forgive them.
f2a2953c 402
403(I don't dare make any comparison with Unicode Consortium and the
c731e18e 404Vogons here ;) Or, comparing Encode to Babel Fish is completely
405appropriate -- if you can only stick this into your ear :)
f2a2953c 406
fcb875d4 407Surrogate pairs were born when Unicode Consortium finally
408admitted that 16 bits were not big enough to hold all the world's
409character repertoire. But they have already made UCS-2 16-bit. What
f2a2953c 410do we do?
411
412Back then 0xD800-0xDFFF was not allocated. Let's split them half and
413use the first half to represent C<upper half of a character> and the
414latter C<lower half of a character>. That way you can represent 1024
415* 1024 = 1048576 more characters. Now we can store character ranges
416up to \x{10ffff} even with 16-bit encodings. This pair of
417half-character is now called a I<Surrogate Pair> and UTF-16 is the
fcb875d4 418name of the encoding that embraces them.
f2a2953c 419
448e90bb 420Here is a formula to ensurrogate a Unicode character \x{10000} and
f2a2953c 421above;
422
423 $hi = ($uni - 0x10000) / 0x400 + 0xD800;
424 $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
425
426And to desurrogate;
427
428 $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
429
fcb875d4 430Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but
431perl does not prohibit the use of characters within this range. To perl,
432every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
433
434 (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
435 integer support! (**)
436
437 (**) Is anything beyond \x{11_0000} still Unicode :?
f2a2953c 438
439=head1 SEE ALSO
440
fdd579e2 441L<Encode>, L<http://www.unicode.org/glossary/>,
f2a2953c 442
fdd579e2 443RFC 2781 L<http://rfc.net/rfc2781.html>,
444
445L<http://www.unicode.org/unicode/faq/utf_bom.html>
446
fcb875d4 447Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)>
448by Larry Wall, Tom Christiansen, Jon Orwant;
449O'Reilly & Associates; ISBN 0-596-00027-8
450
fdd579e2 451=cut