Upgrade to Encode 1.32, from Dan Kogai.
[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Unicode.pm
CommitLineData
f2a2953c 1package Encode::Unicode;
2
df1df145 3use strict;
f2a2953c 4use warnings;
5
fcb875d4 6our $VERSION = do { my @r = (q$Revision: 1.29 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
f2a2953c 7
8#
9# Aux. subs & constants
10#
11
12sub FBCHAR(){ 0xFFFd }
13sub BOM_BE(){ 0xFeFF }
14sub BOM16LE(){ 0xFFFe }
fdd579e2 15sub BOM32LE(){ 0xFFFe0000 }
f2a2953c 16
17sub valid_ucs2($){
fcb875d4 18 return
19 (0 <= $_[0] && $_[0] < 0xD800)
20 || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF);
f2a2953c 21}
22
23sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF }
24sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 }
25sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF }
26
27sub ensurrogate($){
28 use integer; # we have divisions
29 my $uni = shift;
30 my $hi = ($uni - 0x10000) / 0x400 + 0xD800;
31 my $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
32 return ($hi, $lo);
33}
34
35sub desurrogate($$){
36 my ($hi, $lo) = @_;
37 return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
38}
ee981de6 39
f2a2953c 40sub Mask { {2 => 0xffff, 4 => 0xffffffff} }
df1df145 41
f2a2953c 42#
43# Object Generator 8 transcoders all at once!
44#
df1df145 45
f2a2953c 46require Encode;
47for my $name (qw(UTF-16 UTF-16BE UTF-16LE
48 UTF-32 UTF-32BE UTF-32LE
49 UCS-2BE UCS-2LE))
df1df145 50{
f2a2953c 51 my ($size, $endian, $ucs2, $mask);
52 $name =~ /^(\w+)-(\d+)(\w*)$/o;
53 if ($ucs2 = ($1 eq 'UCS')){
54 $size = 2;
55 }else{
56 $size = $2/8;
df1df145 57 }
f2a2953c 58 $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ;
59 $size == 4 and $endian = uc($endian);
60
61 $Encode::Encoding{$name} =
62 bless {
63 Name => $name,
64 size => $size,
65 endian => $endian,
66 ucs2 => $ucs2,
c731e18e 67 } => __PACKAGE__;
f2a2953c 68
df1df145 69}
70
f2a2953c 71sub name { shift->{'Name'} }
72sub new_sequence { $_[0] };
73
74#
fcb875d4 75# two implementation of (en|de)code exist. *_modern use
76# an array and *_classic stick with substr. *_classic is much
77# slower but more memory conservative. *_modern is default.
f2a2953c 78
79sub set_transcoder{
80 no warnings qw(redefine);
81 my $type = shift;
82 if ($type eq "modern"){
83 *decode = \&decode_modern;
84 *encode = \&encode_modern;
85 }elsif($type eq "classic"){
86 *decode = \&decode_classic;
87 *encode = \&encode_classic;
88 }else{
fcb875d4 89 require Carp;
f2a2953c 90 Carp::croak __PACKAGE__, "::set_transcoder(modern|classic)";
91 }
92}
93
94set_transcoder("modern");
95
96#
97# *_modern are much faster but guzzle more memory
98#
99
100sub decode_modern
df1df145 101{
f2a2953c 102 my ($obj, $str, $chk ) = @_;
103 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
104
105 # warn "$size, $endian, $ucs2";
106 $endian ||= BOMB($size, substr($str, 0, $size, ''))
107 or poisoned2death($obj, "Where's the BOM?");
108 my $mask = Mask->{$size};
109 my $utf8 = '';
110 my @ord = unpack("$endian*", $str);
111 undef $str; # to conserve memory
112 while (@ord){
113 my $ord = shift @ord;
114 unless ($size == 4 or valid_ucs2($ord &= $mask)){
115 if ($ucs2){
fcb875d4 116 $chk and
f2a2953c 117 poisoned2death($obj, "no surrogates allowed", $ord);
118 shift @ord; # skip the next one as well
119 $ord = FBCHAR;
120 }else{
121 unless (isHiSurrogate($ord)){
122 poisoned2death($obj, "Malformed HI surrogate", $ord);
123 }
124 my $lo = shift @ord;
125 unless (isLoSurrogate($lo &= $mask)){
126 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
127 }
128 $ord = desurrogate($ord, $lo);
129 }
130 }
131 $utf8 .= chr($ord);
df1df145 132 }
f2a2953c 133 utf8::upgrade($utf8);
134 return $utf8;
135}
136
137sub encode_modern
138{
139 my ($obj, $utf8, $chk) = @_;
140 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
141 my @str = ();
142 unless ($endian){
143 $endian = ($size == 4) ? 'N' : 'n';
144 push @str, BOM_BE;
145 }
146 my @ord = unpack("U*", $utf8);
147 undef $utf8; # to conserve memory
148 for my $ord (@ord){
149 unless ($size == 4 or valid_ucs2($ord)) {
150 unless(issurrogate($ord)){
151 if ($ucs2){
fcb875d4 152 $chk and
f2a2953c 153 poisoned2death($obj, "code point too high", $ord);
154
155 push @str, FBCHAR;
156 }else{
fcb875d4 157
f2a2953c 158 push @str, ensurrogate($ord);
159 }
160 }else{ # not supposed to happen
161 push @str, FBCHAR;
162 }
163 }else{
164 push @str, $ord;
165 }
166 }
167 return pack("$endian*", @str);
168}
169
170#
171# *_classic are slower but more memory conservative
172#
173
174sub decode_classic
175{
176 my ($obj, $str, $chk ) = @_;
177 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
178
179 # warn "$size, $endian, $ucs2";
180 $endian ||= BOMB($size, substr($str, 0, $size, ''))
181 or poisoned2death($obj, "Where's the BOM?");
182 my $mask = Mask->{$size};
183 my $utf8 = '';
184 my @ord = unpack("$endian*", $str);
185 while (length($str)){
186 my $ord = unpack($endian, substr($str, 0, $size, ''));
187 unless ($size == 4 or valid_ucs2($ord &= $mask)){
188 if ($ucs2){
fcb875d4 189 $chk and
f2a2953c 190 poisoned2death($obj, "no surrogates allowed", $ord);
191 substr($str,0,$size,''); # skip the next one as well
192 $ord = FBCHAR;
193 }else{
194 unless (isHiSurrogate($ord)){
195 poisoned2death($obj, "Malformed HI surrogate", $ord);
196 }
197 my $lo = unpack($endian ,substr($str,0,$size,''));
198 unless (isLoSurrogate($lo &= $mask)){
199 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
200 }
201 $ord = desurrogate($ord, $lo);
202 }
203 }
204 $utf8 .= chr($ord);
205 }
206 utf8::upgrade($utf8);
207 return $utf8;
208}
209
210sub encode_classic
211{
212 my ($obj, $utf8, $chk) = @_;
213 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
214 # warn join ", ", $size, $ucs2, $endian, $mask;
215 my $str = '';
216 unless ($endian){
217 $endian = ($size == 4) ? 'N' : 'n';
218 $str .= pack($endian, BOM_BE);
219 }
220 while (length($utf8)){
221 my $ord = ord(substr($utf8,0,1,''));
222 unless ($size == 4 or valid_ucs2($ord)) {
223 unless(issurrogate($ord)){
224 if ($ucs2){
fcb875d4 225 $chk and
f2a2953c 226 poisoned2death($obj, "code point too high", $ord);
227 $str .= pack($endian, FBCHAR);
228 }else{
229 $str .= pack($endian.2, ensurrogate($ord));
230 }
231 }else{ # not supposed to happen
232 $str .= pack($endian, FBCHAR);
233 }
234 }else{
235 $str .= pack($endian, $ord);
236 }
237 }
238 return $str;
239}
240
241sub BOMB {
242 my ($size, $bom) = @_;
243 my $N = $size == 2 ? 'n' : 'N';
244 my $ord = unpack($N, $bom);
fcb875d4 245 return ($ord eq BOM_BE) ? $N :
f2a2953c 246 ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
247}
248
249sub poisoned2death{
250 my $obj = shift;
251 my $msg = shift;
252 my $pair = join(", ", map {sprintf "\\x%x", $_} @_);
253 require Carp;
254 Carp::croak $obj->name, ":", $msg, "<$pair>.", caller;
df1df145 255}
256
2571;
258__END__
67d7b5ef 259
260=head1 NAME
261
f2a2953c 262Encode::Unicode -- Various Unicode Transform Format
67d7b5ef 263
264=cut
f2a2953c 265
266=head1 SYNOPSIS
267
fcb875d4 268 use Encode qw/encode decode/;
f2a2953c 269 $ucs2 = encode("UCS-2BE", $utf8);
270 $utf8 = decode("UCS-2BE", $ucs2);
271
272=head1 ABSTRACT
273
274This module implements all Character Encoding Schemes of Unicode that
275are officially documented by Unicode Consortium (except, of course,
276for UTF-8, which is a native format in perl).
277
278=over 4
279
280=item L<http://www.unicode.org/glossary/> says:
281
282I<Character Encoding Scheme> A character encoding form plus byte
283serialization. There are seven character encoding schemes in Unicode:
284UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE.
285
286=item Quick Reference
287
288 Decodes from ord(N) Encodes chr(N) to...
289 octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} ==
290 ---------------+-----------------+------------------------------
291 UCS-2BE 2 N N is bogus Not Available
292 UCS-2LE 2 N N bogus Not Available
293 UTF-16 2/4 Y Y is S.P S.P BE/LE
294 UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd
295 UTF-16LE 2 N Y S.P S.P 0x2ad8,0xcddf
296 UTF-32 4 Y - is bogus As is BE/LE
297 UTF-32BE 4 N - bogus As is 0x0010abcd
298 UTF-32LE 4 N - bogus As is 0xcdab1000
299 UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d
300 ---------------+-----------------+------------------------------
301
302=back
303
304=head1 Size, Endianness, and BOM
305
306You can categorize these CES by 3 criteria; Size of each character,
307Endianness, and Byte Order Mark.
308
309=head2 by Size
310
311UCS-2 is a fixed-length encoding with each character taking 16 bits.
fcb875d4 312It B<does not> support I<Surrogate Pairs>. When a surrogate pair is
313encountered during decode(), its place is filled with \xFFFD without
314I<CHECK> or croaks if I<CHECK>. When a character whose ord value is
315larger than 0xFFFF is encountered, it uses 0xFFFD without I<CHECK> or
316croaks if <CHECK>.
f2a2953c 317
fcb875d4 318UTF-16 is almost the same as UCS-2 but it supports I<Surrogate Pairs>.
f2a2953c 319When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
fcb875d4 320following low surrogate (0xDC00-0xDFFF), C<desurrogate>s them to form a
f2a2953c 321character. Bogus surrogates result in death. When \x{10000} or above
fcb875d4 322is encountered during encode(), it C<ensurrogate>s them and pushes the
f2a2953c 323surrogate pair to the output stream.
324
325UTF-32 is a fixed-length encoding with each character taking 32 bits.
fcb875d4 326Since it is 32-bit there is no need for I<Surrogate Pairs>.
f2a2953c 327
328=head2 by Endianness
329
330First (and now failed) goal of Unicode was to map all character
fcb875d4 331repertories into a fixed-length integer so programmers are happy.
f2a2953c 332Since each character is either I<short> or I<long> in C, you have to
333put endianness of each platform when you pass data to one another.
334
335Anything marked as BE is Big Endian (or network byte order) and LE is
336Little Endian (aka VAX byte order). For anything without, a character
337called Byte Order Mark (BOM) is prepended to the head of string.
338
339=over 4
340
fcb875d4 341=item BOM as integer when fetched in network byte order
f2a2953c 342
fcb875d4 343 16 32 bits/char
344 -------------------------
345 BE 0xFeFF 0x0000FeFF
346 LE 0xFFeF 0xFFFe0000
347 -------------------------
f2a2953c 348
349=back
fcb875d4 350
f2a2953c 351This modules handles BOM as follows.
352
353=over 4
354
355=item *
356
357When BE or LE is explicitly stated as the name of encoding, BOM is
358simply treated as one of characters (ZERO WIDTH NO-BREAK SPACE).
359
360=item *
361
362When BE or LE is omitted during decode(), it checks if BOM is in the
363beginning of the string and if found endianness is set to what BOM
fcb875d4 364says. If not found, dies.
f2a2953c 365
366=item *
367
368When BE or LE is omitted during encode(), it returns a BE-encoded
369string with BOM prepended. So when you want to encode a whole text
370file, make sure you encode() by whole text, not line by line or each
371line, not file, is prepended with BOMs.
372
373=item *
374
375C<UCS-2> is an exception. Unlike others this is an alias of UCS-2BE.
376UCS-2 is already registered by IANA and others that way.
377
fdd579e2 378=back
f2a2953c 379
fcb875d4 380=head1 Surrogate Pairs
f2a2953c 381
fcb875d4 382To say the least, surrogate pairs were the biggest mistake of the
383Unicode Consortium. But according to the late Douglas Adams in I<The
384Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the
385Universe was created. This has made a lot of people very angry and
386been widely regarded as a bad move>. Their mistake was not of this
387magnitude so let's forgive them.
f2a2953c 388
389(I don't dare make any comparison with Unicode Consortium and the
c731e18e 390Vogons here ;) Or, comparing Encode to Babel Fish is completely
391appropriate -- if you can only stick this into your ear :)
f2a2953c 392
fcb875d4 393Surrogate pairs were born when Unicode Consortium finally
394admitted that 16 bits were not big enough to hold all the world's
395character repertoire. But they have already made UCS-2 16-bit. What
f2a2953c 396do we do?
397
398Back then 0xD800-0xDFFF was not allocated. Let's split them half and
399use the first half to represent C<upper half of a character> and the
400latter C<lower half of a character>. That way you can represent 1024
401* 1024 = 1048576 more characters. Now we can store character ranges
402up to \x{10ffff} even with 16-bit encodings. This pair of
403half-character is now called a I<Surrogate Pair> and UTF-16 is the
fcb875d4 404name of the encoding that embraces them.
f2a2953c 405
406Here is a fomula to ensurrogate a Unicode character \x{10000} and
407above;
408
409 $hi = ($uni - 0x10000) / 0x400 + 0xD800;
410 $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
411
412And to desurrogate;
413
414 $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
415
fcb875d4 416Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but
417perl does not prohibit the use of characters within this range. To perl,
418every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
419
420 (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
421 integer support! (**)
422
423 (**) Is anything beyond \x{11_0000} still Unicode :?
f2a2953c 424
425=head1 SEE ALSO
426
fdd579e2 427L<Encode>, L<http://www.unicode.org/glossary/>,
f2a2953c 428
fdd579e2 429RFC 2781 L<http://rfc.net/rfc2781.html>,
430
431L<http://www.unicode.org/unicode/faq/utf_bom.html>
432
fcb875d4 433Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)>
434by Larry Wall, Tom Christiansen, Jon Orwant;
435O'Reilly & Associates; ISBN 0-596-00027-8
436
fdd579e2 437=cut