CPAN.pm sync
[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Unicode.pm
CommitLineData
f2a2953c 1package Encode::Unicode;
2
df1df145 3use strict;
f2a2953c 4use warnings;
5
aae85ceb 6our $VERSION = do { my @r = (q$Revision: 1.31 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
f2a2953c 7
8#
9# Aux. subs & constants
10#
11
12sub FBCHAR(){ 0xFFFd }
13sub BOM_BE(){ 0xFeFF }
14sub BOM16LE(){ 0xFFFe }
fdd579e2 15sub BOM32LE(){ 0xFFFe0000 }
f2a2953c 16
17sub valid_ucs2($){
fcb875d4 18 return
19 (0 <= $_[0] && $_[0] < 0xD800)
20 || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF);
f2a2953c 21}
22
23sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF }
24sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 }
25sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF }
26
27sub ensurrogate($){
28 use integer; # we have divisions
29 my $uni = shift;
30 my $hi = ($uni - 0x10000) / 0x400 + 0xD800;
31 my $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
32 return ($hi, $lo);
33}
34
35sub desurrogate($$){
36 my ($hi, $lo) = @_;
37 return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
38}
ee981de6 39
f2a2953c 40sub Mask { {2 => 0xffff, 4 => 0xffffffff} }
df1df145 41
f2a2953c 42#
43# Object Generator 8 transcoders all at once!
44#
df1df145 45
f2a2953c 46require Encode;
47for my $name (qw(UTF-16 UTF-16BE UTF-16LE
48 UTF-32 UTF-32BE UTF-32LE
49 UCS-2BE UCS-2LE))
df1df145 50{
f2a2953c 51 my ($size, $endian, $ucs2, $mask);
52 $name =~ /^(\w+)-(\d+)(\w*)$/o;
53 if ($ucs2 = ($1 eq 'UCS')){
54 $size = 2;
55 }else{
56 $size = $2/8;
df1df145 57 }
f2a2953c 58 $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ;
59 $size == 4 and $endian = uc($endian);
60
61 $Encode::Encoding{$name} =
62 bless {
63 Name => $name,
64 size => $size,
65 endian => $endian,
66 ucs2 => $ucs2,
c731e18e 67 } => __PACKAGE__;
f2a2953c 68
df1df145 69}
70
f2a2953c 71sub name { shift->{'Name'} }
aae85ceb 72sub new_sequence
73{
74 my $self = shift;
75 # Return the original if endian known
76 return $self if ($self->{endian});
77 # Return a clone
78 return bless {%$self},ref($self);
79}
80
f2a2953c 81
82#
aae85ceb 83# three implementation of (en|de)code exist. XS version is the fastest.
84# *_modern use # an array and *_classic stick with substr. *_classic is
85# much slower but more memory conservative. *_xs is default.
f2a2953c 86
87sub set_transcoder{
88 no warnings qw(redefine);
89 my $type = shift;
aae85ceb 90 if ($type eq "xs"){
91 *decode = \&decode_xs;
92 *encode = \&encode_xs;
93 }elsif($type eq "modern"){
f2a2953c 94 *decode = \&decode_modern;
95 *encode = \&encode_modern;
96 }elsif($type eq "classic"){
97 *decode = \&decode_classic;
98 *encode = \&encode_classic;
99 }else{
fcb875d4 100 require Carp;
aae85ceb 101 Carp::croak __PACKAGE__, "::set_transcoder(modern|classic|xs)";
f2a2953c 102 }
103}
104
aae85ceb 105set_transcoder("xs");
f2a2953c 106
107#
108# *_modern are much faster but guzzle more memory
109#
110
aae85ceb 111sub decode_modern($$;$)
df1df145 112{
f2a2953c 113 my ($obj, $str, $chk ) = @_;
114 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
115
116 # warn "$size, $endian, $ucs2";
117 $endian ||= BOMB($size, substr($str, 0, $size, ''))
118 or poisoned2death($obj, "Where's the BOM?");
119 my $mask = Mask->{$size};
120 my $utf8 = '';
121 my @ord = unpack("$endian*", $str);
122 undef $str; # to conserve memory
123 while (@ord){
124 my $ord = shift @ord;
125 unless ($size == 4 or valid_ucs2($ord &= $mask)){
126 if ($ucs2){
fcb875d4 127 $chk and
f2a2953c 128 poisoned2death($obj, "no surrogates allowed", $ord);
129 shift @ord; # skip the next one as well
130 $ord = FBCHAR;
131 }else{
132 unless (isHiSurrogate($ord)){
133 poisoned2death($obj, "Malformed HI surrogate", $ord);
134 }
135 my $lo = shift @ord;
136 unless (isLoSurrogate($lo &= $mask)){
137 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
138 }
139 $ord = desurrogate($ord, $lo);
140 }
141 }
142 $utf8 .= chr($ord);
df1df145 143 }
f2a2953c 144 utf8::upgrade($utf8);
145 return $utf8;
146}
147
aae85ceb 148sub encode_modern($$;$)
f2a2953c 149{
150 my ($obj, $utf8, $chk) = @_;
151 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
152 my @str = ();
153 unless ($endian){
154 $endian = ($size == 4) ? 'N' : 'n';
155 push @str, BOM_BE;
156 }
157 my @ord = unpack("U*", $utf8);
158 undef $utf8; # to conserve memory
159 for my $ord (@ord){
160 unless ($size == 4 or valid_ucs2($ord)) {
161 unless(issurrogate($ord)){
162 if ($ucs2){
fcb875d4 163 $chk and
f2a2953c 164 poisoned2death($obj, "code point too high", $ord);
165
166 push @str, FBCHAR;
167 }else{
fcb875d4 168
f2a2953c 169 push @str, ensurrogate($ord);
170 }
171 }else{ # not supposed to happen
172 push @str, FBCHAR;
173 }
174 }else{
175 push @str, $ord;
176 }
177 }
178 return pack("$endian*", @str);
179}
180
181#
182# *_classic are slower but more memory conservative
183#
184
aae85ceb 185sub decode_classic($$;$)
f2a2953c 186{
187 my ($obj, $str, $chk ) = @_;
188 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
189
190 # warn "$size, $endian, $ucs2";
191 $endian ||= BOMB($size, substr($str, 0, $size, ''))
192 or poisoned2death($obj, "Where's the BOM?");
193 my $mask = Mask->{$size};
194 my $utf8 = '';
195 my @ord = unpack("$endian*", $str);
196 while (length($str)){
197 my $ord = unpack($endian, substr($str, 0, $size, ''));
198 unless ($size == 4 or valid_ucs2($ord &= $mask)){
199 if ($ucs2){
fcb875d4 200 $chk and
f2a2953c 201 poisoned2death($obj, "no surrogates allowed", $ord);
202 substr($str,0,$size,''); # skip the next one as well
203 $ord = FBCHAR;
204 }else{
205 unless (isHiSurrogate($ord)){
206 poisoned2death($obj, "Malformed HI surrogate", $ord);
207 }
208 my $lo = unpack($endian ,substr($str,0,$size,''));
209 unless (isLoSurrogate($lo &= $mask)){
210 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
211 }
212 $ord = desurrogate($ord, $lo);
213 }
214 }
215 $utf8 .= chr($ord);
216 }
217 utf8::upgrade($utf8);
218 return $utf8;
219}
220
aae85ceb 221sub encode_classic($$;$)
f2a2953c 222{
223 my ($obj, $utf8, $chk) = @_;
224 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
225 # warn join ", ", $size, $ucs2, $endian, $mask;
226 my $str = '';
227 unless ($endian){
228 $endian = ($size == 4) ? 'N' : 'n';
229 $str .= pack($endian, BOM_BE);
230 }
231 while (length($utf8)){
232 my $ord = ord(substr($utf8,0,1,''));
233 unless ($size == 4 or valid_ucs2($ord)) {
234 unless(issurrogate($ord)){
235 if ($ucs2){
fcb875d4 236 $chk and
f2a2953c 237 poisoned2death($obj, "code point too high", $ord);
238 $str .= pack($endian, FBCHAR);
239 }else{
240 $str .= pack($endian.2, ensurrogate($ord));
241 }
242 }else{ # not supposed to happen
243 $str .= pack($endian, FBCHAR);
244 }
245 }else{
246 $str .= pack($endian, $ord);
247 }
248 }
249 return $str;
250}
251
252sub BOMB {
253 my ($size, $bom) = @_;
254 my $N = $size == 2 ? 'n' : 'N';
255 my $ord = unpack($N, $bom);
fcb875d4 256 return ($ord eq BOM_BE) ? $N :
f2a2953c 257 ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
258}
259
260sub poisoned2death{
261 my $obj = shift;
262 my $msg = shift;
263 my $pair = join(", ", map {sprintf "\\x%x", $_} @_);
264 require Carp;
265 Carp::croak $obj->name, ":", $msg, "<$pair>.", caller;
df1df145 266}
267
2681;
269__END__
67d7b5ef 270
271=head1 NAME
272
f2a2953c 273Encode::Unicode -- Various Unicode Transform Format
67d7b5ef 274
275=cut
f2a2953c 276
277=head1 SYNOPSIS
278
fcb875d4 279 use Encode qw/encode decode/;
f2a2953c 280 $ucs2 = encode("UCS-2BE", $utf8);
281 $utf8 = decode("UCS-2BE", $ucs2);
282
283=head1 ABSTRACT
284
285This module implements all Character Encoding Schemes of Unicode that
286are officially documented by Unicode Consortium (except, of course,
287for UTF-8, which is a native format in perl).
288
289=over 4
290
291=item L<http://www.unicode.org/glossary/> says:
292
293I<Character Encoding Scheme> A character encoding form plus byte
294serialization. There are seven character encoding schemes in Unicode:
295UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE.
296
297=item Quick Reference
298
299 Decodes from ord(N) Encodes chr(N) to...
300 octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} ==
301 ---------------+-----------------+------------------------------
302 UCS-2BE 2 N N is bogus Not Available
303 UCS-2LE 2 N N bogus Not Available
304 UTF-16 2/4 Y Y is S.P S.P BE/LE
305 UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd
306 UTF-16LE 2 N Y S.P S.P 0x2ad8,0xcddf
307 UTF-32 4 Y - is bogus As is BE/LE
308 UTF-32BE 4 N - bogus As is 0x0010abcd
309 UTF-32LE 4 N - bogus As is 0xcdab1000
310 UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d
311 ---------------+-----------------+------------------------------
312
313=back
314
315=head1 Size, Endianness, and BOM
316
317You can categorize these CES by 3 criteria; Size of each character,
318Endianness, and Byte Order Mark.
319
320=head2 by Size
321
322UCS-2 is a fixed-length encoding with each character taking 16 bits.
fcb875d4 323It B<does not> support I<Surrogate Pairs>. When a surrogate pair is
324encountered during decode(), its place is filled with \xFFFD without
325I<CHECK> or croaks if I<CHECK>. When a character whose ord value is
326larger than 0xFFFF is encountered, it uses 0xFFFD without I<CHECK> or
327croaks if <CHECK>.
f2a2953c 328
fcb875d4 329UTF-16 is almost the same as UCS-2 but it supports I<Surrogate Pairs>.
f2a2953c 330When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
fcb875d4 331following low surrogate (0xDC00-0xDFFF), C<desurrogate>s them to form a
f2a2953c 332character. Bogus surrogates result in death. When \x{10000} or above
fcb875d4 333is encountered during encode(), it C<ensurrogate>s them and pushes the
f2a2953c 334surrogate pair to the output stream.
335
336UTF-32 is a fixed-length encoding with each character taking 32 bits.
fcb875d4 337Since it is 32-bit there is no need for I<Surrogate Pairs>.
f2a2953c 338
339=head2 by Endianness
340
341First (and now failed) goal of Unicode was to map all character
fcb875d4 342repertories into a fixed-length integer so programmers are happy.
f2a2953c 343Since each character is either I<short> or I<long> in C, you have to
344put endianness of each platform when you pass data to one another.
345
346Anything marked as BE is Big Endian (or network byte order) and LE is
347Little Endian (aka VAX byte order). For anything without, a character
348called Byte Order Mark (BOM) is prepended to the head of string.
349
350=over 4
351
fcb875d4 352=item BOM as integer when fetched in network byte order
f2a2953c 353
fcb875d4 354 16 32 bits/char
355 -------------------------
356 BE 0xFeFF 0x0000FeFF
357 LE 0xFFeF 0xFFFe0000
358 -------------------------
f2a2953c 359
360=back
fcb875d4 361
f2a2953c 362This modules handles BOM as follows.
363
364=over 4
365
366=item *
367
368When BE or LE is explicitly stated as the name of encoding, BOM is
369simply treated as one of characters (ZERO WIDTH NO-BREAK SPACE).
370
371=item *
372
373When BE or LE is omitted during decode(), it checks if BOM is in the
374beginning of the string and if found endianness is set to what BOM
fcb875d4 375says. If not found, dies.
f2a2953c 376
377=item *
378
379When BE or LE is omitted during encode(), it returns a BE-encoded
380string with BOM prepended. So when you want to encode a whole text
381file, make sure you encode() by whole text, not line by line or each
382line, not file, is prepended with BOMs.
383
384=item *
385
386C<UCS-2> is an exception. Unlike others this is an alias of UCS-2BE.
387UCS-2 is already registered by IANA and others that way.
388
fdd579e2 389=back
f2a2953c 390
fcb875d4 391=head1 Surrogate Pairs
f2a2953c 392
fcb875d4 393To say the least, surrogate pairs were the biggest mistake of the
394Unicode Consortium. But according to the late Douglas Adams in I<The
395Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the
396Universe was created. This has made a lot of people very angry and
397been widely regarded as a bad move>. Their mistake was not of this
398magnitude so let's forgive them.
f2a2953c 399
400(I don't dare make any comparison with Unicode Consortium and the
c731e18e 401Vogons here ;) Or, comparing Encode to Babel Fish is completely
402appropriate -- if you can only stick this into your ear :)
f2a2953c 403
fcb875d4 404Surrogate pairs were born when Unicode Consortium finally
405admitted that 16 bits were not big enough to hold all the world's
406character repertoire. But they have already made UCS-2 16-bit. What
f2a2953c 407do we do?
408
409Back then 0xD800-0xDFFF was not allocated. Let's split them half and
410use the first half to represent C<upper half of a character> and the
411latter C<lower half of a character>. That way you can represent 1024
412* 1024 = 1048576 more characters. Now we can store character ranges
413up to \x{10ffff} even with 16-bit encodings. This pair of
414half-character is now called a I<Surrogate Pair> and UTF-16 is the
fcb875d4 415name of the encoding that embraces them.
f2a2953c 416
448e90bb 417Here is a formula to ensurrogate a Unicode character \x{10000} and
f2a2953c 418above;
419
420 $hi = ($uni - 0x10000) / 0x400 + 0xD800;
421 $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
422
423And to desurrogate;
424
425 $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
426
fcb875d4 427Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but
428perl does not prohibit the use of characters within this range. To perl,
429every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
430
431 (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
432 integer support! (**)
433
434 (**) Is anything beyond \x{11_0000} still Unicode :?
f2a2953c 435
436=head1 SEE ALSO
437
fdd579e2 438L<Encode>, L<http://www.unicode.org/glossary/>,
f2a2953c 439
fdd579e2 440RFC 2781 L<http://rfc.net/rfc2781.html>,
441
442L<http://www.unicode.org/unicode/faq/utf_bom.html>
443
fcb875d4 444Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)>
445by Larry Wall, Tom Christiansen, Jon Orwant;
446O'Reilly & Associates; ISBN 0-596-00027-8
447
fdd579e2 448=cut