Upgrade to Encode 1.26, from Dan Kogai.
[p5sagit/p5-mst-13.2.git] / ext / Encode / lib / Encode / Unicode.pm
CommitLineData
f2a2953c 1package Encode::Unicode;
2
df1df145 3use strict;
f2a2953c 4use warnings;
5
6our $VERSION = do { my @r = (q$Revision: 1.25 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
7
8#
9# Aux. subs & constants
10#
11
12sub FBCHAR(){ 0xFFFd }
13sub BOM_BE(){ 0xFeFF }
14sub BOM16LE(){ 0xFFFe }
15sub BOM32LE(){ 0xFeFF0000 }
16
17sub valid_ucs2($){
18 if ($_[0] < 0xD800){
19 return $_[0] > 0;
20 }else{
21 return ($_[0] > 0xDFFFF && $_[0] <= 0xFFFF);
22 }
23}
24
25sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF }
26sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 }
27sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF }
28
29sub ensurrogate($){
30 use integer; # we have divisions
31 my $uni = shift;
32 my $hi = ($uni - 0x10000) / 0x400 + 0xD800;
33 my $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
34 return ($hi, $lo);
35}
36
37sub desurrogate($$){
38 my ($hi, $lo) = @_;
39 return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
40}
ee981de6 41
f2a2953c 42sub Mask { {2 => 0xffff, 4 => 0xffffffff} }
df1df145 43
f2a2953c 44#
45# Object Generator 8 transcoders all at once!
46#
df1df145 47
f2a2953c 48require Encode;
49for my $name (qw(UTF-16 UTF-16BE UTF-16LE
50 UTF-32 UTF-32BE UTF-32LE
51 UCS-2BE UCS-2LE))
df1df145 52{
f2a2953c 53 my ($size, $endian, $ucs2, $mask);
54 $name =~ /^(\w+)-(\d+)(\w*)$/o;
55 if ($ucs2 = ($1 eq 'UCS')){
56 $size = 2;
57 }else{
58 $size = $2/8;
df1df145 59 }
f2a2953c 60 $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ;
61 $size == 4 and $endian = uc($endian);
62
63 $Encode::Encoding{$name} =
64 bless {
65 Name => $name,
66 size => $size,
67 endian => $endian,
68 ucs2 => $ucs2,
69 }, __PACKAGE__;
70
df1df145 71}
72
f2a2953c 73sub name { shift->{'Name'} }
74sub new_sequence { $_[0] };
75
76#
77# the two implementation of (en|de)code exist. *_modern use
78# array and *_classic stick with substr. *_classic is much
79# slower but more memory conservative. *_moder is default.
80
81sub set_transcoder{
82 no warnings qw(redefine);
83 my $type = shift;
84 if ($type eq "modern"){
85 *decode = \&decode_modern;
86 *encode = \&encode_modern;
87 }elsif($type eq "classic"){
88 *decode = \&decode_classic;
89 *encode = \&encode_classic;
90 }else{
91 require Carp;
92 Carp::croak __PACKAGE__, "::set_transcoder(modern|classic)";
93 }
94}
95
96set_transcoder("modern");
97
98#
99# *_modern are much faster but guzzle more memory
100#
101
102sub decode_modern
df1df145 103{
f2a2953c 104 my ($obj, $str, $chk ) = @_;
105 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
106
107 # warn "$size, $endian, $ucs2";
108 $endian ||= BOMB($size, substr($str, 0, $size, ''))
109 or poisoned2death($obj, "Where's the BOM?");
110 my $mask = Mask->{$size};
111 my $utf8 = '';
112 my @ord = unpack("$endian*", $str);
113 undef $str; # to conserve memory
114 while (@ord){
115 my $ord = shift @ord;
116 unless ($size == 4 or valid_ucs2($ord &= $mask)){
117 if ($ucs2){
118 $chk and
119 poisoned2death($obj, "no surrogates allowed", $ord);
120 shift @ord; # skip the next one as well
121 $ord = FBCHAR;
122 }else{
123 unless (isHiSurrogate($ord)){
124 poisoned2death($obj, "Malformed HI surrogate", $ord);
125 }
126 my $lo = shift @ord;
127 unless (isLoSurrogate($lo &= $mask)){
128 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
129 }
130 $ord = desurrogate($ord, $lo);
131 }
132 }
133 $utf8 .= chr($ord);
df1df145 134 }
f2a2953c 135 utf8::upgrade($utf8);
136 return $utf8;
137}
138
139sub encode_modern
140{
141 my ($obj, $utf8, $chk) = @_;
142 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
143 my @str = ();
144 unless ($endian){
145 $endian = ($size == 4) ? 'N' : 'n';
146 push @str, BOM_BE;
147 }
148 my @ord = unpack("U*", $utf8);
149 undef $utf8; # to conserve memory
150 for my $ord (@ord){
151 unless ($size == 4 or valid_ucs2($ord)) {
152 unless(issurrogate($ord)){
153 if ($ucs2){
154 $chk and
155 poisoned2death($obj, "code point too high", $ord);
156
157 push @str, FBCHAR;
158 }else{
159
160 push @str, ensurrogate($ord);
161 }
162 }else{ # not supposed to happen
163 push @str, FBCHAR;
164 }
165 }else{
166 push @str, $ord;
167 }
168 }
169 return pack("$endian*", @str);
170}
171
172#
173# *_classic are slower but more memory conservative
174#
175
176sub decode_classic
177{
178 my ($obj, $str, $chk ) = @_;
179 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
180
181 # warn "$size, $endian, $ucs2";
182 $endian ||= BOMB($size, substr($str, 0, $size, ''))
183 or poisoned2death($obj, "Where's the BOM?");
184 my $mask = Mask->{$size};
185 my $utf8 = '';
186 my @ord = unpack("$endian*", $str);
187 while (length($str)){
188 my $ord = unpack($endian, substr($str, 0, $size, ''));
189 unless ($size == 4 or valid_ucs2($ord &= $mask)){
190 if ($ucs2){
191 $chk and
192 poisoned2death($obj, "no surrogates allowed", $ord);
193 substr($str,0,$size,''); # skip the next one as well
194 $ord = FBCHAR;
195 }else{
196 unless (isHiSurrogate($ord)){
197 poisoned2death($obj, "Malformed HI surrogate", $ord);
198 }
199 my $lo = unpack($endian ,substr($str,0,$size,''));
200 unless (isLoSurrogate($lo &= $mask)){
201 poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
202 }
203 $ord = desurrogate($ord, $lo);
204 }
205 }
206 $utf8 .= chr($ord);
207 }
208 utf8::upgrade($utf8);
209 return $utf8;
210}
211
212sub encode_classic
213{
214 my ($obj, $utf8, $chk) = @_;
215 my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
216 # warn join ", ", $size, $ucs2, $endian, $mask;
217 my $str = '';
218 unless ($endian){
219 $endian = ($size == 4) ? 'N' : 'n';
220 $str .= pack($endian, BOM_BE);
221 }
222 while (length($utf8)){
223 my $ord = ord(substr($utf8,0,1,''));
224 unless ($size == 4 or valid_ucs2($ord)) {
225 unless(issurrogate($ord)){
226 if ($ucs2){
227 $chk and
228 poisoned2death($obj, "code point too high", $ord);
229 $str .= pack($endian, FBCHAR);
230 }else{
231 $str .= pack($endian.2, ensurrogate($ord));
232 }
233 }else{ # not supposed to happen
234 $str .= pack($endian, FBCHAR);
235 }
236 }else{
237 $str .= pack($endian, $ord);
238 }
239 }
240 return $str;
241}
242
243sub BOMB {
244 my ($size, $bom) = @_;
245 my $N = $size == 2 ? 'n' : 'N';
246 my $ord = unpack($N, $bom);
247 return ($ord eq BOM_BE) ? $N :
248 ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
249}
250
251sub poisoned2death{
252 my $obj = shift;
253 my $msg = shift;
254 my $pair = join(", ", map {sprintf "\\x%x", $_} @_);
255 require Carp;
256 Carp::croak $obj->name, ":", $msg, "<$pair>.", caller;
df1df145 257}
258
2591;
260__END__
67d7b5ef 261
262=head1 NAME
263
f2a2953c 264Encode::Unicode -- Various Unicode Transform Format
67d7b5ef 265
266=cut
f2a2953c 267
268=head1 SYNOPSIS
269
270 use Encode qw/encode decode/;
271 $ucs2 = encode("UCS-2BE", $utf8);
272 $utf8 = decode("UCS-2BE", $ucs2);
273
274=head1 ABSTRACT
275
276This module implements all Character Encoding Schemes of Unicode that
277are officially documented by Unicode Consortium (except, of course,
278for UTF-8, which is a native format in perl).
279
280=over 4
281
282=item L<http://www.unicode.org/glossary/> says:
283
284I<Character Encoding Scheme> A character encoding form plus byte
285serialization. There are seven character encoding schemes in Unicode:
286UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE.
287
288=item Quick Reference
289
290 Decodes from ord(N) Encodes chr(N) to...
291 octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} ==
292 ---------------+-----------------+------------------------------
293 UCS-2BE 2 N N is bogus Not Available
294 UCS-2LE 2 N N bogus Not Available
295 UTF-16 2/4 Y Y is S.P S.P BE/LE
296 UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd
297 UTF-16LE 2 N Y S.P S.P 0x2ad8,0xcddf
298 UTF-32 4 Y - is bogus As is BE/LE
299 UTF-32BE 4 N - bogus As is 0x0010abcd
300 UTF-32LE 4 N - bogus As is 0xcdab1000
301 UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d
302 ---------------+-----------------+------------------------------
303
304=back
305
306=head1 Size, Endianness, and BOM
307
308You can categorize these CES by 3 criteria; Size of each character,
309Endianness, and Byte Order Mark.
310
311=head2 by Size
312
313UCS-2 is a fixed-length encoding with each character taking 16 bits.
314It B<does not> support I<Surrogate Pair>. When surrogate pair is
315encountered during decode(), it fills its place with \xFFFD without
316I<CHECK> or croaks if I<CHECK>. When a character which ord value is
317larger than 0xFFFF, it uses 0xFFFD without I<CHECK> or croaks if
318<CHECK>.
319
320UTF-16 is almost the same as UCS-2 but it supports I<Surrogate Pair>.
321When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
322following low surrogate (0xDC00-0xDFFF), C<desurrogate> them to form a
323character. Bogus surrogates result in death. When \x{10000} or above
324is encountered during encode(), it C<ensurrogate>s them and push the
325surrogate pair to the output stream.
326
327UTF-32 is a fixed-length encoding with each character taking 32 bits.
328Since it is 32-bit there is no need for I<Surrogate Pair>.
329
330=head2 by Endianness
331
332First (and now failed) goal of Unicode was to map all character
333repartories into a fixed-length integer so programmers are happy.
334Since each character is either I<short> or I<long> in C, you have to
335put endianness of each platform when you pass data to one another.
336
337Anything marked as BE is Big Endian (or network byte order) and LE is
338Little Endian (aka VAX byte order). For anything without, a character
339called Byte Order Mark (BOM) is prepended to the head of string.
340
341=over 4
342
343=item BOM as integer
344
345 16 32 bits/char
346-------------------------
347BE 0xFeFF 0x0000FeFF
348LE 0xFFeF 0xFeFF0000
349-------------------------
350
351=back
352
353This modules handles BOM as follows.
354
355=over 4
356
357=item *
358
359When BE or LE is explicitly stated as the name of encoding, BOM is
360simply treated as one of characters (ZERO WIDTH NO-BREAK SPACE).
361
362=item *
363
364When BE or LE is omitted during decode(), it checks if BOM is in the
365beginning of the string and if found endianness is set to what BOM
366says. if not found, dies.
367
368=item *
369
370When BE or LE is omitted during encode(), it returns a BE-encoded
371string with BOM prepended. So when you want to encode a whole text
372file, make sure you encode() by whole text, not line by line or each
373line, not file, is prepended with BOMs.
374
375=item *
376
377C<UCS-2> is an exception. Unlike others this is an alias of UCS-2BE.
378UCS-2 is already registered by IANA and others that way.
379
380
381=head1 The Surrogate Pair
382
383To say the least, surrogate pair was the biggest mistake by Unicode
384Consortium. I don't give a darn if they admit it or not. But
385according to late Douglas Adams in I<The Hitchhiker's Guide to the
386Galaxy> Triology, C<First the Universe was created and it was a bad
387move>. Their mistake was not this magnitude so let's forgive them.
388
389(I don't dare make any comparison with Unicode Consortium and the
390Vogols here :)
391
392A surrogate pair was born when Unicode Consortium had finally
393admitted that 16 bit was not big enough to hold all the world's
394character repartorie. But they have already made UCS-2 16-bit. What
395do we do?
396
397Back then 0xD800-0xDFFF was not allocated. Let's split them half and
398use the first half to represent C<upper half of a character> and the
399latter C<lower half of a character>. That way you can represent 1024
400* 1024 = 1048576 more characters. Now we can store character ranges
401up to \x{10ffff} even with 16-bit encodings. This pair of
402half-character is now called a I<Surrogate Pair> and UTF-16 is the
403name of encoding that embraces them.
404
405Here is a fomula to ensurrogate a Unicode character \x{10000} and
406above;
407
408 $hi = ($uni - 0x10000) / 0x400 + 0xD800;
409 $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
410
411And to desurrogate;
412
413 $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
414
415Note this move has made \x{D800}-\x{DFFF} forbidden zone but perl
416does not prohibit them for uses.
417
418=head1 SEE ALSO
419
420L<Encode>, L<http://www.unicode.org/glossary/>
421
422=back