Commit | Line | Data |
f2a2953c |
1 | package Encode::Unicode; |
2 | |
df1df145 |
3 | use strict; |
f2a2953c |
4 | use warnings; |
a0d8a30e |
5 | no warnings 'redefine'; |
f2a2953c |
6 | |
b2deda17 |
7 | our $VERSION = do { my @r = ( q$Revision: 2.7 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; |
f2a2953c |
8 | |
85982a32 |
9 | use XSLoader; |
d1256cb1 |
10 | XSLoader::load( __PACKAGE__, $VERSION ); |
df1df145 |
11 | |
f2a2953c |
12 | # |
13 | # Object Generator 8 transcoders all at once! |
14 | # |
df1df145 |
15 | |
f2a2953c |
16 | require Encode; |
10c5ecbb |
17 | |
d1256cb1 |
18 | our %BOM_Unknown = map { $_ => 1 } qw(UTF-16 UTF-32); |
a0d8a30e |
19 | |
d1256cb1 |
20 | for my $name ( |
21 | qw(UTF-16 UTF-16BE UTF-16LE |
22 | UTF-32 UTF-32BE UTF-32LE |
23 | UCS-2BE UCS-2LE) |
24 | ) |
df1df145 |
25 | { |
d1256cb1 |
26 | my ( $size, $endian, $ucs2, $mask ); |
f2a2953c |
27 | $name =~ /^(\w+)-(\d+)(\w*)$/o; |
d1256cb1 |
28 | if ( $ucs2 = ( $1 eq 'UCS' ) ) { |
29 | $size = 2; |
df1df145 |
30 | } |
d1256cb1 |
31 | else { |
32 | $size = $2 / 8; |
33 | } |
34 | $endian = ( $3 eq 'BE' ) ? 'n' : ( $3 eq 'LE' ) ? 'v' : ''; |
f2a2953c |
35 | $size == 4 and $endian = uc($endian); |
36 | |
d1256cb1 |
37 | $Encode::Encoding{$name} = bless { |
38 | Name => $name, |
39 | size => $size, |
40 | endian => $endian, |
41 | ucs2 => $ucs2, |
42 | } => __PACKAGE__; |
df1df145 |
43 | } |
44 | |
10c5ecbb |
45 | use base qw(Encode::Encoding); |
f2a2953c |
46 | |
d1256cb1 |
47 | sub renew { |
a0d8a30e |
48 | my $self = shift; |
d1256cb1 |
49 | $BOM_Unknown{ $self->name } or return $self; |
50 | my $clone = bless {%$self} => ref($self); |
51 | $clone->{renewed}++; # so the caller knows it is renewed. |
a0d8a30e |
52 | return $clone; |
f2a2953c |
53 | } |
54 | |
a0d8a30e |
55 | # There used to be a perl implemntation of (en|de)code but with |
56 | # XS version is ripe, perl version is zapped for optimal speed |
f2a2953c |
57 | |
a0d8a30e |
58 | *decode = \&decode_xs; |
59 | *encode = \&encode_xs; |
df1df145 |
60 | |
61 | 1; |
62 | __END__ |
67d7b5ef |
63 | |
64 | =head1 NAME |
65 | |
0ab8f81e |
66 | Encode::Unicode -- Various Unicode Transformation Formats |
67d7b5ef |
67 | |
68 | =cut |
f2a2953c |
69 | |
70 | =head1 SYNOPSIS |
71 | |
40bed538 |
72 | use Encode qw/encode decode/; |
f2a2953c |
73 | $ucs2 = encode("UCS-2BE", $utf8); |
74 | $utf8 = decode("UCS-2BE", $ucs2); |
75 | |
76 | =head1 ABSTRACT |
77 | |
78 | This module implements all Character Encoding Schemes of Unicode that |
79 | are officially documented by Unicode Consortium (except, of course, |
80 | for UTF-8, which is a native format in perl). |
81 | |
82 | =over 4 |
83 | |
84 | =item L<http://www.unicode.org/glossary/> says: |
85 | |
86 | I<Character Encoding Scheme> A character encoding form plus byte |
1485817e |
87 | serialization. There are Seven character encoding schemes in Unicode: |
11067275 |
88 | UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32 (UCS-4), UTF-32BE (UCS-4BE) and |
1485817e |
89 | UTF-32LE (UCS-4LE), and UTF-7. |
90 | |
91 | Since UTF-7 is a 7-bit (re)encoded version of UTF-16BE, It is not part of |
92 | Unicode's Character Encoding Scheme. It is separately implemented in |
93 | Encode::Unicode::UTF7. For details see L<Encode::Unicode::UTF7>. |
f2a2953c |
94 | |
95 | =item Quick Reference |
96 | |
97 | Decodes from ord(N) Encodes chr(N) to... |
98 | octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} == |
99 | ---------------+-----------------+------------------------------ |
370462a2 |
100 | UCS-2BE 2 N N is bogus Not Available |
f2a2953c |
101 | UCS-2LE 2 N N bogus Not Available |
102 | UTF-16 2/4 Y Y is S.P S.P BE/LE |
103 | UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd |
370462a2 |
104 | UTF-16LE 2/4 N Y S.P S.P 0x2ad8,0xcddf |
105 | UTF-32 4 Y - is bogus As is BE/LE |
106 | UTF-32BE 4 N - bogus As is 0x0001abcd |
107 | UTF-32LE 4 N - bogus As is 0xcdab0100 |
f2a2953c |
108 | UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d |
109 | ---------------+-----------------+------------------------------ |
110 | |
111 | =back |
112 | |
113 | =head1 Size, Endianness, and BOM |
114 | |
0ab8f81e |
115 | You can categorize these CES by 3 criteria: size of each character, |
116 | endianness, and Byte Order Mark. |
f2a2953c |
117 | |
0ab8f81e |
118 | =head2 by size |
f2a2953c |
119 | |
120 | UCS-2 is a fixed-length encoding with each character taking 16 bits. |
0ab8f81e |
121 | It B<does not> support I<surrogate pairs>. When a surrogate pair |
122 | is encountered during decode(), its place is filled with \x{FFFD} |
123 | if I<CHECK> is 0, or the routine croaks if I<CHECK> is 1. When a |
124 | character whose ord value is larger than 0xFFFF is encountered, |
125 | its place is filled with \x{FFFD} if I<CHECK> is 0, or the routine |
126 | croaks if I<CHECK> is 1. |
127 | |
128 | UTF-16 is almost the same as UCS-2 but it supports I<surrogate pairs>. |
f2a2953c |
129 | When it encounters a high surrogate (0xD800-0xDBFF), it fetches the |
0ab8f81e |
130 | following low surrogate (0xDC00-0xDFFF) and C<desurrogate>s them to |
131 | form a character. Bogus surrogates result in death. When \x{10000} |
132 | or above is encountered during encode(), it C<ensurrogate>s them and |
133 | pushes the surrogate pair to the output stream. |
f2a2953c |
134 | |
11067275 |
135 | UTF-32 (UCS-4) is a fixed-length encoding with each character taking 32 bits. |
0ab8f81e |
136 | Since it is 32-bit, there is no need for I<surrogate pairs>. |
f2a2953c |
137 | |
0ab8f81e |
138 | =head2 by endianness |
f2a2953c |
139 | |
0ab8f81e |
140 | The first (and now failed) goal of Unicode was to map all character |
141 | repertoires into a fixed-length integer so that programmers are happy. |
142 | Since each character is either a I<short> or I<long> in C, you have to |
143 | pay attention to the endianness of each platform when you pass data |
144 | to one another. |
f2a2953c |
145 | |
146 | Anything marked as BE is Big Endian (or network byte order) and LE is |
0ab8f81e |
147 | Little Endian (aka VAX byte order). For anything not marked either |
148 | BE or LE, a character called Byte Order Mark (BOM) indicating the |
149 | endianness is prepended to the string. |
f2a2953c |
150 | |
7237418a |
151 | CAVEAT: Though BOM in utf8 (\xEF\xBB\xBF) is valid, it is meaningless |
152 | and as of this writing Encode suite just leave it as is (\x{FeFF}). |
153 | |
f2a2953c |
154 | =over 4 |
155 | |
fcb875d4 |
156 | =item BOM as integer when fetched in network byte order |
f2a2953c |
157 | |
fcb875d4 |
158 | 16 32 bits/char |
159 | ------------------------- |
160 | BE 0xFeFF 0x0000FeFF |
742555bd |
161 | LE 0xFFFe 0xFFFe0000 |
fcb875d4 |
162 | ------------------------- |
f2a2953c |
163 | |
164 | =back |
151b5d36 |
165 | |
0ab8f81e |
166 | This modules handles the BOM as follows. |
f2a2953c |
167 | |
168 | =over 4 |
169 | |
170 | =item * |
171 | |
172 | When BE or LE is explicitly stated as the name of encoding, BOM is |
0ab8f81e |
173 | simply treated as a normal character (ZERO WIDTH NO-BREAK SPACE). |
f2a2953c |
174 | |
175 | =item * |
176 | |
0ab8f81e |
177 | When BE or LE is omitted during decode(), it checks if BOM is at the |
178 | beginning of the string; if one is found, the endianness is set to |
179 | what the BOM says. If no BOM is found, the routine dies. |
f2a2953c |
180 | |
181 | =item * |
182 | |
183 | When BE or LE is omitted during encode(), it returns a BE-encoded |
184 | string with BOM prepended. So when you want to encode a whole text |
0ab8f81e |
185 | file, make sure you encode() the whole text at once, not line by line |
186 | or each line, not file, will have a BOM prepended. |
f2a2953c |
187 | |
188 | =item * |
189 | |
0ab8f81e |
190 | C<UCS-2> is an exception. Unlike others, this is an alias of UCS-2BE. |
f2a2953c |
191 | UCS-2 is already registered by IANA and others that way. |
192 | |
fdd579e2 |
193 | =back |
f2a2953c |
194 | |
fcb875d4 |
195 | =head1 Surrogate Pairs |
f2a2953c |
196 | |
fcb875d4 |
197 | To say the least, surrogate pairs were the biggest mistake of the |
198 | Unicode Consortium. But according to the late Douglas Adams in I<The |
199 | Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the |
200 | Universe was created. This has made a lot of people very angry and |
201 | been widely regarded as a bad move>. Their mistake was not of this |
202 | magnitude so let's forgive them. |
f2a2953c |
203 | |
204 | (I don't dare make any comparison with Unicode Consortium and the |
c731e18e |
205 | Vogons here ;) Or, comparing Encode to Babel Fish is completely |
206 | appropriate -- if you can only stick this into your ear :) |
f2a2953c |
207 | |
0ab8f81e |
208 | Surrogate pairs were born when the Unicode Consortium finally |
fcb875d4 |
209 | admitted that 16 bits were not big enough to hold all the world's |
0ab8f81e |
210 | character repertoires. But they already made UCS-2 16-bit. What |
f2a2953c |
211 | do we do? |
212 | |
0ab8f81e |
213 | Back then, the range 0xD800-0xDFFF was not allocated. Let's split |
214 | that range in half and use the first half to represent the C<upper |
215 | half of a character> and the second half to represent the C<lower |
216 | half of a character>. That way, you can represent 1024 * 1024 = |
217 | 1048576 more characters. Now we can store character ranges up to |
218 | \x{10ffff} even with 16-bit encodings. This pair of half-character is |
219 | now called a I<surrogate pair> and UTF-16 is the name of the encoding |
220 | that embraces them. |
f2a2953c |
221 | |
448e90bb |
222 | Here is a formula to ensurrogate a Unicode character \x{10000} and |
f2a2953c |
223 | above; |
224 | |
225 | $hi = ($uni - 0x10000) / 0x400 + 0xD800; |
226 | $lo = ($uni - 0x10000) % 0x400 + 0xDC00; |
227 | |
228 | And to desurrogate; |
229 | |
230 | $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00); |
231 | |
fcb875d4 |
232 | Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but |
40bed538 |
233 | perl does not prohibit the use of characters within this range. To perl, |
fcb875d4 |
234 | every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>. |
235 | |
236 | (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit |
0ab8f81e |
237 | integer support! |
f2a2953c |
238 | |
f9d05ba3 |
239 | =head1 Error Checking |
240 | |
241 | Unlike most encodings which accept various ways to handle errors, |
242 | Unicode encodings simply croaks. |
243 | |
40bed538 |
244 | % perl -MEncode -e'$_ = "\xfe\xff\xd8\xd9\xda\xdb\0\n"' \ |
245 | -e'Encode::from_to($_, "utf16","shift_jis", 0); print' |
f9d05ba3 |
246 | UTF-16:Malformed LO surrogate d8d9 at /path/to/Encode.pm line 184. |
40bed538 |
247 | % perl -MEncode -e'$a = "BOM missing"' \ |
248 | -e' Encode::from_to($a, "utf16", "shift_jis", 0); print' |
f9d05ba3 |
249 | UTF-16:Unrecognised BOM 424f at /path/to/Encode.pm line 184. |
250 | |
251 | Unlike other encodings where mappings are not one-to-one against |
252 | Unicode, UTFs are supposed to map 100% against one another. So Encode |
253 | is more strict on UTFs. |
254 | |
255 | Consider that "division by zero" of Encode :) |
256 | |
f2a2953c |
257 | =head1 SEE ALSO |
258 | |
1485817e |
259 | L<Encode>, L<Encode::Unicode::UTF7>, L<http://www.unicode.org/glossary/>, |
11067275 |
260 | L<http://www.unicode.org/unicode/faq/utf_bom.html>, |
f2a2953c |
261 | |
b2deda17 |
262 | RFC 2781 L<http://www.ietf.org/rfc/rfc2781.txt>, |
fdd579e2 |
263 | |
11067275 |
264 | The whole Unicode standard L<http://www.unicode.org/unicode/uni2book/u2.html> |
fdd579e2 |
265 | |
fcb875d4 |
266 | Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)> |
40bed538 |
267 | by Larry Wall, Tom Christiansen, Jon Orwant; |
fcb875d4 |
268 | O'Reilly & Associates; ISBN 0-596-00027-8 |
269 | |
fdd579e2 |
270 | =cut |