Commit | Line | Data |
85982a32 |
1 | /* |
a37eaad4 |
2 | $Id: Unicode.xs,v 2.5 2009/02/01 13:14:41 dankogai Exp dankogai $ |
85982a32 |
3 | */ |
4 | |
5 | #define PERL_NO_GET_CONTEXT |
6 | #include "EXTERN.h" |
7 | #include "perl.h" |
8 | #include "XSUB.h" |
6d1c0808 |
9 | #define U8 U8 |
10 | #include "../Encode/encode.h" |
85982a32 |
11 | |
12 | #define FBCHAR 0xFFFd |
13 | #define BOM_BE 0xFeFF |
14 | #define BOM16LE 0xFFFe |
15 | #define BOM32LE 0xFFFe0000 |
16 | #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF ) |
17 | #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 ) |
18 | #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF ) |
19 | #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) ) |
20 | |
64bc6d54 |
21 | #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */ |
22 | |
23 | /* Avoid wasting too much space in the result buffer */ |
a37eaad4 |
24 | /* static void */ |
25 | /* shrink_buffer(SV *result) */ |
26 | /* { */ |
27 | /* if (SvLEN(result) > 42 + SvCUR(result)) { */ |
28 | /* char *buf; */ |
29 | /* STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */ |
30 | /* New(0, buf, len, char); */ |
31 | /* Copy(SvPVX(result), buf, len, char); */ |
32 | /* Safefree(SvPVX(result)); */ |
33 | /* SvPV_set(result, buf); */ |
34 | /* SvLEN_set(result, len); */ |
35 | /* } */ |
36 | /* } */ |
37 | |
38 | #define shrink_buffer(result) { \ |
39 | if (SvLEN(result) > 42 + SvCUR(result)) { \ |
40 | char *newpv; \ |
41 | STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \ |
42 | New(0, newpv, newlen, char); \ |
43 | Copy(SvPVX(result), newpv, newlen, char); \ |
44 | Safefree(SvPVX(result)); \ |
45 | SvPV_set(result, newpv); \ |
46 | SvLEN_set(result, newlen); \ |
47 | } \ |
64bc6d54 |
48 | } |
49 | |
85982a32 |
50 | static UV |
64bc6d54 |
51 | enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian) |
85982a32 |
52 | { |
53 | U8 *s = *sp; |
54 | UV v = 0; |
55 | if (s+size > e) { |
64bc6d54 |
56 | croak("Partial character %c",(char) endian); |
85982a32 |
57 | } |
58 | switch(endian) { |
59 | case 'N': |
64bc6d54 |
60 | v = *s++; |
61 | v = (v << 8) | *s++; |
85982a32 |
62 | case 'n': |
64bc6d54 |
63 | v = (v << 8) | *s++; |
64 | v = (v << 8) | *s++; |
65 | break; |
85982a32 |
66 | case 'V': |
67 | case 'v': |
64bc6d54 |
68 | v |= *s++; |
69 | v |= (*s++ << 8); |
70 | if (endian == 'v') |
71 | break; |
72 | v |= (*s++ << 16); |
73 | v |= (*s++ << 24); |
74 | break; |
85982a32 |
75 | default: |
64bc6d54 |
76 | croak("Unknown endian %c",(char) endian); |
77 | break; |
85982a32 |
78 | } |
79 | *sp = s; |
80 | return v; |
81 | } |
82 | |
83 | void |
64bc6d54 |
84 | enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value) |
85982a32 |
85 | { |
64bc6d54 |
86 | U8 *d = (U8 *) SvPV_nolen(result); |
87 | |
85982a32 |
88 | switch(endian) { |
89 | case 'v': |
90 | case 'V': |
64bc6d54 |
91 | d += SvCUR(result); |
92 | SvCUR_set(result,SvCUR(result)+size); |
93 | while (size--) { |
94 | *d++ = (U8)(value & 0xFF); |
95 | value >>= 8; |
96 | } |
97 | break; |
85982a32 |
98 | case 'n': |
99 | case 'N': |
64bc6d54 |
100 | SvCUR_set(result,SvCUR(result)+size); |
101 | d += SvCUR(result); |
102 | while (size--) { |
103 | *--d = (U8)(value & 0xFF); |
104 | value >>= 8; |
105 | } |
106 | break; |
85982a32 |
107 | default: |
64bc6d54 |
108 | croak("Unknown endian %c",(char) endian); |
109 | break; |
85982a32 |
110 | } |
111 | } |
112 | |
113 | MODULE = Encode::Unicode PACKAGE = Encode::Unicode |
114 | |
6d1c0808 |
115 | PROTOTYPES: DISABLE |
116 | |
a0d8a30e |
117 | #define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \ |
118 | *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef) |
119 | |
85982a32 |
120 | void |
6d1c0808 |
121 | decode_xs(obj, str, check = 0) |
85982a32 |
122 | SV * obj |
123 | SV * str |
6d1c0808 |
124 | IV check |
85982a32 |
125 | CODE: |
126 | { |
64bc6d54 |
127 | U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); |
128 | int size = SvIV(attr("size", 4)); |
129 | int ucs2 = -1; /* only needed in the event of surrogate pairs */ |
130 | SV *result = newSVpvn("",0); |
131 | STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */ |
85982a32 |
132 | STRLEN ulen; |
64bc6d54 |
133 | STRLEN resultbuflen; |
134 | U8 *resultbuf; |
85982a32 |
135 | U8 *s = (U8 *)SvPVbyte(str,ulen); |
136 | U8 *e = (U8 *)SvEND(str); |
64bc6d54 |
137 | /* Optimise for the common case of being called from PerlIOEncode_fill() |
138 | with a standard length buffer. In this case the result SV's buffer is |
139 | only used temporarily, so we can afford to allocate the maximum needed |
140 | and not care about unused space. */ |
141 | const bool temp_result = (ulen == PERLIO_BUFSIZ); |
142 | |
85982a32 |
143 | ST(0) = sv_2mortal(result); |
144 | SvUTF8_on(result); |
145 | |
146 | if (!endian && s+size <= e) { |
64bc6d54 |
147 | UV bom; |
148 | endian = (size == 4) ? 'N' : 'n'; |
149 | bom = enc_unpack(aTHX_ &s,e,size,endian); |
150 | if (bom != BOM_BE) { |
151 | if (bom == BOM16LE) { |
152 | endian = 'v'; |
153 | } |
154 | else if (bom == BOM32LE) { |
155 | endian = 'V'; |
156 | } |
157 | else { |
158 | croak("%"SVf":Unrecognised BOM %"UVxf, |
159 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
160 | bom); |
161 | } |
162 | } |
a0d8a30e |
163 | #if 1 |
64bc6d54 |
164 | /* Update endian for next sequence */ |
165 | if (SvTRUE(attr("renewed", 7))) { |
166 | hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); |
167 | } |
85982a32 |
168 | #endif |
169 | } |
0a8c69ed |
170 | |
64bc6d54 |
171 | if (temp_result) { |
172 | resultbuflen = 1 + ulen/usize * UTF8_MAXLEN; |
173 | } else { |
174 | /* Preallocate the buffer to the minimum possible space required. */ |
175 | resultbuflen = ulen/usize + UTF8_MAXLEN + 1; |
d1256cb1 |
176 | } |
64bc6d54 |
177 | resultbuf = (U8 *) SvGROW(result, resultbuflen); |
0a8c69ed |
178 | |
64bc6d54 |
179 | while (s < e && s+size <= e) { |
180 | UV ord = enc_unpack(aTHX_ &s,e,size,endian); |
181 | U8 *d; |
182 | if (issurrogate(ord)) { |
183 | if (ucs2 == -1) { |
184 | ucs2 = SvTRUE(attr("ucs2", 4)); |
185 | } |
186 | if (ucs2 || size == 4) { |
187 | if (check) { |
188 | croak("%"SVf":no surrogates allowed %"UVxf, |
189 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
190 | ord); |
191 | } |
192 | if (s+size <= e) { |
193 | /* skip the next one as well */ |
194 | enc_unpack(aTHX_ &s,e,size,endian); |
195 | } |
196 | ord = FBCHAR; |
197 | } |
198 | else { |
199 | UV lo; |
200 | if (!isHiSurrogate(ord)) { |
201 | if (check) { |
202 | croak("%"SVf":Malformed HI surrogate %"UVxf, |
203 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
204 | ord); |
205 | } |
206 | else { |
207 | ord = FBCHAR; |
208 | } |
209 | } |
210 | else { |
211 | if (s+size > e) { |
212 | /* Partial character */ |
213 | s -= size; /* back up to 1st half */ |
214 | break; /* And exit loop */ |
215 | } |
216 | lo = enc_unpack(aTHX_ &s,e,size,endian); |
217 | if (!isLoSurrogate(lo)) { |
218 | if (check) { |
219 | croak("%"SVf":Malformed LO surrogate %"UVxf, |
220 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
221 | ord); |
222 | } |
223 | else { |
224 | ord = FBCHAR; |
225 | } |
226 | } |
227 | else { |
228 | ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); |
229 | } |
230 | } |
231 | } |
232 | } |
233 | |
234 | if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) { |
235 | if (check) { |
236 | croak("%"SVf":Unicode character %"UVxf" is illegal", |
237 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
238 | ord); |
239 | } else { |
240 | ord = FBCHAR; |
241 | } |
242 | } |
243 | |
244 | if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) { |
245 | /* Do not allocate >8Mb more than the minimum needed. |
246 | This prevents allocating too much in the rogue case of a large |
247 | input consisting initially of long sequence uft8-byte unicode |
248 | chars followed by single utf8-byte chars. */ |
249 | STRLEN remaining = (e - s)/usize; |
250 | STRLEN max_alloc = remaining + (8*1024*1024); |
251 | STRLEN est_alloc = remaining * UTF8_MAXLEN; |
252 | STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */ |
253 | (est_alloc > max_alloc ? max_alloc : est_alloc); |
254 | resultbuf = (U8 *) SvGROW(result, newlen); |
255 | resultbuflen = SvLEN(result); |
256 | } |
257 | |
258 | d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord, 0); |
259 | SvCUR_set(result, d - (U8 *)SvPVX(result)); |
6d1c0808 |
260 | } |
64bc6d54 |
261 | |
d1256cb1 |
262 | if (s < e) { |
64bc6d54 |
263 | /* unlikely to happen because it's fixed-length -- dankogai */ |
264 | if (check & ENCODE_WARN_ON_ERR) { |
265 | Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character", |
266 | *hv_fetch((HV *)SvRV(obj),"Name",4,0)); |
267 | } |
d1256cb1 |
268 | } |
64bc6d54 |
269 | if (check && !(check & ENCODE_LEAVE_SRC)) { |
270 | if (s < e) { |
271 | Move(s,SvPVX(str),e-s,U8); |
272 | SvCUR_set(str,(e-s)); |
273 | } |
274 | else { |
275 | SvCUR_set(str,0); |
276 | } |
277 | *SvEND(str) = '\0'; |
85982a32 |
278 | } |
64bc6d54 |
279 | |
280 | if (!temp_result) |
281 | shrink_buffer(result); |
282 | |
85982a32 |
283 | XSRETURN(1); |
284 | } |
285 | |
286 | void |
6d1c0808 |
287 | encode_xs(obj, utf8, check = 0) |
288 | SV * obj |
85982a32 |
289 | SV * utf8 |
6d1c0808 |
290 | IV check |
85982a32 |
291 | CODE: |
292 | { |
64bc6d54 |
293 | U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); |
294 | const int size = SvIV(attr("size", 4)); |
295 | int ucs2 = -1; /* only needed if there is invalid_ucs2 input */ |
296 | const STRLEN usize = (size > 0 ? size : 1); |
297 | SV *result = newSVpvn("", 0); |
85982a32 |
298 | STRLEN ulen; |
64bc6d54 |
299 | U8 *s = (U8 *) SvPVutf8(utf8, ulen); |
300 | const U8 *e = (U8 *) SvEND(utf8); |
301 | /* Optimise for the common case of being called from PerlIOEncode_flush() |
302 | with a standard length buffer. In this case the result SV's buffer is |
303 | only used temporarily, so we can afford to allocate the maximum needed |
304 | and not care about unused space. */ |
305 | const bool temp_result = (ulen == PERLIO_BUFSIZ); |
306 | |
85982a32 |
307 | ST(0) = sv_2mortal(result); |
64bc6d54 |
308 | |
309 | /* Preallocate the result buffer to the maximum possible size. |
310 | ie. assume each UTF8 byte is 1 character. |
311 | Then shrink the result's buffer if necesary at the end. */ |
312 | SvGROW(result, ((ulen+1) * usize)); |
313 | |
85982a32 |
314 | if (!endian) { |
64bc6d54 |
315 | endian = (size == 4) ? 'N' : 'n'; |
316 | enc_pack(aTHX_ result,size,endian,BOM_BE); |
a0d8a30e |
317 | #if 1 |
64bc6d54 |
318 | /* Update endian for next sequence */ |
319 | if (SvTRUE(attr("renewed", 7))) { |
320 | hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); |
321 | } |
85982a32 |
322 | #endif |
323 | } |
324 | while (s < e && s+UTF8SKIP(s) <= e) { |
64bc6d54 |
325 | STRLEN len; |
326 | UV ord = utf8n_to_uvuni(s, e-s, &len, 0); |
327 | s += len; |
328 | if (size != 4 && invalid_ucs2(ord)) { |
329 | if (!issurrogate(ord)) { |
330 | if (ucs2 == -1) { |
331 | ucs2 = SvTRUE(attr("ucs2", 4)); |
332 | } |
333 | if (ucs2) { |
334 | if (check) { |
335 | croak("%"SVf":code point \"\\x{%"UVxf"}\" too high", |
336 | *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord); |
337 | } |
338 | enc_pack(aTHX_ result,size,endian,FBCHAR); |
339 | } else { |
340 | UV hi = ((ord - 0x10000) >> 10) + 0xD800; |
341 | UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; |
342 | enc_pack(aTHX_ result,size,endian,hi); |
343 | enc_pack(aTHX_ result,size,endian,lo); |
344 | } |
345 | } |
346 | else { |
347 | /* not supposed to happen */ |
348 | enc_pack(aTHX_ result,size,endian,FBCHAR); |
349 | } |
350 | } |
351 | else { |
352 | enc_pack(aTHX_ result,size,endian,ord); |
353 | } |
85982a32 |
354 | } |
6d1c0808 |
355 | if (s < e) { |
64bc6d54 |
356 | /* UTF-8 partial char happens often on PerlIO. |
357 | Since this is okay and normal, we do not warn. |
358 | But this is critical when you choose to LEAVE_SRC |
359 | in which case we die */ |
360 | if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) { |
361 | Perl_croak(aTHX_ "%"SVf":partial character is not allowed " |
362 | "when CHECK = 0x%" UVuf, |
363 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), check); |
364 | } |
d1256cb1 |
365 | } |
64bc6d54 |
366 | if (check && !(check & ENCODE_LEAVE_SRC)) { |
367 | if (s < e) { |
368 | Move(s,SvPVX(utf8),e-s,U8); |
369 | SvCUR_set(utf8,(e-s)); |
370 | } |
371 | else { |
372 | SvCUR_set(utf8,0); |
373 | } |
374 | *SvEND(utf8) = '\0'; |
d1256cb1 |
375 | } |
64bc6d54 |
376 | |
377 | if (!temp_result) |
378 | shrink_buffer(result); |
379 | |
85982a32 |
380 | XSRETURN(1); |
381 | } |