Support $! stringification of socket error codes on Windows.
[p5sagit/p5-mst-13.2.git] / cpan / Encode / Unicode / Unicode.xs
CommitLineData
85982a32 1/*
40bed538 2 $Id: Unicode.xs,v 2.5 2009/02/01 13:14:41 dankogai Exp $
85982a32 3 */
4
5#define PERL_NO_GET_CONTEXT
6#include "EXTERN.h"
7#include "perl.h"
8#include "XSUB.h"
6d1c0808 9#define U8 U8
10#include "../Encode/encode.h"
85982a32 11
12#define FBCHAR 0xFFFd
13#define BOM_BE 0xFeFF
14#define BOM16LE 0xFFFe
15#define BOM32LE 0xFFFe0000
16#define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
17#define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
18#define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
19#define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
20
64bc6d54 21#define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
22
23/* Avoid wasting too much space in the result buffer */
a37eaad4 24/* static void */
25/* shrink_buffer(SV *result) */
26/* { */
27/* if (SvLEN(result) > 42 + SvCUR(result)) { */
28/* char *buf; */
29/* STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */
30/* New(0, buf, len, char); */
31/* Copy(SvPVX(result), buf, len, char); */
32/* Safefree(SvPVX(result)); */
33/* SvPV_set(result, buf); */
34/* SvLEN_set(result, len); */
35/* } */
36/* } */
37
38#define shrink_buffer(result) { \
39 if (SvLEN(result) > 42 + SvCUR(result)) { \
40 char *newpv; \
41 STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \
42 New(0, newpv, newlen, char); \
43 Copy(SvPVX(result), newpv, newlen, char); \
44 Safefree(SvPVX(result)); \
45 SvPV_set(result, newpv); \
46 SvLEN_set(result, newlen); \
47 } \
64bc6d54 48}
49
85982a32 50static UV
64bc6d54 51enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian)
85982a32 52{
53 U8 *s = *sp;
54 UV v = 0;
55 if (s+size > e) {
64bc6d54 56 croak("Partial character %c",(char) endian);
85982a32 57 }
58 switch(endian) {
59 case 'N':
64bc6d54 60 v = *s++;
61 v = (v << 8) | *s++;
85982a32 62 case 'n':
64bc6d54 63 v = (v << 8) | *s++;
64 v = (v << 8) | *s++;
65 break;
85982a32 66 case 'V':
67 case 'v':
64bc6d54 68 v |= *s++;
69 v |= (*s++ << 8);
70 if (endian == 'v')
71 break;
72 v |= (*s++ << 16);
73 v |= (*s++ << 24);
74 break;
85982a32 75 default:
64bc6d54 76 croak("Unknown endian %c",(char) endian);
77 break;
85982a32 78 }
79 *sp = s;
80 return v;
81}
82
83void
64bc6d54 84enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value)
85982a32 85{
64bc6d54 86 U8 *d = (U8 *) SvPV_nolen(result);
87
85982a32 88 switch(endian) {
89 case 'v':
90 case 'V':
64bc6d54 91 d += SvCUR(result);
92 SvCUR_set(result,SvCUR(result)+size);
93 while (size--) {
94 *d++ = (U8)(value & 0xFF);
95 value >>= 8;
96 }
97 break;
85982a32 98 case 'n':
99 case 'N':
64bc6d54 100 SvCUR_set(result,SvCUR(result)+size);
101 d += SvCUR(result);
102 while (size--) {
103 *--d = (U8)(value & 0xFF);
104 value >>= 8;
105 }
106 break;
85982a32 107 default:
64bc6d54 108 croak("Unknown endian %c",(char) endian);
109 break;
85982a32 110 }
111}
112
113MODULE = Encode::Unicode PACKAGE = Encode::Unicode
114
6d1c0808 115PROTOTYPES: DISABLE
116
a0d8a30e 117#define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \
118 *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
119
85982a32 120void
6d1c0808 121decode_xs(obj, str, check = 0)
85982a32 122SV * obj
123SV * str
6d1c0808 124IV check
85982a32 125CODE:
126{
64bc6d54 127 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
128 int size = SvIV(attr("size", 4));
129 int ucs2 = -1; /* only needed in the event of surrogate pairs */
130 SV *result = newSVpvn("",0);
131 STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */
85982a32 132 STRLEN ulen;
64bc6d54 133 STRLEN resultbuflen;
134 U8 *resultbuf;
85982a32 135 U8 *s = (U8 *)SvPVbyte(str,ulen);
136 U8 *e = (U8 *)SvEND(str);
64bc6d54 137 /* Optimise for the common case of being called from PerlIOEncode_fill()
138 with a standard length buffer. In this case the result SV's buffer is
139 only used temporarily, so we can afford to allocate the maximum needed
140 and not care about unused space. */
141 const bool temp_result = (ulen == PERLIO_BUFSIZ);
142
85982a32 143 ST(0) = sv_2mortal(result);
144 SvUTF8_on(result);
145
146 if (!endian && s+size <= e) {
64bc6d54 147 UV bom;
148 endian = (size == 4) ? 'N' : 'n';
149 bom = enc_unpack(aTHX_ &s,e,size,endian);
150 if (bom != BOM_BE) {
151 if (bom == BOM16LE) {
152 endian = 'v';
153 }
154 else if (bom == BOM32LE) {
155 endian = 'V';
156 }
157 else {
158 croak("%"SVf":Unrecognised BOM %"UVxf,
159 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
160 bom);
161 }
162 }
a0d8a30e 163#if 1
64bc6d54 164 /* Update endian for next sequence */
165 if (SvTRUE(attr("renewed", 7))) {
166 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
167 }
85982a32 168#endif
169 }
0a8c69ed 170
64bc6d54 171 if (temp_result) {
172 resultbuflen = 1 + ulen/usize * UTF8_MAXLEN;
173 } else {
174 /* Preallocate the buffer to the minimum possible space required. */
175 resultbuflen = ulen/usize + UTF8_MAXLEN + 1;
d1256cb1 176 }
64bc6d54 177 resultbuf = (U8 *) SvGROW(result, resultbuflen);
0a8c69ed 178
64bc6d54 179 while (s < e && s+size <= e) {
180 UV ord = enc_unpack(aTHX_ &s,e,size,endian);
181 U8 *d;
182 if (issurrogate(ord)) {
183 if (ucs2 == -1) {
184 ucs2 = SvTRUE(attr("ucs2", 4));
185 }
186 if (ucs2 || size == 4) {
187 if (check) {
188 croak("%"SVf":no surrogates allowed %"UVxf,
189 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
190 ord);
191 }
192 if (s+size <= e) {
193 /* skip the next one as well */
194 enc_unpack(aTHX_ &s,e,size,endian);
195 }
196 ord = FBCHAR;
197 }
198 else {
199 UV lo;
200 if (!isHiSurrogate(ord)) {
201 if (check) {
202 croak("%"SVf":Malformed HI surrogate %"UVxf,
203 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
204 ord);
205 }
206 else {
207 ord = FBCHAR;
208 }
209 }
210 else {
211 if (s+size > e) {
212 /* Partial character */
213 s -= size; /* back up to 1st half */
214 break; /* And exit loop */
215 }
216 lo = enc_unpack(aTHX_ &s,e,size,endian);
217 if (!isLoSurrogate(lo)) {
218 if (check) {
219 croak("%"SVf":Malformed LO surrogate %"UVxf,
220 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
221 ord);
222 }
223 else {
224 ord = FBCHAR;
225 }
226 }
227 else {
228 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
229 }
230 }
231 }
232 }
233
234 if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
235 if (check) {
236 croak("%"SVf":Unicode character %"UVxf" is illegal",
237 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
238 ord);
239 } else {
240 ord = FBCHAR;
241 }
242 }
243
244 if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
245 /* Do not allocate >8Mb more than the minimum needed.
246 This prevents allocating too much in the rogue case of a large
247 input consisting initially of long sequence uft8-byte unicode
248 chars followed by single utf8-byte chars. */
249 STRLEN remaining = (e - s)/usize;
250 STRLEN max_alloc = remaining + (8*1024*1024);
251 STRLEN est_alloc = remaining * UTF8_MAXLEN;
252 STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */
253 (est_alloc > max_alloc ? max_alloc : est_alloc);
254 resultbuf = (U8 *) SvGROW(result, newlen);
255 resultbuflen = SvLEN(result);
256 }
257
258 d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord, 0);
259 SvCUR_set(result, d - (U8 *)SvPVX(result));
6d1c0808 260 }
64bc6d54 261
d1256cb1 262 if (s < e) {
64bc6d54 263 /* unlikely to happen because it's fixed-length -- dankogai */
264 if (check & ENCODE_WARN_ON_ERR) {
265 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
266 *hv_fetch((HV *)SvRV(obj),"Name",4,0));
267 }
d1256cb1 268 }
64bc6d54 269 if (check && !(check & ENCODE_LEAVE_SRC)) {
270 if (s < e) {
271 Move(s,SvPVX(str),e-s,U8);
272 SvCUR_set(str,(e-s));
273 }
274 else {
275 SvCUR_set(str,0);
276 }
277 *SvEND(str) = '\0';
85982a32 278 }
64bc6d54 279
280 if (!temp_result)
281 shrink_buffer(result);
282
85982a32 283 XSRETURN(1);
284}
285
286void
6d1c0808 287encode_xs(obj, utf8, check = 0)
288SV * obj
85982a32 289SV * utf8
6d1c0808 290IV check
85982a32 291CODE:
292{
64bc6d54 293 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
294 const int size = SvIV(attr("size", 4));
295 int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
296 const STRLEN usize = (size > 0 ? size : 1);
297 SV *result = newSVpvn("", 0);
85982a32 298 STRLEN ulen;
64bc6d54 299 U8 *s = (U8 *) SvPVutf8(utf8, ulen);
300 const U8 *e = (U8 *) SvEND(utf8);
301 /* Optimise for the common case of being called from PerlIOEncode_flush()
302 with a standard length buffer. In this case the result SV's buffer is
303 only used temporarily, so we can afford to allocate the maximum needed
304 and not care about unused space. */
305 const bool temp_result = (ulen == PERLIO_BUFSIZ);
306
85982a32 307 ST(0) = sv_2mortal(result);
64bc6d54 308
309 /* Preallocate the result buffer to the maximum possible size.
310 ie. assume each UTF8 byte is 1 character.
311 Then shrink the result's buffer if necesary at the end. */
312 SvGROW(result, ((ulen+1) * usize));
313
85982a32 314 if (!endian) {
64bc6d54 315 endian = (size == 4) ? 'N' : 'n';
316 enc_pack(aTHX_ result,size,endian,BOM_BE);
a0d8a30e 317#if 1
64bc6d54 318 /* Update endian for next sequence */
319 if (SvTRUE(attr("renewed", 7))) {
320 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
321 }
85982a32 322#endif
323 }
324 while (s < e && s+UTF8SKIP(s) <= e) {
64bc6d54 325 STRLEN len;
326 UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
327 s += len;
328 if (size != 4 && invalid_ucs2(ord)) {
329 if (!issurrogate(ord)) {
330 if (ucs2 == -1) {
331 ucs2 = SvTRUE(attr("ucs2", 4));
332 }
333 if (ucs2) {
334 if (check) {
335 croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",
336 *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
337 }
338 enc_pack(aTHX_ result,size,endian,FBCHAR);
339 } else {
340 UV hi = ((ord - 0x10000) >> 10) + 0xD800;
341 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
342 enc_pack(aTHX_ result,size,endian,hi);
343 enc_pack(aTHX_ result,size,endian,lo);
344 }
345 }
346 else {
347 /* not supposed to happen */
348 enc_pack(aTHX_ result,size,endian,FBCHAR);
349 }
350 }
351 else {
352 enc_pack(aTHX_ result,size,endian,ord);
353 }
85982a32 354 }
6d1c0808 355 if (s < e) {
64bc6d54 356 /* UTF-8 partial char happens often on PerlIO.
357 Since this is okay and normal, we do not warn.
358 But this is critical when you choose to LEAVE_SRC
359 in which case we die */
360 if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
361 Perl_croak(aTHX_ "%"SVf":partial character is not allowed "
362 "when CHECK = 0x%" UVuf,
363 *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
364 }
d1256cb1 365 }
64bc6d54 366 if (check && !(check & ENCODE_LEAVE_SRC)) {
367 if (s < e) {
368 Move(s,SvPVX(utf8),e-s,U8);
369 SvCUR_set(utf8,(e-s));
370 }
371 else {
372 SvCUR_set(utf8,0);
373 }
374 *SvEND(utf8) = '\0';
d1256cb1 375 }
64bc6d54 376
377 if (!temp_result)
378 shrink_buffer(result);
379
85982a32 380 XSRETURN(1);
381}