Upgrade DB_File to 1.818.
[p5sagit/p5-mst-13.2.git] / ext / Encode / Unicode / Unicode.xs
CommitLineData
85982a32 1/*
64bc6d54 2 $Id: Unicode.xs,v 2.4 2009/01/21 22:55:07 dankogai Exp dankogai $
85982a32 3 */
4
5#define PERL_NO_GET_CONTEXT
6#include "EXTERN.h"
7#include "perl.h"
8#include "XSUB.h"
6d1c0808 9#define U8 U8
10#include "../Encode/encode.h"
85982a32 11
12#define FBCHAR 0xFFFd
13#define BOM_BE 0xFeFF
14#define BOM16LE 0xFFFe
15#define BOM32LE 0xFFFe0000
16#define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
17#define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
18#define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
19#define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
20
64bc6d54 21#define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
22
23/* Avoid wasting too much space in the result buffer */
24static void
25shrink_buffer(SV *result)
26{
27 if (SvLEN(result) > 42 + SvCUR(result)) {
28 char *buf;
29 STRLEN datalen = 1 + SvCUR(result); /* include the NUL byte */
30 STRLEN buflen = PERL_STRLEN_ROUNDUP(datalen);
31 Newx(buf, buflen, char);
32 Copy(SvPVX(result), buf, datalen, char);
33 Safefree(SvPVX(result));
34 SvPV_set(result, buf);
35 SvLEN_set(result, buflen);
36 }
37}
38
85982a32 39static UV
64bc6d54 40enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian)
85982a32 41{
42 U8 *s = *sp;
43 UV v = 0;
44 if (s+size > e) {
64bc6d54 45 croak("Partial character %c",(char) endian);
85982a32 46 }
47 switch(endian) {
48 case 'N':
64bc6d54 49 v = *s++;
50 v = (v << 8) | *s++;
85982a32 51 case 'n':
64bc6d54 52 v = (v << 8) | *s++;
53 v = (v << 8) | *s++;
54 break;
85982a32 55 case 'V':
56 case 'v':
64bc6d54 57 v |= *s++;
58 v |= (*s++ << 8);
59 if (endian == 'v')
60 break;
61 v |= (*s++ << 16);
62 v |= (*s++ << 24);
63 break;
85982a32 64 default:
64bc6d54 65 croak("Unknown endian %c",(char) endian);
66 break;
85982a32 67 }
68 *sp = s;
69 return v;
70}
71
72void
64bc6d54 73enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value)
85982a32 74{
64bc6d54 75 U8 *d = (U8 *) SvPV_nolen(result);
76
85982a32 77 switch(endian) {
78 case 'v':
79 case 'V':
64bc6d54 80 d += SvCUR(result);
81 SvCUR_set(result,SvCUR(result)+size);
82 while (size--) {
83 *d++ = (U8)(value & 0xFF);
84 value >>= 8;
85 }
86 break;
85982a32 87 case 'n':
88 case 'N':
64bc6d54 89 SvCUR_set(result,SvCUR(result)+size);
90 d += SvCUR(result);
91 while (size--) {
92 *--d = (U8)(value & 0xFF);
93 value >>= 8;
94 }
95 break;
85982a32 96 default:
64bc6d54 97 croak("Unknown endian %c",(char) endian);
98 break;
85982a32 99 }
100}
101
102MODULE = Encode::Unicode PACKAGE = Encode::Unicode
103
6d1c0808 104PROTOTYPES: DISABLE
105
a0d8a30e 106#define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \
107 *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
108
85982a32 109void
6d1c0808 110decode_xs(obj, str, check = 0)
85982a32 111SV * obj
112SV * str
6d1c0808 113IV check
85982a32 114CODE:
115{
64bc6d54 116 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
117 int size = SvIV(attr("size", 4));
118 int ucs2 = -1; /* only needed in the event of surrogate pairs */
119 SV *result = newSVpvn("",0);
120 STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */
85982a32 121 STRLEN ulen;
64bc6d54 122 STRLEN resultbuflen;
123 U8 *resultbuf;
85982a32 124 U8 *s = (U8 *)SvPVbyte(str,ulen);
125 U8 *e = (U8 *)SvEND(str);
64bc6d54 126 /* Optimise for the common case of being called from PerlIOEncode_fill()
127 with a standard length buffer. In this case the result SV's buffer is
128 only used temporarily, so we can afford to allocate the maximum needed
129 and not care about unused space. */
130 const bool temp_result = (ulen == PERLIO_BUFSIZ);
131
85982a32 132 ST(0) = sv_2mortal(result);
133 SvUTF8_on(result);
134
135 if (!endian && s+size <= e) {
64bc6d54 136 UV bom;
137 endian = (size == 4) ? 'N' : 'n';
138 bom = enc_unpack(aTHX_ &s,e,size,endian);
139 if (bom != BOM_BE) {
140 if (bom == BOM16LE) {
141 endian = 'v';
142 }
143 else if (bom == BOM32LE) {
144 endian = 'V';
145 }
146 else {
147 croak("%"SVf":Unrecognised BOM %"UVxf,
148 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
149 bom);
150 }
151 }
a0d8a30e 152#if 1
64bc6d54 153 /* Update endian for next sequence */
154 if (SvTRUE(attr("renewed", 7))) {
155 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
156 }
85982a32 157#endif
158 }
0a8c69ed 159
64bc6d54 160 if (temp_result) {
161 resultbuflen = 1 + ulen/usize * UTF8_MAXLEN;
162 } else {
163 /* Preallocate the buffer to the minimum possible space required. */
164 resultbuflen = ulen/usize + UTF8_MAXLEN + 1;
d1256cb1 165 }
64bc6d54 166 resultbuf = (U8 *) SvGROW(result, resultbuflen);
0a8c69ed 167
64bc6d54 168 while (s < e && s+size <= e) {
169 UV ord = enc_unpack(aTHX_ &s,e,size,endian);
170 U8 *d;
171 if (issurrogate(ord)) {
172 if (ucs2 == -1) {
173 ucs2 = SvTRUE(attr("ucs2", 4));
174 }
175 if (ucs2 || size == 4) {
176 if (check) {
177 croak("%"SVf":no surrogates allowed %"UVxf,
178 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
179 ord);
180 }
181 if (s+size <= e) {
182 /* skip the next one as well */
183 enc_unpack(aTHX_ &s,e,size,endian);
184 }
185 ord = FBCHAR;
186 }
187 else {
188 UV lo;
189 if (!isHiSurrogate(ord)) {
190 if (check) {
191 croak("%"SVf":Malformed HI surrogate %"UVxf,
192 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
193 ord);
194 }
195 else {
196 ord = FBCHAR;
197 }
198 }
199 else {
200 if (s+size > e) {
201 /* Partial character */
202 s -= size; /* back up to 1st half */
203 break; /* And exit loop */
204 }
205 lo = enc_unpack(aTHX_ &s,e,size,endian);
206 if (!isLoSurrogate(lo)) {
207 if (check) {
208 croak("%"SVf":Malformed LO surrogate %"UVxf,
209 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
210 ord);
211 }
212 else {
213 ord = FBCHAR;
214 }
215 }
216 else {
217 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
218 }
219 }
220 }
221 }
222
223 if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
224 if (check) {
225 croak("%"SVf":Unicode character %"UVxf" is illegal",
226 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
227 ord);
228 } else {
229 ord = FBCHAR;
230 }
231 }
232
233 if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
234 /* Do not allocate >8Mb more than the minimum needed.
235 This prevents allocating too much in the rogue case of a large
236 input consisting initially of long sequence uft8-byte unicode
237 chars followed by single utf8-byte chars. */
238 STRLEN remaining = (e - s)/usize;
239 STRLEN max_alloc = remaining + (8*1024*1024);
240 STRLEN est_alloc = remaining * UTF8_MAXLEN;
241 STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */
242 (est_alloc > max_alloc ? max_alloc : est_alloc);
243 resultbuf = (U8 *) SvGROW(result, newlen);
244 resultbuflen = SvLEN(result);
245 }
246
247 d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord, 0);
248 SvCUR_set(result, d - (U8 *)SvPVX(result));
6d1c0808 249 }
64bc6d54 250
d1256cb1 251 if (s < e) {
64bc6d54 252 /* unlikely to happen because it's fixed-length -- dankogai */
253 if (check & ENCODE_WARN_ON_ERR) {
254 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
255 *hv_fetch((HV *)SvRV(obj),"Name",4,0));
256 }
d1256cb1 257 }
64bc6d54 258 if (check && !(check & ENCODE_LEAVE_SRC)) {
259 if (s < e) {
260 Move(s,SvPVX(str),e-s,U8);
261 SvCUR_set(str,(e-s));
262 }
263 else {
264 SvCUR_set(str,0);
265 }
266 *SvEND(str) = '\0';
85982a32 267 }
64bc6d54 268
269 if (!temp_result)
270 shrink_buffer(result);
271
85982a32 272 XSRETURN(1);
273}
274
275void
6d1c0808 276encode_xs(obj, utf8, check = 0)
277SV * obj
85982a32 278SV * utf8
6d1c0808 279IV check
85982a32 280CODE:
281{
64bc6d54 282 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
283 const int size = SvIV(attr("size", 4));
284 int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
285 const STRLEN usize = (size > 0 ? size : 1);
286 SV *result = newSVpvn("", 0);
85982a32 287 STRLEN ulen;
64bc6d54 288 U8 *s = (U8 *) SvPVutf8(utf8, ulen);
289 const U8 *e = (U8 *) SvEND(utf8);
290 /* Optimise for the common case of being called from PerlIOEncode_flush()
291 with a standard length buffer. In this case the result SV's buffer is
292 only used temporarily, so we can afford to allocate the maximum needed
293 and not care about unused space. */
294 const bool temp_result = (ulen == PERLIO_BUFSIZ);
295
85982a32 296 ST(0) = sv_2mortal(result);
64bc6d54 297
298 /* Preallocate the result buffer to the maximum possible size.
299 ie. assume each UTF8 byte is 1 character.
300 Then shrink the result's buffer if necesary at the end. */
301 SvGROW(result, ((ulen+1) * usize));
302
85982a32 303 if (!endian) {
64bc6d54 304 endian = (size == 4) ? 'N' : 'n';
305 enc_pack(aTHX_ result,size,endian,BOM_BE);
a0d8a30e 306#if 1
64bc6d54 307 /* Update endian for next sequence */
308 if (SvTRUE(attr("renewed", 7))) {
309 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
310 }
85982a32 311#endif
312 }
313 while (s < e && s+UTF8SKIP(s) <= e) {
64bc6d54 314 STRLEN len;
315 UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
316 s += len;
317 if (size != 4 && invalid_ucs2(ord)) {
318 if (!issurrogate(ord)) {
319 if (ucs2 == -1) {
320 ucs2 = SvTRUE(attr("ucs2", 4));
321 }
322 if (ucs2) {
323 if (check) {
324 croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",
325 *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
326 }
327 enc_pack(aTHX_ result,size,endian,FBCHAR);
328 } else {
329 UV hi = ((ord - 0x10000) >> 10) + 0xD800;
330 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
331 enc_pack(aTHX_ result,size,endian,hi);
332 enc_pack(aTHX_ result,size,endian,lo);
333 }
334 }
335 else {
336 /* not supposed to happen */
337 enc_pack(aTHX_ result,size,endian,FBCHAR);
338 }
339 }
340 else {
341 enc_pack(aTHX_ result,size,endian,ord);
342 }
85982a32 343 }
6d1c0808 344 if (s < e) {
64bc6d54 345 /* UTF-8 partial char happens often on PerlIO.
346 Since this is okay and normal, we do not warn.
347 But this is critical when you choose to LEAVE_SRC
348 in which case we die */
349 if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
350 Perl_croak(aTHX_ "%"SVf":partial character is not allowed "
351 "when CHECK = 0x%" UVuf,
352 *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
353 }
d1256cb1 354 }
64bc6d54 355 if (check && !(check & ENCODE_LEAVE_SRC)) {
356 if (s < e) {
357 Move(s,SvPVX(utf8),e-s,U8);
358 SvCUR_set(utf8,(e-s));
359 }
360 else {
361 SvCUR_set(utf8,0);
362 }
363 *SvEND(utf8) = '\0';
d1256cb1 364 }
64bc6d54 365
366 if (!temp_result)
367 shrink_buffer(result);
368
85982a32 369 XSRETURN(1);
370}