2 $Id: Unicode.xs,v 2.5 2009/02/01 13:14:41 dankogai Exp $
5 #define PERL_NO_GET_CONTEXT
10 #include "../Encode/encode.h"
14 #define BOM16LE 0xFFFe
15 #define BOM32LE 0xFFFe0000
16 #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
17 #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
18 #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
19 #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
21 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
23 /* Avoid wasting too much space in the result buffer */
25 /* shrink_buffer(SV *result) */
27 /* if (SvLEN(result) > 42 + SvCUR(result)) { */
29 /* STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */
30 /* New(0, buf, len, char); */
31 /* Copy(SvPVX(result), buf, len, char); */
32 /* Safefree(SvPVX(result)); */
33 /* SvPV_set(result, buf); */
34 /* SvLEN_set(result, len); */
38 #define shrink_buffer(result) { \
39 if (SvLEN(result) > 42 + SvCUR(result)) { \
41 STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \
42 New(0, newpv, newlen, char); \
43 Copy(SvPVX(result), newpv, newlen, char); \
44 Safefree(SvPVX(result)); \
45 SvPV_set(result, newpv); \
46 SvLEN_set(result, newlen); \
51 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian)
56 croak("Partial character %c",(char) endian);
76 croak("Unknown endian %c",(char) endian);
84 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value)
86 U8 *d = (U8 *) SvPV_nolen(result);
92 SvCUR_set(result,SvCUR(result)+size);
94 *d++ = (U8)(value & 0xFF);
100 SvCUR_set(result,SvCUR(result)+size);
103 *--d = (U8)(value & 0xFF);
108 croak("Unknown endian %c",(char) endian);
113 MODULE = Encode::Unicode PACKAGE = Encode::Unicode
117 #define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \
118 *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
121 decode_xs(obj, str, check = 0)
127 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
128 int size = SvIV(attr("size", 4));
129 int ucs2 = -1; /* only needed in the event of surrogate pairs */
130 SV *result = newSVpvn("",0);
131 STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */
135 U8 *s = (U8 *)SvPVbyte(str,ulen);
136 U8 *e = (U8 *)SvEND(str);
137 /* Optimise for the common case of being called from PerlIOEncode_fill()
138 with a standard length buffer. In this case the result SV's buffer is
139 only used temporarily, so we can afford to allocate the maximum needed
140 and not care about unused space. */
141 const bool temp_result = (ulen == PERLIO_BUFSIZ);
143 ST(0) = sv_2mortal(result);
146 if (!endian && s+size <= e) {
148 endian = (size == 4) ? 'N' : 'n';
149 bom = enc_unpack(aTHX_ &s,e,size,endian);
151 if (bom == BOM16LE) {
154 else if (bom == BOM32LE) {
158 croak("%"SVf":Unrecognised BOM %"UVxf,
159 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
164 /* Update endian for next sequence */
165 if (SvTRUE(attr("renewed", 7))) {
166 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
172 resultbuflen = 1 + ulen/usize * UTF8_MAXLEN;
174 /* Preallocate the buffer to the minimum possible space required. */
175 resultbuflen = ulen/usize + UTF8_MAXLEN + 1;
177 resultbuf = (U8 *) SvGROW(result, resultbuflen);
179 while (s < e && s+size <= e) {
180 UV ord = enc_unpack(aTHX_ &s,e,size,endian);
182 if (issurrogate(ord)) {
184 ucs2 = SvTRUE(attr("ucs2", 4));
186 if (ucs2 || size == 4) {
188 croak("%"SVf":no surrogates allowed %"UVxf,
189 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
193 /* skip the next one as well */
194 enc_unpack(aTHX_ &s,e,size,endian);
200 if (!isHiSurrogate(ord)) {
202 croak("%"SVf":Malformed HI surrogate %"UVxf,
203 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
212 /* Partial character */
213 s -= size; /* back up to 1st half */
214 break; /* And exit loop */
216 lo = enc_unpack(aTHX_ &s,e,size,endian);
217 if (!isLoSurrogate(lo)) {
219 croak("%"SVf":Malformed LO surrogate %"UVxf,
220 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
228 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
234 if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
236 croak("%"SVf":Unicode character %"UVxf" is illegal",
237 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
244 if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
245 /* Do not allocate >8Mb more than the minimum needed.
246 This prevents allocating too much in the rogue case of a large
247 input consisting initially of long sequence uft8-byte unicode
248 chars followed by single utf8-byte chars. */
249 STRLEN remaining = (e - s)/usize;
250 STRLEN max_alloc = remaining + (8*1024*1024);
251 STRLEN est_alloc = remaining * UTF8_MAXLEN;
252 STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */
253 (est_alloc > max_alloc ? max_alloc : est_alloc);
254 resultbuf = (U8 *) SvGROW(result, newlen);
255 resultbuflen = SvLEN(result);
258 d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord, 0);
259 SvCUR_set(result, d - (U8 *)SvPVX(result));
263 /* unlikely to happen because it's fixed-length -- dankogai */
264 if (check & ENCODE_WARN_ON_ERR) {
265 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
266 *hv_fetch((HV *)SvRV(obj),"Name",4,0));
269 if (check && !(check & ENCODE_LEAVE_SRC)) {
271 Move(s,SvPVX(str),e-s,U8);
272 SvCUR_set(str,(e-s));
281 shrink_buffer(result);
287 encode_xs(obj, utf8, check = 0)
293 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
294 const int size = SvIV(attr("size", 4));
295 int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
296 const STRLEN usize = (size > 0 ? size : 1);
297 SV *result = newSVpvn("", 0);
299 U8 *s = (U8 *) SvPVutf8(utf8, ulen);
300 const U8 *e = (U8 *) SvEND(utf8);
301 /* Optimise for the common case of being called from PerlIOEncode_flush()
302 with a standard length buffer. In this case the result SV's buffer is
303 only used temporarily, so we can afford to allocate the maximum needed
304 and not care about unused space. */
305 const bool temp_result = (ulen == PERLIO_BUFSIZ);
307 ST(0) = sv_2mortal(result);
309 /* Preallocate the result buffer to the maximum possible size.
310 ie. assume each UTF8 byte is 1 character.
311 Then shrink the result's buffer if necesary at the end. */
312 SvGROW(result, ((ulen+1) * usize));
315 endian = (size == 4) ? 'N' : 'n';
316 enc_pack(aTHX_ result,size,endian,BOM_BE);
318 /* Update endian for next sequence */
319 if (SvTRUE(attr("renewed", 7))) {
320 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
324 while (s < e && s+UTF8SKIP(s) <= e) {
326 UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
328 if (size != 4 && invalid_ucs2(ord)) {
329 if (!issurrogate(ord)) {
331 ucs2 = SvTRUE(attr("ucs2", 4));
335 croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",
336 *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
338 enc_pack(aTHX_ result,size,endian,FBCHAR);
340 UV hi = ((ord - 0x10000) >> 10) + 0xD800;
341 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
342 enc_pack(aTHX_ result,size,endian,hi);
343 enc_pack(aTHX_ result,size,endian,lo);
347 /* not supposed to happen */
348 enc_pack(aTHX_ result,size,endian,FBCHAR);
352 enc_pack(aTHX_ result,size,endian,ord);
356 /* UTF-8 partial char happens often on PerlIO.
357 Since this is okay and normal, we do not warn.
358 But this is critical when you choose to LEAVE_SRC
359 in which case we die */
360 if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
361 Perl_croak(aTHX_ "%"SVf":partial character is not allowed "
362 "when CHECK = 0x%" UVuf,
363 *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
366 if (check && !(check & ENCODE_LEAVE_SRC)) {
368 Move(s,SvPVX(utf8),e-s,U8);
369 SvCUR_set(utf8,(e-s));
378 shrink_buffer(result);