Commit | Line | Data |
85982a32 |
1 | /* |
d1256cb1 |
2 | $Id: Unicode.xs,v 2.3 2006/05/03 18:24:10 dankogai Exp $ |
85982a32 |
3 | */ |
4 | |
5 | #define PERL_NO_GET_CONTEXT |
6 | #include "EXTERN.h" |
7 | #include "perl.h" |
8 | #include "XSUB.h" |
6d1c0808 |
9 | #define U8 U8 |
10 | #include "../Encode/encode.h" |
85982a32 |
11 | |
12 | #define FBCHAR 0xFFFd |
13 | #define BOM_BE 0xFeFF |
14 | #define BOM16LE 0xFFFe |
15 | #define BOM32LE 0xFFFe0000 |
16 | #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF ) |
17 | #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 ) |
18 | #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF ) |
19 | #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) ) |
20 | |
21 | static UV |
22 | enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian) |
23 | { |
24 | U8 *s = *sp; |
25 | UV v = 0; |
26 | if (s+size > e) { |
d1256cb1 |
27 | croak("Partial character %c",(char) endian); |
85982a32 |
28 | } |
29 | switch(endian) { |
30 | case 'N': |
d1256cb1 |
31 | v = *s++; |
32 | v = (v << 8) | *s++; |
85982a32 |
33 | case 'n': |
d1256cb1 |
34 | v = (v << 8) | *s++; |
35 | v = (v << 8) | *s++; |
36 | break; |
85982a32 |
37 | case 'V': |
38 | case 'v': |
d1256cb1 |
39 | v |= *s++; |
40 | v |= (*s++ << 8); |
41 | if (endian == 'v') |
42 | break; |
43 | v |= (*s++ << 16); |
44 | v |= (*s++ << 24); |
45 | break; |
85982a32 |
46 | default: |
d1256cb1 |
47 | croak("Unknown endian %c",(char) endian); |
48 | break; |
85982a32 |
49 | } |
50 | *sp = s; |
51 | return v; |
52 | } |
53 | |
54 | void |
55 | enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value) |
56 | { |
c2cbba7d |
57 | U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size+1); |
85982a32 |
58 | switch(endian) { |
59 | case 'v': |
60 | case 'V': |
d1256cb1 |
61 | d += SvCUR(result); |
62 | SvCUR_set(result,SvCUR(result)+size); |
63 | while (size--) { |
64 | *d++ = (U8)(value & 0xFF); |
65 | value >>= 8; |
66 | } |
67 | break; |
85982a32 |
68 | case 'n': |
69 | case 'N': |
d1256cb1 |
70 | SvCUR_set(result,SvCUR(result)+size); |
71 | d += SvCUR(result); |
72 | while (size--) { |
73 | *--d = (U8)(value & 0xFF); |
74 | value >>= 8; |
75 | } |
76 | break; |
85982a32 |
77 | default: |
d1256cb1 |
78 | croak("Unknown endian %c",(char) endian); |
79 | break; |
85982a32 |
80 | } |
81 | } |
82 | |
83 | MODULE = Encode::Unicode PACKAGE = Encode::Unicode |
84 | |
6d1c0808 |
85 | PROTOTYPES: DISABLE |
86 | |
a0d8a30e |
87 | #define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \ |
88 | *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef) |
89 | |
85982a32 |
90 | void |
6d1c0808 |
91 | decode_xs(obj, str, check = 0) |
85982a32 |
92 | SV * obj |
93 | SV * str |
6d1c0808 |
94 | IV check |
85982a32 |
95 | CODE: |
96 | { |
a0d8a30e |
97 | U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); |
98 | int size = SvIV(attr("size", 4)); |
99 | int ucs2 = SvTRUE(attr("ucs2", 4)); |
cc836e95 |
100 | int renewed = SvTRUE(attr("renewed", 7)); |
a0d8a30e |
101 | SV *result = newSVpvn("",0); |
85982a32 |
102 | STRLEN ulen; |
103 | U8 *s = (U8 *)SvPVbyte(str,ulen); |
104 | U8 *e = (U8 *)SvEND(str); |
105 | ST(0) = sv_2mortal(result); |
106 | SvUTF8_on(result); |
107 | |
108 | if (!endian && s+size <= e) { |
d1256cb1 |
109 | UV bom; |
110 | endian = (size == 4) ? 'N' : 'n'; |
111 | bom = enc_unpack(aTHX_ &s,e,size,endian); |
85982a32 |
112 | if (bom != BOM_BE) { |
d1256cb1 |
113 | if (bom == BOM16LE) { |
114 | endian = 'v'; |
115 | } |
116 | else if (bom == BOM32LE) { |
117 | endian = 'V'; |
118 | } |
119 | else { |
120 | croak("%"SVf":Unrecognised BOM %"UVxf, |
0f7c507f |
121 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
d1256cb1 |
122 | bom); |
123 | } |
124 | } |
a0d8a30e |
125 | #if 1 |
d1256cb1 |
126 | /* Update endian for next sequence */ |
127 | if (renewed) { |
128 | hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); |
129 | } |
85982a32 |
130 | #endif |
131 | } |
132 | while (s < e && s+size <= e) { |
d1256cb1 |
133 | UV ord = enc_unpack(aTHX_ &s,e,size,endian); |
134 | U8 *d; |
135 | if (issurrogate(ord)) { |
136 | if (ucs2 || size == 4) { |
137 | if (check) { |
138 | croak("%"SVf":no surrogates allowed %"UVxf, |
139 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
140 | ord); |
141 | } |
142 | if (s+size <= e) { |
85982a32 |
143 | /* skip the next one as well */ |
d1256cb1 |
144 | enc_unpack(aTHX_ &s,e,size,endian); |
145 | } |
146 | ord = FBCHAR; |
147 | } |
148 | else { |
149 | UV lo; |
150 | if (!isHiSurrogate(ord)) { |
151 | if (check) { |
152 | croak("%"SVf":Malformed HI surrogate %"UVxf, |
153 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
154 | ord); |
155 | } |
156 | else { |
157 | ord = FBCHAR; |
158 | } |
159 | } |
160 | else { |
161 | if (s+size > e) { |
162 | /* Partial character */ |
163 | s -= size; /* back up to 1st half */ |
164 | break; /* And exit loop */ |
165 | } |
166 | lo = enc_unpack(aTHX_ &s,e,size,endian); |
167 | if (!isLoSurrogate(lo)){ |
168 | if (check) { |
169 | croak("%"SVf":Malformed LO surrogate %"UVxf, |
170 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
171 | ord); |
172 | } |
173 | else { |
174 | ord = FBCHAR; |
175 | } |
176 | } |
177 | else { |
178 | ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); |
179 | } |
180 | } |
181 | } |
182 | } |
0a8c69ed |
183 | |
d1256cb1 |
184 | if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) { |
185 | if (check) { |
186 | croak("%"SVf":Unicode character %"UVxf" is illegal", |
187 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
188 | ord); |
189 | } else { |
190 | ord = FBCHAR; |
191 | } |
192 | } |
0a8c69ed |
193 | |
d1256cb1 |
194 | d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1); |
195 | d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0); |
196 | SvCUR_set(result,d - (U8 *)SvPVX(result)); |
85982a32 |
197 | } |
6d1c0808 |
198 | if (s < e) { |
d1256cb1 |
199 | /* unlikely to happen because it's fixed-length -- dankogai */ |
200 | if (check & ENCODE_WARN_ON_ERR){ |
201 | Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character", |
202 | *hv_fetch((HV *)SvRV(obj),"Name",4,0)); |
203 | } |
6d1c0808 |
204 | } |
205 | if (check && !(check & ENCODE_LEAVE_SRC)){ |
d1256cb1 |
206 | if (s < e) { |
207 | Move(s,SvPVX(str),e-s,U8); |
208 | SvCUR_set(str,(e-s)); |
209 | } |
210 | else { |
211 | SvCUR_set(str,0); |
212 | } |
213 | *SvEND(str) = '\0'; |
85982a32 |
214 | } |
215 | XSRETURN(1); |
216 | } |
217 | |
218 | void |
6d1c0808 |
219 | encode_xs(obj, utf8, check = 0) |
220 | SV * obj |
85982a32 |
221 | SV * utf8 |
6d1c0808 |
222 | IV check |
85982a32 |
223 | CODE: |
224 | { |
a0d8a30e |
225 | U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); |
226 | int size = SvIV(attr("size", 4)); |
227 | int ucs2 = SvTRUE(attr("ucs2", 4)); |
cc836e95 |
228 | int renewed = SvTRUE(attr("renewed", 7)); |
a0d8a30e |
229 | SV *result = newSVpvn("",0); |
85982a32 |
230 | STRLEN ulen; |
231 | U8 *s = (U8 *)SvPVutf8(utf8,ulen); |
232 | U8 *e = (U8 *)SvEND(utf8); |
233 | ST(0) = sv_2mortal(result); |
234 | if (!endian) { |
d1256cb1 |
235 | endian = (size == 4) ? 'N' : 'n'; |
236 | enc_pack(aTHX_ result,size,endian,BOM_BE); |
a0d8a30e |
237 | #if 1 |
d1256cb1 |
238 | /* Update endian for next sequence */ |
239 | if (renewed){ |
240 | hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); |
241 | } |
85982a32 |
242 | #endif |
243 | } |
244 | while (s < e && s+UTF8SKIP(s) <= e) { |
d1256cb1 |
245 | STRLEN len; |
246 | UV ord = utf8n_to_uvuni(s, e-s, &len, 0); |
85982a32 |
247 | s += len; |
d1256cb1 |
248 | if (size != 4 && invalid_ucs2(ord)) { |
249 | if (!issurrogate(ord)){ |
250 | if (ucs2) { |
251 | if (check) { |
252 | croak("%"SVf":code point \"\\x{%"UVxf"}\" too high", |
253 | *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord); |
254 | } |
255 | enc_pack(aTHX_ result,size,endian,FBCHAR); |
256 | }else{ |
257 | UV hi = ((ord - 0x10000) >> 10) + 0xD800; |
258 | UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; |
259 | enc_pack(aTHX_ result,size,endian,hi); |
260 | enc_pack(aTHX_ result,size,endian,lo); |
261 | } |
262 | } |
263 | else { |
264 | /* not supposed to happen */ |
265 | enc_pack(aTHX_ result,size,endian,FBCHAR); |
266 | } |
267 | } |
268 | else { |
269 | enc_pack(aTHX_ result,size,endian,ord); |
270 | } |
85982a32 |
271 | } |
6d1c0808 |
272 | if (s < e) { |
d1256cb1 |
273 | /* UTF-8 partial char happens often on PerlIO. |
274 | Since this is okay and normal, we do not warn. |
275 | But this is critical when you choose to LEAVE_SRC |
276 | in which case we die */ |
277 | if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)){ |
278 | Perl_croak(aTHX_ "%"SVf":partial character is not allowed " |
279 | "when CHECK = 0x%" UVuf, |
280 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), check); |
281 | } |
282 | |
6d1c0808 |
283 | } |
284 | if (check && !(check & ENCODE_LEAVE_SRC)){ |
d1256cb1 |
285 | if (s < e) { |
286 | Move(s,SvPVX(utf8),e-s,U8); |
287 | SvCUR_set(utf8,(e-s)); |
288 | } |
289 | else { |
290 | SvCUR_set(utf8,0); |
291 | } |
292 | *SvEND(utf8) = '\0'; |
47dd3999 |
293 | } |
85982a32 |
294 | XSRETURN(1); |
295 | } |
296 | |