[PATCH] encoding and open pragmas
[p5sagit/p5-mst-13.2.git] / ext / Encode / Unicode / Unicode.xs
CommitLineData
85982a32 1/*
7237418a 2 $Id: Unicode.xs,v 2.0 2004/05/16 20:55:16 dankogai Exp $
85982a32 3 */
4
5#define PERL_NO_GET_CONTEXT
6#include "EXTERN.h"
7#include "perl.h"
8#include "XSUB.h"
6d1c0808 9#define U8 U8
10#include "../Encode/encode.h"
85982a32 11
12#define FBCHAR 0xFFFd
13#define BOM_BE 0xFeFF
14#define BOM16LE 0xFFFe
15#define BOM32LE 0xFFFe0000
16#define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
17#define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
18#define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
19#define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
20
21static UV
22enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian)
23{
24 U8 *s = *sp;
25 UV v = 0;
26 if (s+size > e) {
27 croak("Partial character %c",(char) endian);
28 }
29 switch(endian) {
30 case 'N':
31 v = *s++;
32 v = (v << 8) | *s++;
33 case 'n':
34 v = (v << 8) | *s++;
35 v = (v << 8) | *s++;
36 break;
37 case 'V':
38 case 'v':
39 v |= *s++;
40 v |= (*s++ << 8);
41 if (endian == 'v')
42 break;
43 v |= (*s++ << 16);
44 v |= (*s++ << 24);
45 break;
46 default:
47 croak("Unknown endian %c",(char) endian);
48 break;
49 }
50 *sp = s;
51 return v;
52}
53
54void
55enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value)
56{
c2cbba7d 57 U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size+1);
85982a32 58 switch(endian) {
59 case 'v':
60 case 'V':
61 d += SvCUR(result);
62 SvCUR_set(result,SvCUR(result)+size);
63 while (size--) {
7c436af3 64 *d++ = (U8)(value & 0xFF);
85982a32 65 value >>= 8;
66 }
67 break;
68 case 'n':
69 case 'N':
70 SvCUR_set(result,SvCUR(result)+size);
71 d += SvCUR(result);
72 while (size--) {
7c436af3 73 *--d = (U8)(value & 0xFF);
85982a32 74 value >>= 8;
75 }
76 break;
77 default:
78 croak("Unknown endian %c",(char) endian);
79 break;
80 }
81}
82
83MODULE = Encode::Unicode PACKAGE = Encode::Unicode
84
6d1c0808 85PROTOTYPES: DISABLE
86
a0d8a30e 87#define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \
88 *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
89
85982a32 90void
6d1c0808 91decode_xs(obj, str, check = 0)
85982a32 92SV * obj
93SV * str
6d1c0808 94IV check
85982a32 95CODE:
96{
a0d8a30e 97 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
98 int size = SvIV(attr("size", 4));
99 int ucs2 = SvTRUE(attr("ucs2", 4));
100 int clone = SvTRUE(attr("clone", 5));
101 SV *result = newSVpvn("",0);
85982a32 102 STRLEN ulen;
103 U8 *s = (U8 *)SvPVbyte(str,ulen);
104 U8 *e = (U8 *)SvEND(str);
105 ST(0) = sv_2mortal(result);
106 SvUTF8_on(result);
107
108 if (!endian && s+size <= e) {
109 UV bom;
110 endian = (size == 4) ? 'N' : 'n';
111 bom = enc_unpack(aTHX_ &s,e,size,endian);
112 if (bom != BOM_BE) {
113 if (bom == BOM16LE) {
114 endian = 'v';
115 }
116 else if (bom == BOM32LE) {
117 endian = 'V';
118 }
119 else {
315b3302 120 croak("%"SVf":Unrecognised BOM %"UVxf,
0f7c507f 121 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
85982a32 122 bom);
123 }
124 }
a0d8a30e 125#if 1
126 /* Update endian for next sequence */
127 if (clone) {
128 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
129 }
85982a32 130#endif
131 }
132 while (s < e && s+size <= e) {
133 UV ord = enc_unpack(aTHX_ &s,e,size,endian);
134 U8 *d;
135 if (size != 4 && invalid_ucs2(ord)) {
136 if (ucs2) {
6d1c0808 137 if (check) {
0f7c507f 138 croak("%"SVf":no surrogates allowed %"UVxf,
139 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
85982a32 140 ord);
141 }
142 if (s+size <= e) {
143 /* skip the next one as well */
6d1c0808 144 enc_unpack(aTHX_ &s,e,size,endian);
85982a32 145 }
146 ord = FBCHAR;
147 }
148 else {
149 UV lo;
150 if (!isHiSurrogate(ord)) {
0f7c507f 151 croak("%"SVf":Malformed HI surrogate %"UVxf,
152 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
85982a32 153 ord);
154 }
155 if (s+size > e) {
156 /* Partial character */
157 s -= size; /* back up to 1st half */
158 break; /* And exit loop */
159 }
160 lo = enc_unpack(aTHX_ &s,e,size,endian);
161 if (!isLoSurrogate(lo)){
0f7c507f 162 croak("%"SVf":Malformed LO surrogate %"UVxf,
163 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
85982a32 164 ord);
165 }
166 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
167 }
168 }
169 d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1);
170 d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0);
171 SvCUR_set(result,d - (U8 *)SvPVX(result));
172 }
6d1c0808 173 if (s < e) {
47dd3999 174 /* unlikely to happen because it's fixed-length -- dankogai */
175 if (check & ENCODE_WARN_ON_ERR){
0f7c507f 176 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
177 *hv_fetch((HV *)SvRV(obj),"Name",4,0));
47dd3999 178 }
6d1c0808 179 }
180 if (check && !(check & ENCODE_LEAVE_SRC)){
181 if (s < e) {
85982a32 182 Move(s,SvPVX(str),e-s,U8);
183 SvCUR_set(str,(e-s));
184 }
185 else {
186 SvCUR_set(str,0);
187 }
188 *SvEND(str) = '\0';
189 }
190 XSRETURN(1);
191}
192
193void
6d1c0808 194encode_xs(obj, utf8, check = 0)
195SV * obj
85982a32 196SV * utf8
6d1c0808 197IV check
85982a32 198CODE:
199{
a0d8a30e 200 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
201 int size = SvIV(attr("size", 4));
202 int ucs2 = SvTRUE(attr("ucs2", 4));
203 int clone = SvTRUE(attr("clone", 5));
204 SV *result = newSVpvn("",0);
85982a32 205 STRLEN ulen;
206 U8 *s = (U8 *)SvPVutf8(utf8,ulen);
207 U8 *e = (U8 *)SvEND(utf8);
208 ST(0) = sv_2mortal(result);
209 if (!endian) {
210 endian = (size == 4) ? 'N' : 'n';
211 enc_pack(aTHX_ result,size,endian,BOM_BE);
a0d8a30e 212#if 1
213 /* Update endian for next sequence */
214 if (clone){
215 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
216 }
85982a32 217#endif
218 }
219 while (s < e && s+UTF8SKIP(s) <= e) {
220 STRLEN len;
221 UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
222 s += len;
223 if (size != 4 && invalid_ucs2(ord)) {
224 if (!issurrogate(ord)){
225 if (ucs2) {
6d1c0808 226 if (check) {
0f7c507f 227 croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",
228 *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
85982a32 229 }
230 enc_pack(aTHX_ result,size,endian,FBCHAR);
231 }else{
232 UV hi = ((ord - 0x10000) >> 10) + 0xD800;
233 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
234 enc_pack(aTHX_ result,size,endian,hi);
235 enc_pack(aTHX_ result,size,endian,lo);
236 }
237 }
238 else {
239 /* not supposed to happen */
240 enc_pack(aTHX_ result,size,endian,FBCHAR);
241 }
242 }
243 else {
244 enc_pack(aTHX_ result,size,endian,ord);
245 }
246 }
6d1c0808 247 if (s < e) {
47dd3999 248 /* UTF-8 partial char happens often on PerlIO.
249 Since this is okay and normal, we do not warn.
250 But this is critical when you choose to LEAVE_SRC
251 in which case we die */
252 if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)){
253 Perl_croak(aTHX_ "%"SVf":partial character is not allowed "
254 "when CHECK = 0x%" UVuf,
255 *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
256 }
257
6d1c0808 258 }
259 if (check && !(check & ENCODE_LEAVE_SRC)){
85982a32 260 if (s < e) {
85982a32 261 Move(s,SvPVX(utf8),e-s,U8);
262 SvCUR_set(utf8,(e-s));
263 }
264 else {
265 SvCUR_set(utf8,0);
266 }
267 *SvEND(utf8) = '\0';
47dd3999 268 }
85982a32 269 XSRETURN(1);
270}
271