Re: [PATCH 5.7.3 docs] The question deals with a bug that was fixed
[p5sagit/p5-mst-13.2.git] / ext / Encode / Unicode / Unicode.xs
CommitLineData
85982a32 1/*
2 $Id: Unicode.xs,v 1.2 2002/04/19 05:36:43 dankogai Exp $
3 */
4
5#define PERL_NO_GET_CONTEXT
6#include "EXTERN.h"
7#include "perl.h"
8#include "XSUB.h"
9
10#define FBCHAR 0xFFFd
11#define BOM_BE 0xFeFF
12#define BOM16LE 0xFFFe
13#define BOM32LE 0xFFFe0000
14#define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
15#define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
16#define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
17#define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
18
19static UV
20enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian)
21{
22 U8 *s = *sp;
23 UV v = 0;
24 if (s+size > e) {
25 croak("Partial character %c",(char) endian);
26 }
27 switch(endian) {
28 case 'N':
29 v = *s++;
30 v = (v << 8) | *s++;
31 case 'n':
32 v = (v << 8) | *s++;
33 v = (v << 8) | *s++;
34 break;
35 case 'V':
36 case 'v':
37 v |= *s++;
38 v |= (*s++ << 8);
39 if (endian == 'v')
40 break;
41 v |= (*s++ << 16);
42 v |= (*s++ << 24);
43 break;
44 default:
45 croak("Unknown endian %c",(char) endian);
46 break;
47 }
48 *sp = s;
49 return v;
50}
51
52void
53enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value)
54{
55 U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size);
56 switch(endian) {
57 case 'v':
58 case 'V':
59 d += SvCUR(result);
60 SvCUR_set(result,SvCUR(result)+size);
61 while (size--) {
62 *d++ = value & 0xFF;
63 value >>= 8;
64 }
65 break;
66 case 'n':
67 case 'N':
68 SvCUR_set(result,SvCUR(result)+size);
69 d += SvCUR(result);
70 while (size--) {
71 *--d = value & 0xFF;
72 value >>= 8;
73 }
74 break;
75 default:
76 croak("Unknown endian %c",(char) endian);
77 break;
78 }
79}
80
81MODULE = Encode::Unicode PACKAGE = Encode::Unicode
82
83void
84decode_xs(obj, str, chk = &PL_sv_undef)
85SV * obj
86SV * str
87SV * chk
88CODE:
89{
90 int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
91 U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
92 int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
93 SV *result = newSVpvn("",0);
94 STRLEN ulen;
95 U8 *s = (U8 *)SvPVbyte(str,ulen);
96 U8 *e = (U8 *)SvEND(str);
97 ST(0) = sv_2mortal(result);
98 SvUTF8_on(result);
99
100 if (!endian && s+size <= e) {
101 UV bom;
102 endian = (size == 4) ? 'N' : 'n';
103 bom = enc_unpack(aTHX_ &s,e,size,endian);
104 if (bom != BOM_BE) {
105 if (bom == BOM16LE) {
106 endian = 'v';
107 }
108 else if (bom == BOM32LE) {
109 endian = 'V';
110 }
111 else {
112 croak("%s:Unregognised BOM %"UVxf,
113 SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),
114 bom);
115 }
116 }
117#if 0
118 /* Update endian for this sequence */
119 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
120#endif
121 }
122 while (s < e && s+size <= e) {
123 UV ord = enc_unpack(aTHX_ &s,e,size,endian);
124 U8 *d;
125 if (size != 4 && invalid_ucs2(ord)) {
126 if (ucs2) {
127 if (SvTRUE(chk)) {
128 croak("%s:no surrogates allowed %"UVxf,
129 SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),
130 ord);
131 }
132 if (s+size <= e) {
133 /* skip the next one as well */
134 enc_unpack(aTHX_ &s,e,size,endian);
135 }
136 ord = FBCHAR;
137 }
138 else {
139 UV lo;
140 if (!isHiSurrogate(ord)) {
141 croak("%s:Malformed HI surrogate %"UVxf,
142 SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),
143 ord);
144 }
145 if (s+size > e) {
146 /* Partial character */
147 s -= size; /* back up to 1st half */
148 break; /* And exit loop */
149 }
150 lo = enc_unpack(aTHX_ &s,e,size,endian);
151 if (!isLoSurrogate(lo)){
152 croak("%s:Malformed LO surrogate %"UVxf,
153 SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),
154 ord);
155 }
156 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
157 }
158 }
159 d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1);
160 d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0);
161 SvCUR_set(result,d - (U8 *)SvPVX(result));
162 }
163 if (SvTRUE(chk)) {
164 if (s < e) {
165 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character",
166 SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)));
167 Move(s,SvPVX(str),e-s,U8);
168 SvCUR_set(str,(e-s));
169 }
170 else {
171 SvCUR_set(str,0);
172 }
173 *SvEND(str) = '\0';
174 }
175 XSRETURN(1);
176}
177
178void
179encode_xs(obj, utf8, chk = &PL_sv_undef)
180 SV * obj
181SV * utf8
182SV * chk
183CODE:
184{
185 int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
186 U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
187 int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
188 SV *result = newSVpvn("",0);
189 STRLEN ulen;
190 U8 *s = (U8 *)SvPVutf8(utf8,ulen);
191 U8 *e = (U8 *)SvEND(utf8);
192 ST(0) = sv_2mortal(result);
193 if (!endian) {
194 endian = (size == 4) ? 'N' : 'n';
195 enc_pack(aTHX_ result,size,endian,BOM_BE);
196#if 0
197 /* Update endian for this sequence */
198 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
199#endif
200 }
201 while (s < e && s+UTF8SKIP(s) <= e) {
202 STRLEN len;
203 UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
204 s += len;
205 if (size != 4 && invalid_ucs2(ord)) {
206 if (!issurrogate(ord)){
207 if (ucs2) {
208 if (SvTRUE(chk)) {
209 croak("%s:code point \"\\x{"UVxf"}\" too high",
210 SvPV_nolen(
211 *hv_fetch((HV *)SvRV(obj),"Name",4,0))
212 ,ord);
213 }
214 enc_pack(aTHX_ result,size,endian,FBCHAR);
215 }else{
216 UV hi = ((ord - 0x10000) >> 10) + 0xD800;
217 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
218 enc_pack(aTHX_ result,size,endian,hi);
219 enc_pack(aTHX_ result,size,endian,lo);
220 }
221 }
222 else {
223 /* not supposed to happen */
224 enc_pack(aTHX_ result,size,endian,FBCHAR);
225 }
226 }
227 else {
228 enc_pack(aTHX_ result,size,endian,ord);
229 }
230 }
231 if (SvTRUE(chk)) {
232 if (s < e) {
233 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character",
234 SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)));
235 Move(s,SvPVX(utf8),e-s,U8);
236 SvCUR_set(utf8,(e-s));
237 }
238 else {
239 SvCUR_set(utf8,0);
240 }
241 *SvEND(utf8) = '\0';
242 }
243 XSRETURN(1);
244}
245