Commit | Line | Data |
85982a32 |
1 | /* |
2 | $Id: Unicode.xs,v 1.2 2002/04/19 05:36:43 dankogai Exp $ |
3 | */ |
4 | |
5 | #define PERL_NO_GET_CONTEXT |
6 | #include "EXTERN.h" |
7 | #include "perl.h" |
8 | #include "XSUB.h" |
9 | |
10 | #define FBCHAR 0xFFFd |
11 | #define BOM_BE 0xFeFF |
12 | #define BOM16LE 0xFFFe |
13 | #define BOM32LE 0xFFFe0000 |
14 | #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF ) |
15 | #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 ) |
16 | #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF ) |
17 | #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) ) |
18 | |
19 | static UV |
20 | enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian) |
21 | { |
22 | U8 *s = *sp; |
23 | UV v = 0; |
24 | if (s+size > e) { |
25 | croak("Partial character %c",(char) endian); |
26 | } |
27 | switch(endian) { |
28 | case 'N': |
29 | v = *s++; |
30 | v = (v << 8) | *s++; |
31 | case 'n': |
32 | v = (v << 8) | *s++; |
33 | v = (v << 8) | *s++; |
34 | break; |
35 | case 'V': |
36 | case 'v': |
37 | v |= *s++; |
38 | v |= (*s++ << 8); |
39 | if (endian == 'v') |
40 | break; |
41 | v |= (*s++ << 16); |
42 | v |= (*s++ << 24); |
43 | break; |
44 | default: |
45 | croak("Unknown endian %c",(char) endian); |
46 | break; |
47 | } |
48 | *sp = s; |
49 | return v; |
50 | } |
51 | |
52 | void |
53 | enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value) |
54 | { |
55 | U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size); |
56 | switch(endian) { |
57 | case 'v': |
58 | case 'V': |
59 | d += SvCUR(result); |
60 | SvCUR_set(result,SvCUR(result)+size); |
61 | while (size--) { |
62 | *d++ = value & 0xFF; |
63 | value >>= 8; |
64 | } |
65 | break; |
66 | case 'n': |
67 | case 'N': |
68 | SvCUR_set(result,SvCUR(result)+size); |
69 | d += SvCUR(result); |
70 | while (size--) { |
71 | *--d = value & 0xFF; |
72 | value >>= 8; |
73 | } |
74 | break; |
75 | default: |
76 | croak("Unknown endian %c",(char) endian); |
77 | break; |
78 | } |
79 | } |
80 | |
81 | MODULE = Encode::Unicode PACKAGE = Encode::Unicode |
82 | |
83 | void |
84 | decode_xs(obj, str, chk = &PL_sv_undef) |
85 | SV * obj |
86 | SV * str |
87 | SV * chk |
88 | CODE: |
89 | { |
90 | int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0)); |
91 | U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0))); |
92 | int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0)); |
93 | SV *result = newSVpvn("",0); |
94 | STRLEN ulen; |
95 | U8 *s = (U8 *)SvPVbyte(str,ulen); |
96 | U8 *e = (U8 *)SvEND(str); |
97 | ST(0) = sv_2mortal(result); |
98 | SvUTF8_on(result); |
99 | |
100 | if (!endian && s+size <= e) { |
101 | UV bom; |
102 | endian = (size == 4) ? 'N' : 'n'; |
103 | bom = enc_unpack(aTHX_ &s,e,size,endian); |
104 | if (bom != BOM_BE) { |
105 | if (bom == BOM16LE) { |
106 | endian = 'v'; |
107 | } |
108 | else if (bom == BOM32LE) { |
109 | endian = 'V'; |
110 | } |
111 | else { |
112 | croak("%s:Unregognised BOM %"UVxf, |
113 | SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)), |
114 | bom); |
115 | } |
116 | } |
117 | #if 0 |
118 | /* Update endian for this sequence */ |
119 | hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); |
120 | #endif |
121 | } |
122 | while (s < e && s+size <= e) { |
123 | UV ord = enc_unpack(aTHX_ &s,e,size,endian); |
124 | U8 *d; |
125 | if (size != 4 && invalid_ucs2(ord)) { |
126 | if (ucs2) { |
127 | if (SvTRUE(chk)) { |
128 | croak("%s:no surrogates allowed %"UVxf, |
129 | SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)), |
130 | ord); |
131 | } |
132 | if (s+size <= e) { |
133 | /* skip the next one as well */ |
134 | enc_unpack(aTHX_ &s,e,size,endian); |
135 | } |
136 | ord = FBCHAR; |
137 | } |
138 | else { |
139 | UV lo; |
140 | if (!isHiSurrogate(ord)) { |
141 | croak("%s:Malformed HI surrogate %"UVxf, |
142 | SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)), |
143 | ord); |
144 | } |
145 | if (s+size > e) { |
146 | /* Partial character */ |
147 | s -= size; /* back up to 1st half */ |
148 | break; /* And exit loop */ |
149 | } |
150 | lo = enc_unpack(aTHX_ &s,e,size,endian); |
151 | if (!isLoSurrogate(lo)){ |
152 | croak("%s:Malformed LO surrogate %"UVxf, |
153 | SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)), |
154 | ord); |
155 | } |
156 | ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); |
157 | } |
158 | } |
159 | d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1); |
160 | d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0); |
161 | SvCUR_set(result,d - (U8 *)SvPVX(result)); |
162 | } |
163 | if (SvTRUE(chk)) { |
164 | if (s < e) { |
165 | Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character", |
166 | SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0))); |
167 | Move(s,SvPVX(str),e-s,U8); |
168 | SvCUR_set(str,(e-s)); |
169 | } |
170 | else { |
171 | SvCUR_set(str,0); |
172 | } |
173 | *SvEND(str) = '\0'; |
174 | } |
175 | XSRETURN(1); |
176 | } |
177 | |
178 | void |
179 | encode_xs(obj, utf8, chk = &PL_sv_undef) |
180 | SV * obj |
181 | SV * utf8 |
182 | SV * chk |
183 | CODE: |
184 | { |
185 | int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0)); |
186 | U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0))); |
187 | int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0)); |
188 | SV *result = newSVpvn("",0); |
189 | STRLEN ulen; |
190 | U8 *s = (U8 *)SvPVutf8(utf8,ulen); |
191 | U8 *e = (U8 *)SvEND(utf8); |
192 | ST(0) = sv_2mortal(result); |
193 | if (!endian) { |
194 | endian = (size == 4) ? 'N' : 'n'; |
195 | enc_pack(aTHX_ result,size,endian,BOM_BE); |
196 | #if 0 |
197 | /* Update endian for this sequence */ |
198 | hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); |
199 | #endif |
200 | } |
201 | while (s < e && s+UTF8SKIP(s) <= e) { |
202 | STRLEN len; |
203 | UV ord = utf8n_to_uvuni(s, e-s, &len, 0); |
204 | s += len; |
205 | if (size != 4 && invalid_ucs2(ord)) { |
206 | if (!issurrogate(ord)){ |
207 | if (ucs2) { |
208 | if (SvTRUE(chk)) { |
209 | croak("%s:code point \"\\x{"UVxf"}\" too high", |
210 | SvPV_nolen( |
211 | *hv_fetch((HV *)SvRV(obj),"Name",4,0)) |
212 | ,ord); |
213 | } |
214 | enc_pack(aTHX_ result,size,endian,FBCHAR); |
215 | }else{ |
216 | UV hi = ((ord - 0x10000) >> 10) + 0xD800; |
217 | UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; |
218 | enc_pack(aTHX_ result,size,endian,hi); |
219 | enc_pack(aTHX_ result,size,endian,lo); |
220 | } |
221 | } |
222 | else { |
223 | /* not supposed to happen */ |
224 | enc_pack(aTHX_ result,size,endian,FBCHAR); |
225 | } |
226 | } |
227 | else { |
228 | enc_pack(aTHX_ result,size,endian,ord); |
229 | } |
230 | } |
231 | if (SvTRUE(chk)) { |
232 | if (s < e) { |
233 | Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character", |
234 | SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0))); |
235 | Move(s,SvPVX(utf8),e-s,U8); |
236 | SvCUR_set(utf8,(e-s)); |
237 | } |
238 | else { |
239 | SvCUR_set(utf8,0); |
240 | } |
241 | *SvEND(utf8) = '\0'; |
242 | } |
243 | XSRETURN(1); |
244 | } |
245 | |