2 $Id: Unicode.xs,v 1.4 2002/04/26 03:02:04 dankogai Exp $
5 #define PERL_NO_GET_CONTEXT
10 #include "../Encode/encode.h"
14 #define BOM16LE 0xFFFe
15 #define BOM32LE 0xFFFe0000
16 #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
17 #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
18 #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
19 #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
22 enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian)
27 croak("Partial character %c",(char) endian);
47 croak("Unknown endian %c",(char) endian);
55 enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value)
57 U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size);
62 SvCUR_set(result,SvCUR(result)+size);
64 *d++ = (U8)(value & 0xFF);
70 SvCUR_set(result,SvCUR(result)+size);
73 *--d = (U8)(value & 0xFF);
78 croak("Unknown endian %c",(char) endian);
83 MODULE = Encode::Unicode PACKAGE = Encode::Unicode
88 decode_xs(obj, str, check = 0)
94 int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
95 U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
96 int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
97 SV *result = newSVpvn("",0);
99 U8 *s = (U8 *)SvPVbyte(str,ulen);
100 U8 *e = (U8 *)SvEND(str);
101 ST(0) = sv_2mortal(result);
104 if (!endian && s+size <= e) {
106 endian = (size == 4) ? 'N' : 'n';
107 bom = enc_unpack(aTHX_ &s,e,size,endian);
109 if (bom == BOM16LE) {
112 else if (bom == BOM32LE) {
116 croak("%s:Unregognised BOM %"UVxf,
117 SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),
122 /* Update endian for this sequence */
123 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
126 while (s < e && s+size <= e) {
127 UV ord = enc_unpack(aTHX_ &s,e,size,endian);
129 if (size != 4 && invalid_ucs2(ord)) {
132 croak("%s:no surrogates allowed %"UVxf,
133 SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),
137 /* skip the next one as well */
138 enc_unpack(aTHX_ &s,e,size,endian);
144 if (!isHiSurrogate(ord)) {
145 croak("%s:Malformed HI surrogate %"UVxf,
146 SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),
150 /* Partial character */
151 s -= size; /* back up to 1st half */
152 break; /* And exit loop */
154 lo = enc_unpack(aTHX_ &s,e,size,endian);
155 if (!isLoSurrogate(lo)){
156 croak("%s:Malformed LO surrogate %"UVxf,
157 SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),
160 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
163 d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1);
164 d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0);
165 SvCUR_set(result,d - (U8 *)SvPVX(result));
168 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character",
169 SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)));
171 if (check && !(check & ENCODE_LEAVE_SRC)){
173 Move(s,SvPVX(str),e-s,U8);
174 SvCUR_set(str,(e-s));
185 encode_xs(obj, utf8, check = 0)
191 int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
192 U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
193 int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
194 SV *result = newSVpvn("",0);
196 U8 *s = (U8 *)SvPVutf8(utf8,ulen);
197 U8 *e = (U8 *)SvEND(utf8);
198 ST(0) = sv_2mortal(result);
200 endian = (size == 4) ? 'N' : 'n';
201 enc_pack(aTHX_ result,size,endian,BOM_BE);
203 /* Update endian for this sequence */
204 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
207 while (s < e && s+UTF8SKIP(s) <= e) {
209 UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
211 if (size != 4 && invalid_ucs2(ord)) {
212 if (!issurrogate(ord)){
215 croak("%s:code point \"\\x{%"UVxf"}\" too high",
217 *hv_fetch((HV *)SvRV(obj),"Name",4,0))
220 enc_pack(aTHX_ result,size,endian,FBCHAR);
222 UV hi = ((ord - 0x10000) >> 10) + 0xD800;
223 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
224 enc_pack(aTHX_ result,size,endian,hi);
225 enc_pack(aTHX_ result,size,endian,lo);
229 /* not supposed to happen */
230 enc_pack(aTHX_ result,size,endian,FBCHAR);
234 enc_pack(aTHX_ result,size,endian,ord);
238 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character",
239 SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)));
241 if (check && !(check & ENCODE_LEAVE_SRC)){
243 Move(s,SvPVX(utf8),e-s,U8);
244 SvCUR_set(utf8,(e-s));