Commit | Line | Data |
fcf2db38 |
1 | #define PERL_NO_GET_CONTEXT |
2c674647 |
2 | #include "EXTERN.h" |
3 | #include "perl.h" |
4 | #include "XSUB.h" |
2f2b4ff2 |
5 | #define U8 U8 |
6 | #include "encode.h" |
071db25d |
7 | #include "def_t.h" |
fcf2db38 |
8 | |
aae85ceb |
9 | #define FBCHAR 0xFFFd |
b2704119 |
10 | #define FBCHAR_UTF8 "\xEF\xBF\xBD" |
aae85ceb |
11 | #define BOM_BE 0xFeFF |
12 | #define BOM16LE 0xFFFe |
13 | #define BOM32LE 0xFFFe0000 |
aae85ceb |
14 | #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF ) |
15 | #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 ) |
16 | #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF ) |
b2704119 |
17 | #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) ) |
aae85ceb |
18 | |
19 | static UV |
20 | enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian) |
21 | { |
22 | U8 *s = *sp; |
23 | UV v = 0; |
24 | if (s+size > e) { |
25 | croak("Partial character %c",(char) endian); |
26 | } |
27 | switch(endian) { |
28 | case 'N': |
29 | v = *s++; |
30 | v = (v << 8) | *s++; |
31 | case 'n': |
32 | v = (v << 8) | *s++; |
33 | v = (v << 8) | *s++; |
34 | break; |
35 | case 'V': |
36 | case 'v': |
37 | v |= *s++; |
38 | v |= (*s++ << 8); |
39 | if (endian == 'v') |
40 | break; |
41 | v |= (*s++ << 16); |
42 | v |= (*s++ << 24); |
43 | break; |
44 | default: |
45 | croak("Unknown endian %c",(char) endian); |
46 | break; |
47 | } |
48 | *sp = s; |
49 | return v; |
50 | } |
51 | |
52 | void |
53 | enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value) |
54 | { |
b2704119 |
55 | U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size); |
aae85ceb |
56 | switch(endian) { |
57 | case 'v': |
58 | case 'V': |
59 | d += SvCUR(result); |
60 | SvCUR_set(result,SvCUR(result)+size); |
61 | while (size--) { |
62 | *d++ = value & 0xFF; |
63 | value >>= 8; |
64 | } |
65 | break; |
66 | case 'n': |
67 | case 'N': |
68 | SvCUR_set(result,SvCUR(result)+size); |
69 | d += SvCUR(result); |
70 | while (size--) { |
71 | *--d = value & 0xFF; |
72 | value >>= 8; |
73 | } |
74 | break; |
75 | default: |
76 | croak("Unknown endian %c",(char) endian); |
77 | break; |
78 | } |
79 | } |
80 | |
fcb875d4 |
81 | #define ENCODE_XS_PROFILE 0 /* set 1 or more to profile. |
a999c27c |
82 | t/encoding.t dumps core because of |
83 | Perl_warner and PerlIO don't work well */ |
39cf9a5e |
84 | |
85 | #define ENCODE_XS_USEFP 1 /* set 0 to disable floating point to calculate |
a999c27c |
86 | buffer size for encode_method(). |
87 | 1 is recommended. 2 restores NI-S original */ |
39cf9a5e |
88 | |
fcf2db38 |
89 | #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \ |
2f5768b8 |
90 | Perl_croak(aTHX_ "panic_unimplemented"); \ |
4a83738a |
91 | return (y)0; /* fool picky compilers */ \ |
87714904 |
92 | } |
67e989fb |
93 | UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) |
aa0053b7 |
94 | UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) |
33af2bc7 |
95 | |
2f2b4ff2 |
96 | void |
aa0053b7 |
97 | Encode_XSEncoding(pTHX_ encode_t * enc) |
2f2b4ff2 |
98 | { |
aa0053b7 |
99 | dSP; |
100 | HV *stash = gv_stashpv("Encode::XS", TRUE); |
101 | SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))), stash); |
102 | int i = 0; |
103 | PUSHMARK(sp); |
104 | XPUSHs(sv); |
105 | while (enc->name[i]) { |
106 | const char *name = enc->name[i++]; |
107 | XPUSHs(sv_2mortal(newSVpvn(name, strlen(name)))); |
108 | } |
109 | PUTBACK; |
110 | call_pv("Encode::define_encoding", G_DISCARD); |
111 | SvREFCNT_dec(sv); |
2f2b4ff2 |
112 | } |
113 | |
aa0053b7 |
114 | void |
115 | call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig) |
116 | { |
25f7d9d3 |
117 | /* Exists for breakpointing */ |
aa0053b7 |
118 | } |
67e989fb |
119 | |
2f2b4ff2 |
120 | static SV * |
aa0053b7 |
121 | encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, |
122 | int check) |
2f2b4ff2 |
123 | { |
aa0053b7 |
124 | STRLEN slen; |
125 | U8 *s = (U8 *) SvPV(src, slen); |
3aececda |
126 | STRLEN tlen = slen; |
127 | STRLEN ddone = 0; |
128 | STRLEN sdone = 0; |
39cf9a5e |
129 | |
3c49ab08 |
130 | /* We allocate slen+1. |
a999c27c |
131 | PerlIO dumps core if this value is smaller than this. */ |
3c49ab08 |
132 | SV *dst = sv_2mortal(newSV(slen+1)); |
aa0053b7 |
133 | if (slen) { |
0b3236bb |
134 | U8 *d = (U8 *) SvPVX(dst); |
135 | STRLEN dlen = SvLEN(dst)-1; |
aa0053b7 |
136 | int code; |
137 | while ((code = do_encode(dir, s, &slen, d, dlen, &dlen, !check))) { |
3aececda |
138 | SvCUR_set(dst, dlen+ddone); |
284ee456 |
139 | SvPOK_only(dst); |
9b37254d |
140 | |
39cf9a5e |
141 | #if ENCODE_XS_PROFILE >= 3 |
142 | Perl_warn(aTHX_ "code=%d @ s=%d/%d/%d d=%d/%d/%d\n",code,slen,sdone,tlen,dlen,ddone,SvLEN(dst)-1); |
0b3236bb |
143 | #endif |
144 | |
145 | if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL) |
aa0053b7 |
146 | break; |
9b37254d |
147 | |
aa0053b7 |
148 | switch (code) { |
149 | case ENCODE_NOSPACE: |
a999c27c |
150 | { |
fcb875d4 |
151 | STRLEN more = 0; /* make sure you initialize! */ |
152 | STRLEN sleft; |
3aececda |
153 | sdone += slen; |
154 | ddone += dlen; |
39cf9a5e |
155 | sleft = tlen - sdone; |
fcb875d4 |
156 | #if ENCODE_XS_PROFILE >= 2 |
3c49ab08 |
157 | Perl_warn(aTHX_ |
fcb875d4 |
158 | "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n", |
159 | more, sdone, sleft, SvLEN(dst)); |
160 | #endif |
161 | if (sdone != 0) { /* has src ever been processed ? */ |
39cf9a5e |
162 | #if ENCODE_XS_USEFP == 2 |
a999c27c |
163 | more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone |
164 | - SvLEN(dst); |
39cf9a5e |
165 | #elif ENCODE_XS_USEFP |
a999c27c |
166 | more = (1.0*SvLEN(dst)+1)/sdone * sleft; |
39cf9a5e |
167 | #else |
a999c27c |
168 | /* safe until SvLEN(dst) == MAX_INT/16 */ |
169 | more = (16*SvLEN(dst)+1)/sdone/16 * sleft; |
c98ca92f |
170 | #endif |
0b3236bb |
171 | } |
39cf9a5e |
172 | more += UTF8_MAXLEN; /* insurance policy */ |
173 | #if ENCODE_XS_PROFILE >= 2 |
3c49ab08 |
174 | Perl_warn(aTHX_ |
a999c27c |
175 | "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n", |
176 | more, sdone, sleft, SvLEN(dst)); |
39cf9a5e |
177 | #endif |
178 | d = (U8 *) SvGROW(dst, SvLEN(dst) + more); |
179 | /* dst need to grow need MORE bytes! */ |
3aececda |
180 | if (ddone >= SvLEN(dst)) { |
181 | Perl_croak(aTHX_ "Destination couldn't be grown."); |
aa0053b7 |
182 | } |
3aececda |
183 | dlen = SvLEN(dst)-ddone-1; |
184 | d += ddone; |
185 | s += slen; |
186 | slen = tlen-sdone; |
187 | continue; |
39cf9a5e |
188 | } |
2f2b4ff2 |
189 | |
aa0053b7 |
190 | case ENCODE_NOREP: |
191 | if (dir == enc->f_utf8) { |
b2704119 |
192 | STRLEN clen; |
193 | UV ch = |
194 | utf8n_to_uvuni(s + slen, (SvCUR(src) - slen), |
195 | &clen, 0); |
196 | if (!check) { /* fallback char */ |
197 | sdone += slen + clen; |
198 | ddone += dlen + enc->replen; |
199 | sv_catpvn(dst, enc->rep, enc->replen); |
aa0053b7 |
200 | } |
b2704119 |
201 | else if (check == -1){ /* perlqq */ |
202 | SV* perlqq = |
203 | sv_2mortal(newSVpvf("\\x{%x}", ch)); |
204 | sdone += slen + clen; |
205 | ddone += dlen + SvLEN(perlqq); |
206 | sv_catsv(dst, perlqq); |
207 | } |
208 | else { |
209 | Perl_croak(aTHX_ |
210 | "\"\\N{U+%" UVxf |
211 | "}\" does not map to %s", ch, |
212 | enc->name[0]); |
aa0053b7 |
213 | } |
b2704119 |
214 | } |
215 | else { |
216 | if (!check){ /* fallback char */ |
217 | sdone += slen + 1; |
218 | ddone += dlen + strlen(FBCHAR_UTF8); |
219 | sv_catpv(dst, FBCHAR_UTF8); |
aa0053b7 |
220 | } |
b2704119 |
221 | else if (check == -1){ /* perlqq */ |
222 | SV* perlqq = |
223 | sv_2mortal(newSVpvf("\\x%02X", s[slen])); |
224 | sdone += slen + 1; |
225 | ddone += dlen + SvLEN(perlqq); |
226 | sv_catsv(dst, perlqq); |
227 | } |
aa0053b7 |
228 | else { |
b2704119 |
229 | /* UTF-8 is supposed to be "Universal" so should not |
230 | happen for real characters, but some encodings |
231 | have non-assigned codes which may occur. */ |
232 | Perl_croak(aTHX_ "%s \"\\x%02X\" " |
233 | "does not map to Unicode (%d)", |
234 | enc->name[0], (U8) s[slen], code); |
aa0053b7 |
235 | } |
b2704119 |
236 | } |
237 | dlen = SvCUR(dst); |
238 | d = SvPVX(dst) + dlen; |
239 | s = SvPVX(src) + sdone; |
240 | slen = tlen - sdone; |
241 | break; |
2f2b4ff2 |
242 | |
aa0053b7 |
243 | default: |
244 | Perl_croak(aTHX_ "Unexpected code %d converting %s %s", |
245 | code, (dir == enc->f_utf8) ? "to" : "from", |
246 | enc->name[0]); |
247 | return &PL_sv_undef; |
248 | } |
249 | } |
3aececda |
250 | SvCUR_set(dst, dlen+ddone); |
284ee456 |
251 | SvPOK_only(dst); |
aa0053b7 |
252 | if (check) { |
3aececda |
253 | sdone = SvCUR(src) - (slen+sdone); |
254 | if (sdone) { |
f54fca96 |
255 | #if 1 |
256 | /* FIXME: A Move() is dangerous - PV could be mmap'ed readonly |
257 | SvOOK would be ideal - but sv_backoff does not understand SvLEN == 0 |
258 | type SVs and sv_clear() calls it ... |
259 | */ |
64ffdd5e |
260 | sv_setpvn(src, (char*)s+slen, sdone); |
f54fca96 |
261 | #else |
3aececda |
262 | Move(s + slen, SvPVX(src), sdone , U8); |
f54fca96 |
263 | #endif |
aa0053b7 |
264 | } |
3aececda |
265 | SvCUR_set(src, sdone); |
aa0053b7 |
266 | } |
2f2b4ff2 |
267 | } |
aa0053b7 |
268 | else { |
0b3236bb |
269 | SvCUR_set(dst, 0); |
284ee456 |
270 | SvPOK_only(dst); |
2f2b4ff2 |
271 | } |
39cf9a5e |
272 | #if ENCODE_XS_PROFILE |
273 | if (SvCUR(dst) > SvCUR(src)){ |
3c49ab08 |
274 | Perl_warn(aTHX_ |
a999c27c |
275 | "SvLEN(dst)=%d, SvCUR(dst)=%d. " |
276 | "%d bytes unused(%f %%)\n", |
3c49ab08 |
277 | SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst), |
a999c27c |
278 | (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0); |
3c49ab08 |
279 | |
39cf9a5e |
280 | } |
3c49ab08 |
281 | #endif |
0b3236bb |
282 | *SvEND(dst) = '\0'; |
aa0053b7 |
283 | return dst; |
2f2b4ff2 |
284 | } |
285 | |
50d26985 |
286 | MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_ |
2f2b4ff2 |
287 | |
288 | PROTOTYPES: ENABLE |
289 | |
290 | void |
0a95303c |
291 | Method_name(obj) |
292 | SV * obj |
293 | CODE: |
294 | { |
295 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
296 | ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0]))); |
297 | XSRETURN(1); |
298 | } |
299 | |
300 | void |
b2704119 |
301 | Method_decode(obj,src,check = 0) |
2f2b4ff2 |
302 | SV * obj |
303 | SV * src |
b2704119 |
304 | int check |
2f2b4ff2 |
305 | CODE: |
306 | { |
307 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
308 | ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check); |
309 | SvUTF8_on(ST(0)); |
310 | XSRETURN(1); |
311 | } |
312 | |
313 | void |
b2704119 |
314 | Method_encode(obj,src,check = 0) |
2f2b4ff2 |
315 | SV * obj |
316 | SV * src |
b2704119 |
317 | int check |
2f2b4ff2 |
318 | CODE: |
319 | { |
320 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
321 | sv_utf8_upgrade(src); |
322 | ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check); |
323 | XSRETURN(1); |
324 | } |
325 | |
aae85ceb |
326 | MODULE = Encode PACKAGE = Encode::Unicode |
327 | |
328 | void |
329 | decode_xs(obj, str, chk = &PL_sv_undef) |
330 | SV * obj |
331 | SV * str |
332 | SV * chk |
333 | CODE: |
334 | { |
335 | int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0)); |
336 | U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0))); |
337 | int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0)); |
338 | SV *result = newSVpvn("",0); |
339 | STRLEN ulen; |
b2704119 |
340 | U8 *s = (U8 *)SvPVbyte(str,ulen); |
341 | U8 *e = (U8 *)SvEND(str); |
aae85ceb |
342 | ST(0) = sv_2mortal(result); |
343 | SvUTF8_on(result); |
344 | |
345 | if (!endian && s+size <= e) { |
346 | UV bom; |
347 | endian = (size == 4) ? 'N' : 'n'; |
348 | bom = enc_unpack(aTHX_ &s,e,size,endian); |
349 | if (bom != BOM_BE) { |
350 | if (bom == BOM16LE) { |
351 | endian = 'v'; |
352 | } |
353 | else if (bom == BOM32LE) { |
354 | endian = 'V'; |
355 | } |
356 | else { |
357 | croak("%s:Unregognised BOM %"UVxf, |
358 | SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),bom); |
359 | } |
360 | } |
361 | #if 0 |
362 | /* Update endian for this sequence */ |
363 | hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); |
364 | #endif |
365 | } |
366 | while (s < e && s+size <= e) { |
367 | UV ord = enc_unpack(aTHX_ &s,e,size,endian); |
368 | U8 *d; |
b2704119 |
369 | if (size != 4 && invalid_ucs2(ord)) { |
aae85ceb |
370 | if (ucs2) { |
371 | if (SvTRUE(chk)) { |
372 | croak("%s:no surrogates allowed %"UVxf, |
373 | SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord); |
374 | } |
375 | if (s+size <= e) { |
376 | enc_unpack(aTHX_ &s,e,size,endian); /* skip the next one as well */ |
377 | } |
378 | ord = FBCHAR; |
379 | } |
380 | else { |
381 | UV lo; |
382 | if (!isHiSurrogate(ord)) { |
383 | croak("%s:Malformed HI surrogate %"UVxf, |
384 | SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord); |
385 | } |
386 | if (s+size > e) { |
387 | /* Partial character */ |
388 | s -= size; /* back up to 1st half */ |
389 | break; /* And exit loop */ |
390 | } |
391 | lo = enc_unpack(aTHX_ &s,e,size,endian); |
392 | if (!isLoSurrogate(lo)){ |
393 | croak("%s:Malformed LO surrogate %"UVxf, |
394 | SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord); |
395 | } |
396 | ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); |
397 | } |
398 | } |
399 | d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1); |
400 | d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0); |
401 | SvCUR_set(result,d - (U8 *)SvPVX(result)); |
402 | } |
403 | if (SvTRUE(chk)) { |
404 | if (s < e) { |
405 | Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character", |
406 | SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0))); |
407 | Move(s,SvPVX(str),e-s,U8); |
408 | SvCUR_set(str,(e-s)); |
409 | } |
410 | else { |
411 | SvCUR_set(str,0); |
412 | } |
413 | *SvEND(str) = '\0'; |
414 | } |
415 | XSRETURN(1); |
416 | } |
417 | |
418 | void |
419 | encode_xs(obj, utf8, chk = &PL_sv_undef) |
420 | SV * obj |
421 | SV * utf8 |
422 | SV * chk |
423 | CODE: |
424 | { |
425 | int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0)); |
426 | U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0))); |
427 | int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0)); |
428 | SV *result = newSVpvn("",0); |
429 | STRLEN ulen; |
b2704119 |
430 | U8 *s = (U8 *)SvPVutf8(utf8,ulen); |
431 | U8 *e = (U8 *)SvEND(utf8); |
aae85ceb |
432 | ST(0) = sv_2mortal(result); |
433 | if (!endian) { |
434 | endian = (size == 4) ? 'N' : 'n'; |
435 | enc_pack(aTHX_ result,size,endian,BOM_BE); |
436 | #if 0 |
437 | /* Update endian for this sequence */ |
438 | hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); |
439 | #endif |
440 | } |
441 | while (s < e && s+UTF8SKIP(s) <= e) { |
442 | STRLEN len; |
443 | UV ord = utf8n_to_uvuni(s, e-s, &len, 0); |
444 | s += len; |
b2704119 |
445 | if (size != 4 && invalid_ucs2(ord)) { |
aae85ceb |
446 | if (!issurrogate(ord)){ |
447 | if (ucs2) { |
448 | if (SvTRUE(chk)) { |
449 | croak("%s:code point \"\\x{"UVxf"}\" too high", |
450 | SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord); |
451 | } |
452 | enc_pack(aTHX_ result,size,endian,FBCHAR); |
453 | }else{ |
454 | UV hi = ((ord - 0x10000) >> 10) + 0xD800; |
455 | UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; |
456 | enc_pack(aTHX_ result,size,endian,hi); |
457 | enc_pack(aTHX_ result,size,endian,lo); |
458 | } |
459 | } |
460 | else { |
461 | /* not supposed to happen */ |
462 | enc_pack(aTHX_ result,size,endian,FBCHAR); |
463 | } |
464 | } |
465 | else { |
466 | enc_pack(aTHX_ result,size,endian,ord); |
467 | } |
468 | } |
469 | if (SvTRUE(chk)) { |
470 | if (s < e) { |
471 | Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character", |
472 | SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0))); |
473 | Move(s,SvPVX(utf8),e-s,U8); |
474 | SvCUR_set(utf8,(e-s)); |
475 | } |
476 | else { |
477 | SvCUR_set(utf8,0); |
478 | } |
479 | *SvEND(utf8) = '\0'; |
480 | } |
481 | XSRETURN(1); |
482 | } |
483 | |
67e989fb |
484 | MODULE = Encode PACKAGE = Encode |
2c674647 |
485 | |
486 | PROTOTYPES: ENABLE |
487 | |
67e989fb |
488 | I32 |
2c674647 |
489 | _bytes_to_utf8(sv, ...) |
67e989fb |
490 | SV * sv |
2c674647 |
491 | CODE: |
67e989fb |
492 | { |
493 | SV * encoding = items == 2 ? ST(1) : Nullsv; |
494 | |
495 | if (encoding) |
496 | RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding)); |
497 | else { |
498 | STRLEN len; |
183a2d84 |
499 | U8* s = (U8*)SvPV(sv, len); |
67e989fb |
500 | U8* converted; |
501 | |
502 | converted = bytes_to_utf8(s, &len); /* This allocs */ |
183a2d84 |
503 | sv_setpvn(sv, (char *)converted, len); |
67e989fb |
504 | SvUTF8_on(sv); /* XXX Should we? */ |
505 | Safefree(converted); /* ... so free it */ |
506 | RETVAL = len; |
507 | } |
508 | } |
2c674647 |
509 | OUTPUT: |
67e989fb |
510 | RETVAL |
2c674647 |
511 | |
67e989fb |
512 | I32 |
2c674647 |
513 | _utf8_to_bytes(sv, ...) |
67e989fb |
514 | SV * sv |
2c674647 |
515 | CODE: |
67e989fb |
516 | { |
517 | SV * to = items > 1 ? ST(1) : Nullsv; |
518 | SV * check = items > 2 ? ST(2) : Nullsv; |
87714904 |
519 | |
67e989fb |
520 | if (to) |
521 | RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to)); |
522 | else { |
67e989fb |
523 | STRLEN len; |
b113ac0e |
524 | U8 *s = (U8*)SvPV(sv, len); |
67e989fb |
525 | |
9c5ffd7c |
526 | RETVAL = 0; |
67e989fb |
527 | if (SvTRUE(check)) { |
528 | /* Must do things the slow way */ |
529 | U8 *dest; |
87714904 |
530 | U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */ |
67e989fb |
531 | U8 *send = s + len; |
532 | |
533 | New(83, dest, len, U8); /* I think */ |
534 | |
535 | while (s < send) { |
536 | if (*s < 0x80) |
537 | *dest++ = *s++; |
538 | else { |
b113ac0e |
539 | STRLEN ulen; |
540 | UV uv = *s++; |
87714904 |
541 | |
67e989fb |
542 | /* Have to do it all ourselves because of error routine, |
543 | aargh. */ |
544 | if (!(uv & 0x40)) |
545 | goto failure; |
546 | if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; } |
547 | else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; } |
548 | else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; } |
549 | else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; } |
550 | else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; } |
551 | else if (!(uv & 0x01)) { ulen = 7; uv = 0; } |
552 | else { ulen = 13; uv = 0; } |
87714904 |
553 | |
67e989fb |
554 | /* Note change to utf8.c variable naming, for variety */ |
555 | while (ulen--) { |
556 | if ((*s & 0xc0) != 0x80) |
557 | goto failure; |
87714904 |
558 | |
67e989fb |
559 | else |
560 | uv = (uv << 6) | (*s++ & 0x3f); |
87714904 |
561 | } |
67e989fb |
562 | if (uv > 256) { |
563 | failure: |
564 | call_failure(check, s, dest, src); |
565 | /* Now what happens? */ |
566 | } |
567 | *dest++ = (U8)uv; |
568 | } |
569 | } |
570 | } else |
571 | RETVAL = (utf8_to_bytes(s, &len) ? len : 0); |
572 | } |
2c674647 |
573 | } |
574 | OUTPUT: |
575 | RETVAL |
576 | |
2c674647 |
577 | bool |
b2704119 |
578 | is_utf8(sv, check = 0) |
4411f3b6 |
579 | SV * sv |
b2704119 |
580 | int check |
2c674647 |
581 | CODE: |
582 | { |
2eebba1d |
583 | if (SvGMAGICAL(sv)) /* it could be $1, for example */ |
584 | sv = newSVsv(sv); /* GMAGIG will be done */ |
2c674647 |
585 | if (SvPOK(sv)) { |
4411f3b6 |
586 | RETVAL = SvUTF8(sv) ? TRUE : FALSE; |
2c674647 |
587 | if (RETVAL && |
4411f3b6 |
588 | check && |
2c674647 |
589 | !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) |
590 | RETVAL = FALSE; |
591 | } else { |
592 | RETVAL = FALSE; |
593 | } |
2eebba1d |
594 | if (sv != ST(0)) |
595 | SvREFCNT_dec(sv); /* it was a temp copy */ |
2c674647 |
596 | } |
597 | OUTPUT: |
598 | RETVAL |
599 | |
600 | SV * |
4411f3b6 |
601 | _utf8_on(sv) |
2c674647 |
602 | SV * sv |
603 | CODE: |
604 | { |
605 | if (SvPOK(sv)) { |
87714904 |
606 | SV *rsv = newSViv(SvUTF8(sv)); |
2c674647 |
607 | RETVAL = rsv; |
608 | SvUTF8_on(sv); |
609 | } else { |
610 | RETVAL = &PL_sv_undef; |
611 | } |
612 | } |
613 | OUTPUT: |
614 | RETVAL |
615 | |
616 | SV * |
4411f3b6 |
617 | _utf8_off(sv) |
2c674647 |
618 | SV * sv |
619 | CODE: |
620 | { |
621 | if (SvPOK(sv)) { |
87714904 |
622 | SV *rsv = newSViv(SvUTF8(sv)); |
2c674647 |
623 | RETVAL = rsv; |
624 | SvUTF8_off(sv); |
625 | } else { |
626 | RETVAL = &PL_sv_undef; |
627 | } |
628 | } |
629 | OUTPUT: |
630 | RETVAL |
631 | |
33af2bc7 |
632 | BOOT: |
633 | { |
6a59c517 |
634 | #if defined(USE_PERLIO) && !defined(USE_SFIO) |
b2704119 |
635 | /* PerlIO_define_layer(aTHX_ &PerlIO_encode); */ |
33af2bc7 |
636 | #endif |
e7cbefb8 |
637 | #include "def_t.exh" |
33af2bc7 |
638 | } |