Commit | Line | Data |
85982a32 |
1 | /* |
56ff7374 |
2 | $Id: Encode.xs,v 2.5 2005/08/05 10:58:25 dankogai Exp dankogai $ |
85982a32 |
3 | */ |
4 | |
fcf2db38 |
5 | #define PERL_NO_GET_CONTEXT |
2c674647 |
6 | #include "EXTERN.h" |
7 | #include "perl.h" |
8 | #include "XSUB.h" |
2f2b4ff2 |
9 | #define U8 U8 |
10 | #include "encode.h" |
10c5ecbb |
11 | |
12 | # define PERLIO_MODNAME "PerlIO::encoding" |
0ab8f81e |
13 | # define PERLIO_FILENAME "PerlIO/encoding.pm" |
aae85ceb |
14 | |
85982a32 |
15 | /* set 1 or more to profile. t/encoding.t dumps core because of |
16 | Perl_warner and PerlIO don't work well */ |
c6a7db43 |
17 | #define ENCODE_XS_PROFILE 0 |
39cf9a5e |
18 | |
85982a32 |
19 | /* set 0 to disable floating point to calculate buffer size for |
20 | encode_method(). 1 is recommended. 2 restores NI-S original */ |
c6a7db43 |
21 | #define ENCODE_XS_USEFP 1 |
39cf9a5e |
22 | |
fcf2db38 |
23 | #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \ |
2f5768b8 |
24 | Perl_croak(aTHX_ "panic_unimplemented"); \ |
4a83738a |
25 | return (y)0; /* fool picky compilers */ \ |
87714904 |
26 | } |
85982a32 |
27 | /**/ |
011b2d2f |
28 | |
7e9a885a |
29 | UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) |
30 | UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) |
33af2bc7 |
31 | |
7f0d54d7 |
32 | #define UTF8_ALLOW_STRICT 0 |
33 | #define UTF8_ALLOW_NONSTRICT (UTF8_ALLOW_ANY & \ |
34 | ~(UTF8_ALLOW_CONTINUATION | \ |
35 | UTF8_ALLOW_NON_CONTINUATION | \ |
36 | UTF8_ALLOW_LONG)) |
37 | |
b0b300a3 |
38 | void |
aa0053b7 |
39 | Encode_XSEncoding(pTHX_ encode_t * enc) |
2f2b4ff2 |
40 | { |
aa0053b7 |
41 | dSP; |
42 | HV *stash = gv_stashpv("Encode::XS", TRUE); |
43 | SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))), stash); |
44 | int i = 0; |
45 | PUSHMARK(sp); |
46 | XPUSHs(sv); |
47 | while (enc->name[i]) { |
48 | const char *name = enc->name[i++]; |
49 | XPUSHs(sv_2mortal(newSVpvn(name, strlen(name)))); |
50 | } |
51 | PUTBACK; |
52 | call_pv("Encode::define_encoding", G_DISCARD); |
53 | SvREFCNT_dec(sv); |
2f2b4ff2 |
54 | } |
55 | |
aa0053b7 |
56 | void |
57 | call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig) |
58 | { |
85982a32 |
59 | /* Exists for breakpointing */ |
aa0053b7 |
60 | } |
67e989fb |
61 | |
85982a32 |
62 | |
2fc614e0 |
63 | #define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s" |
64 | #define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode" |
65 | |
2f2b4ff2 |
66 | static SV * |
aa0053b7 |
67 | encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, |
220e2d4e |
68 | int check, STRLEN * offset, SV * term, int * retcode) |
2f2b4ff2 |
69 | { |
aa0053b7 |
70 | STRLEN slen; |
71 | U8 *s = (U8 *) SvPV(src, slen); |
3aececda |
72 | STRLEN tlen = slen; |
73 | STRLEN ddone = 0; |
74 | STRLEN sdone = 0; |
39cf9a5e |
75 | |
3c49ab08 |
76 | /* We allocate slen+1. |
85982a32 |
77 | PerlIO dumps core if this value is smaller than this. */ |
3c49ab08 |
78 | SV *dst = sv_2mortal(newSV(slen+1)); |
85982a32 |
79 | U8 *d = (U8 *)SvPVX(dst); |
80 | STRLEN dlen = SvLEN(dst)-1; |
220e2d4e |
81 | int code = 0; |
82 | STRLEN trmlen = 0; |
cc7dbc11 |
83 | U8 *trm = term ? (U8*) SvPV(term, trmlen) : NULL; |
220e2d4e |
84 | |
85 | if (offset) { |
86 | s += *offset; |
6be7c101 |
87 | if (slen > *offset){ /* safeguard against slen overflow */ |
88 | slen -= *offset; |
89 | }else{ |
90 | slen = 0; |
91 | } |
220e2d4e |
92 | tlen = slen; |
93 | } |
85982a32 |
94 | |
6be7c101 |
95 | if (slen == 0){ |
85982a32 |
96 | SvCUR_set(dst, 0); |
97 | SvPOK_only(dst); |
98 | goto ENCODE_END; |
99 | } |
100 | |
220e2d4e |
101 | while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check, |
102 | trm, trmlen)) ) |
85982a32 |
103 | { |
104 | SvCUR_set(dst, dlen+ddone); |
105 | SvPOK_only(dst); |
0b3236bb |
106 | |
220e2d4e |
107 | if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL || |
108 | code == ENCODE_FOUND_TERM) { |
85982a32 |
109 | break; |
110 | } |
111 | switch (code) { |
112 | case ENCODE_NOSPACE: |
113 | { |
114 | STRLEN more = 0; /* make sure you initialize! */ |
115 | STRLEN sleft; |
116 | sdone += slen; |
117 | ddone += dlen; |
118 | sleft = tlen - sdone; |
fcb875d4 |
119 | #if ENCODE_XS_PROFILE >= 2 |
85982a32 |
120 | Perl_warn(aTHX_ |
121 | "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n", |
122 | more, sdone, sleft, SvLEN(dst)); |
fcb875d4 |
123 | #endif |
85982a32 |
124 | if (sdone != 0) { /* has src ever been processed ? */ |
39cf9a5e |
125 | #if ENCODE_XS_USEFP == 2 |
85982a32 |
126 | more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone |
127 | - SvLEN(dst); |
39cf9a5e |
128 | #elif ENCODE_XS_USEFP |
6e21dc91 |
129 | more = (STRLEN)((1.0*SvLEN(dst)+1)/sdone * sleft); |
39cf9a5e |
130 | #else |
85982a32 |
131 | /* safe until SvLEN(dst) == MAX_INT/16 */ |
132 | more = (16*SvLEN(dst)+1)/sdone/16 * sleft; |
39cf9a5e |
133 | #endif |
39cf9a5e |
134 | } |
85982a32 |
135 | more += UTF8_MAXLEN; /* insurance policy */ |
136 | d = (U8 *) SvGROW(dst, SvLEN(dst) + more); |
137 | /* dst need to grow need MORE bytes! */ |
138 | if (ddone >= SvLEN(dst)) { |
139 | Perl_croak(aTHX_ "Destination couldn't be grown."); |
140 | } |
141 | dlen = SvLEN(dst)-ddone-1; |
142 | d += ddone; |
143 | s += slen; |
144 | slen = tlen-sdone; |
145 | continue; |
146 | } |
147 | case ENCODE_NOREP: |
148 | /* encoding */ |
c6a7db43 |
149 | if (dir == enc->f_utf8) { |
85982a32 |
150 | STRLEN clen; |
151 | UV ch = |
3e952a88 |
152 | utf8n_to_uvuni(s+slen, (SvCUR(src)-slen), |
b0b300a3 |
153 | &clen, UTF8_ALLOW_ANY|UTF8_CHECK_ONLY); |
56ff7374 |
154 | /* if non-representable multibyte prefix at end of current buffer - break*/ |
155 | if (clen > tlen - sdone) break; |
85982a32 |
156 | if (check & ENCODE_DIE_ON_ERR) { |
2fc614e0 |
157 | Perl_croak(aTHX_ ERR_ENCODE_NOMAP, |
158 | (UV)ch, enc->name[0]); |
4089adc4 |
159 | return &PL_sv_undef; /* never reaches but be safe */ |
160 | } |
161 | if (check & ENCODE_WARN_ON_ERR){ |
162 | Perl_warner(aTHX_ packWARN(WARN_UTF8), |
2fc614e0 |
163 | ERR_ENCODE_NOMAP, (UV)ch, enc->name[0]); |
4089adc4 |
164 | } |
165 | if (check & ENCODE_RETURN_ON_ERR){ |
166 | goto ENCODE_SET_SRC; |
167 | } |
f9d05ba3 |
168 | if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ |
169 | SV* subchar = |
170 | newSVpvf(check & ENCODE_PERLQQ ? "\\x{%04"UVxf"}" : |
171 | check & ENCODE_HTMLCREF ? "&#%" UVuf ";" : |
172 | "&#x%" UVxf ";", (UV)ch); |
4089adc4 |
173 | sdone += slen + clen; |
f9d05ba3 |
174 | ddone += dlen + SvCUR(subchar); |
175 | sv_catsv(dst, subchar); |
176 | SvREFCNT_dec(subchar); |
4089adc4 |
177 | } else { |
178 | /* fallback char */ |
179 | sdone += slen + clen; |
180 | ddone += dlen + enc->replen; |
181 | sv_catpvn(dst, (char*)enc->rep, enc->replen); |
c6a7db43 |
182 | } |
b2704119 |
183 | } |
85982a32 |
184 | /* decoding */ |
c6a7db43 |
185 | else { |
85982a32 |
186 | if (check & ENCODE_DIE_ON_ERR){ |
2fc614e0 |
187 | Perl_croak(aTHX_ ERR_DECODE_NOMAP, |
436c6dd3 |
188 | enc->name[0], (UV)s[slen]); |
4089adc4 |
189 | return &PL_sv_undef; /* never reaches but be safe */ |
190 | } |
191 | if (check & ENCODE_WARN_ON_ERR){ |
192 | Perl_warner( |
193 | aTHX_ packWARN(WARN_UTF8), |
2fc614e0 |
194 | ERR_DECODE_NOMAP, |
436c6dd3 |
195 | enc->name[0], (UV)s[slen]); |
4089adc4 |
196 | } |
197 | if (check & ENCODE_RETURN_ON_ERR){ |
198 | goto ENCODE_SET_SRC; |
199 | } |
200 | if (check & |
201 | (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ |
f9d05ba3 |
202 | SV* subchar = newSVpvf("\\x%02" UVXf, (UV)s[slen]); |
4089adc4 |
203 | sdone += slen + 1; |
f9d05ba3 |
204 | ddone += dlen + SvCUR(subchar); |
205 | sv_catsv(dst, subchar); |
206 | SvREFCNT_dec(subchar); |
4089adc4 |
207 | } else { |
208 | sdone += slen + 1; |
209 | ddone += dlen + strlen(FBCHAR_UTF8); |
210 | sv_catpv(dst, FBCHAR_UTF8); |
aa0053b7 |
211 | } |
b2704119 |
212 | } |
85982a32 |
213 | /* settle variables when fallback */ |
b0b300a3 |
214 | d = (U8 *)SvEND(dst); |
215 | dlen = SvLEN(dst) - ddone - 1; |
3e952a88 |
216 | s = (U8*)SvPVX(src) + sdone; |
b2704119 |
217 | slen = tlen - sdone; |
218 | break; |
2f2b4ff2 |
219 | |
85982a32 |
220 | default: |
221 | Perl_croak(aTHX_ "Unexpected code %d converting %s %s", |
222 | code, (dir == enc->f_utf8) ? "to" : "from", |
223 | enc->name[0]); |
224 | return &PL_sv_undef; |
aa0053b7 |
225 | } |
85982a32 |
226 | } |
227 | ENCODE_SET_SRC: |
ca777f1c |
228 | if (check && !(check & ENCODE_LEAVE_SRC)){ |
229 | sdone = SvCUR(src) - (slen+sdone); |
85982a32 |
230 | if (sdone) { |
231 | sv_setpvn(src, (char*)s+slen, sdone); |
aa0053b7 |
232 | } |
85982a32 |
233 | SvCUR_set(src, sdone); |
2f2b4ff2 |
234 | } |
85982a32 |
235 | /* warn("check = 0x%X, code = 0x%d\n", check, code); */ |
c6a7db43 |
236 | |
85982a32 |
237 | SvCUR_set(dst, dlen+ddone); |
238 | SvPOK_only(dst); |
c6a7db43 |
239 | |
39cf9a5e |
240 | #if ENCODE_XS_PROFILE |
241 | if (SvCUR(dst) > SvCUR(src)){ |
85982a32 |
242 | Perl_warn(aTHX_ |
243 | "SvLEN(dst)=%d, SvCUR(dst)=%d. %d bytes unused(%f %%)\n", |
244 | SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst), |
245 | (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0); |
39cf9a5e |
246 | } |
3c49ab08 |
247 | #endif |
c6a7db43 |
248 | |
220e2d4e |
249 | if (offset) |
250 | *offset += sdone + slen; |
251 | |
85982a32 |
252 | ENCODE_END: |
0b3236bb |
253 | *SvEND(dst) = '\0'; |
220e2d4e |
254 | if (retcode) *retcode = code; |
aa0053b7 |
255 | return dst; |
2f2b4ff2 |
256 | } |
257 | |
7f0d54d7 |
258 | static bool |
259 | strict_utf8(pTHX_ SV* sv) |
260 | { |
261 | HV* hv; |
262 | SV** svp; |
263 | sv = SvRV(sv); |
264 | if (!sv || SvTYPE(sv) != SVt_PVHV) |
265 | return 0; |
266 | hv = (HV*)sv; |
267 | svp = hv_fetch(hv, "strict_utf8", 11, 0); |
268 | if (!svp) |
269 | return 0; |
270 | return SvTRUE(*svp); |
271 | } |
272 | |
273 | static U8* |
274 | process_utf8(pTHX_ SV* dst, U8* s, U8* e, int check, |
275 | bool encode, bool strict, bool stop_at_partial) |
276 | { |
277 | UV uv; |
278 | STRLEN ulen; |
279 | |
280 | SvPOK_only(dst); |
281 | SvCUR_set(dst,0); |
282 | |
283 | while (s < e) { |
284 | if (UTF8_IS_INVARIANT(*s)) { |
285 | sv_catpvn(dst, (char *)s, 1); |
286 | s++; |
287 | continue; |
288 | } |
289 | |
290 | if (UTF8_IS_START(*s)) { |
291 | U8 skip = UTF8SKIP(s); |
292 | if ((s + skip) > e) { |
293 | /* Partial character */ |
294 | /* XXX could check that rest of bytes are UTF8_IS_CONTINUATION(ch) */ |
56ff7374 |
295 | if (stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) |
7f0d54d7 |
296 | break; |
297 | |
298 | goto malformed_byte; |
299 | } |
300 | |
301 | uv = utf8n_to_uvuni(s, e - s, &ulen, |
302 | UTF8_CHECK_ONLY | (strict ? UTF8_ALLOW_STRICT : |
303 | UTF8_ALLOW_NONSTRICT) |
304 | ); |
305 | #if 1 /* perl-5.8.6 and older do not check UTF8_ALLOW_LONG */ |
306 | if (strict && uv > PERL_UNICODE_MAX) |
307 | ulen = -1; |
308 | #endif |
309 | if (ulen == -1) { |
310 | if (strict) { |
311 | uv = utf8n_to_uvuni(s, e - s, &ulen, |
312 | UTF8_CHECK_ONLY | UTF8_ALLOW_NONSTRICT); |
313 | if (ulen == -1) |
314 | goto malformed_byte; |
315 | goto malformed; |
316 | } |
317 | goto malformed_byte; |
318 | } |
319 | |
320 | |
321 | /* Whole char is good */ |
322 | sv_catpvn(dst,(char *)s,skip); |
323 | s += skip; |
324 | continue; |
325 | } |
326 | |
327 | /* If we get here there is something wrong with alleged UTF-8 */ |
328 | malformed_byte: |
329 | uv = (UV)*s; |
330 | ulen = 1; |
331 | |
332 | malformed: |
333 | if (check & ENCODE_DIE_ON_ERR){ |
334 | if (encode) |
335 | Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, "utf8"); |
336 | else |
337 | Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", uv); |
338 | } |
339 | if (check & ENCODE_WARN_ON_ERR){ |
340 | if (encode) |
341 | Perl_warner(aTHX_ packWARN(WARN_UTF8), |
342 | ERR_ENCODE_NOMAP, uv, "utf8"); |
343 | else |
344 | Perl_warner(aTHX_ packWARN(WARN_UTF8), |
345 | ERR_DECODE_NOMAP, "utf8", uv); |
346 | } |
347 | if (check & ENCODE_RETURN_ON_ERR) { |
348 | break; |
349 | } |
350 | if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ |
351 | SV* subchar = newSVpvf(check & ENCODE_PERLQQ ? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}"): |
352 | check & ENCODE_HTMLCREF ? "&#%" UVuf ";" : |
353 | "&#x%" UVxf ";", uv); |
354 | sv_catsv(dst, subchar); |
355 | SvREFCNT_dec(subchar); |
356 | } else { |
357 | sv_catpv(dst, FBCHAR_UTF8); |
358 | } |
359 | s += ulen; |
360 | } |
361 | *SvEND(dst) = '\0'; |
362 | |
363 | return s; |
364 | } |
365 | |
366 | |
ab3374e4 |
367 | MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_ |
368 | |
a0d8a30e |
369 | PROTOTYPES: DISABLE |
370 | |
371 | void |
b536bf57 |
372 | Method_decode_xs(obj,src,check = 0) |
ab3374e4 |
373 | SV * obj |
374 | SV * src |
375 | int check |
376 | CODE: |
377 | { |
378 | STRLEN slen; |
379 | U8 *s = (U8 *) SvPV(src, slen); |
380 | U8 *e = (U8 *) SvEND(src); |
b536bf57 |
381 | SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */ |
cc836e95 |
382 | |
383 | /* |
7f0d54d7 |
384 | * PerlIO check -- we assume the object is of PerlIO if renewed |
cc836e95 |
385 | */ |
386 | int renewed = 0; |
387 | dSP; ENTER; SAVETMPS; |
388 | PUSHMARK(sp); |
389 | XPUSHs(obj); |
390 | PUTBACK; |
391 | if (call_method("renewed",G_SCALAR) == 1) { |
392 | SPAGAIN; |
393 | renewed = POPi; |
394 | PUTBACK; |
395 | #if 0 |
396 | fprintf(stderr, "renewed == %d\n", renewed); |
397 | #endif |
cc836e95 |
398 | } |
399 | FREETMPS; LEAVE; |
400 | /* end PerlIO check */ |
401 | |
ab3374e4 |
402 | if (SvUTF8(src)) { |
403 | s = utf8_to_bytes(s,&slen); |
404 | if (s) { |
405 | SvCUR_set(src,slen); |
406 | SvUTF8_off(src); |
407 | e = s+slen; |
408 | } |
409 | else { |
410 | croak("Cannot decode string with wide characters"); |
411 | } |
412 | } |
7f0d54d7 |
413 | |
414 | s = process_utf8(aTHX_ dst, s, e, check, 0, strict_utf8(aTHX_ obj), renewed); |
ab3374e4 |
415 | |
416 | /* Clear out translated part of source unless asked not to */ |
417 | if (check && !(check & ENCODE_LEAVE_SRC)){ |
418 | slen = e-s; |
419 | if (slen) { |
420 | sv_setpvn(src, (char*)s, slen); |
421 | } |
422 | SvCUR_set(src, slen); |
423 | } |
424 | SvUTF8_on(dst); |
425 | ST(0) = sv_2mortal(dst); |
426 | XSRETURN(1); |
427 | } |
428 | |
429 | void |
b536bf57 |
430 | Method_encode_xs(obj,src,check = 0) |
ab3374e4 |
431 | SV * obj |
432 | SV * src |
433 | int check |
434 | CODE: |
435 | { |
436 | STRLEN slen; |
437 | U8 *s = (U8 *) SvPV(src, slen); |
438 | U8 *e = (U8 *) SvEND(src); |
b536bf57 |
439 | SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */ |
ab3374e4 |
440 | if (SvUTF8(src)) { |
7f0d54d7 |
441 | /* Already encoded */ |
442 | if (strict_utf8(aTHX_ obj)) { |
443 | s = process_utf8(aTHX_ dst, s, e, check, 1, 1, 0); |
444 | } |
445 | else { |
446 | /* trust it and just copy the octets */ |
447 | sv_setpvn(dst,(char *)s,(e-s)); |
448 | s = e; |
449 | } |
ab3374e4 |
450 | } |
451 | else { |
452 | /* Native bytes - can always encode */ |
b536bf57 |
453 | U8 *d = (U8 *) SvGROW(dst, 2*slen+1); /* +1 or assertion will botch */ |
ab3374e4 |
454 | while (s < e) { |
455 | UV uv = NATIVE_TO_UNI((UV) *s++); |
456 | if (UNI_IS_INVARIANT(uv)) |
457 | *d++ = (U8)UTF_TO_NATIVE(uv); |
458 | else { |
459 | *d++ = (U8)UTF8_EIGHT_BIT_HI(uv); |
460 | *d++ = (U8)UTF8_EIGHT_BIT_LO(uv); |
461 | } |
462 | } |
463 | SvCUR_set(dst, d- (U8 *)SvPVX(dst)); |
464 | *SvEND(dst) = '\0'; |
465 | } |
466 | |
467 | /* Clear out translated part of source unless asked not to */ |
468 | if (check && !(check & ENCODE_LEAVE_SRC)){ |
469 | slen = e-s; |
470 | if (slen) { |
471 | sv_setpvn(src, (char*)s, slen); |
472 | } |
473 | SvCUR_set(src, slen); |
474 | } |
475 | SvPOK_only(dst); |
476 | SvUTF8_off(dst); |
477 | ST(0) = sv_2mortal(dst); |
478 | XSRETURN(1); |
479 | } |
480 | |
50d26985 |
481 | MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_ |
2f2b4ff2 |
482 | |
483 | PROTOTYPES: ENABLE |
484 | |
485 | void |
a0d8a30e |
486 | Method_renew(obj) |
487 | SV * obj |
488 | CODE: |
489 | { |
490 | XSRETURN(1); |
491 | } |
492 | |
cc836e95 |
493 | int |
494 | Method_renewed(obj) |
495 | SV * obj |
496 | CODE: |
497 | RETVAL = 0; |
498 | OUTPUT: |
499 | RETVAL |
500 | |
a0d8a30e |
501 | void |
0a95303c |
502 | Method_name(obj) |
503 | SV * obj |
504 | CODE: |
85982a32 |
505 | { |
506 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
507 | ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0]))); |
508 | XSRETURN(1); |
509 | } |
0a95303c |
510 | |
511 | void |
220e2d4e |
512 | Method_cat_decode(obj, dst, src, off, term, check = 0) |
513 | SV * obj |
514 | SV * dst |
515 | SV * src |
516 | SV * off |
517 | SV * term |
518 | int check |
519 | CODE: |
520 | { |
521 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
522 | STRLEN offset = (STRLEN)SvIV(off); |
523 | int code = 0; |
524 | if (SvUTF8(src)) { |
525 | sv_utf8_downgrade(src, FALSE); |
526 | } |
527 | sv_catsv(dst, encode_method(aTHX_ enc, enc->t_utf8, src, check, |
528 | &offset, term, &code)); |
b162af07 |
529 | SvIV_set(off, (IV)offset); |
220e2d4e |
530 | if (code == ENCODE_FOUND_TERM) { |
531 | ST(0) = &PL_sv_yes; |
532 | }else{ |
533 | ST(0) = &PL_sv_no; |
534 | } |
535 | XSRETURN(1); |
536 | } |
537 | |
538 | void |
b2704119 |
539 | Method_decode(obj,src,check = 0) |
2f2b4ff2 |
540 | SV * obj |
541 | SV * src |
b2704119 |
542 | int check |
2f2b4ff2 |
543 | CODE: |
aae85ceb |
544 | { |
85982a32 |
545 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
ab3374e4 |
546 | if (SvUTF8(src)) { |
547 | sv_utf8_downgrade(src, FALSE); |
548 | } |
220e2d4e |
549 | ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check, |
550 | NULL, Nullsv, NULL); |
85982a32 |
551 | SvUTF8_on(ST(0)); |
aae85ceb |
552 | XSRETURN(1); |
553 | } |
554 | |
555 | void |
85982a32 |
556 | Method_encode(obj,src,check = 0) |
aae85ceb |
557 | SV * obj |
85982a32 |
558 | SV * src |
559 | int check |
aae85ceb |
560 | CODE: |
561 | { |
85982a32 |
562 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
563 | sv_utf8_upgrade(src); |
220e2d4e |
564 | ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check, |
565 | NULL, Nullsv, NULL); |
aae85ceb |
566 | XSRETURN(1); |
567 | } |
568 | |
0ab8f81e |
569 | void |
570 | Method_needs_lines(obj) |
571 | SV * obj |
572 | CODE: |
573 | { |
b32afa7c |
574 | /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */ |
0ab8f81e |
575 | ST(0) = &PL_sv_no; |
576 | XSRETURN(1); |
577 | } |
578 | |
579 | void |
580 | Method_perlio_ok(obj) |
581 | SV * obj |
582 | CODE: |
583 | { |
b32afa7c |
584 | /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */ |
10c5ecbb |
585 | /* require_pv(PERLIO_FILENAME); */ |
586 | |
587 | eval_pv("require PerlIO::encoding", 0); |
588 | |
589 | if (SvTRUE(get_sv("@", 0))) { |
0ab8f81e |
590 | ST(0) = &PL_sv_no; |
10c5ecbb |
591 | }else{ |
592 | ST(0) = &PL_sv_yes; |
0ab8f81e |
593 | } |
594 | XSRETURN(1); |
595 | } |
596 | |
67e989fb |
597 | MODULE = Encode PACKAGE = Encode |
2c674647 |
598 | |
599 | PROTOTYPES: ENABLE |
600 | |
67e989fb |
601 | I32 |
2c674647 |
602 | _bytes_to_utf8(sv, ...) |
85982a32 |
603 | SV * sv |
604 | CODE: |
605 | { |
606 | SV * encoding = items == 2 ? ST(1) : Nullsv; |
c6a7db43 |
607 | |
85982a32 |
608 | if (encoding) |
609 | RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding)); |
610 | else { |
611 | STRLEN len; |
612 | U8* s = (U8*)SvPV(sv, len); |
613 | U8* converted; |
614 | |
615 | converted = bytes_to_utf8(s, &len); /* This allocs */ |
616 | sv_setpvn(sv, (char *)converted, len); |
617 | SvUTF8_on(sv); /* XXX Should we? */ |
618 | Safefree(converted); /* ... so free it */ |
619 | RETVAL = len; |
620 | } |
621 | } |
622 | OUTPUT: |
623 | RETVAL |
2c674647 |
624 | |
67e989fb |
625 | I32 |
2c674647 |
626 | _utf8_to_bytes(sv, ...) |
85982a32 |
627 | SV * sv |
628 | CODE: |
629 | { |
630 | SV * to = items > 1 ? ST(1) : Nullsv; |
631 | SV * check = items > 2 ? ST(2) : Nullsv; |
632 | |
633 | if (to) { |
634 | RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to)); |
635 | } else { |
636 | STRLEN len; |
637 | U8 *s = (U8*)SvPV(sv, len); |
638 | |
639 | RETVAL = 0; |
640 | if (SvTRUE(check)) { |
641 | /* Must do things the slow way */ |
642 | U8 *dest; |
643 | /* We need a copy to pass to check() */ |
c6a7db43 |
644 | U8 *src = (U8*)savepv((char *)s); |
85982a32 |
645 | U8 *send = s + len; |
646 | |
647 | New(83, dest, len, U8); /* I think */ |
648 | |
649 | while (s < send) { |
650 | if (*s < 0x80){ |
651 | *dest++ = *s++; |
652 | } else { |
653 | STRLEN ulen; |
654 | UV uv = *s++; |
655 | |
656 | /* Have to do it all ourselves because of error routine, |
657 | aargh. */ |
658 | if (!(uv & 0x40)){ goto failure; } |
659 | if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; } |
660 | else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; } |
661 | else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; } |
662 | else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; } |
663 | else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; } |
664 | else if (!(uv & 0x01)) { ulen = 7; uv = 0; } |
665 | else { ulen = 13; uv = 0; } |
87714904 |
666 | |
85982a32 |
667 | /* Note change to utf8.c variable naming, for variety */ |
668 | while (ulen--) { |
c6a7db43 |
669 | if ((*s & 0xc0) != 0x80){ |
670 | goto failure; |
85982a32 |
671 | } else { |
672 | uv = (uv << 6) | (*s++ & 0x3f); |
673 | } |
87714904 |
674 | } |
67e989fb |
675 | if (uv > 256) { |
676 | failure: |
85982a32 |
677 | call_failure(check, s, dest, src); |
678 | /* Now what happens? */ |
67e989fb |
679 | } |
680 | *dest++ = (U8)uv; |
85982a32 |
681 | } |
682 | } |
683 | } else { |
684 | RETVAL = (utf8_to_bytes(s, &len) ? len : 0); |
2c674647 |
685 | } |
85982a32 |
686 | } |
687 | } |
688 | OUTPUT: |
689 | RETVAL |
2c674647 |
690 | |
2c674647 |
691 | bool |
b2704119 |
692 | is_utf8(sv, check = 0) |
4411f3b6 |
693 | SV * sv |
b2704119 |
694 | int check |
85982a32 |
695 | CODE: |
696 | { |
697 | if (SvGMAGICAL(sv)) /* it could be $1, for example */ |
698 | sv = newSVsv(sv); /* GMAGIG will be done */ |
699 | if (SvPOK(sv)) { |
700 | RETVAL = SvUTF8(sv) ? TRUE : FALSE; |
701 | if (RETVAL && |
702 | check && |
703 | !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) |
2c674647 |
704 | RETVAL = FALSE; |
85982a32 |
705 | } else { |
706 | RETVAL = FALSE; |
707 | } |
708 | if (sv != ST(0)) |
709 | SvREFCNT_dec(sv); /* it was a temp copy */ |
710 | } |
711 | OUTPUT: |
712 | RETVAL |
2c674647 |
713 | |
714 | SV * |
4411f3b6 |
715 | _utf8_on(sv) |
85982a32 |
716 | SV * sv |
717 | CODE: |
718 | { |
719 | if (SvPOK(sv)) { |
720 | SV *rsv = newSViv(SvUTF8(sv)); |
721 | RETVAL = rsv; |
722 | SvUTF8_on(sv); |
723 | } else { |
724 | RETVAL = &PL_sv_undef; |
725 | } |
726 | } |
727 | OUTPUT: |
728 | RETVAL |
2c674647 |
729 | |
730 | SV * |
4411f3b6 |
731 | _utf8_off(sv) |
85982a32 |
732 | SV * sv |
733 | CODE: |
734 | { |
735 | if (SvPOK(sv)) { |
736 | SV *rsv = newSViv(SvUTF8(sv)); |
737 | RETVAL = rsv; |
738 | SvUTF8_off(sv); |
739 | } else { |
740 | RETVAL = &PL_sv_undef; |
741 | } |
742 | } |
743 | OUTPUT: |
744 | RETVAL |
745 | |
85982a32 |
746 | int |
747 | DIE_ON_ERR() |
748 | CODE: |
749 | RETVAL = ENCODE_DIE_ON_ERR; |
750 | OUTPUT: |
751 | RETVAL |
752 | |
c6a7db43 |
753 | int |
85982a32 |
754 | WARN_ON_ERR() |
755 | CODE: |
756 | RETVAL = ENCODE_WARN_ON_ERR; |
757 | OUTPUT: |
758 | RETVAL |
759 | |
760 | int |
761 | LEAVE_SRC() |
762 | CODE: |
763 | RETVAL = ENCODE_LEAVE_SRC; |
764 | OUTPUT: |
765 | RETVAL |
766 | |
767 | int |
768 | RETURN_ON_ERR() |
769 | CODE: |
770 | RETVAL = ENCODE_RETURN_ON_ERR; |
771 | OUTPUT: |
772 | RETVAL |
773 | |
774 | int |
775 | PERLQQ() |
776 | CODE: |
777 | RETVAL = ENCODE_PERLQQ; |
778 | OUTPUT: |
779 | RETVAL |
780 | |
781 | int |
af1f55d9 |
782 | HTMLCREF() |
783 | CODE: |
784 | RETVAL = ENCODE_HTMLCREF; |
785 | OUTPUT: |
786 | RETVAL |
787 | |
788 | int |
789 | XMLCREF() |
790 | CODE: |
791 | RETVAL = ENCODE_XMLCREF; |
792 | OUTPUT: |
793 | RETVAL |
794 | |
795 | int |
56ff7374 |
796 | STOP_AT_PARTIAL() |
797 | CODE: |
798 | RETVAL = ENCODE_STOP_AT_PARTIAL; |
799 | OUTPUT: |
800 | RETVAL |
801 | |
802 | int |
85982a32 |
803 | FB_DEFAULT() |
804 | CODE: |
805 | RETVAL = ENCODE_FB_DEFAULT; |
806 | OUTPUT: |
807 | RETVAL |
808 | |
809 | int |
810 | FB_CROAK() |
811 | CODE: |
812 | RETVAL = ENCODE_FB_CROAK; |
813 | OUTPUT: |
814 | RETVAL |
815 | |
816 | int |
817 | FB_QUIET() |
818 | CODE: |
819 | RETVAL = ENCODE_FB_QUIET; |
820 | OUTPUT: |
821 | RETVAL |
822 | |
823 | int |
824 | FB_WARN() |
825 | CODE: |
826 | RETVAL = ENCODE_FB_WARN; |
827 | OUTPUT: |
828 | RETVAL |
829 | |
830 | int |
831 | FB_PERLQQ() |
832 | CODE: |
833 | RETVAL = ENCODE_FB_PERLQQ; |
834 | OUTPUT: |
835 | RETVAL |
2c674647 |
836 | |
af1f55d9 |
837 | int |
838 | FB_HTMLCREF() |
839 | CODE: |
840 | RETVAL = ENCODE_FB_HTMLCREF; |
841 | OUTPUT: |
842 | RETVAL |
843 | |
844 | int |
845 | FB_XMLCREF() |
846 | CODE: |
847 | RETVAL = ENCODE_FB_XMLCREF; |
848 | OUTPUT: |
849 | RETVAL |
850 | |
33af2bc7 |
851 | BOOT: |
852 | { |
85982a32 |
853 | #include "def_t.h" |
e7cbefb8 |
854 | #include "def_t.exh" |
33af2bc7 |
855 | } |