Commit | Line | Data |
85982a32 |
1 | /* |
b9627ca0 |
2 | $Id: Encode.xs,v 2.18 2009/11/26 09:23:59 dankogai Exp dankogai $ |
85982a32 |
3 | */ |
4 | |
fcf2db38 |
5 | #define PERL_NO_GET_CONTEXT |
2c674647 |
6 | #include "EXTERN.h" |
7 | #include "perl.h" |
8 | #include "XSUB.h" |
2f2b4ff2 |
9 | #define U8 U8 |
10 | #include "encode.h" |
10c5ecbb |
11 | |
12 | # define PERLIO_MODNAME "PerlIO::encoding" |
0ab8f81e |
13 | # define PERLIO_FILENAME "PerlIO/encoding.pm" |
aae85ceb |
14 | |
85982a32 |
15 | /* set 1 or more to profile. t/encoding.t dumps core because of |
16 | Perl_warner and PerlIO don't work well */ |
c6a7db43 |
17 | #define ENCODE_XS_PROFILE 0 |
39cf9a5e |
18 | |
85982a32 |
19 | /* set 0 to disable floating point to calculate buffer size for |
20 | encode_method(). 1 is recommended. 2 restores NI-S original */ |
c6a7db43 |
21 | #define ENCODE_XS_USEFP 1 |
39cf9a5e |
22 | |
fcf2db38 |
23 | #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \ |
2f5768b8 |
24 | Perl_croak(aTHX_ "panic_unimplemented"); \ |
d1256cb1 |
25 | return (y)0; /* fool picky compilers */ \ |
87714904 |
26 | } |
85982a32 |
27 | /**/ |
011b2d2f |
28 | |
7e9a885a |
29 | UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) |
30 | UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) |
33af2bc7 |
31 | |
7f0d54d7 |
32 | #define UTF8_ALLOW_STRICT 0 |
33 | #define UTF8_ALLOW_NONSTRICT (UTF8_ALLOW_ANY & \ |
34 | ~(UTF8_ALLOW_CONTINUATION | \ |
35 | UTF8_ALLOW_NON_CONTINUATION | \ |
36 | UTF8_ALLOW_LONG)) |
37 | |
b0b300a3 |
38 | void |
aa0053b7 |
39 | Encode_XSEncoding(pTHX_ encode_t * enc) |
2f2b4ff2 |
40 | { |
aa0053b7 |
41 | dSP; |
42 | HV *stash = gv_stashpv("Encode::XS", TRUE); |
43 | SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))), stash); |
44 | int i = 0; |
45 | PUSHMARK(sp); |
46 | XPUSHs(sv); |
47 | while (enc->name[i]) { |
d1256cb1 |
48 | const char *name = enc->name[i++]; |
49 | XPUSHs(sv_2mortal(newSVpvn(name, strlen(name)))); |
aa0053b7 |
50 | } |
51 | PUTBACK; |
52 | call_pv("Encode::define_encoding", G_DISCARD); |
53 | SvREFCNT_dec(sv); |
2f2b4ff2 |
54 | } |
55 | |
aa0053b7 |
56 | void |
57 | call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig) |
58 | { |
85982a32 |
59 | /* Exists for breakpointing */ |
aa0053b7 |
60 | } |
67e989fb |
61 | |
85982a32 |
62 | |
2fc614e0 |
63 | #define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s" |
64 | #define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode" |
65 | |
2f2b4ff2 |
66 | static SV * |
370462a2 |
67 | do_fallback_cb(pTHX_ UV ch, SV *fallback_cb) |
8e180e82 |
68 | { |
69 | dSP; |
70 | int argc; |
b2deda17 |
71 | SV *retval = newSVpv("",0); |
8e180e82 |
72 | ENTER; |
73 | SAVETMPS; |
74 | PUSHMARK(sp); |
75 | XPUSHs(sv_2mortal(newSVnv((UV)ch))); |
76 | PUTBACK; |
77 | argc = call_sv(fallback_cb, G_SCALAR); |
78 | SPAGAIN; |
79 | if (argc != 1){ |
370462a2 |
80 | croak("fallback sub must return scalar!"); |
8e180e82 |
81 | } |
b2deda17 |
82 | sv_catsv(retval, POPs); |
8e180e82 |
83 | PUTBACK; |
84 | FREETMPS; |
85 | LEAVE; |
86 | return retval; |
87 | } |
88 | |
89 | static SV * |
0629a5b3 |
90 | encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, |
370462a2 |
91 | int check, STRLEN * offset, SV * term, int * retcode, |
92 | SV *fallback_cb) |
2f2b4ff2 |
93 | { |
aa0053b7 |
94 | STRLEN slen; |
95 | U8 *s = (U8 *) SvPV(src, slen); |
3aececda |
96 | STRLEN tlen = slen; |
97 | STRLEN ddone = 0; |
98 | STRLEN sdone = 0; |
39cf9a5e |
99 | |
3c49ab08 |
100 | /* We allocate slen+1. |
85982a32 |
101 | PerlIO dumps core if this value is smaller than this. */ |
3c49ab08 |
102 | SV *dst = sv_2mortal(newSV(slen+1)); |
85982a32 |
103 | U8 *d = (U8 *)SvPVX(dst); |
104 | STRLEN dlen = SvLEN(dst)-1; |
220e2d4e |
105 | int code = 0; |
106 | STRLEN trmlen = 0; |
cc7dbc11 |
107 | U8 *trm = term ? (U8*) SvPV(term, trmlen) : NULL; |
220e2d4e |
108 | |
109 | if (offset) { |
110 | s += *offset; |
6be7c101 |
111 | if (slen > *offset){ /* safeguard against slen overflow */ |
d1256cb1 |
112 | slen -= *offset; |
6be7c101 |
113 | }else{ |
d1256cb1 |
114 | slen = 0; |
6be7c101 |
115 | } |
220e2d4e |
116 | tlen = slen; |
117 | } |
85982a32 |
118 | |
6be7c101 |
119 | if (slen == 0){ |
d1256cb1 |
120 | SvCUR_set(dst, 0); |
121 | SvPOK_only(dst); |
122 | goto ENCODE_END; |
85982a32 |
123 | } |
124 | |
220e2d4e |
125 | while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check, |
d1256cb1 |
126 | trm, trmlen)) ) |
85982a32 |
127 | { |
d1256cb1 |
128 | SvCUR_set(dst, dlen+ddone); |
129 | SvPOK_only(dst); |
130 | |
131 | if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL || |
132 | code == ENCODE_FOUND_TERM) { |
133 | break; |
134 | } |
135 | switch (code) { |
136 | case ENCODE_NOSPACE: |
137 | { |
138 | STRLEN more = 0; /* make sure you initialize! */ |
139 | STRLEN sleft; |
140 | sdone += slen; |
141 | ddone += dlen; |
142 | sleft = tlen - sdone; |
fcb875d4 |
143 | #if ENCODE_XS_PROFILE >= 2 |
d1256cb1 |
144 | Perl_warn(aTHX_ |
145 | "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n", |
146 | more, sdone, sleft, SvLEN(dst)); |
fcb875d4 |
147 | #endif |
d1256cb1 |
148 | if (sdone != 0) { /* has src ever been processed ? */ |
39cf9a5e |
149 | #if ENCODE_XS_USEFP == 2 |
d1256cb1 |
150 | more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone |
151 | - SvLEN(dst); |
39cf9a5e |
152 | #elif ENCODE_XS_USEFP |
d1256cb1 |
153 | more = (STRLEN)((1.0*SvLEN(dst)+1)/sdone * sleft); |
39cf9a5e |
154 | #else |
d1256cb1 |
155 | /* safe until SvLEN(dst) == MAX_INT/16 */ |
156 | more = (16*SvLEN(dst)+1)/sdone/16 * sleft; |
39cf9a5e |
157 | #endif |
d1256cb1 |
158 | } |
159 | more += UTF8_MAXLEN; /* insurance policy */ |
160 | d = (U8 *) SvGROW(dst, SvLEN(dst) + more); |
161 | /* dst need to grow need MORE bytes! */ |
162 | if (ddone >= SvLEN(dst)) { |
163 | Perl_croak(aTHX_ "Destination couldn't be grown."); |
164 | } |
165 | dlen = SvLEN(dst)-ddone-1; |
166 | d += ddone; |
167 | s += slen; |
168 | slen = tlen-sdone; |
169 | continue; |
170 | } |
171 | case ENCODE_NOREP: |
172 | /* encoding */ |
173 | if (dir == enc->f_utf8) { |
174 | STRLEN clen; |
175 | UV ch = |
176 | utf8n_to_uvuni(s+slen, (SvCUR(src)-slen), |
177 | &clen, UTF8_ALLOW_ANY|UTF8_CHECK_ONLY); |
178 | /* if non-representable multibyte prefix at end of current buffer - break*/ |
179 | if (clen > tlen - sdone) break; |
180 | if (check & ENCODE_DIE_ON_ERR) { |
181 | Perl_croak(aTHX_ ERR_ENCODE_NOMAP, |
182 | (UV)ch, enc->name[0]); |
183 | return &PL_sv_undef; /* never reaches but be safe */ |
184 | } |
185 | if (check & ENCODE_WARN_ON_ERR){ |
186 | Perl_warner(aTHX_ packWARN(WARN_UTF8), |
187 | ERR_ENCODE_NOMAP, (UV)ch, enc->name[0]); |
188 | } |
189 | if (check & ENCODE_RETURN_ON_ERR){ |
190 | goto ENCODE_SET_SRC; |
191 | } |
192 | if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ |
193 | SV* subchar = |
370462a2 |
194 | (fallback_cb != &PL_sv_undef) |
195 | ? do_fallback_cb(aTHX_ ch, fallback_cb) |
196 | : newSVpvf(check & ENCODE_PERLQQ ? "\\x{%04"UVxf"}" : |
d1256cb1 |
197 | check & ENCODE_HTMLCREF ? "&#%" UVuf ";" : |
198 | "&#x%" UVxf ";", (UV)ch); |
b2deda17 |
199 | SvUTF8_off(subchar); /* make sure no decoded string gets in */ |
d1256cb1 |
200 | sdone += slen + clen; |
201 | ddone += dlen + SvCUR(subchar); |
202 | sv_catsv(dst, subchar); |
203 | SvREFCNT_dec(subchar); |
204 | } else { |
205 | /* fallback char */ |
206 | sdone += slen + clen; |
207 | ddone += dlen + enc->replen; |
208 | sv_catpvn(dst, (char*)enc->rep, enc->replen); |
209 | } |
210 | } |
211 | /* decoding */ |
212 | else { |
213 | if (check & ENCODE_DIE_ON_ERR){ |
214 | Perl_croak(aTHX_ ERR_DECODE_NOMAP, |
436c6dd3 |
215 | enc->name[0], (UV)s[slen]); |
d1256cb1 |
216 | return &PL_sv_undef; /* never reaches but be safe */ |
217 | } |
218 | if (check & ENCODE_WARN_ON_ERR){ |
219 | Perl_warner( |
220 | aTHX_ packWARN(WARN_UTF8), |
221 | ERR_DECODE_NOMAP, |
436c6dd3 |
222 | enc->name[0], (UV)s[slen]); |
d1256cb1 |
223 | } |
224 | if (check & ENCODE_RETURN_ON_ERR){ |
225 | goto ENCODE_SET_SRC; |
226 | } |
227 | if (check & |
228 | (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ |
229 | SV* subchar = |
370462a2 |
230 | (fallback_cb != &PL_sv_undef) |
231 | ? do_fallback_cb(aTHX_ (UV)s[slen], fallback_cb) |
232 | : newSVpvf("\\x%02" UVXf, (UV)s[slen]); |
d1256cb1 |
233 | sdone += slen + 1; |
234 | ddone += dlen + SvCUR(subchar); |
235 | sv_catsv(dst, subchar); |
236 | SvREFCNT_dec(subchar); |
237 | } else { |
238 | sdone += slen + 1; |
239 | ddone += dlen + strlen(FBCHAR_UTF8); |
240 | sv_catpv(dst, FBCHAR_UTF8); |
241 | } |
242 | } |
243 | /* settle variables when fallback */ |
244 | d = (U8 *)SvEND(dst); |
b0b300a3 |
245 | dlen = SvLEN(dst) - ddone - 1; |
d1256cb1 |
246 | s = (U8*)SvPVX(src) + sdone; |
247 | slen = tlen - sdone; |
248 | break; |
249 | |
250 | default: |
251 | Perl_croak(aTHX_ "Unexpected code %d converting %s %s", |
252 | code, (dir == enc->f_utf8) ? "to" : "from", |
253 | enc->name[0]); |
254 | return &PL_sv_undef; |
255 | } |
85982a32 |
256 | } |
257 | ENCODE_SET_SRC: |
ca777f1c |
258 | if (check && !(check & ENCODE_LEAVE_SRC)){ |
d1256cb1 |
259 | sdone = SvCUR(src) - (slen+sdone); |
260 | if (sdone) { |
261 | sv_setpvn(src, (char*)s+slen, sdone); |
262 | } |
263 | SvCUR_set(src, sdone); |
2f2b4ff2 |
264 | } |
85982a32 |
265 | /* warn("check = 0x%X, code = 0x%d\n", check, code); */ |
c6a7db43 |
266 | |
85982a32 |
267 | SvCUR_set(dst, dlen+ddone); |
268 | SvPOK_only(dst); |
c6a7db43 |
269 | |
39cf9a5e |
270 | #if ENCODE_XS_PROFILE |
271 | if (SvCUR(dst) > SvCUR(src)){ |
d1256cb1 |
272 | Perl_warn(aTHX_ |
273 | "SvLEN(dst)=%d, SvCUR(dst)=%d. %d bytes unused(%f %%)\n", |
274 | SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst), |
275 | (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0); |
39cf9a5e |
276 | } |
3c49ab08 |
277 | #endif |
c6a7db43 |
278 | |
220e2d4e |
279 | if (offset) |
280 | *offset += sdone + slen; |
281 | |
85982a32 |
282 | ENCODE_END: |
0b3236bb |
283 | *SvEND(dst) = '\0'; |
220e2d4e |
284 | if (retcode) *retcode = code; |
aa0053b7 |
285 | return dst; |
2f2b4ff2 |
286 | } |
287 | |
7f0d54d7 |
288 | static bool |
289 | strict_utf8(pTHX_ SV* sv) |
290 | { |
291 | HV* hv; |
292 | SV** svp; |
293 | sv = SvRV(sv); |
294 | if (!sv || SvTYPE(sv) != SVt_PVHV) |
295 | return 0; |
296 | hv = (HV*)sv; |
297 | svp = hv_fetch(hv, "strict_utf8", 11, 0); |
298 | if (!svp) |
299 | return 0; |
300 | return SvTRUE(*svp); |
301 | } |
302 | |
303 | static U8* |
b9627ca0 |
304 | process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, |
7f0d54d7 |
305 | bool encode, bool strict, bool stop_at_partial) |
306 | { |
307 | UV uv; |
308 | STRLEN ulen; |
b9627ca0 |
309 | SV *fallback_cb; |
310 | int check; |
311 | |
312 | if (SvROK(check_sv)) { |
313 | /* croak("UTF-8 decoder doesn't support callback CHECK"); */ |
314 | fallback_cb = check_sv; |
315 | check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as perlqq */ |
316 | } |
317 | else { |
318 | fallback_cb = &PL_sv_undef; |
319 | check = SvIV(check_sv); |
320 | } |
7f0d54d7 |
321 | |
322 | SvPOK_only(dst); |
323 | SvCUR_set(dst,0); |
324 | |
325 | while (s < e) { |
326 | if (UTF8_IS_INVARIANT(*s)) { |
327 | sv_catpvn(dst, (char *)s, 1); |
328 | s++; |
329 | continue; |
330 | } |
331 | |
332 | if (UTF8_IS_START(*s)) { |
333 | U8 skip = UTF8SKIP(s); |
334 | if ((s + skip) > e) { |
335 | /* Partial character */ |
336 | /* XXX could check that rest of bytes are UTF8_IS_CONTINUATION(ch) */ |
56ff7374 |
337 | if (stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) |
7f0d54d7 |
338 | break; |
339 | |
340 | goto malformed_byte; |
341 | } |
342 | |
343 | uv = utf8n_to_uvuni(s, e - s, &ulen, |
344 | UTF8_CHECK_ONLY | (strict ? UTF8_ALLOW_STRICT : |
345 | UTF8_ALLOW_NONSTRICT) |
346 | ); |
347 | #if 1 /* perl-5.8.6 and older do not check UTF8_ALLOW_LONG */ |
d1256cb1 |
348 | if (strict && uv > PERL_UNICODE_MAX) |
1b9a72a9 |
349 | ulen = (STRLEN) -1; |
7f0d54d7 |
350 | #endif |
351 | if (ulen == -1) { |
352 | if (strict) { |
353 | uv = utf8n_to_uvuni(s, e - s, &ulen, |
354 | UTF8_CHECK_ONLY | UTF8_ALLOW_NONSTRICT); |
355 | if (ulen == -1) |
356 | goto malformed_byte; |
357 | goto malformed; |
358 | } |
359 | goto malformed_byte; |
360 | } |
361 | |
362 | |
363 | /* Whole char is good */ |
364 | sv_catpvn(dst,(char *)s,skip); |
365 | s += skip; |
366 | continue; |
367 | } |
368 | |
369 | /* If we get here there is something wrong with alleged UTF-8 */ |
370 | malformed_byte: |
371 | uv = (UV)*s; |
372 | ulen = 1; |
373 | |
374 | malformed: |
375 | if (check & ENCODE_DIE_ON_ERR){ |
376 | if (encode) |
377 | Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, "utf8"); |
378 | else |
379 | Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", uv); |
380 | } |
381 | if (check & ENCODE_WARN_ON_ERR){ |
382 | if (encode) |
383 | Perl_warner(aTHX_ packWARN(WARN_UTF8), |
384 | ERR_ENCODE_NOMAP, uv, "utf8"); |
385 | else |
386 | Perl_warner(aTHX_ packWARN(WARN_UTF8), |
387 | ERR_DECODE_NOMAP, "utf8", uv); |
388 | } |
389 | if (check & ENCODE_RETURN_ON_ERR) { |
390 | break; |
391 | } |
392 | if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ |
b9627ca0 |
393 | SV* subchar = |
394 | (fallback_cb != &PL_sv_undef) |
395 | ? do_fallback_cb(aTHX_ uv, fallback_cb) |
396 | : newSVpvf(check & ENCODE_PERLQQ |
397 | ? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}") |
398 | : check & ENCODE_HTMLCREF ? "&#%" UVuf ";" |
399 | : "&#x%" UVxf ";", uv); |
400 | if (encode){ |
401 | SvUTF8_off(subchar); /* make sure no decoded string gets in */ |
402 | } |
7f0d54d7 |
403 | sv_catsv(dst, subchar); |
404 | SvREFCNT_dec(subchar); |
405 | } else { |
406 | sv_catpv(dst, FBCHAR_UTF8); |
407 | } |
408 | s += ulen; |
409 | } |
410 | *SvEND(dst) = '\0'; |
411 | |
412 | return s; |
413 | } |
414 | |
415 | |
ab3374e4 |
416 | MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_ |
417 | |
a0d8a30e |
418 | PROTOTYPES: DISABLE |
419 | |
420 | void |
b2deda17 |
421 | Method_decode_xs(obj,src,check_sv = &PL_sv_no) |
ab3374e4 |
422 | SV * obj |
423 | SV * src |
b2deda17 |
424 | SV * check_sv |
e83d9f5f |
425 | PREINIT: |
426 | STRLEN slen; |
427 | U8 *s; |
428 | U8 *e; |
429 | SV *dst; |
430 | bool renewed = 0; |
b2deda17 |
431 | int check; |
ab3374e4 |
432 | CODE: |
433 | { |
e83d9f5f |
434 | dSP; ENTER; SAVETMPS; |
435 | if (src == &PL_sv_undef) src = newSV(0); |
436 | s = (U8 *) SvPV(src, slen); |
437 | e = (U8 *) SvEND(src); |
438 | dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */ |
b9627ca0 |
439 | check = SvROK(check_sv) ? ENCODE_PERLQQ|ENCODE_LEAVE_SRC : SvIV(check_sv); |
cc836e95 |
440 | /* |
7f0d54d7 |
441 | * PerlIO check -- we assume the object is of PerlIO if renewed |
cc836e95 |
442 | */ |
cc836e95 |
443 | PUSHMARK(sp); |
444 | XPUSHs(obj); |
445 | PUTBACK; |
446 | if (call_method("renewed",G_SCALAR) == 1) { |
d1256cb1 |
447 | SPAGAIN; |
448 | renewed = (bool)POPi; |
449 | PUTBACK; |
cc836e95 |
450 | #if 0 |
d1256cb1 |
451 | fprintf(stderr, "renewed == %d\n", renewed); |
cc836e95 |
452 | #endif |
cc836e95 |
453 | } |
454 | FREETMPS; LEAVE; |
455 | /* end PerlIO check */ |
456 | |
ab3374e4 |
457 | if (SvUTF8(src)) { |
d1256cb1 |
458 | s = utf8_to_bytes(s,&slen); |
459 | if (s) { |
460 | SvCUR_set(src,slen); |
461 | SvUTF8_off(src); |
462 | e = s+slen; |
463 | } |
464 | else { |
465 | croak("Cannot decode string with wide characters"); |
466 | } |
ab3374e4 |
467 | } |
7f0d54d7 |
468 | |
b9627ca0 |
469 | s = process_utf8(aTHX_ dst, s, e, check_sv, 0, strict_utf8(aTHX_ obj), renewed); |
ab3374e4 |
470 | |
471 | /* Clear out translated part of source unless asked not to */ |
472 | if (check && !(check & ENCODE_LEAVE_SRC)){ |
d1256cb1 |
473 | slen = e-s; |
474 | if (slen) { |
475 | sv_setpvn(src, (char*)s, slen); |
476 | } |
477 | SvCUR_set(src, slen); |
ab3374e4 |
478 | } |
479 | SvUTF8_on(dst); |
480 | ST(0) = sv_2mortal(dst); |
481 | XSRETURN(1); |
482 | } |
483 | |
484 | void |
b2deda17 |
485 | Method_encode_xs(obj,src,check_sv = &PL_sv_no) |
ab3374e4 |
486 | SV * obj |
487 | SV * src |
b2deda17 |
488 | SV * check_sv |
e83d9f5f |
489 | PREINIT: |
490 | STRLEN slen; |
491 | U8 *s; |
492 | U8 *e; |
493 | SV *dst; |
494 | bool renewed = 0; |
b2deda17 |
495 | int check; |
ab3374e4 |
496 | CODE: |
497 | { |
b9627ca0 |
498 | check = SvROK(check_sv) ? ENCODE_PERLQQ|ENCODE_LEAVE_SRC : SvIV(check_sv); |
e83d9f5f |
499 | if (src == &PL_sv_undef) src = newSV(0); |
500 | s = (U8 *) SvPV(src, slen); |
501 | e = (U8 *) SvEND(src); |
502 | dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */ |
ab3374e4 |
503 | if (SvUTF8(src)) { |
d1256cb1 |
504 | /* Already encoded */ |
505 | if (strict_utf8(aTHX_ obj)) { |
b9627ca0 |
506 | s = process_utf8(aTHX_ dst, s, e, check_sv, 1, 1, 0); |
d1256cb1 |
507 | } |
7f0d54d7 |
508 | else { |
509 | /* trust it and just copy the octets */ |
510 | sv_setpvn(dst,(char *)s,(e-s)); |
d1256cb1 |
511 | s = e; |
7f0d54d7 |
512 | } |
ab3374e4 |
513 | } |
514 | else { |
515 | /* Native bytes - can always encode */ |
d1256cb1 |
516 | U8 *d = (U8 *) SvGROW(dst, 2*slen+1); /* +1 or assertion will botch */ |
ab3374e4 |
517 | while (s < e) { |
cf0d1c66 |
518 | UV uv = NATIVE_TO_UNI((UV) *s); |
519 | s++; /* Above expansion of NATIVE_TO_UNI() is safer this way. */ |
ab3374e4 |
520 | if (UNI_IS_INVARIANT(uv)) |
521 | *d++ = (U8)UTF_TO_NATIVE(uv); |
522 | else { |
523 | *d++ = (U8)UTF8_EIGHT_BIT_HI(uv); |
524 | *d++ = (U8)UTF8_EIGHT_BIT_LO(uv); |
525 | } |
d1256cb1 |
526 | } |
ab3374e4 |
527 | SvCUR_set(dst, d- (U8 *)SvPVX(dst)); |
528 | *SvEND(dst) = '\0'; |
529 | } |
530 | |
531 | /* Clear out translated part of source unless asked not to */ |
532 | if (check && !(check & ENCODE_LEAVE_SRC)){ |
d1256cb1 |
533 | slen = e-s; |
534 | if (slen) { |
535 | sv_setpvn(src, (char*)s, slen); |
536 | } |
537 | SvCUR_set(src, slen); |
ab3374e4 |
538 | } |
539 | SvPOK_only(dst); |
540 | SvUTF8_off(dst); |
541 | ST(0) = sv_2mortal(dst); |
542 | XSRETURN(1); |
543 | } |
544 | |
50d26985 |
545 | MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_ |
2f2b4ff2 |
546 | |
547 | PROTOTYPES: ENABLE |
548 | |
549 | void |
a0d8a30e |
550 | Method_renew(obj) |
551 | SV * obj |
552 | CODE: |
553 | { |
554 | XSRETURN(1); |
555 | } |
556 | |
cc836e95 |
557 | int |
558 | Method_renewed(obj) |
559 | SV * obj |
560 | CODE: |
561 | RETVAL = 0; |
562 | OUTPUT: |
563 | RETVAL |
564 | |
a0d8a30e |
565 | void |
0a95303c |
566 | Method_name(obj) |
567 | SV * obj |
568 | CODE: |
85982a32 |
569 | { |
570 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
571 | ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0]))); |
572 | XSRETURN(1); |
573 | } |
0a95303c |
574 | |
575 | void |
370462a2 |
576 | Method_cat_decode(obj, dst, src, off, term, check_sv = &PL_sv_no) |
220e2d4e |
577 | SV * obj |
578 | SV * dst |
579 | SV * src |
580 | SV * off |
581 | SV * term |
370462a2 |
582 | SV * check_sv |
220e2d4e |
583 | CODE: |
584 | { |
370462a2 |
585 | int check; |
586 | SV *fallback_cb = &PL_sv_undef; |
220e2d4e |
587 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
588 | STRLEN offset = (STRLEN)SvIV(off); |
589 | int code = 0; |
590 | if (SvUTF8(src)) { |
591 | sv_utf8_downgrade(src, FALSE); |
592 | } |
370462a2 |
593 | if (SvROK(check_sv)){ |
594 | fallback_cb = check_sv; |
595 | check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ |
596 | }else{ |
597 | check = SvIV(check_sv); |
598 | } |
220e2d4e |
599 | sv_catsv(dst, encode_method(aTHX_ enc, enc->t_utf8, src, check, |
370462a2 |
600 | &offset, term, &code, fallback_cb)); |
b162af07 |
601 | SvIV_set(off, (IV)offset); |
220e2d4e |
602 | if (code == ENCODE_FOUND_TERM) { |
d1256cb1 |
603 | ST(0) = &PL_sv_yes; |
220e2d4e |
604 | }else{ |
d1256cb1 |
605 | ST(0) = &PL_sv_no; |
220e2d4e |
606 | } |
607 | XSRETURN(1); |
608 | } |
609 | |
610 | void |
8e180e82 |
611 | Method_decode(obj,src,check_sv = &PL_sv_no) |
2f2b4ff2 |
612 | SV * obj |
613 | SV * src |
8e180e82 |
614 | SV * check_sv |
2f2b4ff2 |
615 | CODE: |
aae85ceb |
616 | { |
8e180e82 |
617 | int check; |
370462a2 |
618 | SV *fallback_cb = &PL_sv_undef; |
85982a32 |
619 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
ab3374e4 |
620 | if (SvUTF8(src)) { |
621 | sv_utf8_downgrade(src, FALSE); |
622 | } |
8e180e82 |
623 | if (SvROK(check_sv)){ |
370462a2 |
624 | fallback_cb = check_sv; |
625 | check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ |
8e180e82 |
626 | }else{ |
370462a2 |
627 | check = SvIV(check_sv); |
8e180e82 |
628 | } |
220e2d4e |
629 | ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check, |
370462a2 |
630 | NULL, Nullsv, NULL, fallback_cb); |
85982a32 |
631 | SvUTF8_on(ST(0)); |
aae85ceb |
632 | XSRETURN(1); |
633 | } |
634 | |
635 | void |
8e180e82 |
636 | Method_encode(obj,src,check_sv = &PL_sv_no) |
aae85ceb |
637 | SV * obj |
85982a32 |
638 | SV * src |
8e180e82 |
639 | SV * check_sv |
aae85ceb |
640 | CODE: |
641 | { |
8e180e82 |
642 | int check; |
370462a2 |
643 | SV *fallback_cb = &PL_sv_undef; |
85982a32 |
644 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
645 | sv_utf8_upgrade(src); |
8e180e82 |
646 | if (SvROK(check_sv)){ |
370462a2 |
647 | fallback_cb = check_sv; |
648 | check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ |
8e180e82 |
649 | }else{ |
370462a2 |
650 | check = SvIV(check_sv); |
8e180e82 |
651 | } |
220e2d4e |
652 | ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check, |
370462a2 |
653 | NULL, Nullsv, NULL, fallback_cb); |
aae85ceb |
654 | XSRETURN(1); |
655 | } |
656 | |
0ab8f81e |
657 | void |
658 | Method_needs_lines(obj) |
659 | SV * obj |
660 | CODE: |
661 | { |
b32afa7c |
662 | /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */ |
0ab8f81e |
663 | ST(0) = &PL_sv_no; |
664 | XSRETURN(1); |
665 | } |
666 | |
667 | void |
668 | Method_perlio_ok(obj) |
669 | SV * obj |
670 | CODE: |
671 | { |
b32afa7c |
672 | /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */ |
10c5ecbb |
673 | /* require_pv(PERLIO_FILENAME); */ |
674 | |
675 | eval_pv("require PerlIO::encoding", 0); |
676 | |
677 | if (SvTRUE(get_sv("@", 0))) { |
d1256cb1 |
678 | ST(0) = &PL_sv_no; |
10c5ecbb |
679 | }else{ |
d1256cb1 |
680 | ST(0) = &PL_sv_yes; |
0ab8f81e |
681 | } |
682 | XSRETURN(1); |
683 | } |
684 | |
742555bd |
685 | void |
686 | Method_mime_name(obj) |
687 | SV * obj |
688 | CODE: |
689 | { |
690 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
691 | SV *retval; |
692 | eval_pv("require Encode::MIME::Name", 0); |
693 | |
694 | if (SvTRUE(get_sv("@", 0))) { |
695 | ST(0) = &PL_sv_undef; |
696 | }else{ |
697 | ENTER; |
698 | SAVETMPS; |
699 | PUSHMARK(sp); |
700 | XPUSHs(sv_2mortal(newSVpvn(enc->name[0], strlen(enc->name[0])))); |
701 | PUTBACK; |
702 | call_pv("Encode::MIME::Name::get_mime_name", G_SCALAR); |
703 | SPAGAIN; |
704 | retval = newSVsv(POPs); |
705 | PUTBACK; |
706 | FREETMPS; |
707 | LEAVE; |
708 | /* enc->name[0] */ |
709 | ST(0) = retval; |
710 | } |
711 | XSRETURN(1); |
712 | } |
713 | |
67e989fb |
714 | MODULE = Encode PACKAGE = Encode |
2c674647 |
715 | |
716 | PROTOTYPES: ENABLE |
717 | |
67e989fb |
718 | I32 |
2c674647 |
719 | _bytes_to_utf8(sv, ...) |
85982a32 |
720 | SV * sv |
721 | CODE: |
722 | { |
723 | SV * encoding = items == 2 ? ST(1) : Nullsv; |
c6a7db43 |
724 | |
85982a32 |
725 | if (encoding) |
726 | RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding)); |
727 | else { |
d1256cb1 |
728 | STRLEN len; |
729 | U8* s = (U8*)SvPV(sv, len); |
730 | U8* converted; |
731 | |
732 | converted = bytes_to_utf8(s, &len); /* This allocs */ |
733 | sv_setpvn(sv, (char *)converted, len); |
734 | SvUTF8_on(sv); /* XXX Should we? */ |
735 | Safefree(converted); /* ... so free it */ |
736 | RETVAL = len; |
85982a32 |
737 | } |
738 | } |
739 | OUTPUT: |
740 | RETVAL |
2c674647 |
741 | |
67e989fb |
742 | I32 |
2c674647 |
743 | _utf8_to_bytes(sv, ...) |
85982a32 |
744 | SV * sv |
745 | CODE: |
746 | { |
747 | SV * to = items > 1 ? ST(1) : Nullsv; |
748 | SV * check = items > 2 ? ST(2) : Nullsv; |
749 | |
750 | if (to) { |
d1256cb1 |
751 | RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to)); |
85982a32 |
752 | } else { |
d1256cb1 |
753 | STRLEN len; |
754 | U8 *s = (U8*)SvPV(sv, len); |
85982a32 |
755 | |
d1256cb1 |
756 | RETVAL = 0; |
757 | if (SvTRUE(check)) { |
758 | /* Must do things the slow way */ |
759 | U8 *dest; |
85982a32 |
760 | /* We need a copy to pass to check() */ |
2cdf8b6c |
761 | U8 *src = s; |
d1256cb1 |
762 | U8 *send = s + len; |
2cdf8b6c |
763 | U8 *d0; |
85982a32 |
764 | |
d1256cb1 |
765 | New(83, dest, len, U8); /* I think */ |
2cdf8b6c |
766 | d0 = dest; |
85982a32 |
767 | |
d1256cb1 |
768 | while (s < send) { |
85982a32 |
769 | if (*s < 0x80){ |
d1256cb1 |
770 | *dest++ = *s++; |
85982a32 |
771 | } else { |
d1256cb1 |
772 | STRLEN ulen; |
773 | UV uv = *s++; |
774 | |
775 | /* Have to do it all ourselves because of error routine, |
776 | aargh. */ |
777 | if (!(uv & 0x40)){ goto failure; } |
778 | if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; } |
779 | else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; } |
780 | else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; } |
781 | else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; } |
782 | else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; } |
783 | else if (!(uv & 0x01)) { ulen = 7; uv = 0; } |
784 | else { ulen = 13; uv = 0; } |
785 | |
786 | /* Note change to utf8.c variable naming, for variety */ |
787 | while (ulen--) { |
788 | if ((*s & 0xc0) != 0x80){ |
789 | goto failure; |
790 | } else { |
791 | uv = (uv << 6) | (*s++ & 0x3f); |
792 | } |
793 | } |
794 | if (uv > 256) { |
795 | failure: |
796 | call_failure(check, s, dest, src); |
797 | /* Now what happens? */ |
798 | } |
799 | *dest++ = (U8)uv; |
800 | } |
801 | } |
2cdf8b6c |
802 | RETVAL = dest - d0; |
803 | sv_usepvn(sv, (char *)dest, RETVAL); |
804 | SvUTF8_off(sv); |
d1256cb1 |
805 | } else { |
806 | RETVAL = (utf8_to_bytes(s, &len) ? len : 0); |
807 | } |
85982a32 |
808 | } |
809 | } |
810 | OUTPUT: |
811 | RETVAL |
2c674647 |
812 | |
2c674647 |
813 | bool |
b2704119 |
814 | is_utf8(sv, check = 0) |
4411f3b6 |
815 | SV * sv |
b2704119 |
816 | int check |
85982a32 |
817 | CODE: |
818 | { |
819 | if (SvGMAGICAL(sv)) /* it could be $1, for example */ |
d1256cb1 |
820 | sv = newSVsv(sv); /* GMAGIG will be done */ |
d1256cb1 |
821 | RETVAL = SvUTF8(sv) ? TRUE : FALSE; |
822 | if (RETVAL && |
823 | check && |
824 | !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) |
825 | RETVAL = FALSE; |
85982a32 |
826 | if (sv != ST(0)) |
d1256cb1 |
827 | SvREFCNT_dec(sv); /* it was a temp copy */ |
85982a32 |
828 | } |
829 | OUTPUT: |
830 | RETVAL |
2c674647 |
831 | |
832 | SV * |
4411f3b6 |
833 | _utf8_on(sv) |
85982a32 |
834 | SV * sv |
835 | CODE: |
836 | { |
837 | if (SvPOK(sv)) { |
d1256cb1 |
838 | SV *rsv = newSViv(SvUTF8(sv)); |
839 | RETVAL = rsv; |
840 | SvUTF8_on(sv); |
85982a32 |
841 | } else { |
d1256cb1 |
842 | RETVAL = &PL_sv_undef; |
85982a32 |
843 | } |
844 | } |
845 | OUTPUT: |
846 | RETVAL |
2c674647 |
847 | |
848 | SV * |
4411f3b6 |
849 | _utf8_off(sv) |
85982a32 |
850 | SV * sv |
851 | CODE: |
852 | { |
853 | if (SvPOK(sv)) { |
d1256cb1 |
854 | SV *rsv = newSViv(SvUTF8(sv)); |
855 | RETVAL = rsv; |
856 | SvUTF8_off(sv); |
85982a32 |
857 | } else { |
d1256cb1 |
858 | RETVAL = &PL_sv_undef; |
85982a32 |
859 | } |
860 | } |
861 | OUTPUT: |
862 | RETVAL |
863 | |
85982a32 |
864 | int |
865 | DIE_ON_ERR() |
866 | CODE: |
867 | RETVAL = ENCODE_DIE_ON_ERR; |
868 | OUTPUT: |
869 | RETVAL |
870 | |
c6a7db43 |
871 | int |
85982a32 |
872 | WARN_ON_ERR() |
873 | CODE: |
874 | RETVAL = ENCODE_WARN_ON_ERR; |
875 | OUTPUT: |
876 | RETVAL |
877 | |
878 | int |
879 | LEAVE_SRC() |
880 | CODE: |
881 | RETVAL = ENCODE_LEAVE_SRC; |
882 | OUTPUT: |
883 | RETVAL |
884 | |
885 | int |
886 | RETURN_ON_ERR() |
887 | CODE: |
888 | RETVAL = ENCODE_RETURN_ON_ERR; |
889 | OUTPUT: |
890 | RETVAL |
891 | |
892 | int |
893 | PERLQQ() |
894 | CODE: |
895 | RETVAL = ENCODE_PERLQQ; |
896 | OUTPUT: |
897 | RETVAL |
898 | |
899 | int |
af1f55d9 |
900 | HTMLCREF() |
901 | CODE: |
902 | RETVAL = ENCODE_HTMLCREF; |
903 | OUTPUT: |
904 | RETVAL |
905 | |
906 | int |
907 | XMLCREF() |
908 | CODE: |
909 | RETVAL = ENCODE_XMLCREF; |
910 | OUTPUT: |
911 | RETVAL |
912 | |
913 | int |
56ff7374 |
914 | STOP_AT_PARTIAL() |
915 | CODE: |
916 | RETVAL = ENCODE_STOP_AT_PARTIAL; |
917 | OUTPUT: |
918 | RETVAL |
919 | |
920 | int |
85982a32 |
921 | FB_DEFAULT() |
922 | CODE: |
923 | RETVAL = ENCODE_FB_DEFAULT; |
924 | OUTPUT: |
925 | RETVAL |
926 | |
927 | int |
928 | FB_CROAK() |
929 | CODE: |
930 | RETVAL = ENCODE_FB_CROAK; |
931 | OUTPUT: |
932 | RETVAL |
933 | |
934 | int |
935 | FB_QUIET() |
936 | CODE: |
937 | RETVAL = ENCODE_FB_QUIET; |
938 | OUTPUT: |
939 | RETVAL |
940 | |
941 | int |
942 | FB_WARN() |
943 | CODE: |
944 | RETVAL = ENCODE_FB_WARN; |
945 | OUTPUT: |
946 | RETVAL |
947 | |
948 | int |
949 | FB_PERLQQ() |
950 | CODE: |
951 | RETVAL = ENCODE_FB_PERLQQ; |
952 | OUTPUT: |
953 | RETVAL |
2c674647 |
954 | |
af1f55d9 |
955 | int |
956 | FB_HTMLCREF() |
957 | CODE: |
958 | RETVAL = ENCODE_FB_HTMLCREF; |
959 | OUTPUT: |
960 | RETVAL |
961 | |
962 | int |
963 | FB_XMLCREF() |
964 | CODE: |
965 | RETVAL = ENCODE_FB_XMLCREF; |
966 | OUTPUT: |
967 | RETVAL |
968 | |
33af2bc7 |
969 | BOOT: |
970 | { |
85982a32 |
971 | #include "def_t.h" |
e7cbefb8 |
972 | #include "def_t.exh" |
33af2bc7 |
973 | } |