Commit | Line | Data |
fcf2db38 |
1 | #define PERL_NO_GET_CONTEXT |
2c674647 |
2 | #include "EXTERN.h" |
3 | #include "perl.h" |
4 | #include "XSUB.h" |
2f2b4ff2 |
5 | #define U8 U8 |
6 | #include "encode.h" |
071db25d |
7 | #include "def_t.h" |
fcf2db38 |
8 | |
fcb875d4 |
9 | #define ENCODE_XS_PROFILE 0 /* set 1 or more to profile. |
a999c27c |
10 | t/encoding.t dumps core because of |
11 | Perl_warner and PerlIO don't work well */ |
39cf9a5e |
12 | |
13 | #define ENCODE_XS_USEFP 1 /* set 0 to disable floating point to calculate |
a999c27c |
14 | buffer size for encode_method(). |
15 | 1 is recommended. 2 restores NI-S original */ |
39cf9a5e |
16 | |
fcf2db38 |
17 | #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \ |
2f5768b8 |
18 | Perl_croak(aTHX_ "panic_unimplemented"); \ |
4a83738a |
19 | return (y)0; /* fool picky compilers */ \ |
87714904 |
20 | } |
67e989fb |
21 | UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) |
aa0053b7 |
22 | UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) |
9df9a5cd |
23 | #if defined(USE_PERLIO) && !defined(USE_SFIO) |
72e44f29 |
24 | /* Define an encoding "layer" in the perliol.h sense. |
25 | The layer defined here "inherits" in an object-oriented sense from the |
26 | "perlio" layer with its PerlIOBuf_* "methods". |
27 | The implementation is particularly efficient as until Encode settles down |
28 | there is no point in tryint to tune it. |
29 | |
30 | The layer works by overloading the "fill" and "flush" methods. |
31 | |
32 | "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API |
33 | to convert the encoded data to UTF-8 form, then copies it back to the |
34 | buffer. The "base class's" read methods then see the UTF-8 data. |
35 | |
36 | "flush" transforms the UTF-8 data deposited by the "base class's write |
37 | method in the buffer back into the encoded form using the encode OO perl API, |
38 | then copies data back into the buffer and calls "SUPER::flush. |
39 | |
40 | Note that "flush" is _also_ called for read mode - we still do the (back)-translate |
41 | so that the the base class's "flush" sees the correct number of encoded chars |
42 | for positioning the seek pointer. (This double translation is the worst performance |
43 | issue - particularly with all-perl encode engine.) |
44 | |
45 | */ |
33af2bc7 |
46 | #include "perliol.h" |
aa0053b7 |
47 | typedef struct { |
48 | PerlIOBuf base; /* PerlIOBuf stuff */ |
0b3236bb |
49 | SV *bufsv; /* buffer seen by layers above */ |
50 | SV *dataSV; /* data we have read from layer below */ |
51 | SV *enc; /* the encoding object */ |
33af2bc7 |
52 | } PerlIOEncode; |
53 | |
e3f3bf95 |
54 | SV * |
aa0053b7 |
55 | PerlIOEncode_getarg(pTHX_ PerlIO * f, CLONE_PARAMS * param, int flags) |
e3f3bf95 |
56 | { |
aa0053b7 |
57 | PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); |
58 | SV *sv = &PL_sv_undef; |
59 | if (e->enc) { |
60 | dSP; |
61 | ENTER; |
62 | SAVETMPS; |
63 | PUSHMARK(sp); |
64 | XPUSHs(e->enc); |
65 | PUTBACK; |
66 | if (perl_call_method("name", G_SCALAR) == 1) { |
67 | SPAGAIN; |
68 | sv = newSVsv(POPs); |
69 | PUTBACK; |
70 | } |
e3f3bf95 |
71 | } |
aa0053b7 |
72 | return sv; |
e3f3bf95 |
73 | } |
33af2bc7 |
74 | |
75 | IV |
aa0053b7 |
76 | PerlIOEncode_pushed(pTHX_ PerlIO * f, const char *mode, SV * arg) |
33af2bc7 |
77 | { |
aa0053b7 |
78 | PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); |
79 | dSP; |
80 | IV code; |
81 | code = PerlIOBuf_pushed(aTHX_ f, mode, Nullsv); |
82 | ENTER; |
83 | SAVETMPS; |
84 | PUSHMARK(sp); |
85 | XPUSHs(arg); |
86 | PUTBACK; |
87 | if (perl_call_pv("Encode::find_encoding", G_SCALAR) != 1) { |
88 | /* should never happen */ |
89 | Perl_die(aTHX_ "Encode::find_encoding did not return a value"); |
90 | return -1; |
91 | } |
92 | SPAGAIN; |
93 | e->enc = POPs; |
94 | PUTBACK; |
95 | if (!SvROK(e->enc)) { |
96 | e->enc = Nullsv; |
97 | errno = EINVAL; |
f98bc0c6 |
98 | Perl_warner(aTHX_ packWARN(WARN_IO), "Cannot find encoding \"%" SVf "\"", |
aa0053b7 |
99 | arg); |
100 | code = -1; |
101 | } |
102 | else { |
103 | SvREFCNT_inc(e->enc); |
104 | PerlIOBase(f)->flags |= PERLIO_F_UTF8; |
105 | } |
106 | FREETMPS; |
107 | LEAVE; |
108 | return code; |
33af2bc7 |
109 | } |
110 | |
111 | IV |
aa0053b7 |
112 | PerlIOEncode_popped(pTHX_ PerlIO * f) |
33af2bc7 |
113 | { |
aa0053b7 |
114 | PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); |
115 | if (e->enc) { |
116 | SvREFCNT_dec(e->enc); |
117 | e->enc = Nullsv; |
118 | } |
119 | if (e->bufsv) { |
120 | SvREFCNT_dec(e->bufsv); |
121 | e->bufsv = Nullsv; |
122 | } |
0b3236bb |
123 | if (e->dataSV) { |
124 | SvREFCNT_dec(e->dataSV); |
284ee456 |
125 | e->dataSV = Nullsv; |
0b3236bb |
126 | } |
aa0053b7 |
127 | return 0; |
33af2bc7 |
128 | } |
129 | |
130 | STDCHAR * |
aa0053b7 |
131 | PerlIOEncode_get_base(pTHX_ PerlIO * f) |
33af2bc7 |
132 | { |
aa0053b7 |
133 | PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); |
134 | if (!e->base.bufsiz) |
135 | e->base.bufsiz = 1024; |
136 | if (!e->bufsv) { |
137 | e->bufsv = newSV(e->base.bufsiz); |
138 | sv_setpvn(e->bufsv, "", 0); |
139 | } |
140 | e->base.buf = (STDCHAR *) SvPVX(e->bufsv); |
141 | if (!e->base.ptr) |
142 | e->base.ptr = e->base.buf; |
143 | if (!e->base.end) |
144 | e->base.end = e->base.buf; |
145 | if (e->base.ptr < e->base.buf |
146 | || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) { |
147 | Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr, |
148 | e->base.buf + SvLEN(e->bufsv)); |
149 | abort(); |
150 | } |
151 | if (SvLEN(e->bufsv) < e->base.bufsiz) { |
152 | SSize_t poff = e->base.ptr - e->base.buf; |
153 | SSize_t eoff = e->base.end - e->base.buf; |
154 | e->base.buf = (STDCHAR *) SvGROW(e->bufsv, e->base.bufsiz); |
155 | e->base.ptr = e->base.buf + poff; |
156 | e->base.end = e->base.buf + eoff; |
157 | } |
158 | if (e->base.ptr < e->base.buf |
159 | || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) { |
160 | Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr, |
161 | e->base.buf + SvLEN(e->bufsv)); |
162 | abort(); |
163 | } |
164 | return e->base.buf; |
33af2bc7 |
165 | } |
166 | |
33af2bc7 |
167 | IV |
aa0053b7 |
168 | PerlIOEncode_fill(pTHX_ PerlIO * f) |
33af2bc7 |
169 | { |
aa0053b7 |
170 | PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); |
171 | dSP; |
0b3236bb |
172 | IV code = 0; |
173 | PerlIO *n; |
174 | SSize_t avail; |
175 | if (PerlIO_flush(f) != 0) |
176 | return -1; |
177 | n = PerlIONext(f); |
178 | if (!PerlIO_fast_gets(n)) { |
179 | /* Things get too messy if we don't have a buffer layer |
180 | push a :perlio to do the job */ |
181 | char mode[8]; |
182 | n = PerlIO_push(aTHX_ n, &PerlIO_perlio, PerlIO_modestr(f,mode), Nullsv); |
183 | if (!n) { |
184 | Perl_die(aTHX_ "panic: cannot push :perlio for %p",f); |
185 | } |
186 | } |
187 | ENTER; |
188 | SAVETMPS; |
189 | retry: |
190 | avail = PerlIO_get_cnt(n); |
191 | if (avail <= 0) { |
192 | avail = PerlIO_fill(n); |
193 | if (avail == 0) { |
194 | avail = PerlIO_get_cnt(n); |
195 | } |
196 | else { |
197 | if (!PerlIO_error(n) && PerlIO_eof(n)) |
198 | avail = 0; |
199 | } |
200 | } |
201 | if (avail > 0) { |
202 | STDCHAR *ptr = PerlIO_get_ptr(n); |
203 | SSize_t use = avail; |
aa0053b7 |
204 | SV *uni; |
aa0053b7 |
205 | char *s; |
0b3236bb |
206 | STRLEN len = 0; |
207 | e->base.ptr = e->base.end = (STDCHAR *) Nullch; |
208 | (void) PerlIOEncode_get_base(aTHX_ f); |
209 | if (!e->dataSV) |
210 | e->dataSV = newSV(0); |
211 | if (SvTYPE(e->dataSV) < SVt_PV) { |
212 | sv_upgrade(e->dataSV,SVt_PV); |
213 | } |
214 | if (SvCUR(e->dataSV)) { |
215 | /* something left over from last time - create a normal |
216 | SV with new data appended |
217 | */ |
218 | if (use + SvCUR(e->dataSV) > e->base.bufsiz) { |
219 | use = e->base.bufsiz - SvCUR(e->dataSV); |
220 | } |
9dbbe389 |
221 | sv_catpvn(e->dataSV,(char*)ptr,use); |
0b3236bb |
222 | } |
223 | else { |
224 | /* Create a "dummy" SV to represent the available data from layer below */ |
225 | if (SvLEN(e->dataSV) && SvPVX(e->dataSV)) { |
226 | Safefree(SvPVX(e->dataSV)); |
227 | } |
228 | if (use > e->base.bufsiz) { |
229 | use = e->base.bufsiz; |
230 | } |
231 | SvPVX(e->dataSV) = (char *) ptr; |
232 | SvLEN(e->dataSV) = 0; /* Hands off sv.c - it isn't yours */ |
233 | SvCUR_set(e->dataSV,use); |
284ee456 |
234 | SvPOK_only(e->dataSV); |
0b3236bb |
235 | } |
236 | SvUTF8_off(e->dataSV); |
aa0053b7 |
237 | PUSHMARK(sp); |
238 | XPUSHs(e->enc); |
0b3236bb |
239 | XPUSHs(e->dataSV); |
aa0053b7 |
240 | XPUSHs(&PL_sv_yes); |
241 | PUTBACK; |
0b3236bb |
242 | if (perl_call_method("decode", G_SCALAR) != 1) { |
243 | Perl_die(aTHX_ "panic: decode did not return a value"); |
244 | } |
aa0053b7 |
245 | SPAGAIN; |
246 | uni = POPs; |
247 | PUTBACK; |
0b3236bb |
248 | /* Now get translated string (forced to UTF-8) and use as buffer */ |
249 | if (SvPOK(uni)) { |
250 | s = SvPVutf8(uni, len); |
3c49ab08 |
251 | #ifdef PARANOID_ENCODE_CHECKS |
85f7ebdf |
252 | if (len && !is_utf8_string((U8*)s,len)) { |
0b3236bb |
253 | Perl_warn(aTHX_ "panic: decode did not return UTF-8 '%.*s'",(int) len,s); |
254 | } |
3c49ab08 |
255 | #endif |
0b3236bb |
256 | } |
257 | if (len > 0) { |
258 | /* Got _something */ |
259 | /* if decode gave us back dataSV then data may vanish when |
260 | we do ptrcnt adjust - so take our copy now. |
261 | (The copy is a pain - need a put-it-here option for decode.) |
262 | */ |
263 | sv_setpvn(e->bufsv,s,len); |
284ee456 |
264 | e->base.ptr = e->base.buf = (STDCHAR*)SvPVX(e->bufsv); |
0b3236bb |
265 | e->base.end = e->base.ptr + SvCUR(e->bufsv); |
266 | PerlIOBase(f)->flags |= PERLIO_F_RDBUF; |
267 | SvUTF8_on(e->bufsv); |
268 | |
269 | /* Adjust ptr/cnt not taking anything which |
270 | did not translate - not clear this is a win */ |
271 | /* compute amount we took */ |
272 | use -= SvCUR(e->dataSV); |
273 | PerlIO_set_ptrcnt(n, ptr+use, (avail-use)); |
274 | /* and as we did not take it it isn't pending */ |
275 | SvCUR_set(e->dataSV,0); |
276 | } else { |
277 | /* Got nothing - assume partial character so we need some more */ |
278 | /* Make sure e->dataSV is a normal SV before re-filling as |
279 | buffer alias will change under us |
280 | */ |
281 | s = SvPV(e->dataSV,len); |
282 | sv_setpvn(e->dataSV,s,len); |
283 | PerlIO_set_ptrcnt(n, ptr+use, (avail-use)); |
284 | goto retry; |
aa0053b7 |
285 | } |
aa0053b7 |
286 | FREETMPS; |
287 | LEAVE; |
0b3236bb |
288 | return code; |
289 | } |
290 | else { |
291 | if (avail == 0) |
292 | PerlIOBase(f)->flags |= PERLIO_F_EOF; |
293 | else |
294 | PerlIOBase(f)->flags |= PERLIO_F_ERROR; |
295 | return -1; |
72e44f29 |
296 | } |
33af2bc7 |
297 | } |
298 | |
299 | IV |
aa0053b7 |
300 | PerlIOEncode_flush(pTHX_ PerlIO * f) |
33af2bc7 |
301 | { |
aa0053b7 |
302 | PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); |
303 | IV code = 0; |
0b3236bb |
304 | if (e->bufsv && (e->base.ptr > e->base.buf)) { |
aa0053b7 |
305 | dSP; |
306 | SV *str; |
307 | char *s; |
308 | STRLEN len; |
0b3236bb |
309 | SSize_t count = 0; |
310 | if (PerlIOBase(f)->flags & PERLIO_F_WRBUF) { |
311 | /* Write case encode the buffer and write() to layer below */ |
312 | ENTER; |
313 | SAVETMPS; |
314 | PUSHMARK(sp); |
315 | XPUSHs(e->enc); |
316 | SvCUR_set(e->bufsv, e->base.ptr - e->base.buf); |
317 | SvUTF8_on(e->bufsv); |
0b3236bb |
318 | XPUSHs(e->bufsv); |
319 | XPUSHs(&PL_sv_yes); |
320 | PUTBACK; |
8bbd9480 |
321 | if (perl_call_method("encode", G_SCALAR) != 1) { |
322 | Perl_die(aTHX_ "panic: encode did not return a value"); |
323 | } |
0b3236bb |
324 | SPAGAIN; |
325 | str = POPs; |
326 | PUTBACK; |
327 | s = SvPV(str, len); |
328 | count = PerlIO_write(PerlIONext(f),s,len); |
329 | if (count != len) { |
330 | code = -1; |
331 | } |
332 | FREETMPS; |
333 | LEAVE; |
334 | if (PerlIO_flush(PerlIONext(f)) != 0) { |
335 | code = -1; |
336 | } |
25f7d9d3 |
337 | if (SvCUR(e->bufsv)) { |
338 | /* Did not all translate */ |
339 | e->base.ptr = e->base.buf+SvCUR(e->bufsv); |
340 | return code; |
341 | } |
aa0053b7 |
342 | } |
0b3236bb |
343 | else if (PerlIOBase(f)->flags & PERLIO_F_RDBUF) { |
344 | /* read case */ |
345 | /* if we have any untranslated stuff then unread that first */ |
346 | if (e->dataSV && SvCUR(e->dataSV)) { |
347 | s = SvPV(e->dataSV, len); |
348 | count = PerlIO_unread(PerlIONext(f),s,len); |
349 | if (count != len) { |
350 | code = -1; |
351 | } |
352 | } |
353 | /* See if there is anything left in the buffer */ |
354 | if (e->base.ptr < e->base.end) { |
355 | /* Bother - have unread data. |
356 | re-encode and unread() to layer below |
357 | */ |
358 | ENTER; |
359 | SAVETMPS; |
360 | str = sv_newmortal(); |
361 | sv_upgrade(str, SVt_PV); |
9dbbe389 |
362 | SvPVX(str) = (char*)e->base.ptr; |
0b3236bb |
363 | SvLEN(str) = 0; |
364 | SvCUR_set(str, e->base.end - e->base.ptr); |
284ee456 |
365 | SvPOK_only(str); |
0b3236bb |
366 | SvUTF8_on(str); |
367 | PUSHMARK(sp); |
368 | XPUSHs(e->enc); |
369 | XPUSHs(str); |
370 | XPUSHs(&PL_sv_yes); |
371 | PUTBACK; |
8bbd9480 |
372 | if (perl_call_method("encode", G_SCALAR) != 1) { |
373 | Perl_die(aTHX_ "panic: encode did not return a value"); |
374 | } |
0b3236bb |
375 | SPAGAIN; |
376 | str = POPs; |
377 | PUTBACK; |
378 | s = SvPV(str, len); |
379 | count = PerlIO_unread(PerlIONext(f),s,len); |
380 | if (count != len) { |
381 | code = -1; |
382 | } |
383 | FREETMPS; |
384 | LEAVE; |
385 | } |
aa0053b7 |
386 | } |
0b3236bb |
387 | e->base.ptr = e->base.end = e->base.buf; |
388 | PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF); |
72e44f29 |
389 | } |
aa0053b7 |
390 | return code; |
33af2bc7 |
391 | } |
392 | |
393 | IV |
aa0053b7 |
394 | PerlIOEncode_close(pTHX_ PerlIO * f) |
33af2bc7 |
395 | { |
aa0053b7 |
396 | PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode); |
397 | IV code = PerlIOBase_close(aTHX_ f); |
398 | if (e->bufsv) { |
25f7d9d3 |
399 | if (e->base.buf && e->base.ptr > e->base.buf) { |
400 | Perl_croak(aTHX_ "Close with partial character"); |
401 | } |
aa0053b7 |
402 | SvREFCNT_dec(e->bufsv); |
403 | e->bufsv = Nullsv; |
404 | } |
405 | e->base.buf = NULL; |
406 | e->base.ptr = NULL; |
407 | e->base.end = NULL; |
408 | PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF); |
409 | return code; |
33af2bc7 |
410 | } |
411 | |
72e44f29 |
412 | Off_t |
aa0053b7 |
413 | PerlIOEncode_tell(pTHX_ PerlIO * f) |
72e44f29 |
414 | { |
aa0053b7 |
415 | PerlIOBuf *b = PerlIOSelf(f, PerlIOBuf); |
0b3236bb |
416 | /* Unfortunately the only way to get a postion is to (re-)translate, |
417 | the UTF8 we have in bufefr and then ask layer below |
aa0053b7 |
418 | */ |
0b3236bb |
419 | PerlIO_flush(f); |
25f7d9d3 |
420 | if (b->buf && b->ptr > b->buf) { |
421 | Perl_croak(aTHX_ "Cannot tell at partial character"); |
422 | } |
0b3236bb |
423 | return PerlIO_tell(PerlIONext(f)); |
72e44f29 |
424 | } |
425 | |
8cf8f3d1 |
426 | PerlIO * |
aa0053b7 |
427 | PerlIOEncode_dup(pTHX_ PerlIO * f, PerlIO * o, |
428 | CLONE_PARAMS * params, int flags) |
8cf8f3d1 |
429 | { |
aa0053b7 |
430 | if ((f = PerlIOBase_dup(aTHX_ f, o, params, flags))) { |
431 | PerlIOEncode *fe = PerlIOSelf(f, PerlIOEncode); |
432 | PerlIOEncode *oe = PerlIOSelf(o, PerlIOEncode); |
433 | if (oe->enc) { |
434 | fe->enc = PerlIO_sv_dup(aTHX_ oe->enc, params); |
435 | } |
9f16d962 |
436 | } |
aa0053b7 |
437 | return f; |
8cf8f3d1 |
438 | } |
439 | |
33af2bc7 |
440 | PerlIO_funcs PerlIO_encode = { |
aa0053b7 |
441 | "encoding", |
442 | sizeof(PerlIOEncode), |
284ee456 |
443 | PERLIO_K_BUFFERED|PERLIO_K_DESTRUCT, |
aa0053b7 |
444 | PerlIOEncode_pushed, |
445 | PerlIOEncode_popped, |
446 | PerlIOBuf_open, |
447 | PerlIOEncode_getarg, |
448 | PerlIOBase_fileno, |
449 | PerlIOEncode_dup, |
450 | PerlIOBuf_read, |
451 | PerlIOBuf_unread, |
452 | PerlIOBuf_write, |
453 | PerlIOBuf_seek, |
454 | PerlIOEncode_tell, |
455 | PerlIOEncode_close, |
456 | PerlIOEncode_flush, |
457 | PerlIOEncode_fill, |
458 | PerlIOBase_eof, |
459 | PerlIOBase_error, |
460 | PerlIOBase_clearerr, |
461 | PerlIOBase_setlinebuf, |
462 | PerlIOEncode_get_base, |
463 | PerlIOBuf_bufsiz, |
464 | PerlIOBuf_get_ptr, |
465 | PerlIOBuf_get_cnt, |
466 | PerlIOBuf_set_ptrcnt, |
33af2bc7 |
467 | }; |
aa0053b7 |
468 | #endif /* encode layer */ |
33af2bc7 |
469 | |
2f2b4ff2 |
470 | void |
aa0053b7 |
471 | Encode_XSEncoding(pTHX_ encode_t * enc) |
2f2b4ff2 |
472 | { |
aa0053b7 |
473 | dSP; |
474 | HV *stash = gv_stashpv("Encode::XS", TRUE); |
475 | SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))), stash); |
476 | int i = 0; |
477 | PUSHMARK(sp); |
478 | XPUSHs(sv); |
479 | while (enc->name[i]) { |
480 | const char *name = enc->name[i++]; |
481 | XPUSHs(sv_2mortal(newSVpvn(name, strlen(name)))); |
482 | } |
483 | PUTBACK; |
484 | call_pv("Encode::define_encoding", G_DISCARD); |
485 | SvREFCNT_dec(sv); |
2f2b4ff2 |
486 | } |
487 | |
aa0053b7 |
488 | void |
489 | call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig) |
490 | { |
25f7d9d3 |
491 | /* Exists for breakpointing */ |
aa0053b7 |
492 | } |
67e989fb |
493 | |
2f2b4ff2 |
494 | static SV * |
aa0053b7 |
495 | encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, |
496 | int check) |
2f2b4ff2 |
497 | { |
aa0053b7 |
498 | STRLEN slen; |
499 | U8 *s = (U8 *) SvPV(src, slen); |
3aececda |
500 | STRLEN tlen = slen; |
501 | STRLEN ddone = 0; |
502 | STRLEN sdone = 0; |
39cf9a5e |
503 | |
3c49ab08 |
504 | /* We allocate slen+1. |
a999c27c |
505 | PerlIO dumps core if this value is smaller than this. */ |
3c49ab08 |
506 | SV *dst = sv_2mortal(newSV(slen+1)); |
aa0053b7 |
507 | if (slen) { |
0b3236bb |
508 | U8 *d = (U8 *) SvPVX(dst); |
509 | STRLEN dlen = SvLEN(dst)-1; |
aa0053b7 |
510 | int code; |
511 | while ((code = do_encode(dir, s, &slen, d, dlen, &dlen, !check))) { |
3aececda |
512 | SvCUR_set(dst, dlen+ddone); |
284ee456 |
513 | SvPOK_only(dst); |
9b37254d |
514 | |
39cf9a5e |
515 | #if ENCODE_XS_PROFILE >= 3 |
516 | Perl_warn(aTHX_ "code=%d @ s=%d/%d/%d d=%d/%d/%d\n",code,slen,sdone,tlen,dlen,ddone,SvLEN(dst)-1); |
0b3236bb |
517 | #endif |
518 | |
519 | if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL) |
aa0053b7 |
520 | break; |
9b37254d |
521 | |
aa0053b7 |
522 | switch (code) { |
523 | case ENCODE_NOSPACE: |
a999c27c |
524 | { |
fcb875d4 |
525 | STRLEN more = 0; /* make sure you initialize! */ |
526 | STRLEN sleft; |
3aececda |
527 | sdone += slen; |
528 | ddone += dlen; |
39cf9a5e |
529 | sleft = tlen - sdone; |
fcb875d4 |
530 | #if ENCODE_XS_PROFILE >= 2 |
3c49ab08 |
531 | Perl_warn(aTHX_ |
fcb875d4 |
532 | "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n", |
533 | more, sdone, sleft, SvLEN(dst)); |
534 | #endif |
535 | if (sdone != 0) { /* has src ever been processed ? */ |
39cf9a5e |
536 | #if ENCODE_XS_USEFP == 2 |
a999c27c |
537 | more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone |
538 | - SvLEN(dst); |
39cf9a5e |
539 | #elif ENCODE_XS_USEFP |
a999c27c |
540 | more = (1.0*SvLEN(dst)+1)/sdone * sleft; |
39cf9a5e |
541 | #else |
a999c27c |
542 | /* safe until SvLEN(dst) == MAX_INT/16 */ |
543 | more = (16*SvLEN(dst)+1)/sdone/16 * sleft; |
c98ca92f |
544 | #endif |
0b3236bb |
545 | } |
39cf9a5e |
546 | more += UTF8_MAXLEN; /* insurance policy */ |
547 | #if ENCODE_XS_PROFILE >= 2 |
3c49ab08 |
548 | Perl_warn(aTHX_ |
a999c27c |
549 | "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n", |
550 | more, sdone, sleft, SvLEN(dst)); |
39cf9a5e |
551 | #endif |
552 | d = (U8 *) SvGROW(dst, SvLEN(dst) + more); |
553 | /* dst need to grow need MORE bytes! */ |
3aececda |
554 | if (ddone >= SvLEN(dst)) { |
555 | Perl_croak(aTHX_ "Destination couldn't be grown."); |
aa0053b7 |
556 | } |
3aececda |
557 | dlen = SvLEN(dst)-ddone-1; |
558 | d += ddone; |
559 | s += slen; |
560 | slen = tlen-sdone; |
561 | continue; |
39cf9a5e |
562 | } |
2f2b4ff2 |
563 | |
aa0053b7 |
564 | case ENCODE_NOREP: |
565 | if (dir == enc->f_utf8) { |
566 | if (!check && ckWARN_d(WARN_UTF8)) { |
567 | STRLEN clen; |
568 | UV ch = |
569 | utf8n_to_uvuni(s + slen, (SvCUR(src) - slen), |
570 | &clen, 0); |
f98bc0c6 |
571 | Perl_warner(aTHX_ packWARN(WARN_UTF8), |
aa0053b7 |
572 | "\"\\N{U+%" UVxf |
573 | "}\" does not map to %s", ch, |
574 | enc->name[0]); |
575 | /* FIXME: Skip over the character, copy in replacement and continue |
576 | * but that is messy so for now just fail. |
577 | */ |
578 | return &PL_sv_undef; |
579 | } |
580 | else { |
581 | return &PL_sv_undef; |
582 | } |
583 | } |
584 | else { |
d0ee0454 |
585 | /* UTF-8 is supposed to be "Universal" so should not happen |
586 | for real characters, but some encodings have non-assigned |
587 | codes which may occur. |
588 | */ |
ee981de6 |
589 | Perl_croak(aTHX_ "%s \"\\x%02X\" does not map to Unicode (%d)", |
590 | enc->name[0], (U8) s[slen], code); |
aa0053b7 |
591 | } |
592 | break; |
2f2b4ff2 |
593 | |
aa0053b7 |
594 | default: |
595 | Perl_croak(aTHX_ "Unexpected code %d converting %s %s", |
596 | code, (dir == enc->f_utf8) ? "to" : "from", |
597 | enc->name[0]); |
598 | return &PL_sv_undef; |
599 | } |
600 | } |
3aececda |
601 | SvCUR_set(dst, dlen+ddone); |
284ee456 |
602 | SvPOK_only(dst); |
aa0053b7 |
603 | if (check) { |
3aececda |
604 | sdone = SvCUR(src) - (slen+sdone); |
605 | if (sdone) { |
f54fca96 |
606 | #if 1 |
607 | /* FIXME: A Move() is dangerous - PV could be mmap'ed readonly |
608 | SvOOK would be ideal - but sv_backoff does not understand SvLEN == 0 |
609 | type SVs and sv_clear() calls it ... |
610 | */ |
64ffdd5e |
611 | sv_setpvn(src, (char*)s+slen, sdone); |
f54fca96 |
612 | #else |
3aececda |
613 | Move(s + slen, SvPVX(src), sdone , U8); |
f54fca96 |
614 | #endif |
aa0053b7 |
615 | } |
3aececda |
616 | SvCUR_set(src, sdone); |
aa0053b7 |
617 | } |
2f2b4ff2 |
618 | } |
aa0053b7 |
619 | else { |
0b3236bb |
620 | SvCUR_set(dst, 0); |
284ee456 |
621 | SvPOK_only(dst); |
2f2b4ff2 |
622 | } |
39cf9a5e |
623 | #if ENCODE_XS_PROFILE |
624 | if (SvCUR(dst) > SvCUR(src)){ |
3c49ab08 |
625 | Perl_warn(aTHX_ |
a999c27c |
626 | "SvLEN(dst)=%d, SvCUR(dst)=%d. " |
627 | "%d bytes unused(%f %%)\n", |
3c49ab08 |
628 | SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst), |
a999c27c |
629 | (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0); |
3c49ab08 |
630 | |
39cf9a5e |
631 | } |
3c49ab08 |
632 | #endif |
0b3236bb |
633 | *SvEND(dst) = '\0'; |
aa0053b7 |
634 | return dst; |
2f2b4ff2 |
635 | } |
636 | |
50d26985 |
637 | MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_ |
2f2b4ff2 |
638 | |
639 | PROTOTYPES: ENABLE |
640 | |
641 | void |
0a95303c |
642 | Method_name(obj) |
643 | SV * obj |
644 | CODE: |
645 | { |
646 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
647 | ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0]))); |
648 | XSRETURN(1); |
649 | } |
650 | |
651 | void |
691638dd |
652 | Method_decode(obj,src,check = FALSE) |
2f2b4ff2 |
653 | SV * obj |
654 | SV * src |
691638dd |
655 | bool check |
2f2b4ff2 |
656 | CODE: |
657 | { |
658 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
659 | ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check); |
660 | SvUTF8_on(ST(0)); |
661 | XSRETURN(1); |
662 | } |
663 | |
664 | void |
691638dd |
665 | Method_encode(obj,src,check = FALSE) |
2f2b4ff2 |
666 | SV * obj |
667 | SV * src |
691638dd |
668 | bool check |
2f2b4ff2 |
669 | CODE: |
670 | { |
671 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
672 | sv_utf8_upgrade(src); |
673 | ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check); |
674 | XSRETURN(1); |
675 | } |
676 | |
67e989fb |
677 | MODULE = Encode PACKAGE = Encode |
2c674647 |
678 | |
679 | PROTOTYPES: ENABLE |
680 | |
67e989fb |
681 | I32 |
2c674647 |
682 | _bytes_to_utf8(sv, ...) |
67e989fb |
683 | SV * sv |
2c674647 |
684 | CODE: |
67e989fb |
685 | { |
686 | SV * encoding = items == 2 ? ST(1) : Nullsv; |
687 | |
688 | if (encoding) |
689 | RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding)); |
690 | else { |
691 | STRLEN len; |
183a2d84 |
692 | U8* s = (U8*)SvPV(sv, len); |
67e989fb |
693 | U8* converted; |
694 | |
695 | converted = bytes_to_utf8(s, &len); /* This allocs */ |
183a2d84 |
696 | sv_setpvn(sv, (char *)converted, len); |
67e989fb |
697 | SvUTF8_on(sv); /* XXX Should we? */ |
698 | Safefree(converted); /* ... so free it */ |
699 | RETVAL = len; |
700 | } |
701 | } |
2c674647 |
702 | OUTPUT: |
67e989fb |
703 | RETVAL |
2c674647 |
704 | |
67e989fb |
705 | I32 |
2c674647 |
706 | _utf8_to_bytes(sv, ...) |
67e989fb |
707 | SV * sv |
2c674647 |
708 | CODE: |
67e989fb |
709 | { |
710 | SV * to = items > 1 ? ST(1) : Nullsv; |
711 | SV * check = items > 2 ? ST(2) : Nullsv; |
87714904 |
712 | |
67e989fb |
713 | if (to) |
714 | RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to)); |
715 | else { |
67e989fb |
716 | STRLEN len; |
b113ac0e |
717 | U8 *s = (U8*)SvPV(sv, len); |
67e989fb |
718 | |
9c5ffd7c |
719 | RETVAL = 0; |
67e989fb |
720 | if (SvTRUE(check)) { |
721 | /* Must do things the slow way */ |
722 | U8 *dest; |
87714904 |
723 | U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */ |
67e989fb |
724 | U8 *send = s + len; |
725 | |
726 | New(83, dest, len, U8); /* I think */ |
727 | |
728 | while (s < send) { |
729 | if (*s < 0x80) |
730 | *dest++ = *s++; |
731 | else { |
b113ac0e |
732 | STRLEN ulen; |
733 | UV uv = *s++; |
87714904 |
734 | |
67e989fb |
735 | /* Have to do it all ourselves because of error routine, |
736 | aargh. */ |
737 | if (!(uv & 0x40)) |
738 | goto failure; |
739 | if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; } |
740 | else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; } |
741 | else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; } |
742 | else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; } |
743 | else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; } |
744 | else if (!(uv & 0x01)) { ulen = 7; uv = 0; } |
745 | else { ulen = 13; uv = 0; } |
87714904 |
746 | |
67e989fb |
747 | /* Note change to utf8.c variable naming, for variety */ |
748 | while (ulen--) { |
749 | if ((*s & 0xc0) != 0x80) |
750 | goto failure; |
87714904 |
751 | |
67e989fb |
752 | else |
753 | uv = (uv << 6) | (*s++ & 0x3f); |
87714904 |
754 | } |
67e989fb |
755 | if (uv > 256) { |
756 | failure: |
757 | call_failure(check, s, dest, src); |
758 | /* Now what happens? */ |
759 | } |
760 | *dest++ = (U8)uv; |
761 | } |
762 | } |
763 | } else |
764 | RETVAL = (utf8_to_bytes(s, &len) ? len : 0); |
765 | } |
2c674647 |
766 | } |
767 | OUTPUT: |
768 | RETVAL |
769 | |
2c674647 |
770 | bool |
4411f3b6 |
771 | is_utf8(sv, check = FALSE) |
772 | SV * sv |
773 | bool check |
2c674647 |
774 | CODE: |
775 | { |
2eebba1d |
776 | if (SvGMAGICAL(sv)) /* it could be $1, for example */ |
777 | sv = newSVsv(sv); /* GMAGIG will be done */ |
2c674647 |
778 | if (SvPOK(sv)) { |
4411f3b6 |
779 | RETVAL = SvUTF8(sv) ? TRUE : FALSE; |
2c674647 |
780 | if (RETVAL && |
4411f3b6 |
781 | check && |
2c674647 |
782 | !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) |
783 | RETVAL = FALSE; |
784 | } else { |
785 | RETVAL = FALSE; |
786 | } |
2eebba1d |
787 | if (sv != ST(0)) |
788 | SvREFCNT_dec(sv); /* it was a temp copy */ |
2c674647 |
789 | } |
790 | OUTPUT: |
791 | RETVAL |
792 | |
793 | SV * |
4411f3b6 |
794 | _utf8_on(sv) |
2c674647 |
795 | SV * sv |
796 | CODE: |
797 | { |
798 | if (SvPOK(sv)) { |
87714904 |
799 | SV *rsv = newSViv(SvUTF8(sv)); |
2c674647 |
800 | RETVAL = rsv; |
801 | SvUTF8_on(sv); |
802 | } else { |
803 | RETVAL = &PL_sv_undef; |
804 | } |
805 | } |
806 | OUTPUT: |
807 | RETVAL |
808 | |
809 | SV * |
4411f3b6 |
810 | _utf8_off(sv) |
2c674647 |
811 | SV * sv |
812 | CODE: |
813 | { |
814 | if (SvPOK(sv)) { |
87714904 |
815 | SV *rsv = newSViv(SvUTF8(sv)); |
2c674647 |
816 | RETVAL = rsv; |
817 | SvUTF8_off(sv); |
818 | } else { |
819 | RETVAL = &PL_sv_undef; |
820 | } |
821 | } |
822 | OUTPUT: |
823 | RETVAL |
824 | |
33af2bc7 |
825 | BOOT: |
826 | { |
6a59c517 |
827 | #if defined(USE_PERLIO) && !defined(USE_SFIO) |
a999f61b |
828 | PerlIO_define_layer(aTHX_ &PerlIO_encode); |
33af2bc7 |
829 | #endif |
e7cbefb8 |
830 | #include "def_t.exh" |
33af2bc7 |
831 | } |