Commit | Line | Data |
2c674647 |
1 | #include "EXTERN.h" |
2 | #include "perl.h" |
3 | #include "XSUB.h" |
4 | |
67e989fb |
5 | #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) { \ |
2f5768b8 |
6 | Perl_croak(aTHX_ "panic_unimplemented"); \ |
4a83738a |
7 | return (y)0; /* fool picky compilers */ \ |
87714904 |
8 | } |
67e989fb |
9 | UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) |
10 | UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) |
11 | |
33af2bc7 |
12 | #ifdef USE_PERLIO |
72e44f29 |
13 | /* Define an encoding "layer" in the perliol.h sense. |
14 | The layer defined here "inherits" in an object-oriented sense from the |
15 | "perlio" layer with its PerlIOBuf_* "methods". |
16 | The implementation is particularly efficient as until Encode settles down |
17 | there is no point in tryint to tune it. |
18 | |
19 | The layer works by overloading the "fill" and "flush" methods. |
20 | |
21 | "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API |
22 | to convert the encoded data to UTF-8 form, then copies it back to the |
23 | buffer. The "base class's" read methods then see the UTF-8 data. |
24 | |
25 | "flush" transforms the UTF-8 data deposited by the "base class's write |
26 | method in the buffer back into the encoded form using the encode OO perl API, |
27 | then copies data back into the buffer and calls "SUPER::flush. |
28 | |
29 | Note that "flush" is _also_ called for read mode - we still do the (back)-translate |
30 | so that the the base class's "flush" sees the correct number of encoded chars |
31 | for positioning the seek pointer. (This double translation is the worst performance |
32 | issue - particularly with all-perl encode engine.) |
33 | |
34 | */ |
35 | |
36 | |
33af2bc7 |
37 | #include "perliol.h" |
38 | |
39 | typedef struct |
40 | { |
41 | PerlIOBuf base; /* PerlIOBuf stuff */ |
42 | SV * bufsv; |
43 | SV * enc; |
44 | } PerlIOEncode; |
45 | |
46 | |
47 | IV |
48 | PerlIOEncode_pushed(PerlIO *f, const char *mode,const char *arg,STRLEN len) |
49 | { |
50 | PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode); |
51 | dTHX; |
52 | dSP; |
53 | IV code; |
54 | code = PerlIOBuf_pushed(f,mode,Nullch,0); |
55 | ENTER; |
56 | SAVETMPS; |
57 | PUSHMARK(sp); |
58 | XPUSHs(sv_2mortal(newSVpv("Encode",0))); |
59 | XPUSHs(sv_2mortal(newSVpvn(arg,len))); |
60 | PUTBACK; |
61 | if (perl_call_method("getEncoding",G_SCALAR) != 1) |
62 | return -1; |
63 | SPAGAIN; |
64 | e->enc = POPs; |
65 | PUTBACK; |
66 | if (!SvROK(e->enc)) |
67 | return -1; |
68 | SvREFCNT_inc(e->enc); |
69 | FREETMPS; |
70 | LEAVE; |
71 | PerlIOBase(f)->flags |= PERLIO_F_UTF8; |
72 | return code; |
73 | } |
74 | |
75 | IV |
76 | PerlIOEncode_popped(PerlIO *f) |
77 | { |
78 | PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode); |
79 | dTHX; |
80 | if (e->enc) |
81 | { |
82 | SvREFCNT_dec(e->enc); |
83 | e->enc = Nullsv; |
84 | } |
85 | if (e->bufsv) |
86 | { |
87 | SvREFCNT_dec(e->bufsv); |
88 | e->bufsv = Nullsv; |
89 | } |
90 | return 0; |
91 | } |
92 | |
93 | STDCHAR * |
94 | PerlIOEncode_get_base(PerlIO *f) |
95 | { |
96 | PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode); |
97 | dTHX; |
98 | if (!e->base.bufsiz) |
99 | e->base.bufsiz = 1024; |
100 | if (!e->bufsv) |
101 | { |
102 | e->bufsv = newSV(e->base.bufsiz); |
103 | sv_setpvn(e->bufsv,"",0); |
104 | } |
105 | e->base.buf = SvPVX(e->bufsv); |
106 | if (!e->base.ptr) |
107 | e->base.ptr = e->base.buf; |
108 | if (!e->base.end) |
109 | e->base.end = e->base.buf; |
110 | if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv)) |
111 | { |
112 | Perl_warn(aTHX_ " ptr %p(%p)%p", |
113 | e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv)); |
114 | abort(); |
115 | } |
116 | if (SvLEN(e->bufsv) < e->base.bufsiz) |
117 | { |
118 | SSize_t poff = e->base.ptr - e->base.buf; |
119 | SSize_t eoff = e->base.end - e->base.buf; |
120 | e->base.buf = SvGROW(e->bufsv,e->base.bufsiz); |
121 | e->base.ptr = e->base.buf + poff; |
122 | e->base.end = e->base.buf + eoff; |
123 | } |
124 | if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv)) |
125 | { |
126 | Perl_warn(aTHX_ " ptr %p(%p)%p", |
127 | e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv)); |
128 | abort(); |
129 | } |
130 | return e->base.buf; |
131 | } |
132 | |
33af2bc7 |
133 | IV |
134 | PerlIOEncode_fill(PerlIO *f) |
135 | { |
136 | PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode); |
137 | dTHX; |
138 | dSP; |
139 | IV code; |
33af2bc7 |
140 | code = PerlIOBuf_fill(f); |
141 | if (code == 0) |
142 | { |
143 | SV *uni; |
72e44f29 |
144 | STRLEN len; |
145 | char *s; |
146 | /* Set SV that is the buffer to be buf..ptr */ |
33af2bc7 |
147 | SvCUR_set(e->bufsv, e->base.end - e->base.buf); |
148 | SvUTF8_off(e->bufsv); |
149 | ENTER; |
150 | SAVETMPS; |
151 | PUSHMARK(sp); |
152 | XPUSHs(e->enc); |
153 | XPUSHs(e->bufsv); |
154 | XPUSHs(&PL_sv_yes); |
155 | PUTBACK; |
156 | if (perl_call_method("toUnicode",G_SCALAR) != 1) |
157 | code = -1; |
158 | SPAGAIN; |
159 | uni = POPs; |
160 | PUTBACK; |
72e44f29 |
161 | /* Now get translated string (forced to UTF-8) and copy back to buffer |
162 | don't use sv_setsv as that may "steal" PV from returned temp |
163 | and so free() our known-large-enough buffer. |
164 | sv_setpvn() should do but let us do it long hand. |
165 | */ |
166 | s = SvPVutf8(uni,len); |
167 | if (s != SvPVX(e->bufsv)) |
168 | { |
169 | e->base.buf = SvGROW(e->bufsv,len); |
170 | Move(s,e->base.buf,len,char); |
171 | SvCUR_set(e->bufsv,len); |
172 | } |
173 | SvUTF8_on(e->bufsv); |
174 | e->base.end = e->base.buf+len; |
33af2bc7 |
175 | e->base.ptr = e->base.buf; |
176 | FREETMPS; |
177 | LEAVE; |
178 | } |
179 | return code; |
180 | } |
181 | |
182 | IV |
183 | PerlIOEncode_flush(PerlIO *f) |
184 | { |
185 | PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode); |
186 | IV code = 0; |
187 | dTHX; |
188 | if (e->bufsv && (PerlIOBase(f)->flags & (PERLIO_F_RDBUF|PERLIO_F_WRBUF))) |
189 | { |
190 | dSP; |
191 | SV *str; |
192 | char *s; |
193 | STRLEN len; |
72e44f29 |
194 | SSize_t left = 0; |
195 | if (PerlIOBase(f)->flags & PERLIO_F_RDBUF) |
196 | { |
197 | /* This is really just a flag to see if we took all the data, if |
198 | we did PerlIOBase_flush avoids a seek to lower layer. |
199 | Need to revisit if we start getting clever with unreads or seeks-in-buffer |
200 | */ |
201 | left = e->base.end - e->base.ptr; |
202 | } |
33af2bc7 |
203 | ENTER; |
204 | SAVETMPS; |
205 | PUSHMARK(sp); |
206 | XPUSHs(e->enc); |
72e44f29 |
207 | SvCUR_set(e->bufsv, e->base.ptr - e->base.buf); |
33af2bc7 |
208 | SvUTF8_on(e->bufsv); |
209 | XPUSHs(e->bufsv); |
210 | XPUSHs(&PL_sv_yes); |
211 | PUTBACK; |
212 | if (perl_call_method("fromUnicode",G_SCALAR) != 1) |
213 | code = -1; |
214 | SPAGAIN; |
215 | str = POPs; |
216 | PUTBACK; |
72e44f29 |
217 | s = SvPV(str,len); |
218 | if (s != SvPVX(e->bufsv)) |
219 | { |
220 | e->base.buf = SvGROW(e->bufsv,len); |
221 | Move(s,e->base.buf,len,char); |
222 | SvCUR_set(e->bufsv,len); |
223 | } |
33af2bc7 |
224 | SvUTF8_off(e->bufsv); |
72e44f29 |
225 | e->base.ptr = e->base.buf+len; |
226 | /* restore end != ptr as inequality is used by PerlIOBuf_flush in read case */ |
227 | e->base.end = e->base.ptr + left; |
33af2bc7 |
228 | FREETMPS; |
229 | LEAVE; |
230 | if (PerlIOBuf_flush(f) != 0) |
231 | code = -1; |
232 | } |
233 | return code; |
234 | } |
235 | |
236 | IV |
237 | PerlIOEncode_close(PerlIO *f) |
238 | { |
239 | PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode); |
240 | IV code = PerlIOBase_close(f); |
241 | dTHX; |
242 | if (e->bufsv) |
243 | { |
244 | SvREFCNT_dec(e->bufsv); |
245 | e->bufsv = Nullsv; |
246 | } |
247 | e->base.buf = NULL; |
248 | e->base.ptr = NULL; |
249 | e->base.end = NULL; |
250 | PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF|PERLIO_F_WRBUF); |
251 | return code; |
252 | } |
253 | |
72e44f29 |
254 | Off_t |
255 | PerlIOEncode_tell(PerlIO *f) |
256 | { |
257 | PerlIOBuf *b = PerlIOSelf(f,PerlIOBuf); |
258 | /* Unfortunately the only way to get a postion is to back-translate, |
259 | the UTF8-bytes we have buf..ptr and adjust accordingly. |
260 | But we will try and save any unread data in case stream |
261 | is un-seekable. |
262 | */ |
263 | if ((PerlIOBase(f)->flags & PERLIO_F_RDBUF) && b->ptr < b->end) |
264 | { |
265 | Size_t count = b->end - b->ptr; |
266 | PerlIO_push(f,&PerlIO_pending,"r",Nullch,0); |
267 | /* Save what we have left to read */ |
268 | PerlIOSelf(f,PerlIOBuf)->bufsiz = count; |
269 | PerlIO_unread(f,b->ptr,count); |
270 | /* There isn't any unread data - we just saved it - so avoid the lower seek */ |
271 | b->end = b->ptr; |
272 | /* Flush ourselves - now one layer down, |
273 | this does the back translate and adjusts position |
274 | */ |
275 | PerlIO_flush(PerlIONext(f)); |
276 | /* Set position of the saved data */ |
277 | PerlIOSelf(f,PerlIOBuf)->posn = b->posn; |
278 | } |
279 | else |
280 | { |
281 | PerlIO_flush(f); |
282 | } |
283 | return b->posn; |
284 | } |
285 | |
33af2bc7 |
286 | PerlIO_funcs PerlIO_encode = { |
72e44f29 |
287 | "encoding", |
33af2bc7 |
288 | sizeof(PerlIOEncode), |
289 | PERLIO_K_BUFFERED, |
290 | PerlIOBase_fileno, |
291 | PerlIOBuf_fdopen, |
292 | PerlIOBuf_open, |
293 | PerlIOBuf_reopen, |
294 | PerlIOEncode_pushed, |
295 | PerlIOEncode_popped, |
296 | PerlIOBuf_read, |
297 | PerlIOBuf_unread, |
298 | PerlIOBuf_write, |
299 | PerlIOBuf_seek, |
72e44f29 |
300 | PerlIOEncode_tell, |
33af2bc7 |
301 | PerlIOEncode_close, |
302 | PerlIOEncode_flush, |
303 | PerlIOEncode_fill, |
304 | PerlIOBase_eof, |
305 | PerlIOBase_error, |
306 | PerlIOBase_clearerr, |
307 | PerlIOBuf_setlinebuf, |
308 | PerlIOEncode_get_base, |
309 | PerlIOBuf_bufsiz, |
310 | PerlIOBuf_get_ptr, |
311 | PerlIOBuf_get_cnt, |
312 | PerlIOBuf_set_ptrcnt, |
313 | }; |
314 | #endif |
315 | |
183a2d84 |
316 | void call_failure (SV *routine, U8* done, U8* dest, U8* orig) {} |
67e989fb |
317 | |
318 | MODULE = Encode PACKAGE = Encode |
2c674647 |
319 | |
320 | PROTOTYPES: ENABLE |
321 | |
67e989fb |
322 | I32 |
2c674647 |
323 | _bytes_to_utf8(sv, ...) |
67e989fb |
324 | SV * sv |
2c674647 |
325 | CODE: |
67e989fb |
326 | { |
327 | SV * encoding = items == 2 ? ST(1) : Nullsv; |
328 | |
329 | if (encoding) |
330 | RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding)); |
331 | else { |
332 | STRLEN len; |
183a2d84 |
333 | U8* s = (U8*)SvPV(sv, len); |
67e989fb |
334 | U8* converted; |
335 | |
336 | converted = bytes_to_utf8(s, &len); /* This allocs */ |
183a2d84 |
337 | sv_setpvn(sv, (char *)converted, len); |
67e989fb |
338 | SvUTF8_on(sv); /* XXX Should we? */ |
339 | Safefree(converted); /* ... so free it */ |
340 | RETVAL = len; |
341 | } |
342 | } |
2c674647 |
343 | OUTPUT: |
67e989fb |
344 | RETVAL |
2c674647 |
345 | |
67e989fb |
346 | I32 |
2c674647 |
347 | _utf8_to_bytes(sv, ...) |
67e989fb |
348 | SV * sv |
2c674647 |
349 | CODE: |
67e989fb |
350 | { |
351 | SV * to = items > 1 ? ST(1) : Nullsv; |
352 | SV * check = items > 2 ? ST(2) : Nullsv; |
87714904 |
353 | |
67e989fb |
354 | if (to) |
355 | RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to)); |
356 | else { |
67e989fb |
357 | STRLEN len; |
b113ac0e |
358 | U8 *s = (U8*)SvPV(sv, len); |
67e989fb |
359 | |
360 | if (SvTRUE(check)) { |
361 | /* Must do things the slow way */ |
362 | U8 *dest; |
87714904 |
363 | U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */ |
67e989fb |
364 | U8 *send = s + len; |
365 | |
366 | New(83, dest, len, U8); /* I think */ |
367 | |
368 | while (s < send) { |
369 | if (*s < 0x80) |
370 | *dest++ = *s++; |
371 | else { |
b113ac0e |
372 | STRLEN ulen; |
373 | UV uv = *s++; |
87714904 |
374 | |
67e989fb |
375 | /* Have to do it all ourselves because of error routine, |
376 | aargh. */ |
377 | if (!(uv & 0x40)) |
378 | goto failure; |
379 | if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; } |
380 | else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; } |
381 | else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; } |
382 | else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; } |
383 | else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; } |
384 | else if (!(uv & 0x01)) { ulen = 7; uv = 0; } |
385 | else { ulen = 13; uv = 0; } |
87714904 |
386 | |
67e989fb |
387 | /* Note change to utf8.c variable naming, for variety */ |
388 | while (ulen--) { |
389 | if ((*s & 0xc0) != 0x80) |
390 | goto failure; |
87714904 |
391 | |
67e989fb |
392 | else |
393 | uv = (uv << 6) | (*s++ & 0x3f); |
87714904 |
394 | } |
67e989fb |
395 | if (uv > 256) { |
396 | failure: |
397 | call_failure(check, s, dest, src); |
398 | /* Now what happens? */ |
399 | } |
400 | *dest++ = (U8)uv; |
401 | } |
402 | } |
403 | } else |
404 | RETVAL = (utf8_to_bytes(s, &len) ? len : 0); |
405 | } |
2c674647 |
406 | } |
407 | OUTPUT: |
408 | RETVAL |
409 | |
410 | SV * |
411 | _chars_to_utf8(sv, from, ...) |
412 | SV * sv |
413 | SV * from |
414 | CODE: |
415 | { |
416 | SV * check = items == 3 ? ST(2) : Nullsv; |
417 | RETVAL = &PL_sv_undef; |
418 | } |
419 | OUTPUT: |
420 | RETVAL |
421 | |
422 | SV * |
423 | _utf8_to_chars(sv, to, ...) |
424 | SV * sv |
425 | SV * to |
426 | CODE: |
427 | { |
428 | SV * check = items == 3 ? ST(2) : Nullsv; |
429 | RETVAL = &PL_sv_undef; |
430 | } |
431 | OUTPUT: |
432 | RETVAL |
433 | |
434 | SV * |
435 | _utf8_to_chars_check(sv, ...) |
436 | SV * sv |
437 | CODE: |
438 | { |
439 | SV * check = items == 2 ? ST(1) : Nullsv; |
440 | RETVAL = &PL_sv_undef; |
441 | } |
442 | OUTPUT: |
443 | RETVAL |
444 | |
445 | SV * |
446 | _bytes_to_chars(sv, from, ...) |
447 | SV * sv |
448 | SV * from |
449 | CODE: |
450 | { |
451 | SV * check = items == 3 ? ST(2) : Nullsv; |
452 | RETVAL = &PL_sv_undef; |
453 | } |
454 | OUTPUT: |
455 | RETVAL |
456 | |
457 | SV * |
458 | _chars_to_bytes(sv, to, ...) |
459 | SV * sv |
460 | SV * to |
461 | CODE: |
462 | { |
463 | SV * check = items == 3 ? ST(2) : Nullsv; |
464 | RETVAL = &PL_sv_undef; |
465 | } |
466 | OUTPUT: |
467 | RETVAL |
468 | |
469 | SV * |
470 | _from_to(sv, from, to, ...) |
471 | SV * sv |
472 | SV * from |
473 | SV * to |
474 | CODE: |
475 | { |
476 | SV * check = items == 4 ? ST(3) : Nullsv; |
477 | RETVAL = &PL_sv_undef; |
478 | } |
479 | OUTPUT: |
480 | RETVAL |
481 | |
482 | bool |
483 | _is_utf8(sv, ...) |
484 | SV * sv |
485 | CODE: |
486 | { |
487 | SV * check = items == 2 ? ST(1) : Nullsv; |
488 | if (SvPOK(sv)) { |
067a85ef |
489 | RETVAL = SvUTF8(sv) ? 1 : 0; |
2c674647 |
490 | if (RETVAL && |
491 | SvTRUE(check) && |
492 | !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) |
493 | RETVAL = FALSE; |
494 | } else { |
495 | RETVAL = FALSE; |
496 | } |
497 | } |
498 | OUTPUT: |
499 | RETVAL |
500 | |
501 | SV * |
502 | _on_utf8(sv) |
503 | SV * sv |
504 | CODE: |
505 | { |
506 | if (SvPOK(sv)) { |
87714904 |
507 | SV *rsv = newSViv(SvUTF8(sv)); |
2c674647 |
508 | RETVAL = rsv; |
509 | SvUTF8_on(sv); |
510 | } else { |
511 | RETVAL = &PL_sv_undef; |
512 | } |
513 | } |
514 | OUTPUT: |
515 | RETVAL |
516 | |
517 | SV * |
518 | _off_utf8(sv) |
519 | SV * sv |
520 | CODE: |
521 | { |
522 | if (SvPOK(sv)) { |
87714904 |
523 | SV *rsv = newSViv(SvUTF8(sv)); |
2c674647 |
524 | RETVAL = rsv; |
525 | SvUTF8_off(sv); |
526 | } else { |
527 | RETVAL = &PL_sv_undef; |
528 | } |
529 | } |
530 | OUTPUT: |
531 | RETVAL |
532 | |
533 | SV * |
534 | _utf_to_utf(sv, from, to, ...) |
535 | SV * sv |
536 | SV * from |
537 | SV * to |
538 | CODE: |
539 | { |
540 | SV * check = items == 4 ? ST(3) : Nullsv; |
541 | RETVAL = &PL_sv_undef; |
542 | } |
543 | OUTPUT: |
544 | RETVAL |
545 | |
33af2bc7 |
546 | BOOT: |
547 | { |
548 | #ifdef USE_PERLIO |
549 | PerlIO_define_layer(&PerlIO_encode); |
550 | #endif |
551 | } |