Integrate mainline
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.xs
CommitLineData
2c674647 1#include "EXTERN.h"
2#include "perl.h"
3#include "XSUB.h"
4
67e989fb 5#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) { \
2f5768b8 6 Perl_croak(aTHX_ "panic_unimplemented"); \
4a83738a 7 return (y)0; /* fool picky compilers */ \
87714904 8 }
67e989fb 9UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
10UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
11
33af2bc7 12#ifdef USE_PERLIO
72e44f29 13/* Define an encoding "layer" in the perliol.h sense.
14 The layer defined here "inherits" in an object-oriented sense from the
15 "perlio" layer with its PerlIOBuf_* "methods".
16 The implementation is particularly efficient as until Encode settles down
17 there is no point in tryint to tune it.
18
19 The layer works by overloading the "fill" and "flush" methods.
20
21 "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API
22 to convert the encoded data to UTF-8 form, then copies it back to the
23 buffer. The "base class's" read methods then see the UTF-8 data.
24
25 "flush" transforms the UTF-8 data deposited by the "base class's write
26 method in the buffer back into the encoded form using the encode OO perl API,
27 then copies data back into the buffer and calls "SUPER::flush.
28
29 Note that "flush" is _also_ called for read mode - we still do the (back)-translate
30 so that the the base class's "flush" sees the correct number of encoded chars
31 for positioning the seek pointer. (This double translation is the worst performance
32 issue - particularly with all-perl encode engine.)
33
34*/
35
36
33af2bc7 37#include "perliol.h"
38
39typedef struct
40{
41 PerlIOBuf base; /* PerlIOBuf stuff */
42 SV * bufsv;
43 SV * enc;
44} PerlIOEncode;
45
46
47IV
48PerlIOEncode_pushed(PerlIO *f, const char *mode,const char *arg,STRLEN len)
49{
50 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
51 dTHX;
52 dSP;
53 IV code;
54 code = PerlIOBuf_pushed(f,mode,Nullch,0);
55 ENTER;
56 SAVETMPS;
57 PUSHMARK(sp);
58 XPUSHs(sv_2mortal(newSVpv("Encode",0)));
59 XPUSHs(sv_2mortal(newSVpvn(arg,len)));
60 PUTBACK;
61 if (perl_call_method("getEncoding",G_SCALAR) != 1)
62 return -1;
63 SPAGAIN;
64 e->enc = POPs;
65 PUTBACK;
66 if (!SvROK(e->enc))
67 return -1;
68 SvREFCNT_inc(e->enc);
69 FREETMPS;
70 LEAVE;
71 PerlIOBase(f)->flags |= PERLIO_F_UTF8;
72 return code;
73}
74
75IV
76PerlIOEncode_popped(PerlIO *f)
77{
78 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
79 dTHX;
80 if (e->enc)
81 {
82 SvREFCNT_dec(e->enc);
83 e->enc = Nullsv;
84 }
85 if (e->bufsv)
86 {
87 SvREFCNT_dec(e->bufsv);
88 e->bufsv = Nullsv;
89 }
90 return 0;
91}
92
93STDCHAR *
94PerlIOEncode_get_base(PerlIO *f)
95{
96 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
97 dTHX;
98 if (!e->base.bufsiz)
99 e->base.bufsiz = 1024;
100 if (!e->bufsv)
101 {
102 e->bufsv = newSV(e->base.bufsiz);
103 sv_setpvn(e->bufsv,"",0);
104 }
105 e->base.buf = SvPVX(e->bufsv);
106 if (!e->base.ptr)
107 e->base.ptr = e->base.buf;
108 if (!e->base.end)
109 e->base.end = e->base.buf;
110 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
111 {
112 Perl_warn(aTHX_ " ptr %p(%p)%p",
113 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
114 abort();
115 }
116 if (SvLEN(e->bufsv) < e->base.bufsiz)
117 {
118 SSize_t poff = e->base.ptr - e->base.buf;
119 SSize_t eoff = e->base.end - e->base.buf;
120 e->base.buf = SvGROW(e->bufsv,e->base.bufsiz);
121 e->base.ptr = e->base.buf + poff;
122 e->base.end = e->base.buf + eoff;
123 }
124 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
125 {
126 Perl_warn(aTHX_ " ptr %p(%p)%p",
127 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
128 abort();
129 }
130 return e->base.buf;
131}
132
33af2bc7 133IV
134PerlIOEncode_fill(PerlIO *f)
135{
136 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
137 dTHX;
138 dSP;
139 IV code;
33af2bc7 140 code = PerlIOBuf_fill(f);
141 if (code == 0)
142 {
143 SV *uni;
72e44f29 144 STRLEN len;
145 char *s;
146 /* Set SV that is the buffer to be buf..ptr */
33af2bc7 147 SvCUR_set(e->bufsv, e->base.end - e->base.buf);
148 SvUTF8_off(e->bufsv);
149 ENTER;
150 SAVETMPS;
151 PUSHMARK(sp);
152 XPUSHs(e->enc);
153 XPUSHs(e->bufsv);
154 XPUSHs(&PL_sv_yes);
155 PUTBACK;
156 if (perl_call_method("toUnicode",G_SCALAR) != 1)
157 code = -1;
158 SPAGAIN;
159 uni = POPs;
160 PUTBACK;
72e44f29 161 /* Now get translated string (forced to UTF-8) and copy back to buffer
162 don't use sv_setsv as that may "steal" PV from returned temp
163 and so free() our known-large-enough buffer.
164 sv_setpvn() should do but let us do it long hand.
165 */
166 s = SvPVutf8(uni,len);
167 if (s != SvPVX(e->bufsv))
168 {
169 e->base.buf = SvGROW(e->bufsv,len);
170 Move(s,e->base.buf,len,char);
171 SvCUR_set(e->bufsv,len);
172 }
173 SvUTF8_on(e->bufsv);
174 e->base.end = e->base.buf+len;
33af2bc7 175 e->base.ptr = e->base.buf;
176 FREETMPS;
177 LEAVE;
178 }
179 return code;
180}
181
182IV
183PerlIOEncode_flush(PerlIO *f)
184{
185 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
186 IV code = 0;
187 dTHX;
188 if (e->bufsv && (PerlIOBase(f)->flags & (PERLIO_F_RDBUF|PERLIO_F_WRBUF)))
189 {
190 dSP;
191 SV *str;
192 char *s;
193 STRLEN len;
72e44f29 194 SSize_t left = 0;
195 if (PerlIOBase(f)->flags & PERLIO_F_RDBUF)
196 {
197 /* This is really just a flag to see if we took all the data, if
198 we did PerlIOBase_flush avoids a seek to lower layer.
199 Need to revisit if we start getting clever with unreads or seeks-in-buffer
200 */
201 left = e->base.end - e->base.ptr;
202 }
33af2bc7 203 ENTER;
204 SAVETMPS;
205 PUSHMARK(sp);
206 XPUSHs(e->enc);
72e44f29 207 SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
33af2bc7 208 SvUTF8_on(e->bufsv);
209 XPUSHs(e->bufsv);
210 XPUSHs(&PL_sv_yes);
211 PUTBACK;
212 if (perl_call_method("fromUnicode",G_SCALAR) != 1)
213 code = -1;
214 SPAGAIN;
215 str = POPs;
216 PUTBACK;
72e44f29 217 s = SvPV(str,len);
218 if (s != SvPVX(e->bufsv))
219 {
220 e->base.buf = SvGROW(e->bufsv,len);
221 Move(s,e->base.buf,len,char);
222 SvCUR_set(e->bufsv,len);
223 }
33af2bc7 224 SvUTF8_off(e->bufsv);
72e44f29 225 e->base.ptr = e->base.buf+len;
226 /* restore end != ptr as inequality is used by PerlIOBuf_flush in read case */
227 e->base.end = e->base.ptr + left;
33af2bc7 228 FREETMPS;
229 LEAVE;
230 if (PerlIOBuf_flush(f) != 0)
231 code = -1;
232 }
233 return code;
234}
235
236IV
237PerlIOEncode_close(PerlIO *f)
238{
239 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
240 IV code = PerlIOBase_close(f);
241 dTHX;
242 if (e->bufsv)
243 {
244 SvREFCNT_dec(e->bufsv);
245 e->bufsv = Nullsv;
246 }
247 e->base.buf = NULL;
248 e->base.ptr = NULL;
249 e->base.end = NULL;
250 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF|PERLIO_F_WRBUF);
251 return code;
252}
253
72e44f29 254Off_t
255PerlIOEncode_tell(PerlIO *f)
256{
257 PerlIOBuf *b = PerlIOSelf(f,PerlIOBuf);
258 /* Unfortunately the only way to get a postion is to back-translate,
259 the UTF8-bytes we have buf..ptr and adjust accordingly.
260 But we will try and save any unread data in case stream
261 is un-seekable.
262 */
263 if ((PerlIOBase(f)->flags & PERLIO_F_RDBUF) && b->ptr < b->end)
264 {
265 Size_t count = b->end - b->ptr;
266 PerlIO_push(f,&PerlIO_pending,"r",Nullch,0);
267 /* Save what we have left to read */
268 PerlIOSelf(f,PerlIOBuf)->bufsiz = count;
269 PerlIO_unread(f,b->ptr,count);
270 /* There isn't any unread data - we just saved it - so avoid the lower seek */
271 b->end = b->ptr;
272 /* Flush ourselves - now one layer down,
273 this does the back translate and adjusts position
274 */
275 PerlIO_flush(PerlIONext(f));
276 /* Set position of the saved data */
277 PerlIOSelf(f,PerlIOBuf)->posn = b->posn;
278 }
279 else
280 {
281 PerlIO_flush(f);
282 }
283 return b->posn;
284}
285
33af2bc7 286PerlIO_funcs PerlIO_encode = {
72e44f29 287 "encoding",
33af2bc7 288 sizeof(PerlIOEncode),
289 PERLIO_K_BUFFERED,
290 PerlIOBase_fileno,
291 PerlIOBuf_fdopen,
292 PerlIOBuf_open,
293 PerlIOBuf_reopen,
294 PerlIOEncode_pushed,
295 PerlIOEncode_popped,
296 PerlIOBuf_read,
297 PerlIOBuf_unread,
298 PerlIOBuf_write,
299 PerlIOBuf_seek,
72e44f29 300 PerlIOEncode_tell,
33af2bc7 301 PerlIOEncode_close,
302 PerlIOEncode_flush,
303 PerlIOEncode_fill,
304 PerlIOBase_eof,
305 PerlIOBase_error,
306 PerlIOBase_clearerr,
307 PerlIOBuf_setlinebuf,
308 PerlIOEncode_get_base,
309 PerlIOBuf_bufsiz,
310 PerlIOBuf_get_ptr,
311 PerlIOBuf_get_cnt,
312 PerlIOBuf_set_ptrcnt,
313};
314#endif
315
183a2d84 316void call_failure (SV *routine, U8* done, U8* dest, U8* orig) {}
67e989fb 317
318MODULE = Encode PACKAGE = Encode
2c674647 319
320PROTOTYPES: ENABLE
321
67e989fb 322I32
2c674647 323_bytes_to_utf8(sv, ...)
67e989fb 324 SV * sv
2c674647 325 CODE:
67e989fb 326 {
327 SV * encoding = items == 2 ? ST(1) : Nullsv;
328
329 if (encoding)
330 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
331 else {
332 STRLEN len;
183a2d84 333 U8* s = (U8*)SvPV(sv, len);
67e989fb 334 U8* converted;
335
336 converted = bytes_to_utf8(s, &len); /* This allocs */
183a2d84 337 sv_setpvn(sv, (char *)converted, len);
67e989fb 338 SvUTF8_on(sv); /* XXX Should we? */
339 Safefree(converted); /* ... so free it */
340 RETVAL = len;
341 }
342 }
2c674647 343 OUTPUT:
67e989fb 344 RETVAL
2c674647 345
67e989fb 346I32
2c674647 347_utf8_to_bytes(sv, ...)
67e989fb 348 SV * sv
2c674647 349 CODE:
67e989fb 350 {
351 SV * to = items > 1 ? ST(1) : Nullsv;
352 SV * check = items > 2 ? ST(2) : Nullsv;
87714904 353
67e989fb 354 if (to)
355 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
356 else {
67e989fb 357 STRLEN len;
b113ac0e 358 U8 *s = (U8*)SvPV(sv, len);
67e989fb 359
360 if (SvTRUE(check)) {
361 /* Must do things the slow way */
362 U8 *dest;
87714904 363 U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
67e989fb 364 U8 *send = s + len;
365
366 New(83, dest, len, U8); /* I think */
367
368 while (s < send) {
369 if (*s < 0x80)
370 *dest++ = *s++;
371 else {
b113ac0e 372 STRLEN ulen;
373 UV uv = *s++;
87714904 374
67e989fb 375 /* Have to do it all ourselves because of error routine,
376 aargh. */
377 if (!(uv & 0x40))
378 goto failure;
379 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
380 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
381 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
382 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
383 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
384 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
385 else { ulen = 13; uv = 0; }
87714904 386
67e989fb 387 /* Note change to utf8.c variable naming, for variety */
388 while (ulen--) {
389 if ((*s & 0xc0) != 0x80)
390 goto failure;
87714904 391
67e989fb 392 else
393 uv = (uv << 6) | (*s++ & 0x3f);
87714904 394 }
67e989fb 395 if (uv > 256) {
396 failure:
397 call_failure(check, s, dest, src);
398 /* Now what happens? */
399 }
400 *dest++ = (U8)uv;
401 }
402 }
403 } else
404 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
405 }
2c674647 406 }
407 OUTPUT:
408 RETVAL
409
410SV *
411_chars_to_utf8(sv, from, ...)
412 SV * sv
413 SV * from
414 CODE:
415 {
416 SV * check = items == 3 ? ST(2) : Nullsv;
417 RETVAL = &PL_sv_undef;
418 }
419 OUTPUT:
420 RETVAL
421
422SV *
423_utf8_to_chars(sv, to, ...)
424 SV * sv
425 SV * to
426 CODE:
427 {
428 SV * check = items == 3 ? ST(2) : Nullsv;
429 RETVAL = &PL_sv_undef;
430 }
431 OUTPUT:
432 RETVAL
433
434SV *
435_utf8_to_chars_check(sv, ...)
436 SV * sv
437 CODE:
438 {
439 SV * check = items == 2 ? ST(1) : Nullsv;
440 RETVAL = &PL_sv_undef;
441 }
442 OUTPUT:
443 RETVAL
444
445SV *
446_bytes_to_chars(sv, from, ...)
447 SV * sv
448 SV * from
449 CODE:
450 {
451 SV * check = items == 3 ? ST(2) : Nullsv;
452 RETVAL = &PL_sv_undef;
453 }
454 OUTPUT:
455 RETVAL
456
457SV *
458_chars_to_bytes(sv, to, ...)
459 SV * sv
460 SV * to
461 CODE:
462 {
463 SV * check = items == 3 ? ST(2) : Nullsv;
464 RETVAL = &PL_sv_undef;
465 }
466 OUTPUT:
467 RETVAL
468
469SV *
470_from_to(sv, from, to, ...)
471 SV * sv
472 SV * from
473 SV * to
474 CODE:
475 {
476 SV * check = items == 4 ? ST(3) : Nullsv;
477 RETVAL = &PL_sv_undef;
478 }
479 OUTPUT:
480 RETVAL
481
482bool
483_is_utf8(sv, ...)
484 SV * sv
485 CODE:
486 {
487 SV * check = items == 2 ? ST(1) : Nullsv;
488 if (SvPOK(sv)) {
067a85ef 489 RETVAL = SvUTF8(sv) ? 1 : 0;
2c674647 490 if (RETVAL &&
491 SvTRUE(check) &&
492 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
493 RETVAL = FALSE;
494 } else {
495 RETVAL = FALSE;
496 }
497 }
498 OUTPUT:
499 RETVAL
500
501SV *
502_on_utf8(sv)
503 SV * sv
504 CODE:
505 {
506 if (SvPOK(sv)) {
87714904 507 SV *rsv = newSViv(SvUTF8(sv));
2c674647 508 RETVAL = rsv;
509 SvUTF8_on(sv);
510 } else {
511 RETVAL = &PL_sv_undef;
512 }
513 }
514 OUTPUT:
515 RETVAL
516
517SV *
518_off_utf8(sv)
519 SV * sv
520 CODE:
521 {
522 if (SvPOK(sv)) {
87714904 523 SV *rsv = newSViv(SvUTF8(sv));
2c674647 524 RETVAL = rsv;
525 SvUTF8_off(sv);
526 } else {
527 RETVAL = &PL_sv_undef;
528 }
529 }
530 OUTPUT:
531 RETVAL
532
533SV *
534_utf_to_utf(sv, from, to, ...)
535 SV * sv
536 SV * from
537 SV * to
538 CODE:
539 {
540 SV * check = items == 4 ? ST(3) : Nullsv;
541 RETVAL = &PL_sv_undef;
542 }
543 OUTPUT:
544 RETVAL
545
33af2bc7 546BOOT:
547{
548#ifdef USE_PERLIO
549 PerlIO_define_layer(&PerlIO_encode);
550#endif
551}