Fix -Wall on XS::Typemap
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.xs
CommitLineData
2c674647 1#include "EXTERN.h"
2#include "perl.h"
3#include "XSUB.h"
2f2b4ff2 4#define U8 U8
5#include "encode.h"
6#include "iso8859.h"
7#include "EBCDIC.h"
8#include "Symbols.h"
2c674647 9
67e989fb 10#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) { \
2f5768b8 11 Perl_croak(aTHX_ "panic_unimplemented"); \
4a83738a 12 return (y)0; /* fool picky compilers */ \
87714904 13 }
67e989fb 14UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
15UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
16
9df9a5cd 17#if defined(USE_PERLIO) && !defined(USE_SFIO)
72e44f29 18/* Define an encoding "layer" in the perliol.h sense.
19 The layer defined here "inherits" in an object-oriented sense from the
20 "perlio" layer with its PerlIOBuf_* "methods".
21 The implementation is particularly efficient as until Encode settles down
22 there is no point in tryint to tune it.
23
24 The layer works by overloading the "fill" and "flush" methods.
25
26 "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API
27 to convert the encoded data to UTF-8 form, then copies it back to the
28 buffer. The "base class's" read methods then see the UTF-8 data.
29
30 "flush" transforms the UTF-8 data deposited by the "base class's write
31 method in the buffer back into the encoded form using the encode OO perl API,
32 then copies data back into the buffer and calls "SUPER::flush.
33
34 Note that "flush" is _also_ called for read mode - we still do the (back)-translate
35 so that the the base class's "flush" sees the correct number of encoded chars
36 for positioning the seek pointer. (This double translation is the worst performance
37 issue - particularly with all-perl encode engine.)
38
39*/
40
41
33af2bc7 42#include "perliol.h"
43
44typedef struct
45{
46 PerlIOBuf base; /* PerlIOBuf stuff */
47 SV * bufsv;
48 SV * enc;
49} PerlIOEncode;
50
e3f3bf95 51SV *
52PerlIOEncode_getarg(PerlIO *f)
53{
54 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
55 SV *sv = &PL_sv_undef;
56 if (e->enc)
57 {
58 dSP;
59 ENTER;
60 SAVETMPS;
61 PUSHMARK(sp);
62 XPUSHs(e->enc);
63 PUTBACK;
64 if (perl_call_method("name",G_SCALAR) == 1)
65 {
66 SPAGAIN;
67 sv = newSVsv(POPs);
68 PUTBACK;
69 }
70 }
71 return sv;
72}
33af2bc7 73
74IV
e3f3bf95 75PerlIOEncode_pushed(PerlIO *f, const char *mode, SV *arg)
33af2bc7 76{
77 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
78 dTHX;
79 dSP;
80 IV code;
e3f3bf95 81 code = PerlIOBuf_pushed(f,mode,Nullsv);
33af2bc7 82 ENTER;
83 SAVETMPS;
84 PUSHMARK(sp);
e3f3bf95 85 XPUSHs(arg);
33af2bc7 86 PUTBACK;
51ef4e11 87 if (perl_call_pv("Encode::find_encoding",G_SCALAR) != 1)
52744f63 88 {
89 /* should never happen */
51ef4e11 90 Perl_die(aTHX_ "Encode::find_encoding did not return a value");
52744f63 91 return -1;
92 }
33af2bc7 93 SPAGAIN;
94 e->enc = POPs;
95 PUTBACK;
96 if (!SvROK(e->enc))
52744f63 97 {
98 e->enc = Nullsv;
99 errno = EINVAL;
29b291f7 100 Perl_warner(aTHX_ WARN_IO, "Cannot find encoding \"%"SVf"\"", arg);
52744f63 101 return -1;
102 }
33af2bc7 103 SvREFCNT_inc(e->enc);
104 FREETMPS;
105 LEAVE;
106 PerlIOBase(f)->flags |= PERLIO_F_UTF8;
107 return code;
108}
109
110IV
111PerlIOEncode_popped(PerlIO *f)
112{
113 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
114 dTHX;
115 if (e->enc)
116 {
117 SvREFCNT_dec(e->enc);
118 e->enc = Nullsv;
119 }
120 if (e->bufsv)
121 {
122 SvREFCNT_dec(e->bufsv);
123 e->bufsv = Nullsv;
124 }
125 return 0;
126}
127
128STDCHAR *
129PerlIOEncode_get_base(PerlIO *f)
130{
131 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
132 dTHX;
133 if (!e->base.bufsiz)
134 e->base.bufsiz = 1024;
135 if (!e->bufsv)
136 {
137 e->bufsv = newSV(e->base.bufsiz);
138 sv_setpvn(e->bufsv,"",0);
139 }
62e8870c 140 e->base.buf = (STDCHAR *)SvPVX(e->bufsv);
33af2bc7 141 if (!e->base.ptr)
142 e->base.ptr = e->base.buf;
143 if (!e->base.end)
144 e->base.end = e->base.buf;
145 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
146 {
147 Perl_warn(aTHX_ " ptr %p(%p)%p",
148 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
149 abort();
150 }
151 if (SvLEN(e->bufsv) < e->base.bufsiz)
152 {
153 SSize_t poff = e->base.ptr - e->base.buf;
154 SSize_t eoff = e->base.end - e->base.buf;
62e8870c 155 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,e->base.bufsiz);
33af2bc7 156 e->base.ptr = e->base.buf + poff;
157 e->base.end = e->base.buf + eoff;
158 }
159 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
160 {
161 Perl_warn(aTHX_ " ptr %p(%p)%p",
162 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
163 abort();
164 }
165 return e->base.buf;
166}
167
33af2bc7 168IV
169PerlIOEncode_fill(PerlIO *f)
170{
171 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
172 dTHX;
173 dSP;
174 IV code;
33af2bc7 175 code = PerlIOBuf_fill(f);
176 if (code == 0)
177 {
178 SV *uni;
72e44f29 179 STRLEN len;
180 char *s;
181 /* Set SV that is the buffer to be buf..ptr */
33af2bc7 182 SvCUR_set(e->bufsv, e->base.end - e->base.buf);
183 SvUTF8_off(e->bufsv);
184 ENTER;
185 SAVETMPS;
186 PUSHMARK(sp);
187 XPUSHs(e->enc);
188 XPUSHs(e->bufsv);
189 XPUSHs(&PL_sv_yes);
190 PUTBACK;
50d26985 191 if (perl_call_method("decode",G_SCALAR) != 1)
33af2bc7 192 code = -1;
193 SPAGAIN;
194 uni = POPs;
195 PUTBACK;
72e44f29 196 /* Now get translated string (forced to UTF-8) and copy back to buffer
197 don't use sv_setsv as that may "steal" PV from returned temp
198 and so free() our known-large-enough buffer.
199 sv_setpvn() should do but let us do it long hand.
200 */
201 s = SvPVutf8(uni,len);
202 if (s != SvPVX(e->bufsv))
203 {
62e8870c 204 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
72e44f29 205 Move(s,e->base.buf,len,char);
206 SvCUR_set(e->bufsv,len);
207 }
208 SvUTF8_on(e->bufsv);
209 e->base.end = e->base.buf+len;
33af2bc7 210 e->base.ptr = e->base.buf;
211 FREETMPS;
212 LEAVE;
213 }
214 return code;
215}
216
217IV
218PerlIOEncode_flush(PerlIO *f)
219{
220 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
221 IV code = 0;
8040349a 222 if (e->bufsv && (PerlIOBase(f)->flags & (PERLIO_F_RDBUF|PERLIO_F_WRBUF))
223 &&(e->base.ptr > e->base.buf)
224 )
33af2bc7 225 {
8040349a 226 dTHX;
33af2bc7 227 dSP;
228 SV *str;
229 char *s;
230 STRLEN len;
72e44f29 231 SSize_t left = 0;
232 if (PerlIOBase(f)->flags & PERLIO_F_RDBUF)
233 {
234 /* This is really just a flag to see if we took all the data, if
235 we did PerlIOBase_flush avoids a seek to lower layer.
236 Need to revisit if we start getting clever with unreads or seeks-in-buffer
237 */
238 left = e->base.end - e->base.ptr;
239 }
33af2bc7 240 ENTER;
241 SAVETMPS;
242 PUSHMARK(sp);
243 XPUSHs(e->enc);
72e44f29 244 SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
33af2bc7 245 SvUTF8_on(e->bufsv);
246 XPUSHs(e->bufsv);
247 XPUSHs(&PL_sv_yes);
248 PUTBACK;
50d26985 249 if (perl_call_method("encode",G_SCALAR) != 1)
33af2bc7 250 code = -1;
251 SPAGAIN;
252 str = POPs;
253 PUTBACK;
72e44f29 254 s = SvPV(str,len);
255 if (s != SvPVX(e->bufsv))
256 {
62e8870c 257 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
72e44f29 258 Move(s,e->base.buf,len,char);
259 SvCUR_set(e->bufsv,len);
260 }
33af2bc7 261 SvUTF8_off(e->bufsv);
72e44f29 262 e->base.ptr = e->base.buf+len;
263 /* restore end != ptr as inequality is used by PerlIOBuf_flush in read case */
264 e->base.end = e->base.ptr + left;
33af2bc7 265 FREETMPS;
266 LEAVE;
267 if (PerlIOBuf_flush(f) != 0)
268 code = -1;
269 }
270 return code;
271}
272
273IV
274PerlIOEncode_close(PerlIO *f)
275{
276 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
277 IV code = PerlIOBase_close(f);
278 dTHX;
279 if (e->bufsv)
280 {
281 SvREFCNT_dec(e->bufsv);
282 e->bufsv = Nullsv;
283 }
284 e->base.buf = NULL;
285 e->base.ptr = NULL;
286 e->base.end = NULL;
287 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF|PERLIO_F_WRBUF);
288 return code;
289}
290
72e44f29 291Off_t
292PerlIOEncode_tell(PerlIO *f)
293{
a999f61b 294 dTHX;
72e44f29 295 PerlIOBuf *b = PerlIOSelf(f,PerlIOBuf);
296 /* Unfortunately the only way to get a postion is to back-translate,
297 the UTF8-bytes we have buf..ptr and adjust accordingly.
298 But we will try and save any unread data in case stream
299 is un-seekable.
300 */
301 if ((PerlIOBase(f)->flags & PERLIO_F_RDBUF) && b->ptr < b->end)
302 {
303 Size_t count = b->end - b->ptr;
e3f3bf95 304 PerlIO_push(aTHX_ f,&PerlIO_pending,"r",Nullsv);
72e44f29 305 /* Save what we have left to read */
306 PerlIOSelf(f,PerlIOBuf)->bufsiz = count;
307 PerlIO_unread(f,b->ptr,count);
308 /* There isn't any unread data - we just saved it - so avoid the lower seek */
309 b->end = b->ptr;
310 /* Flush ourselves - now one layer down,
311 this does the back translate and adjusts position
312 */
313 PerlIO_flush(PerlIONext(f));
314 /* Set position of the saved data */
315 PerlIOSelf(f,PerlIOBuf)->posn = b->posn;
316 }
317 else
318 {
319 PerlIO_flush(f);
320 }
321 return b->posn;
322}
323
33af2bc7 324PerlIO_funcs PerlIO_encode = {
72e44f29 325 "encoding",
33af2bc7 326 sizeof(PerlIOEncode),
327 PERLIO_K_BUFFERED,
33af2bc7 328 PerlIOEncode_pushed,
329 PerlIOEncode_popped,
e3f3bf95 330 PerlIOBuf_open,
331 PerlIOEncode_getarg,
332 PerlIOBase_fileno,
33af2bc7 333 PerlIOBuf_read,
334 PerlIOBuf_unread,
335 PerlIOBuf_write,
336 PerlIOBuf_seek,
72e44f29 337 PerlIOEncode_tell,
33af2bc7 338 PerlIOEncode_close,
339 PerlIOEncode_flush,
340 PerlIOEncode_fill,
341 PerlIOBase_eof,
342 PerlIOBase_error,
343 PerlIOBase_clearerr,
f6c77cf1 344 PerlIOBase_setlinebuf,
33af2bc7 345 PerlIOEncode_get_base,
346 PerlIOBuf_bufsiz,
347 PerlIOBuf_get_ptr,
348 PerlIOBuf_get_cnt,
349 PerlIOBuf_set_ptrcnt,
350};
9df9a5cd 351#endif /* encode layer */
33af2bc7 352
2f2b4ff2 353void
354Encode_Define(pTHX_ encode_t *enc)
355{
51ef4e11 356 dSP;
2f2b4ff2 357 HV *stash = gv_stashpv("Encode::XS", TRUE);
358 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
dcda1f94 359 int i = 0;
51ef4e11 360 PUSHMARK(sp);
361 XPUSHs(sv);
dcda1f94 362 while (enc->name[i])
363 {
364 const char *name = enc->name[i++];
51ef4e11 365 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
dcda1f94 366 }
51ef4e11 367 PUTBACK;
368 call_pv("Encode::define_encoding",G_DISCARD);
dcda1f94 369 SvREFCNT_dec(sv);
2f2b4ff2 370}
371
183a2d84 372void call_failure (SV *routine, U8* done, U8* dest, U8* orig) {}
67e989fb 373
2f2b4ff2 374static SV *
375encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check)
376{
377 STRLEN slen;
378 U8 *s = (U8 *) SvPV(src,slen);
379 SV *dst = sv_2mortal(newSV(2*slen+1));
380 if (slen)
381 {
382 U8 *d = (U8 *) SvGROW(dst, 2*slen+1);
383 STRLEN dlen = SvLEN(dst);
384 int code;
9b37254d 385 while ((code = do_encode(dir,s,&slen,d,dlen,&dlen,!check)))
2f2b4ff2 386 {
387 SvCUR_set(dst,dlen);
388 SvPOK_on(dst);
9b37254d 389
390 if (code == ENCODE_FALLBACK)
391 break;
392
2f2b4ff2 393 switch(code)
394 {
395 case ENCODE_NOSPACE:
396 {
397 STRLEN need = (slen) ? (SvLEN(dst)*SvCUR(src)/slen) : (dlen + UTF8_MAXLEN);
398 if (need <= SvLEN(dst))
399 need += UTF8_MAXLEN;
400 d = (U8 *) SvGROW(dst, need);
401 dlen = SvLEN(dst);
402 slen = SvCUR(src);
403 break;
404 }
405
406 case ENCODE_NOREP:
407 if (dir == enc->f_utf8)
408 {
409 if (!check && ckWARN_d(WARN_UTF8))
410 {
411 STRLEN clen;
9041c2e3 412 UV ch = utf8n_to_uvuni(s+slen,(SvCUR(src)-slen),&clen,0);
413 Perl_warner(aTHX_ WARN_UTF8, "\"\\N{U+%"UVxf"}\" does not map to %s", ch, enc->name[0]);
2f2b4ff2 414 /* FIXME: Skip over the character, copy in replacement and continue
415 * but that is messy so for now just fail.
416 */
417 return &PL_sv_undef;
418 }
419 else
420 {
421 return &PL_sv_undef;
422 }
423 }
424 else
425 {
426 /* UTF-8 is supposed to be "Universal" so should not happen */
427 Perl_croak(aTHX_ "%s '%.*s' does not map to UTF-8",
dcda1f94 428 enc->name[0], (int)(SvCUR(src)-slen),s+slen);
2f2b4ff2 429 }
430 break;
431
432 case ENCODE_PARTIAL:
433 if (!check && ckWARN_d(WARN_UTF8))
434 {
435 Perl_warner(aTHX_ WARN_UTF8, "Partial %s character",
dcda1f94 436 (dir == enc->f_utf8) ? "UTF-8" : enc->name[0]);
2f2b4ff2 437 }
438 return &PL_sv_undef;
439
440 default:
441 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
dcda1f94 442 code, (dir == enc->f_utf8) ? "to" : "from",enc->name[0]);
2f2b4ff2 443 return &PL_sv_undef;
444 }
445 }
446 SvCUR_set(dst,dlen);
447 SvPOK_on(dst);
448 if (check)
449 {
450 if (slen < SvCUR(src))
451 {
452 Move(s+slen,s,SvCUR(src)-slen,U8);
453 }
454 SvCUR_set(src,SvCUR(src)-slen);
455 }
456 }
8040349a 457 else
458 {
459 SvCUR_set(dst,slen);
460 SvPOK_on(dst);
461 }
2f2b4ff2 462 return dst;
463}
464
50d26985 465MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_
2f2b4ff2 466
467PROTOTYPES: ENABLE
468
469void
691638dd 470Method_decode(obj,src,check = FALSE)
2f2b4ff2 471SV * obj
472SV * src
691638dd 473bool check
2f2b4ff2 474CODE:
475 {
476 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
477 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
478 SvUTF8_on(ST(0));
479 XSRETURN(1);
480 }
481
482void
691638dd 483Method_encode(obj,src,check = FALSE)
2f2b4ff2 484SV * obj
485SV * src
691638dd 486bool check
2f2b4ff2 487CODE:
488 {
489 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
490 sv_utf8_upgrade(src);
491 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
492 XSRETURN(1);
493 }
494
67e989fb 495MODULE = Encode PACKAGE = Encode
2c674647 496
497PROTOTYPES: ENABLE
498
67e989fb 499I32
2c674647 500_bytes_to_utf8(sv, ...)
67e989fb 501 SV * sv
2c674647 502 CODE:
67e989fb 503 {
504 SV * encoding = items == 2 ? ST(1) : Nullsv;
505
506 if (encoding)
507 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
508 else {
509 STRLEN len;
183a2d84 510 U8* s = (U8*)SvPV(sv, len);
67e989fb 511 U8* converted;
512
513 converted = bytes_to_utf8(s, &len); /* This allocs */
183a2d84 514 sv_setpvn(sv, (char *)converted, len);
67e989fb 515 SvUTF8_on(sv); /* XXX Should we? */
516 Safefree(converted); /* ... so free it */
517 RETVAL = len;
518 }
519 }
2c674647 520 OUTPUT:
67e989fb 521 RETVAL
2c674647 522
67e989fb 523I32
2c674647 524_utf8_to_bytes(sv, ...)
67e989fb 525 SV * sv
2c674647 526 CODE:
67e989fb 527 {
528 SV * to = items > 1 ? ST(1) : Nullsv;
529 SV * check = items > 2 ? ST(2) : Nullsv;
87714904 530
67e989fb 531 if (to)
532 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
533 else {
67e989fb 534 STRLEN len;
b113ac0e 535 U8 *s = (U8*)SvPV(sv, len);
67e989fb 536
9c5ffd7c 537 RETVAL = 0;
67e989fb 538 if (SvTRUE(check)) {
539 /* Must do things the slow way */
540 U8 *dest;
87714904 541 U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
67e989fb 542 U8 *send = s + len;
543
544 New(83, dest, len, U8); /* I think */
545
546 while (s < send) {
547 if (*s < 0x80)
548 *dest++ = *s++;
549 else {
b113ac0e 550 STRLEN ulen;
551 UV uv = *s++;
87714904 552
67e989fb 553 /* Have to do it all ourselves because of error routine,
554 aargh. */
555 if (!(uv & 0x40))
556 goto failure;
557 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
558 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
559 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
560 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
561 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
562 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
563 else { ulen = 13; uv = 0; }
87714904 564
67e989fb 565 /* Note change to utf8.c variable naming, for variety */
566 while (ulen--) {
567 if ((*s & 0xc0) != 0x80)
568 goto failure;
87714904 569
67e989fb 570 else
571 uv = (uv << 6) | (*s++ & 0x3f);
87714904 572 }
67e989fb 573 if (uv > 256) {
574 failure:
575 call_failure(check, s, dest, src);
576 /* Now what happens? */
577 }
578 *dest++ = (U8)uv;
579 }
580 }
581 } else
582 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
583 }
2c674647 584 }
585 OUTPUT:
586 RETVAL
587
2c674647 588bool
4411f3b6 589is_utf8(sv, check = FALSE)
590SV * sv
591bool check
2c674647 592 CODE:
593 {
2c674647 594 if (SvPOK(sv)) {
4411f3b6 595 RETVAL = SvUTF8(sv) ? TRUE : FALSE;
2c674647 596 if (RETVAL &&
4411f3b6 597 check &&
2c674647 598 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
599 RETVAL = FALSE;
600 } else {
601 RETVAL = FALSE;
602 }
603 }
604 OUTPUT:
605 RETVAL
606
607SV *
4411f3b6 608_utf8_on(sv)
2c674647 609 SV * sv
610 CODE:
611 {
612 if (SvPOK(sv)) {
87714904 613 SV *rsv = newSViv(SvUTF8(sv));
2c674647 614 RETVAL = rsv;
615 SvUTF8_on(sv);
616 } else {
617 RETVAL = &PL_sv_undef;
618 }
619 }
620 OUTPUT:
621 RETVAL
622
623SV *
4411f3b6 624_utf8_off(sv)
2c674647 625 SV * sv
626 CODE:
627 {
628 if (SvPOK(sv)) {
87714904 629 SV *rsv = newSViv(SvUTF8(sv));
2c674647 630 RETVAL = rsv;
631 SvUTF8_off(sv);
632 } else {
633 RETVAL = &PL_sv_undef;
634 }
635 }
636 OUTPUT:
637 RETVAL
638
33af2bc7 639BOOT:
640{
6a59c517 641#if defined(USE_PERLIO) && !defined(USE_SFIO)
a999f61b 642 PerlIO_define_layer(aTHX_ &PerlIO_encode);
33af2bc7 643#endif
2f2b4ff2 644#include "iso8859.def"
645#include "EBCDIC.def"
646#include "Symbols.def"
33af2bc7 647}