Implement:
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.xs
CommitLineData
2c674647 1#include "EXTERN.h"
2#include "perl.h"
3#include "XSUB.h"
2f2b4ff2 4#define U8 U8
5#include "encode.h"
6#include "iso8859.h"
7#include "EBCDIC.h"
8#include "Symbols.h"
2c674647 9
67e989fb 10#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) { \
2f5768b8 11 Perl_croak(aTHX_ "panic_unimplemented"); \
4a83738a 12 return (y)0; /* fool picky compilers */ \
87714904 13 }
67e989fb 14UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
15UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
16
9df9a5cd 17#if defined(USE_PERLIO) && !defined(USE_SFIO)
72e44f29 18/* Define an encoding "layer" in the perliol.h sense.
19 The layer defined here "inherits" in an object-oriented sense from the
20 "perlio" layer with its PerlIOBuf_* "methods".
21 The implementation is particularly efficient as until Encode settles down
22 there is no point in tryint to tune it.
23
24 The layer works by overloading the "fill" and "flush" methods.
25
26 "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API
27 to convert the encoded data to UTF-8 form, then copies it back to the
28 buffer. The "base class's" read methods then see the UTF-8 data.
29
30 "flush" transforms the UTF-8 data deposited by the "base class's write
31 method in the buffer back into the encoded form using the encode OO perl API,
32 then copies data back into the buffer and calls "SUPER::flush.
33
34 Note that "flush" is _also_ called for read mode - we still do the (back)-translate
35 so that the the base class's "flush" sees the correct number of encoded chars
36 for positioning the seek pointer. (This double translation is the worst performance
37 issue - particularly with all-perl encode engine.)
38
39*/
40
41
33af2bc7 42#include "perliol.h"
43
44typedef struct
45{
46 PerlIOBuf base; /* PerlIOBuf stuff */
47 SV * bufsv;
48 SV * enc;
49} PerlIOEncode;
50
e3f3bf95 51SV *
52PerlIOEncode_getarg(PerlIO *f)
53{
54 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
55 SV *sv = &PL_sv_undef;
56 if (e->enc)
57 {
58 dSP;
59 ENTER;
60 SAVETMPS;
61 PUSHMARK(sp);
62 XPUSHs(e->enc);
63 PUTBACK;
64 if (perl_call_method("name",G_SCALAR) == 1)
65 {
66 SPAGAIN;
67 sv = newSVsv(POPs);
68 PUTBACK;
69 }
70 }
71 return sv;
72}
33af2bc7 73
74IV
e3f3bf95 75PerlIOEncode_pushed(PerlIO *f, const char *mode, SV *arg)
33af2bc7 76{
77 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
78 dTHX;
79 dSP;
80 IV code;
e3f3bf95 81 code = PerlIOBuf_pushed(f,mode,Nullsv);
33af2bc7 82 ENTER;
83 SAVETMPS;
84 PUSHMARK(sp);
e3f3bf95 85 XPUSHs(arg);
33af2bc7 86 PUTBACK;
51ef4e11 87 if (perl_call_pv("Encode::find_encoding",G_SCALAR) != 1)
52744f63 88 {
89 /* should never happen */
51ef4e11 90 Perl_die(aTHX_ "Encode::find_encoding did not return a value");
52744f63 91 return -1;
92 }
33af2bc7 93 SPAGAIN;
94 e->enc = POPs;
95 PUTBACK;
96 if (!SvROK(e->enc))
52744f63 97 {
98 e->enc = Nullsv;
99 errno = EINVAL;
e3f3bf95 100 Perl_warner(aTHX_ WARN_IO, "Cannot find encoding \"%_\"", arg);
52744f63 101 return -1;
102 }
33af2bc7 103 SvREFCNT_inc(e->enc);
104 FREETMPS;
105 LEAVE;
106 PerlIOBase(f)->flags |= PERLIO_F_UTF8;
107 return code;
108}
109
110IV
111PerlIOEncode_popped(PerlIO *f)
112{
113 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
114 dTHX;
115 if (e->enc)
116 {
117 SvREFCNT_dec(e->enc);
118 e->enc = Nullsv;
119 }
120 if (e->bufsv)
121 {
122 SvREFCNT_dec(e->bufsv);
123 e->bufsv = Nullsv;
124 }
125 return 0;
126}
127
128STDCHAR *
129PerlIOEncode_get_base(PerlIO *f)
130{
131 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
132 dTHX;
133 if (!e->base.bufsiz)
134 e->base.bufsiz = 1024;
135 if (!e->bufsv)
136 {
137 e->bufsv = newSV(e->base.bufsiz);
138 sv_setpvn(e->bufsv,"",0);
139 }
62e8870c 140 e->base.buf = (STDCHAR *)SvPVX(e->bufsv);
33af2bc7 141 if (!e->base.ptr)
142 e->base.ptr = e->base.buf;
143 if (!e->base.end)
144 e->base.end = e->base.buf;
145 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
146 {
147 Perl_warn(aTHX_ " ptr %p(%p)%p",
148 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
149 abort();
150 }
151 if (SvLEN(e->bufsv) < e->base.bufsiz)
152 {
153 SSize_t poff = e->base.ptr - e->base.buf;
154 SSize_t eoff = e->base.end - e->base.buf;
62e8870c 155 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,e->base.bufsiz);
33af2bc7 156 e->base.ptr = e->base.buf + poff;
157 e->base.end = e->base.buf + eoff;
158 }
159 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
160 {
161 Perl_warn(aTHX_ " ptr %p(%p)%p",
162 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
163 abort();
164 }
165 return e->base.buf;
166}
167
33af2bc7 168IV
169PerlIOEncode_fill(PerlIO *f)
170{
171 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
172 dTHX;
173 dSP;
174 IV code;
33af2bc7 175 code = PerlIOBuf_fill(f);
176 if (code == 0)
177 {
178 SV *uni;
72e44f29 179 STRLEN len;
180 char *s;
181 /* Set SV that is the buffer to be buf..ptr */
33af2bc7 182 SvCUR_set(e->bufsv, e->base.end - e->base.buf);
183 SvUTF8_off(e->bufsv);
184 ENTER;
185 SAVETMPS;
186 PUSHMARK(sp);
187 XPUSHs(e->enc);
188 XPUSHs(e->bufsv);
189 XPUSHs(&PL_sv_yes);
190 PUTBACK;
50d26985 191 if (perl_call_method("decode",G_SCALAR) != 1)
33af2bc7 192 code = -1;
193 SPAGAIN;
194 uni = POPs;
195 PUTBACK;
72e44f29 196 /* Now get translated string (forced to UTF-8) and copy back to buffer
197 don't use sv_setsv as that may "steal" PV from returned temp
198 and so free() our known-large-enough buffer.
199 sv_setpvn() should do but let us do it long hand.
200 */
201 s = SvPVutf8(uni,len);
202 if (s != SvPVX(e->bufsv))
203 {
62e8870c 204 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
72e44f29 205 Move(s,e->base.buf,len,char);
206 SvCUR_set(e->bufsv,len);
207 }
208 SvUTF8_on(e->bufsv);
209 e->base.end = e->base.buf+len;
33af2bc7 210 e->base.ptr = e->base.buf;
211 FREETMPS;
212 LEAVE;
213 }
214 return code;
215}
216
217IV
218PerlIOEncode_flush(PerlIO *f)
219{
220 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
221 IV code = 0;
222 dTHX;
223 if (e->bufsv && (PerlIOBase(f)->flags & (PERLIO_F_RDBUF|PERLIO_F_WRBUF)))
224 {
225 dSP;
226 SV *str;
227 char *s;
228 STRLEN len;
72e44f29 229 SSize_t left = 0;
230 if (PerlIOBase(f)->flags & PERLIO_F_RDBUF)
231 {
232 /* This is really just a flag to see if we took all the data, if
233 we did PerlIOBase_flush avoids a seek to lower layer.
234 Need to revisit if we start getting clever with unreads or seeks-in-buffer
235 */
236 left = e->base.end - e->base.ptr;
237 }
33af2bc7 238 ENTER;
239 SAVETMPS;
240 PUSHMARK(sp);
241 XPUSHs(e->enc);
72e44f29 242 SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
33af2bc7 243 SvUTF8_on(e->bufsv);
244 XPUSHs(e->bufsv);
245 XPUSHs(&PL_sv_yes);
246 PUTBACK;
50d26985 247 if (perl_call_method("encode",G_SCALAR) != 1)
33af2bc7 248 code = -1;
249 SPAGAIN;
250 str = POPs;
251 PUTBACK;
72e44f29 252 s = SvPV(str,len);
253 if (s != SvPVX(e->bufsv))
254 {
62e8870c 255 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
72e44f29 256 Move(s,e->base.buf,len,char);
257 SvCUR_set(e->bufsv,len);
258 }
33af2bc7 259 SvUTF8_off(e->bufsv);
72e44f29 260 e->base.ptr = e->base.buf+len;
261 /* restore end != ptr as inequality is used by PerlIOBuf_flush in read case */
262 e->base.end = e->base.ptr + left;
33af2bc7 263 FREETMPS;
264 LEAVE;
265 if (PerlIOBuf_flush(f) != 0)
266 code = -1;
267 }
268 return code;
269}
270
271IV
272PerlIOEncode_close(PerlIO *f)
273{
274 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
275 IV code = PerlIOBase_close(f);
276 dTHX;
277 if (e->bufsv)
278 {
279 SvREFCNT_dec(e->bufsv);
280 e->bufsv = Nullsv;
281 }
282 e->base.buf = NULL;
283 e->base.ptr = NULL;
284 e->base.end = NULL;
285 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF|PERLIO_F_WRBUF);
286 return code;
287}
288
72e44f29 289Off_t
290PerlIOEncode_tell(PerlIO *f)
291{
a999f61b 292 dTHX;
72e44f29 293 PerlIOBuf *b = PerlIOSelf(f,PerlIOBuf);
294 /* Unfortunately the only way to get a postion is to back-translate,
295 the UTF8-bytes we have buf..ptr and adjust accordingly.
296 But we will try and save any unread data in case stream
297 is un-seekable.
298 */
299 if ((PerlIOBase(f)->flags & PERLIO_F_RDBUF) && b->ptr < b->end)
300 {
301 Size_t count = b->end - b->ptr;
e3f3bf95 302 PerlIO_push(aTHX_ f,&PerlIO_pending,"r",Nullsv);
72e44f29 303 /* Save what we have left to read */
304 PerlIOSelf(f,PerlIOBuf)->bufsiz = count;
305 PerlIO_unread(f,b->ptr,count);
306 /* There isn't any unread data - we just saved it - so avoid the lower seek */
307 b->end = b->ptr;
308 /* Flush ourselves - now one layer down,
309 this does the back translate and adjusts position
310 */
311 PerlIO_flush(PerlIONext(f));
312 /* Set position of the saved data */
313 PerlIOSelf(f,PerlIOBuf)->posn = b->posn;
314 }
315 else
316 {
317 PerlIO_flush(f);
318 }
319 return b->posn;
320}
321
33af2bc7 322PerlIO_funcs PerlIO_encode = {
72e44f29 323 "encoding",
33af2bc7 324 sizeof(PerlIOEncode),
325 PERLIO_K_BUFFERED,
33af2bc7 326 PerlIOEncode_pushed,
327 PerlIOEncode_popped,
e3f3bf95 328 PerlIOBuf_open,
329 PerlIOEncode_getarg,
330 PerlIOBase_fileno,
33af2bc7 331 PerlIOBuf_read,
332 PerlIOBuf_unread,
333 PerlIOBuf_write,
334 PerlIOBuf_seek,
72e44f29 335 PerlIOEncode_tell,
33af2bc7 336 PerlIOEncode_close,
337 PerlIOEncode_flush,
338 PerlIOEncode_fill,
339 PerlIOBase_eof,
340 PerlIOBase_error,
341 PerlIOBase_clearerr,
f6c77cf1 342 PerlIOBase_setlinebuf,
33af2bc7 343 PerlIOEncode_get_base,
344 PerlIOBuf_bufsiz,
345 PerlIOBuf_get_ptr,
346 PerlIOBuf_get_cnt,
347 PerlIOBuf_set_ptrcnt,
348};
9df9a5cd 349#endif /* encode layer */
33af2bc7 350
2f2b4ff2 351void
352Encode_Define(pTHX_ encode_t *enc)
353{
51ef4e11 354 dSP;
2f2b4ff2 355 HV *stash = gv_stashpv("Encode::XS", TRUE);
356 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
dcda1f94 357 int i = 0;
51ef4e11 358 PUSHMARK(sp);
359 XPUSHs(sv);
dcda1f94 360 while (enc->name[i])
361 {
362 const char *name = enc->name[i++];
51ef4e11 363 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
dcda1f94 364 }
51ef4e11 365 PUTBACK;
366 call_pv("Encode::define_encoding",G_DISCARD);
dcda1f94 367 SvREFCNT_dec(sv);
2f2b4ff2 368}
369
183a2d84 370void call_failure (SV *routine, U8* done, U8* dest, U8* orig) {}
67e989fb 371
2f2b4ff2 372static SV *
373encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check)
374{
375 STRLEN slen;
376 U8 *s = (U8 *) SvPV(src,slen);
377 SV *dst = sv_2mortal(newSV(2*slen+1));
378 if (slen)
379 {
380 U8 *d = (U8 *) SvGROW(dst, 2*slen+1);
381 STRLEN dlen = SvLEN(dst);
382 int code;
9b37254d 383 while ((code = do_encode(dir,s,&slen,d,dlen,&dlen,!check)))
2f2b4ff2 384 {
385 SvCUR_set(dst,dlen);
386 SvPOK_on(dst);
9b37254d 387
388 if (code == ENCODE_FALLBACK)
389 break;
390
2f2b4ff2 391 switch(code)
392 {
393 case ENCODE_NOSPACE:
394 {
395 STRLEN need = (slen) ? (SvLEN(dst)*SvCUR(src)/slen) : (dlen + UTF8_MAXLEN);
396 if (need <= SvLEN(dst))
397 need += UTF8_MAXLEN;
398 d = (U8 *) SvGROW(dst, need);
399 dlen = SvLEN(dst);
400 slen = SvCUR(src);
401 break;
402 }
403
404 case ENCODE_NOREP:
405 if (dir == enc->f_utf8)
406 {
407 if (!check && ckWARN_d(WARN_UTF8))
408 {
409 STRLEN clen;
9041c2e3 410 UV ch = utf8n_to_uvuni(s+slen,(SvCUR(src)-slen),&clen,0);
411 Perl_warner(aTHX_ WARN_UTF8, "\"\\N{U+%"UVxf"}\" does not map to %s", ch, enc->name[0]);
2f2b4ff2 412 /* FIXME: Skip over the character, copy in replacement and continue
413 * but that is messy so for now just fail.
414 */
415 return &PL_sv_undef;
416 }
417 else
418 {
419 return &PL_sv_undef;
420 }
421 }
422 else
423 {
424 /* UTF-8 is supposed to be "Universal" so should not happen */
425 Perl_croak(aTHX_ "%s '%.*s' does not map to UTF-8",
dcda1f94 426 enc->name[0], (int)(SvCUR(src)-slen),s+slen);
2f2b4ff2 427 }
428 break;
429
430 case ENCODE_PARTIAL:
431 if (!check && ckWARN_d(WARN_UTF8))
432 {
433 Perl_warner(aTHX_ WARN_UTF8, "Partial %s character",
dcda1f94 434 (dir == enc->f_utf8) ? "UTF-8" : enc->name[0]);
2f2b4ff2 435 }
436 return &PL_sv_undef;
437
438 default:
439 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
dcda1f94 440 code, (dir == enc->f_utf8) ? "to" : "from",enc->name[0]);
2f2b4ff2 441 return &PL_sv_undef;
442 }
443 }
444 SvCUR_set(dst,dlen);
445 SvPOK_on(dst);
446 if (check)
447 {
448 if (slen < SvCUR(src))
449 {
450 Move(s+slen,s,SvCUR(src)-slen,U8);
451 }
452 SvCUR_set(src,SvCUR(src)-slen);
453 }
454 }
455 return dst;
456}
457
50d26985 458MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_
2f2b4ff2 459
460PROTOTYPES: ENABLE
461
462void
50d26985 463Method_decode(obj,src,check = 0)
2f2b4ff2 464SV * obj
465SV * src
466int check
467CODE:
468 {
469 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
470 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
471 SvUTF8_on(ST(0));
472 XSRETURN(1);
473 }
474
475void
50d26985 476Method_encode(obj,src,check = 0)
2f2b4ff2 477SV * obj
478SV * src
479int check
480CODE:
481 {
482 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
483 sv_utf8_upgrade(src);
484 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
485 XSRETURN(1);
486 }
487
67e989fb 488MODULE = Encode PACKAGE = Encode
2c674647 489
490PROTOTYPES: ENABLE
491
67e989fb 492I32
2c674647 493_bytes_to_utf8(sv, ...)
67e989fb 494 SV * sv
2c674647 495 CODE:
67e989fb 496 {
497 SV * encoding = items == 2 ? ST(1) : Nullsv;
498
499 if (encoding)
500 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
501 else {
502 STRLEN len;
183a2d84 503 U8* s = (U8*)SvPV(sv, len);
67e989fb 504 U8* converted;
505
506 converted = bytes_to_utf8(s, &len); /* This allocs */
183a2d84 507 sv_setpvn(sv, (char *)converted, len);
67e989fb 508 SvUTF8_on(sv); /* XXX Should we? */
509 Safefree(converted); /* ... so free it */
510 RETVAL = len;
511 }
512 }
2c674647 513 OUTPUT:
67e989fb 514 RETVAL
2c674647 515
67e989fb 516I32
2c674647 517_utf8_to_bytes(sv, ...)
67e989fb 518 SV * sv
2c674647 519 CODE:
67e989fb 520 {
521 SV * to = items > 1 ? ST(1) : Nullsv;
522 SV * check = items > 2 ? ST(2) : Nullsv;
87714904 523
67e989fb 524 if (to)
525 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
526 else {
67e989fb 527 STRLEN len;
b113ac0e 528 U8 *s = (U8*)SvPV(sv, len);
67e989fb 529
530 if (SvTRUE(check)) {
531 /* Must do things the slow way */
532 U8 *dest;
87714904 533 U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
67e989fb 534 U8 *send = s + len;
535
536 New(83, dest, len, U8); /* I think */
537
538 while (s < send) {
539 if (*s < 0x80)
540 *dest++ = *s++;
541 else {
b113ac0e 542 STRLEN ulen;
543 UV uv = *s++;
87714904 544
67e989fb 545 /* Have to do it all ourselves because of error routine,
546 aargh. */
547 if (!(uv & 0x40))
548 goto failure;
549 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
550 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
551 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
552 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
553 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
554 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
555 else { ulen = 13; uv = 0; }
87714904 556
67e989fb 557 /* Note change to utf8.c variable naming, for variety */
558 while (ulen--) {
559 if ((*s & 0xc0) != 0x80)
560 goto failure;
87714904 561
67e989fb 562 else
563 uv = (uv << 6) | (*s++ & 0x3f);
87714904 564 }
67e989fb 565 if (uv > 256) {
566 failure:
567 call_failure(check, s, dest, src);
568 /* Now what happens? */
569 }
570 *dest++ = (U8)uv;
571 }
572 }
573 } else
574 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
575 }
2c674647 576 }
577 OUTPUT:
578 RETVAL
579
2c674647 580bool
4411f3b6 581is_utf8(sv, check = FALSE)
582SV * sv
583bool check
2c674647 584 CODE:
585 {
2c674647 586 if (SvPOK(sv)) {
4411f3b6 587 RETVAL = SvUTF8(sv) ? TRUE : FALSE;
2c674647 588 if (RETVAL &&
4411f3b6 589 check &&
2c674647 590 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
591 RETVAL = FALSE;
592 } else {
593 RETVAL = FALSE;
594 }
595 }
596 OUTPUT:
597 RETVAL
598
599SV *
4411f3b6 600_utf8_on(sv)
2c674647 601 SV * sv
602 CODE:
603 {
604 if (SvPOK(sv)) {
87714904 605 SV *rsv = newSViv(SvUTF8(sv));
2c674647 606 RETVAL = rsv;
607 SvUTF8_on(sv);
608 } else {
609 RETVAL = &PL_sv_undef;
610 }
611 }
612 OUTPUT:
613 RETVAL
614
615SV *
4411f3b6 616_utf8_off(sv)
2c674647 617 SV * sv
618 CODE:
619 {
620 if (SvPOK(sv)) {
87714904 621 SV *rsv = newSViv(SvUTF8(sv));
2c674647 622 RETVAL = rsv;
623 SvUTF8_off(sv);
624 } else {
625 RETVAL = &PL_sv_undef;
626 }
627 }
628 OUTPUT:
629 RETVAL
630
33af2bc7 631BOOT:
632{
6a59c517 633#if defined(USE_PERLIO) && !defined(USE_SFIO)
a999f61b 634 PerlIO_define_layer(aTHX_ &PerlIO_encode);
33af2bc7 635#endif
2f2b4ff2 636#include "iso8859.def"
637#include "EBCDIC.def"
638#include "Symbols.def"
33af2bc7 639}