Render -Duseperlio functional again.
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.xs
CommitLineData
2c674647 1#include "EXTERN.h"
2#include "perl.h"
3#include "XSUB.h"
2f2b4ff2 4#define U8 U8
5#include "encode.h"
6#include "iso8859.h"
7#include "EBCDIC.h"
8#include "Symbols.h"
2c674647 9
67e989fb 10#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) { \
2f5768b8 11 Perl_croak(aTHX_ "panic_unimplemented"); \
4a83738a 12 return (y)0; /* fool picky compilers */ \
87714904 13 }
67e989fb 14UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
15UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
16
9df9a5cd 17#if defined(USE_PERLIO) && !defined(USE_SFIO)
72e44f29 18/* Define an encoding "layer" in the perliol.h sense.
19 The layer defined here "inherits" in an object-oriented sense from the
20 "perlio" layer with its PerlIOBuf_* "methods".
21 The implementation is particularly efficient as until Encode settles down
22 there is no point in tryint to tune it.
23
24 The layer works by overloading the "fill" and "flush" methods.
25
26 "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API
27 to convert the encoded data to UTF-8 form, then copies it back to the
28 buffer. The "base class's" read methods then see the UTF-8 data.
29
30 "flush" transforms the UTF-8 data deposited by the "base class's write
31 method in the buffer back into the encoded form using the encode OO perl API,
32 then copies data back into the buffer and calls "SUPER::flush.
33
34 Note that "flush" is _also_ called for read mode - we still do the (back)-translate
35 so that the the base class's "flush" sees the correct number of encoded chars
36 for positioning the seek pointer. (This double translation is the worst performance
37 issue - particularly with all-perl encode engine.)
38
39*/
40
41
33af2bc7 42#include "perliol.h"
43
44typedef struct
45{
46 PerlIOBuf base; /* PerlIOBuf stuff */
47 SV * bufsv;
48 SV * enc;
49} PerlIOEncode;
50
51
52IV
53PerlIOEncode_pushed(PerlIO *f, const char *mode,const char *arg,STRLEN len)
54{
55 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
56 dTHX;
57 dSP;
58 IV code;
59 code = PerlIOBuf_pushed(f,mode,Nullch,0);
60 ENTER;
61 SAVETMPS;
62 PUSHMARK(sp);
33af2bc7 63 XPUSHs(sv_2mortal(newSVpvn(arg,len)));
64 PUTBACK;
51ef4e11 65 if (perl_call_pv("Encode::find_encoding",G_SCALAR) != 1)
52744f63 66 {
67 /* should never happen */
51ef4e11 68 Perl_die(aTHX_ "Encode::find_encoding did not return a value");
52744f63 69 return -1;
70 }
33af2bc7 71 SPAGAIN;
72 e->enc = POPs;
73 PUTBACK;
74 if (!SvROK(e->enc))
52744f63 75 {
76 e->enc = Nullsv;
77 errno = EINVAL;
78 Perl_warner(aTHX_ WARN_IO, "Cannot find encoding \"%.*s\"", (int) len, arg);
79 return -1;
80 }
33af2bc7 81 SvREFCNT_inc(e->enc);
82 FREETMPS;
83 LEAVE;
84 PerlIOBase(f)->flags |= PERLIO_F_UTF8;
85 return code;
86}
87
88IV
89PerlIOEncode_popped(PerlIO *f)
90{
91 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
92 dTHX;
93 if (e->enc)
94 {
95 SvREFCNT_dec(e->enc);
96 e->enc = Nullsv;
97 }
98 if (e->bufsv)
99 {
100 SvREFCNT_dec(e->bufsv);
101 e->bufsv = Nullsv;
102 }
103 return 0;
104}
105
106STDCHAR *
107PerlIOEncode_get_base(PerlIO *f)
108{
109 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
110 dTHX;
111 if (!e->base.bufsiz)
112 e->base.bufsiz = 1024;
113 if (!e->bufsv)
114 {
115 e->bufsv = newSV(e->base.bufsiz);
116 sv_setpvn(e->bufsv,"",0);
117 }
62e8870c 118 e->base.buf = (STDCHAR *)SvPVX(e->bufsv);
33af2bc7 119 if (!e->base.ptr)
120 e->base.ptr = e->base.buf;
121 if (!e->base.end)
122 e->base.end = e->base.buf;
123 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
124 {
125 Perl_warn(aTHX_ " ptr %p(%p)%p",
126 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
127 abort();
128 }
129 if (SvLEN(e->bufsv) < e->base.bufsiz)
130 {
131 SSize_t poff = e->base.ptr - e->base.buf;
132 SSize_t eoff = e->base.end - e->base.buf;
62e8870c 133 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,e->base.bufsiz);
33af2bc7 134 e->base.ptr = e->base.buf + poff;
135 e->base.end = e->base.buf + eoff;
136 }
137 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
138 {
139 Perl_warn(aTHX_ " ptr %p(%p)%p",
140 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
141 abort();
142 }
143 return e->base.buf;
144}
145
33af2bc7 146IV
147PerlIOEncode_fill(PerlIO *f)
148{
149 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
150 dTHX;
151 dSP;
152 IV code;
33af2bc7 153 code = PerlIOBuf_fill(f);
154 if (code == 0)
155 {
156 SV *uni;
72e44f29 157 STRLEN len;
158 char *s;
159 /* Set SV that is the buffer to be buf..ptr */
33af2bc7 160 SvCUR_set(e->bufsv, e->base.end - e->base.buf);
161 SvUTF8_off(e->bufsv);
162 ENTER;
163 SAVETMPS;
164 PUSHMARK(sp);
165 XPUSHs(e->enc);
166 XPUSHs(e->bufsv);
167 XPUSHs(&PL_sv_yes);
168 PUTBACK;
50d26985 169 if (perl_call_method("decode",G_SCALAR) != 1)
33af2bc7 170 code = -1;
171 SPAGAIN;
172 uni = POPs;
173 PUTBACK;
72e44f29 174 /* Now get translated string (forced to UTF-8) and copy back to buffer
175 don't use sv_setsv as that may "steal" PV from returned temp
176 and so free() our known-large-enough buffer.
177 sv_setpvn() should do but let us do it long hand.
178 */
179 s = SvPVutf8(uni,len);
180 if (s != SvPVX(e->bufsv))
181 {
62e8870c 182 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
72e44f29 183 Move(s,e->base.buf,len,char);
184 SvCUR_set(e->bufsv,len);
185 }
186 SvUTF8_on(e->bufsv);
187 e->base.end = e->base.buf+len;
33af2bc7 188 e->base.ptr = e->base.buf;
189 FREETMPS;
190 LEAVE;
191 }
192 return code;
193}
194
195IV
196PerlIOEncode_flush(PerlIO *f)
197{
198 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
199 IV code = 0;
200 dTHX;
201 if (e->bufsv && (PerlIOBase(f)->flags & (PERLIO_F_RDBUF|PERLIO_F_WRBUF)))
202 {
203 dSP;
204 SV *str;
205 char *s;
206 STRLEN len;
72e44f29 207 SSize_t left = 0;
208 if (PerlIOBase(f)->flags & PERLIO_F_RDBUF)
209 {
210 /* This is really just a flag to see if we took all the data, if
211 we did PerlIOBase_flush avoids a seek to lower layer.
212 Need to revisit if we start getting clever with unreads or seeks-in-buffer
213 */
214 left = e->base.end - e->base.ptr;
215 }
33af2bc7 216 ENTER;
217 SAVETMPS;
218 PUSHMARK(sp);
219 XPUSHs(e->enc);
72e44f29 220 SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
33af2bc7 221 SvUTF8_on(e->bufsv);
222 XPUSHs(e->bufsv);
223 XPUSHs(&PL_sv_yes);
224 PUTBACK;
50d26985 225 if (perl_call_method("encode",G_SCALAR) != 1)
33af2bc7 226 code = -1;
227 SPAGAIN;
228 str = POPs;
229 PUTBACK;
72e44f29 230 s = SvPV(str,len);
231 if (s != SvPVX(e->bufsv))
232 {
62e8870c 233 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
72e44f29 234 Move(s,e->base.buf,len,char);
235 SvCUR_set(e->bufsv,len);
236 }
33af2bc7 237 SvUTF8_off(e->bufsv);
72e44f29 238 e->base.ptr = e->base.buf+len;
239 /* restore end != ptr as inequality is used by PerlIOBuf_flush in read case */
240 e->base.end = e->base.ptr + left;
33af2bc7 241 FREETMPS;
242 LEAVE;
243 if (PerlIOBuf_flush(f) != 0)
244 code = -1;
245 }
246 return code;
247}
248
249IV
250PerlIOEncode_close(PerlIO *f)
251{
252 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
253 IV code = PerlIOBase_close(f);
254 dTHX;
255 if (e->bufsv)
256 {
257 SvREFCNT_dec(e->bufsv);
258 e->bufsv = Nullsv;
259 }
260 e->base.buf = NULL;
261 e->base.ptr = NULL;
262 e->base.end = NULL;
263 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF|PERLIO_F_WRBUF);
264 return code;
265}
266
72e44f29 267Off_t
268PerlIOEncode_tell(PerlIO *f)
269{
270 PerlIOBuf *b = PerlIOSelf(f,PerlIOBuf);
271 /* Unfortunately the only way to get a postion is to back-translate,
272 the UTF8-bytes we have buf..ptr and adjust accordingly.
273 But we will try and save any unread data in case stream
274 is un-seekable.
275 */
276 if ((PerlIOBase(f)->flags & PERLIO_F_RDBUF) && b->ptr < b->end)
277 {
278 Size_t count = b->end - b->ptr;
279 PerlIO_push(f,&PerlIO_pending,"r",Nullch,0);
280 /* Save what we have left to read */
281 PerlIOSelf(f,PerlIOBuf)->bufsiz = count;
282 PerlIO_unread(f,b->ptr,count);
283 /* There isn't any unread data - we just saved it - so avoid the lower seek */
284 b->end = b->ptr;
285 /* Flush ourselves - now one layer down,
286 this does the back translate and adjusts position
287 */
288 PerlIO_flush(PerlIONext(f));
289 /* Set position of the saved data */
290 PerlIOSelf(f,PerlIOBuf)->posn = b->posn;
291 }
292 else
293 {
294 PerlIO_flush(f);
295 }
296 return b->posn;
297}
298
33af2bc7 299PerlIO_funcs PerlIO_encode = {
72e44f29 300 "encoding",
33af2bc7 301 sizeof(PerlIOEncode),
302 PERLIO_K_BUFFERED,
303 PerlIOBase_fileno,
33af2bc7 304 PerlIOBuf_open,
33af2bc7 305 PerlIOEncode_pushed,
306 PerlIOEncode_popped,
307 PerlIOBuf_read,
308 PerlIOBuf_unread,
309 PerlIOBuf_write,
310 PerlIOBuf_seek,
72e44f29 311 PerlIOEncode_tell,
33af2bc7 312 PerlIOEncode_close,
313 PerlIOEncode_flush,
314 PerlIOEncode_fill,
315 PerlIOBase_eof,
316 PerlIOBase_error,
317 PerlIOBase_clearerr,
318 PerlIOBuf_setlinebuf,
319 PerlIOEncode_get_base,
320 PerlIOBuf_bufsiz,
321 PerlIOBuf_get_ptr,
322 PerlIOBuf_get_cnt,
323 PerlIOBuf_set_ptrcnt,
324};
9df9a5cd 325#endif /* encode layer */
33af2bc7 326
2f2b4ff2 327void
328Encode_Define(pTHX_ encode_t *enc)
329{
51ef4e11 330 dSP;
2f2b4ff2 331 HV *stash = gv_stashpv("Encode::XS", TRUE);
332 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
dcda1f94 333 int i = 0;
51ef4e11 334 PUSHMARK(sp);
335 XPUSHs(sv);
dcda1f94 336 while (enc->name[i])
337 {
338 const char *name = enc->name[i++];
51ef4e11 339 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
dcda1f94 340 }
51ef4e11 341 PUTBACK;
342 call_pv("Encode::define_encoding",G_DISCARD);
dcda1f94 343 SvREFCNT_dec(sv);
2f2b4ff2 344}
345
183a2d84 346void call_failure (SV *routine, U8* done, U8* dest, U8* orig) {}
67e989fb 347
2f2b4ff2 348static SV *
349encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check)
350{
351 STRLEN slen;
352 U8 *s = (U8 *) SvPV(src,slen);
353 SV *dst = sv_2mortal(newSV(2*slen+1));
354 if (slen)
355 {
356 U8 *d = (U8 *) SvGROW(dst, 2*slen+1);
357 STRLEN dlen = SvLEN(dst);
358 int code;
9b37254d 359 while ((code = do_encode(dir,s,&slen,d,dlen,&dlen,!check)))
2f2b4ff2 360 {
361 SvCUR_set(dst,dlen);
362 SvPOK_on(dst);
9b37254d 363
364 if (code == ENCODE_FALLBACK)
365 break;
366
2f2b4ff2 367 switch(code)
368 {
369 case ENCODE_NOSPACE:
370 {
371 STRLEN need = (slen) ? (SvLEN(dst)*SvCUR(src)/slen) : (dlen + UTF8_MAXLEN);
372 if (need <= SvLEN(dst))
373 need += UTF8_MAXLEN;
374 d = (U8 *) SvGROW(dst, need);
375 dlen = SvLEN(dst);
376 slen = SvCUR(src);
377 break;
378 }
379
380 case ENCODE_NOREP:
381 if (dir == enc->f_utf8)
382 {
383 if (!check && ckWARN_d(WARN_UTF8))
384 {
385 STRLEN clen;
9041c2e3 386 UV ch = utf8n_to_uvuni(s+slen,(SvCUR(src)-slen),&clen,0);
387 Perl_warner(aTHX_ WARN_UTF8, "\"\\N{U+%"UVxf"}\" does not map to %s", ch, enc->name[0]);
2f2b4ff2 388 /* FIXME: Skip over the character, copy in replacement and continue
389 * but that is messy so for now just fail.
390 */
391 return &PL_sv_undef;
392 }
393 else
394 {
395 return &PL_sv_undef;
396 }
397 }
398 else
399 {
400 /* UTF-8 is supposed to be "Universal" so should not happen */
401 Perl_croak(aTHX_ "%s '%.*s' does not map to UTF-8",
dcda1f94 402 enc->name[0], (int)(SvCUR(src)-slen),s+slen);
2f2b4ff2 403 }
404 break;
405
406 case ENCODE_PARTIAL:
407 if (!check && ckWARN_d(WARN_UTF8))
408 {
409 Perl_warner(aTHX_ WARN_UTF8, "Partial %s character",
dcda1f94 410 (dir == enc->f_utf8) ? "UTF-8" : enc->name[0]);
2f2b4ff2 411 }
412 return &PL_sv_undef;
413
414 default:
415 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
dcda1f94 416 code, (dir == enc->f_utf8) ? "to" : "from",enc->name[0]);
2f2b4ff2 417 return &PL_sv_undef;
418 }
419 }
420 SvCUR_set(dst,dlen);
421 SvPOK_on(dst);
422 if (check)
423 {
424 if (slen < SvCUR(src))
425 {
426 Move(s+slen,s,SvCUR(src)-slen,U8);
427 }
428 SvCUR_set(src,SvCUR(src)-slen);
429 }
430 }
431 return dst;
432}
433
50d26985 434MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_
2f2b4ff2 435
436PROTOTYPES: ENABLE
437
438void
50d26985 439Method_decode(obj,src,check = 0)
2f2b4ff2 440SV * obj
441SV * src
442int check
443CODE:
444 {
445 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
446 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
447 SvUTF8_on(ST(0));
448 XSRETURN(1);
449 }
450
451void
50d26985 452Method_encode(obj,src,check = 0)
2f2b4ff2 453SV * obj
454SV * src
455int check
456CODE:
457 {
458 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
459 sv_utf8_upgrade(src);
460 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
461 XSRETURN(1);
462 }
463
67e989fb 464MODULE = Encode PACKAGE = Encode
2c674647 465
466PROTOTYPES: ENABLE
467
67e989fb 468I32
2c674647 469_bytes_to_utf8(sv, ...)
67e989fb 470 SV * sv
2c674647 471 CODE:
67e989fb 472 {
473 SV * encoding = items == 2 ? ST(1) : Nullsv;
474
475 if (encoding)
476 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
477 else {
478 STRLEN len;
183a2d84 479 U8* s = (U8*)SvPV(sv, len);
67e989fb 480 U8* converted;
481
482 converted = bytes_to_utf8(s, &len); /* This allocs */
183a2d84 483 sv_setpvn(sv, (char *)converted, len);
67e989fb 484 SvUTF8_on(sv); /* XXX Should we? */
485 Safefree(converted); /* ... so free it */
486 RETVAL = len;
487 }
488 }
2c674647 489 OUTPUT:
67e989fb 490 RETVAL
2c674647 491
67e989fb 492I32
2c674647 493_utf8_to_bytes(sv, ...)
67e989fb 494 SV * sv
2c674647 495 CODE:
67e989fb 496 {
497 SV * to = items > 1 ? ST(1) : Nullsv;
498 SV * check = items > 2 ? ST(2) : Nullsv;
87714904 499
67e989fb 500 if (to)
501 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
502 else {
67e989fb 503 STRLEN len;
b113ac0e 504 U8 *s = (U8*)SvPV(sv, len);
67e989fb 505
506 if (SvTRUE(check)) {
507 /* Must do things the slow way */
508 U8 *dest;
87714904 509 U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
67e989fb 510 U8 *send = s + len;
511
512 New(83, dest, len, U8); /* I think */
513
514 while (s < send) {
515 if (*s < 0x80)
516 *dest++ = *s++;
517 else {
b113ac0e 518 STRLEN ulen;
519 UV uv = *s++;
87714904 520
67e989fb 521 /* Have to do it all ourselves because of error routine,
522 aargh. */
523 if (!(uv & 0x40))
524 goto failure;
525 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
526 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
527 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
528 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
529 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
530 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
531 else { ulen = 13; uv = 0; }
87714904 532
67e989fb 533 /* Note change to utf8.c variable naming, for variety */
534 while (ulen--) {
535 if ((*s & 0xc0) != 0x80)
536 goto failure;
87714904 537
67e989fb 538 else
539 uv = (uv << 6) | (*s++ & 0x3f);
87714904 540 }
67e989fb 541 if (uv > 256) {
542 failure:
543 call_failure(check, s, dest, src);
544 /* Now what happens? */
545 }
546 *dest++ = (U8)uv;
547 }
548 }
549 } else
550 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
551 }
2c674647 552 }
553 OUTPUT:
554 RETVAL
555
2c674647 556bool
4411f3b6 557is_utf8(sv, check = FALSE)
558SV * sv
559bool check
2c674647 560 CODE:
561 {
2c674647 562 if (SvPOK(sv)) {
4411f3b6 563 RETVAL = SvUTF8(sv) ? TRUE : FALSE;
2c674647 564 if (RETVAL &&
4411f3b6 565 check &&
2c674647 566 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
567 RETVAL = FALSE;
568 } else {
569 RETVAL = FALSE;
570 }
571 }
572 OUTPUT:
573 RETVAL
574
575SV *
4411f3b6 576_utf8_on(sv)
2c674647 577 SV * sv
578 CODE:
579 {
580 if (SvPOK(sv)) {
87714904 581 SV *rsv = newSViv(SvUTF8(sv));
2c674647 582 RETVAL = rsv;
583 SvUTF8_on(sv);
584 } else {
585 RETVAL = &PL_sv_undef;
586 }
587 }
588 OUTPUT:
589 RETVAL
590
591SV *
4411f3b6 592_utf8_off(sv)
2c674647 593 SV * sv
594 CODE:
595 {
596 if (SvPOK(sv)) {
87714904 597 SV *rsv = newSViv(SvUTF8(sv));
2c674647 598 RETVAL = rsv;
599 SvUTF8_off(sv);
600 } else {
601 RETVAL = &PL_sv_undef;
602 }
603 }
604 OUTPUT:
605 RETVAL
606
33af2bc7 607BOOT:
608{
6a59c517 609#if defined(USE_PERLIO) && !defined(USE_SFIO)
33af2bc7 610 PerlIO_define_layer(&PerlIO_encode);
611#endif
2f2b4ff2 612#include "iso8859.def"
613#include "EBCDIC.def"
614#include "Symbols.def"
33af2bc7 615}