"lose the it's", from Abhijit Menon-Sen.
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.xs
CommitLineData
fcf2db38 1#define PERL_NO_GET_CONTEXT
2
2c674647 3#include "EXTERN.h"
4#include "perl.h"
5#include "XSUB.h"
2f2b4ff2 6#define U8 U8
7#include "encode.h"
8#include "iso8859.h"
9#include "EBCDIC.h"
10#include "Symbols.h"
2c674647 11
fcf2db38 12
13#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \
2f5768b8 14 Perl_croak(aTHX_ "panic_unimplemented"); \
4a83738a 15 return (y)0; /* fool picky compilers */ \
87714904 16 }
67e989fb 17UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
18UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
19
9df9a5cd 20#if defined(USE_PERLIO) && !defined(USE_SFIO)
72e44f29 21/* Define an encoding "layer" in the perliol.h sense.
22 The layer defined here "inherits" in an object-oriented sense from the
23 "perlio" layer with its PerlIOBuf_* "methods".
24 The implementation is particularly efficient as until Encode settles down
25 there is no point in tryint to tune it.
26
27 The layer works by overloading the "fill" and "flush" methods.
28
29 "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API
30 to convert the encoded data to UTF-8 form, then copies it back to the
31 buffer. The "base class's" read methods then see the UTF-8 data.
32
33 "flush" transforms the UTF-8 data deposited by the "base class's write
34 method in the buffer back into the encoded form using the encode OO perl API,
35 then copies data back into the buffer and calls "SUPER::flush.
36
37 Note that "flush" is _also_ called for read mode - we still do the (back)-translate
38 so that the the base class's "flush" sees the correct number of encoded chars
39 for positioning the seek pointer. (This double translation is the worst performance
40 issue - particularly with all-perl encode engine.)
41
42*/
43
44
33af2bc7 45#include "perliol.h"
46
47typedef struct
48{
49 PerlIOBuf base; /* PerlIOBuf stuff */
50 SV * bufsv;
51 SV * enc;
52} PerlIOEncode;
53
e3f3bf95 54SV *
55PerlIOEncode_getarg(PerlIO *f)
56{
fcf2db38 57 dTHX;
e3f3bf95 58 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
59 SV *sv = &PL_sv_undef;
60 if (e->enc)
61 {
62 dSP;
63 ENTER;
64 SAVETMPS;
65 PUSHMARK(sp);
66 XPUSHs(e->enc);
67 PUTBACK;
68 if (perl_call_method("name",G_SCALAR) == 1)
69 {
70 SPAGAIN;
71 sv = newSVsv(POPs);
72 PUTBACK;
73 }
74 }
75 return sv;
76}
33af2bc7 77
78IV
e3f3bf95 79PerlIOEncode_pushed(PerlIO *f, const char *mode, SV *arg)
33af2bc7 80{
81 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
82 dTHX;
83 dSP;
84 IV code;
e3f3bf95 85 code = PerlIOBuf_pushed(f,mode,Nullsv);
33af2bc7 86 ENTER;
87 SAVETMPS;
88 PUSHMARK(sp);
e3f3bf95 89 XPUSHs(arg);
33af2bc7 90 PUTBACK;
51ef4e11 91 if (perl_call_pv("Encode::find_encoding",G_SCALAR) != 1)
52744f63 92 {
93 /* should never happen */
51ef4e11 94 Perl_die(aTHX_ "Encode::find_encoding did not return a value");
52744f63 95 return -1;
96 }
33af2bc7 97 SPAGAIN;
98 e->enc = POPs;
99 PUTBACK;
100 if (!SvROK(e->enc))
52744f63 101 {
102 e->enc = Nullsv;
103 errno = EINVAL;
29b291f7 104 Perl_warner(aTHX_ WARN_IO, "Cannot find encoding \"%"SVf"\"", arg);
52744f63 105 return -1;
106 }
33af2bc7 107 SvREFCNT_inc(e->enc);
108 FREETMPS;
109 LEAVE;
110 PerlIOBase(f)->flags |= PERLIO_F_UTF8;
111 return code;
112}
113
114IV
115PerlIOEncode_popped(PerlIO *f)
116{
117 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
118 dTHX;
119 if (e->enc)
120 {
121 SvREFCNT_dec(e->enc);
122 e->enc = Nullsv;
123 }
124 if (e->bufsv)
125 {
126 SvREFCNT_dec(e->bufsv);
127 e->bufsv = Nullsv;
128 }
129 return 0;
130}
131
132STDCHAR *
133PerlIOEncode_get_base(PerlIO *f)
134{
135 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
136 dTHX;
137 if (!e->base.bufsiz)
138 e->base.bufsiz = 1024;
139 if (!e->bufsv)
140 {
141 e->bufsv = newSV(e->base.bufsiz);
142 sv_setpvn(e->bufsv,"",0);
143 }
62e8870c 144 e->base.buf = (STDCHAR *)SvPVX(e->bufsv);
33af2bc7 145 if (!e->base.ptr)
146 e->base.ptr = e->base.buf;
147 if (!e->base.end)
148 e->base.end = e->base.buf;
149 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
150 {
151 Perl_warn(aTHX_ " ptr %p(%p)%p",
152 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
153 abort();
154 }
155 if (SvLEN(e->bufsv) < e->base.bufsiz)
156 {
157 SSize_t poff = e->base.ptr - e->base.buf;
158 SSize_t eoff = e->base.end - e->base.buf;
62e8870c 159 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,e->base.bufsiz);
33af2bc7 160 e->base.ptr = e->base.buf + poff;
161 e->base.end = e->base.buf + eoff;
162 }
163 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
164 {
165 Perl_warn(aTHX_ " ptr %p(%p)%p",
166 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
167 abort();
168 }
169 return e->base.buf;
170}
171
33af2bc7 172IV
173PerlIOEncode_fill(PerlIO *f)
174{
175 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
176 dTHX;
177 dSP;
178 IV code;
33af2bc7 179 code = PerlIOBuf_fill(f);
180 if (code == 0)
181 {
182 SV *uni;
72e44f29 183 STRLEN len;
184 char *s;
185 /* Set SV that is the buffer to be buf..ptr */
33af2bc7 186 SvCUR_set(e->bufsv, e->base.end - e->base.buf);
187 SvUTF8_off(e->bufsv);
188 ENTER;
189 SAVETMPS;
190 PUSHMARK(sp);
191 XPUSHs(e->enc);
192 XPUSHs(e->bufsv);
193 XPUSHs(&PL_sv_yes);
194 PUTBACK;
50d26985 195 if (perl_call_method("decode",G_SCALAR) != 1)
33af2bc7 196 code = -1;
197 SPAGAIN;
198 uni = POPs;
199 PUTBACK;
72e44f29 200 /* Now get translated string (forced to UTF-8) and copy back to buffer
201 don't use sv_setsv as that may "steal" PV from returned temp
202 and so free() our known-large-enough buffer.
203 sv_setpvn() should do but let us do it long hand.
204 */
205 s = SvPVutf8(uni,len);
206 if (s != SvPVX(e->bufsv))
207 {
62e8870c 208 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
72e44f29 209 Move(s,e->base.buf,len,char);
210 SvCUR_set(e->bufsv,len);
211 }
212 SvUTF8_on(e->bufsv);
213 e->base.end = e->base.buf+len;
33af2bc7 214 e->base.ptr = e->base.buf;
215 FREETMPS;
216 LEAVE;
217 }
218 return code;
219}
220
221IV
222PerlIOEncode_flush(PerlIO *f)
223{
224 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
225 IV code = 0;
8040349a 226 if (e->bufsv && (PerlIOBase(f)->flags & (PERLIO_F_RDBUF|PERLIO_F_WRBUF))
227 &&(e->base.ptr > e->base.buf)
228 )
33af2bc7 229 {
8040349a 230 dTHX;
33af2bc7 231 dSP;
232 SV *str;
233 char *s;
234 STRLEN len;
72e44f29 235 SSize_t left = 0;
236 if (PerlIOBase(f)->flags & PERLIO_F_RDBUF)
237 {
238 /* This is really just a flag to see if we took all the data, if
239 we did PerlIOBase_flush avoids a seek to lower layer.
240 Need to revisit if we start getting clever with unreads or seeks-in-buffer
241 */
242 left = e->base.end - e->base.ptr;
243 }
33af2bc7 244 ENTER;
245 SAVETMPS;
246 PUSHMARK(sp);
247 XPUSHs(e->enc);
72e44f29 248 SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
33af2bc7 249 SvUTF8_on(e->bufsv);
250 XPUSHs(e->bufsv);
251 XPUSHs(&PL_sv_yes);
252 PUTBACK;
50d26985 253 if (perl_call_method("encode",G_SCALAR) != 1)
33af2bc7 254 code = -1;
255 SPAGAIN;
256 str = POPs;
257 PUTBACK;
72e44f29 258 s = SvPV(str,len);
259 if (s != SvPVX(e->bufsv))
260 {
62e8870c 261 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
72e44f29 262 Move(s,e->base.buf,len,char);
263 SvCUR_set(e->bufsv,len);
264 }
33af2bc7 265 SvUTF8_off(e->bufsv);
72e44f29 266 e->base.ptr = e->base.buf+len;
267 /* restore end != ptr as inequality is used by PerlIOBuf_flush in read case */
268 e->base.end = e->base.ptr + left;
33af2bc7 269 FREETMPS;
270 LEAVE;
271 if (PerlIOBuf_flush(f) != 0)
272 code = -1;
273 }
274 return code;
275}
276
277IV
278PerlIOEncode_close(PerlIO *f)
279{
280 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
281 IV code = PerlIOBase_close(f);
282 dTHX;
283 if (e->bufsv)
284 {
285 SvREFCNT_dec(e->bufsv);
286 e->bufsv = Nullsv;
287 }
288 e->base.buf = NULL;
289 e->base.ptr = NULL;
290 e->base.end = NULL;
291 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF|PERLIO_F_WRBUF);
292 return code;
293}
294
72e44f29 295Off_t
296PerlIOEncode_tell(PerlIO *f)
297{
a999f61b 298 dTHX;
72e44f29 299 PerlIOBuf *b = PerlIOSelf(f,PerlIOBuf);
300 /* Unfortunately the only way to get a postion is to back-translate,
301 the UTF8-bytes we have buf..ptr and adjust accordingly.
302 But we will try and save any unread data in case stream
303 is un-seekable.
304 */
305 if ((PerlIOBase(f)->flags & PERLIO_F_RDBUF) && b->ptr < b->end)
306 {
307 Size_t count = b->end - b->ptr;
e3f3bf95 308 PerlIO_push(aTHX_ f,&PerlIO_pending,"r",Nullsv);
72e44f29 309 /* Save what we have left to read */
310 PerlIOSelf(f,PerlIOBuf)->bufsiz = count;
311 PerlIO_unread(f,b->ptr,count);
312 /* There isn't any unread data - we just saved it - so avoid the lower seek */
313 b->end = b->ptr;
314 /* Flush ourselves - now one layer down,
315 this does the back translate and adjusts position
316 */
317 PerlIO_flush(PerlIONext(f));
318 /* Set position of the saved data */
319 PerlIOSelf(f,PerlIOBuf)->posn = b->posn;
320 }
321 else
322 {
323 PerlIO_flush(f);
324 }
325 return b->posn;
326}
327
33af2bc7 328PerlIO_funcs PerlIO_encode = {
72e44f29 329 "encoding",
33af2bc7 330 sizeof(PerlIOEncode),
331 PERLIO_K_BUFFERED,
33af2bc7 332 PerlIOEncode_pushed,
333 PerlIOEncode_popped,
e3f3bf95 334 PerlIOBuf_open,
335 PerlIOEncode_getarg,
336 PerlIOBase_fileno,
33af2bc7 337 PerlIOBuf_read,
338 PerlIOBuf_unread,
339 PerlIOBuf_write,
340 PerlIOBuf_seek,
72e44f29 341 PerlIOEncode_tell,
33af2bc7 342 PerlIOEncode_close,
343 PerlIOEncode_flush,
344 PerlIOEncode_fill,
345 PerlIOBase_eof,
346 PerlIOBase_error,
347 PerlIOBase_clearerr,
f6c77cf1 348 PerlIOBase_setlinebuf,
33af2bc7 349 PerlIOEncode_get_base,
350 PerlIOBuf_bufsiz,
351 PerlIOBuf_get_ptr,
352 PerlIOBuf_get_cnt,
353 PerlIOBuf_set_ptrcnt,
354};
9df9a5cd 355#endif /* encode layer */
33af2bc7 356
2f2b4ff2 357void
358Encode_Define(pTHX_ encode_t *enc)
359{
51ef4e11 360 dSP;
2f2b4ff2 361 HV *stash = gv_stashpv("Encode::XS", TRUE);
362 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
dcda1f94 363 int i = 0;
51ef4e11 364 PUSHMARK(sp);
365 XPUSHs(sv);
dcda1f94 366 while (enc->name[i])
367 {
368 const char *name = enc->name[i++];
51ef4e11 369 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
dcda1f94 370 }
51ef4e11 371 PUTBACK;
372 call_pv("Encode::define_encoding",G_DISCARD);
dcda1f94 373 SvREFCNT_dec(sv);
2f2b4ff2 374}
375
183a2d84 376void call_failure (SV *routine, U8* done, U8* dest, U8* orig) {}
67e989fb 377
2f2b4ff2 378static SV *
379encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check)
380{
381 STRLEN slen;
382 U8 *s = (U8 *) SvPV(src,slen);
383 SV *dst = sv_2mortal(newSV(2*slen+1));
384 if (slen)
385 {
386 U8 *d = (U8 *) SvGROW(dst, 2*slen+1);
387 STRLEN dlen = SvLEN(dst);
388 int code;
9b37254d 389 while ((code = do_encode(dir,s,&slen,d,dlen,&dlen,!check)))
2f2b4ff2 390 {
391 SvCUR_set(dst,dlen);
392 SvPOK_on(dst);
9b37254d 393
394 if (code == ENCODE_FALLBACK)
395 break;
396
2f2b4ff2 397 switch(code)
398 {
399 case ENCODE_NOSPACE:
400 {
401 STRLEN need = (slen) ? (SvLEN(dst)*SvCUR(src)/slen) : (dlen + UTF8_MAXLEN);
402 if (need <= SvLEN(dst))
403 need += UTF8_MAXLEN;
404 d = (U8 *) SvGROW(dst, need);
405 dlen = SvLEN(dst);
406 slen = SvCUR(src);
407 break;
408 }
409
410 case ENCODE_NOREP:
411 if (dir == enc->f_utf8)
412 {
413 if (!check && ckWARN_d(WARN_UTF8))
414 {
415 STRLEN clen;
9041c2e3 416 UV ch = utf8n_to_uvuni(s+slen,(SvCUR(src)-slen),&clen,0);
417 Perl_warner(aTHX_ WARN_UTF8, "\"\\N{U+%"UVxf"}\" does not map to %s", ch, enc->name[0]);
2f2b4ff2 418 /* FIXME: Skip over the character, copy in replacement and continue
419 * but that is messy so for now just fail.
420 */
421 return &PL_sv_undef;
422 }
423 else
424 {
425 return &PL_sv_undef;
426 }
427 }
428 else
429 {
430 /* UTF-8 is supposed to be "Universal" so should not happen */
431 Perl_croak(aTHX_ "%s '%.*s' does not map to UTF-8",
dcda1f94 432 enc->name[0], (int)(SvCUR(src)-slen),s+slen);
2f2b4ff2 433 }
434 break;
435
436 case ENCODE_PARTIAL:
437 if (!check && ckWARN_d(WARN_UTF8))
438 {
439 Perl_warner(aTHX_ WARN_UTF8, "Partial %s character",
dcda1f94 440 (dir == enc->f_utf8) ? "UTF-8" : enc->name[0]);
2f2b4ff2 441 }
442 return &PL_sv_undef;
443
444 default:
445 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
dcda1f94 446 code, (dir == enc->f_utf8) ? "to" : "from",enc->name[0]);
2f2b4ff2 447 return &PL_sv_undef;
448 }
449 }
450 SvCUR_set(dst,dlen);
451 SvPOK_on(dst);
452 if (check)
453 {
454 if (slen < SvCUR(src))
455 {
456 Move(s+slen,s,SvCUR(src)-slen,U8);
457 }
458 SvCUR_set(src,SvCUR(src)-slen);
459 }
460 }
8040349a 461 else
462 {
463 SvCUR_set(dst,slen);
464 SvPOK_on(dst);
465 }
2f2b4ff2 466 return dst;
467}
468
50d26985 469MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_
2f2b4ff2 470
471PROTOTYPES: ENABLE
472
473void
691638dd 474Method_decode(obj,src,check = FALSE)
2f2b4ff2 475SV * obj
476SV * src
691638dd 477bool check
2f2b4ff2 478CODE:
479 {
480 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
481 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
482 SvUTF8_on(ST(0));
483 XSRETURN(1);
484 }
485
486void
691638dd 487Method_encode(obj,src,check = FALSE)
2f2b4ff2 488SV * obj
489SV * src
691638dd 490bool check
2f2b4ff2 491CODE:
492 {
493 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
494 sv_utf8_upgrade(src);
495 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
496 XSRETURN(1);
497 }
498
67e989fb 499MODULE = Encode PACKAGE = Encode
2c674647 500
501PROTOTYPES: ENABLE
502
67e989fb 503I32
2c674647 504_bytes_to_utf8(sv, ...)
67e989fb 505 SV * sv
2c674647 506 CODE:
67e989fb 507 {
508 SV * encoding = items == 2 ? ST(1) : Nullsv;
509
510 if (encoding)
511 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
512 else {
513 STRLEN len;
183a2d84 514 U8* s = (U8*)SvPV(sv, len);
67e989fb 515 U8* converted;
516
517 converted = bytes_to_utf8(s, &len); /* This allocs */
183a2d84 518 sv_setpvn(sv, (char *)converted, len);
67e989fb 519 SvUTF8_on(sv); /* XXX Should we? */
520 Safefree(converted); /* ... so free it */
521 RETVAL = len;
522 }
523 }
2c674647 524 OUTPUT:
67e989fb 525 RETVAL
2c674647 526
67e989fb 527I32
2c674647 528_utf8_to_bytes(sv, ...)
67e989fb 529 SV * sv
2c674647 530 CODE:
67e989fb 531 {
532 SV * to = items > 1 ? ST(1) : Nullsv;
533 SV * check = items > 2 ? ST(2) : Nullsv;
87714904 534
67e989fb 535 if (to)
536 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
537 else {
67e989fb 538 STRLEN len;
b113ac0e 539 U8 *s = (U8*)SvPV(sv, len);
67e989fb 540
9c5ffd7c 541 RETVAL = 0;
67e989fb 542 if (SvTRUE(check)) {
543 /* Must do things the slow way */
544 U8 *dest;
87714904 545 U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
67e989fb 546 U8 *send = s + len;
547
548 New(83, dest, len, U8); /* I think */
549
550 while (s < send) {
551 if (*s < 0x80)
552 *dest++ = *s++;
553 else {
b113ac0e 554 STRLEN ulen;
555 UV uv = *s++;
87714904 556
67e989fb 557 /* Have to do it all ourselves because of error routine,
558 aargh. */
559 if (!(uv & 0x40))
560 goto failure;
561 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
562 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
563 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
564 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
565 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
566 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
567 else { ulen = 13; uv = 0; }
87714904 568
67e989fb 569 /* Note change to utf8.c variable naming, for variety */
570 while (ulen--) {
571 if ((*s & 0xc0) != 0x80)
572 goto failure;
87714904 573
67e989fb 574 else
575 uv = (uv << 6) | (*s++ & 0x3f);
87714904 576 }
67e989fb 577 if (uv > 256) {
578 failure:
579 call_failure(check, s, dest, src);
580 /* Now what happens? */
581 }
582 *dest++ = (U8)uv;
583 }
584 }
585 } else
586 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
587 }
2c674647 588 }
589 OUTPUT:
590 RETVAL
591
2c674647 592bool
4411f3b6 593is_utf8(sv, check = FALSE)
594SV * sv
595bool check
2c674647 596 CODE:
597 {
2c674647 598 if (SvPOK(sv)) {
4411f3b6 599 RETVAL = SvUTF8(sv) ? TRUE : FALSE;
2c674647 600 if (RETVAL &&
4411f3b6 601 check &&
2c674647 602 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
603 RETVAL = FALSE;
604 } else {
605 RETVAL = FALSE;
606 }
607 }
608 OUTPUT:
609 RETVAL
610
611SV *
4411f3b6 612_utf8_on(sv)
2c674647 613 SV * sv
614 CODE:
615 {
616 if (SvPOK(sv)) {
87714904 617 SV *rsv = newSViv(SvUTF8(sv));
2c674647 618 RETVAL = rsv;
619 SvUTF8_on(sv);
620 } else {
621 RETVAL = &PL_sv_undef;
622 }
623 }
624 OUTPUT:
625 RETVAL
626
627SV *
4411f3b6 628_utf8_off(sv)
2c674647 629 SV * sv
630 CODE:
631 {
632 if (SvPOK(sv)) {
87714904 633 SV *rsv = newSViv(SvUTF8(sv));
2c674647 634 RETVAL = rsv;
635 SvUTF8_off(sv);
636 } else {
637 RETVAL = &PL_sv_undef;
638 }
639 }
640 OUTPUT:
641 RETVAL
642
33af2bc7 643BOOT:
644{
6a59c517 645#if defined(USE_PERLIO) && !defined(USE_SFIO)
a999f61b 646 PerlIO_define_layer(aTHX_ &PerlIO_encode);
33af2bc7 647#endif
2f2b4ff2 648#include "iso8859.def"
649#include "EBCDIC.def"
650#include "Symbols.def"
33af2bc7 651}