GCC __atribute__ / printf fix
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.xs
CommitLineData
2c674647 1#include "EXTERN.h"
2#include "perl.h"
3#include "XSUB.h"
2f2b4ff2 4#define U8 U8
5#include "encode.h"
6#include "iso8859.h"
7#include "EBCDIC.h"
8#include "Symbols.h"
2c674647 9
67e989fb 10#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) { \
2f5768b8 11 Perl_croak(aTHX_ "panic_unimplemented"); \
4a83738a 12 return (y)0; /* fool picky compilers */ \
87714904 13 }
67e989fb 14UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
15UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
16
33af2bc7 17#ifdef USE_PERLIO
72e44f29 18/* Define an encoding "layer" in the perliol.h sense.
19 The layer defined here "inherits" in an object-oriented sense from the
20 "perlio" layer with its PerlIOBuf_* "methods".
21 The implementation is particularly efficient as until Encode settles down
22 there is no point in tryint to tune it.
23
24 The layer works by overloading the "fill" and "flush" methods.
25
26 "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API
27 to convert the encoded data to UTF-8 form, then copies it back to the
28 buffer. The "base class's" read methods then see the UTF-8 data.
29
30 "flush" transforms the UTF-8 data deposited by the "base class's write
31 method in the buffer back into the encoded form using the encode OO perl API,
32 then copies data back into the buffer and calls "SUPER::flush.
33
34 Note that "flush" is _also_ called for read mode - we still do the (back)-translate
35 so that the the base class's "flush" sees the correct number of encoded chars
36 for positioning the seek pointer. (This double translation is the worst performance
37 issue - particularly with all-perl encode engine.)
38
39*/
40
41
33af2bc7 42#include "perliol.h"
43
44typedef struct
45{
46 PerlIOBuf base; /* PerlIOBuf stuff */
47 SV * bufsv;
48 SV * enc;
49} PerlIOEncode;
50
51
52IV
53PerlIOEncode_pushed(PerlIO *f, const char *mode,const char *arg,STRLEN len)
54{
55 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
56 dTHX;
57 dSP;
58 IV code;
59 code = PerlIOBuf_pushed(f,mode,Nullch,0);
60 ENTER;
61 SAVETMPS;
62 PUSHMARK(sp);
63 XPUSHs(sv_2mortal(newSVpv("Encode",0)));
64 XPUSHs(sv_2mortal(newSVpvn(arg,len)));
65 PUTBACK;
66 if (perl_call_method("getEncoding",G_SCALAR) != 1)
52744f63 67 {
68 /* should never happen */
69 Perl_die(aTHX_ "Encode::getEncoding did not return a value");
70 return -1;
71 }
33af2bc7 72 SPAGAIN;
73 e->enc = POPs;
74 PUTBACK;
75 if (!SvROK(e->enc))
52744f63 76 {
77 e->enc = Nullsv;
78 errno = EINVAL;
79 Perl_warner(aTHX_ WARN_IO, "Cannot find encoding \"%.*s\"", (int) len, arg);
80 return -1;
81 }
33af2bc7 82 SvREFCNT_inc(e->enc);
83 FREETMPS;
84 LEAVE;
85 PerlIOBase(f)->flags |= PERLIO_F_UTF8;
86 return code;
87}
88
89IV
90PerlIOEncode_popped(PerlIO *f)
91{
92 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
93 dTHX;
94 if (e->enc)
95 {
96 SvREFCNT_dec(e->enc);
97 e->enc = Nullsv;
98 }
99 if (e->bufsv)
100 {
101 SvREFCNT_dec(e->bufsv);
102 e->bufsv = Nullsv;
103 }
104 return 0;
105}
106
107STDCHAR *
108PerlIOEncode_get_base(PerlIO *f)
109{
110 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
111 dTHX;
112 if (!e->base.bufsiz)
113 e->base.bufsiz = 1024;
114 if (!e->bufsv)
115 {
116 e->bufsv = newSV(e->base.bufsiz);
117 sv_setpvn(e->bufsv,"",0);
118 }
62e8870c 119 e->base.buf = (STDCHAR *)SvPVX(e->bufsv);
33af2bc7 120 if (!e->base.ptr)
121 e->base.ptr = e->base.buf;
122 if (!e->base.end)
123 e->base.end = e->base.buf;
124 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
125 {
126 Perl_warn(aTHX_ " ptr %p(%p)%p",
127 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
128 abort();
129 }
130 if (SvLEN(e->bufsv) < e->base.bufsiz)
131 {
132 SSize_t poff = e->base.ptr - e->base.buf;
133 SSize_t eoff = e->base.end - e->base.buf;
62e8870c 134 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,e->base.bufsiz);
33af2bc7 135 e->base.ptr = e->base.buf + poff;
136 e->base.end = e->base.buf + eoff;
137 }
138 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
139 {
140 Perl_warn(aTHX_ " ptr %p(%p)%p",
141 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
142 abort();
143 }
144 return e->base.buf;
145}
146
33af2bc7 147IV
148PerlIOEncode_fill(PerlIO *f)
149{
150 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
151 dTHX;
152 dSP;
153 IV code;
33af2bc7 154 code = PerlIOBuf_fill(f);
155 if (code == 0)
156 {
157 SV *uni;
72e44f29 158 STRLEN len;
159 char *s;
160 /* Set SV that is the buffer to be buf..ptr */
33af2bc7 161 SvCUR_set(e->bufsv, e->base.end - e->base.buf);
162 SvUTF8_off(e->bufsv);
163 ENTER;
164 SAVETMPS;
165 PUSHMARK(sp);
166 XPUSHs(e->enc);
167 XPUSHs(e->bufsv);
168 XPUSHs(&PL_sv_yes);
169 PUTBACK;
170 if (perl_call_method("toUnicode",G_SCALAR) != 1)
171 code = -1;
172 SPAGAIN;
173 uni = POPs;
174 PUTBACK;
72e44f29 175 /* Now get translated string (forced to UTF-8) and copy back to buffer
176 don't use sv_setsv as that may "steal" PV from returned temp
177 and so free() our known-large-enough buffer.
178 sv_setpvn() should do but let us do it long hand.
179 */
180 s = SvPVutf8(uni,len);
181 if (s != SvPVX(e->bufsv))
182 {
62e8870c 183 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
72e44f29 184 Move(s,e->base.buf,len,char);
185 SvCUR_set(e->bufsv,len);
186 }
187 SvUTF8_on(e->bufsv);
188 e->base.end = e->base.buf+len;
33af2bc7 189 e->base.ptr = e->base.buf;
190 FREETMPS;
191 LEAVE;
192 }
193 return code;
194}
195
196IV
197PerlIOEncode_flush(PerlIO *f)
198{
199 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
200 IV code = 0;
201 dTHX;
202 if (e->bufsv && (PerlIOBase(f)->flags & (PERLIO_F_RDBUF|PERLIO_F_WRBUF)))
203 {
204 dSP;
205 SV *str;
206 char *s;
207 STRLEN len;
72e44f29 208 SSize_t left = 0;
209 if (PerlIOBase(f)->flags & PERLIO_F_RDBUF)
210 {
211 /* This is really just a flag to see if we took all the data, if
212 we did PerlIOBase_flush avoids a seek to lower layer.
213 Need to revisit if we start getting clever with unreads or seeks-in-buffer
214 */
215 left = e->base.end - e->base.ptr;
216 }
33af2bc7 217 ENTER;
218 SAVETMPS;
219 PUSHMARK(sp);
220 XPUSHs(e->enc);
72e44f29 221 SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
33af2bc7 222 SvUTF8_on(e->bufsv);
223 XPUSHs(e->bufsv);
224 XPUSHs(&PL_sv_yes);
225 PUTBACK;
226 if (perl_call_method("fromUnicode",G_SCALAR) != 1)
227 code = -1;
228 SPAGAIN;
229 str = POPs;
230 PUTBACK;
72e44f29 231 s = SvPV(str,len);
232 if (s != SvPVX(e->bufsv))
233 {
62e8870c 234 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
72e44f29 235 Move(s,e->base.buf,len,char);
236 SvCUR_set(e->bufsv,len);
237 }
33af2bc7 238 SvUTF8_off(e->bufsv);
72e44f29 239 e->base.ptr = e->base.buf+len;
240 /* restore end != ptr as inequality is used by PerlIOBuf_flush in read case */
241 e->base.end = e->base.ptr + left;
33af2bc7 242 FREETMPS;
243 LEAVE;
244 if (PerlIOBuf_flush(f) != 0)
245 code = -1;
246 }
247 return code;
248}
249
250IV
251PerlIOEncode_close(PerlIO *f)
252{
253 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
254 IV code = PerlIOBase_close(f);
255 dTHX;
256 if (e->bufsv)
257 {
258 SvREFCNT_dec(e->bufsv);
259 e->bufsv = Nullsv;
260 }
261 e->base.buf = NULL;
262 e->base.ptr = NULL;
263 e->base.end = NULL;
264 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF|PERLIO_F_WRBUF);
265 return code;
266}
267
72e44f29 268Off_t
269PerlIOEncode_tell(PerlIO *f)
270{
271 PerlIOBuf *b = PerlIOSelf(f,PerlIOBuf);
272 /* Unfortunately the only way to get a postion is to back-translate,
273 the UTF8-bytes we have buf..ptr and adjust accordingly.
274 But we will try and save any unread data in case stream
275 is un-seekable.
276 */
277 if ((PerlIOBase(f)->flags & PERLIO_F_RDBUF) && b->ptr < b->end)
278 {
279 Size_t count = b->end - b->ptr;
280 PerlIO_push(f,&PerlIO_pending,"r",Nullch,0);
281 /* Save what we have left to read */
282 PerlIOSelf(f,PerlIOBuf)->bufsiz = count;
283 PerlIO_unread(f,b->ptr,count);
284 /* There isn't any unread data - we just saved it - so avoid the lower seek */
285 b->end = b->ptr;
286 /* Flush ourselves - now one layer down,
287 this does the back translate and adjusts position
288 */
289 PerlIO_flush(PerlIONext(f));
290 /* Set position of the saved data */
291 PerlIOSelf(f,PerlIOBuf)->posn = b->posn;
292 }
293 else
294 {
295 PerlIO_flush(f);
296 }
297 return b->posn;
298}
299
33af2bc7 300PerlIO_funcs PerlIO_encode = {
72e44f29 301 "encoding",
33af2bc7 302 sizeof(PerlIOEncode),
303 PERLIO_K_BUFFERED,
304 PerlIOBase_fileno,
305 PerlIOBuf_fdopen,
306 PerlIOBuf_open,
307 PerlIOBuf_reopen,
308 PerlIOEncode_pushed,
309 PerlIOEncode_popped,
310 PerlIOBuf_read,
311 PerlIOBuf_unread,
312 PerlIOBuf_write,
313 PerlIOBuf_seek,
72e44f29 314 PerlIOEncode_tell,
33af2bc7 315 PerlIOEncode_close,
316 PerlIOEncode_flush,
317 PerlIOEncode_fill,
318 PerlIOBase_eof,
319 PerlIOBase_error,
320 PerlIOBase_clearerr,
321 PerlIOBuf_setlinebuf,
322 PerlIOEncode_get_base,
323 PerlIOBuf_bufsiz,
324 PerlIOBuf_get_ptr,
325 PerlIOBuf_get_cnt,
326 PerlIOBuf_set_ptrcnt,
327};
328#endif
329
2f2b4ff2 330void
331Encode_Define(pTHX_ encode_t *enc)
332{
333 HV *hash = get_hv("Encode::encoding",GV_ADD|GV_ADDMULTI);
334 HV *stash = gv_stashpv("Encode::XS", TRUE);
335 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
dcda1f94 336 int i = 0;
337 while (enc->name[i])
338 {
339 const char *name = enc->name[i++];
340 hv_store(hash,name,strlen(name),SvREFCNT_inc(sv),0);
341 }
342 SvREFCNT_dec(sv);
2f2b4ff2 343}
344
183a2d84 345void call_failure (SV *routine, U8* done, U8* dest, U8* orig) {}
67e989fb 346
2f2b4ff2 347static SV *
348encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check)
349{
350 STRLEN slen;
351 U8 *s = (U8 *) SvPV(src,slen);
352 SV *dst = sv_2mortal(newSV(2*slen+1));
353 if (slen)
354 {
355 U8 *d = (U8 *) SvGROW(dst, 2*slen+1);
356 STRLEN dlen = SvLEN(dst);
357 int code;
9b37254d 358 while ((code = do_encode(dir,s,&slen,d,dlen,&dlen,!check)))
2f2b4ff2 359 {
360 SvCUR_set(dst,dlen);
361 SvPOK_on(dst);
9b37254d 362
363 if (code == ENCODE_FALLBACK)
364 break;
365
2f2b4ff2 366 switch(code)
367 {
368 case ENCODE_NOSPACE:
369 {
370 STRLEN need = (slen) ? (SvLEN(dst)*SvCUR(src)/slen) : (dlen + UTF8_MAXLEN);
371 if (need <= SvLEN(dst))
372 need += UTF8_MAXLEN;
373 d = (U8 *) SvGROW(dst, need);
374 dlen = SvLEN(dst);
375 slen = SvCUR(src);
376 break;
377 }
378
379 case ENCODE_NOREP:
380 if (dir == enc->f_utf8)
381 {
382 if (!check && ckWARN_d(WARN_UTF8))
383 {
384 STRLEN clen;
385 UV ch = utf8_to_uv(s+slen,(SvCUR(src)-slen),&clen,0);
dcda1f94 386 Perl_warner(aTHX_ WARN_UTF8, "\"\\x{%"UVxf"}\" does not map to %s", ch, enc->name[0]);
2f2b4ff2 387 /* FIXME: Skip over the character, copy in replacement and continue
388 * but that is messy so for now just fail.
389 */
390 return &PL_sv_undef;
391 }
392 else
393 {
394 return &PL_sv_undef;
395 }
396 }
397 else
398 {
399 /* UTF-8 is supposed to be "Universal" so should not happen */
400 Perl_croak(aTHX_ "%s '%.*s' does not map to UTF-8",
dcda1f94 401 enc->name[0], (int)(SvCUR(src)-slen),s+slen);
2f2b4ff2 402 }
403 break;
404
405 case ENCODE_PARTIAL:
406 if (!check && ckWARN_d(WARN_UTF8))
407 {
408 Perl_warner(aTHX_ WARN_UTF8, "Partial %s character",
dcda1f94 409 (dir == enc->f_utf8) ? "UTF-8" : enc->name[0]);
2f2b4ff2 410 }
411 return &PL_sv_undef;
412
413 default:
414 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
dcda1f94 415 code, (dir == enc->f_utf8) ? "to" : "from",enc->name[0]);
2f2b4ff2 416 return &PL_sv_undef;
417 }
418 }
419 SvCUR_set(dst,dlen);
420 SvPOK_on(dst);
421 if (check)
422 {
423 if (slen < SvCUR(src))
424 {
425 Move(s+slen,s,SvCUR(src)-slen,U8);
426 }
427 SvCUR_set(src,SvCUR(src)-slen);
428 }
429 }
430 return dst;
431}
432
7525822f 433MODULE = Encode PACKAGE = Encode PREFIX = sv_
434
435void
436valid_utf8(sv)
437SV * sv
438CODE:
439 {
440 STRLEN len;
441 char *s = SvPV(sv,len);
51516b5a 442 if (!SvUTF8(sv) || is_utf8_string((U8*)s,len))
7525822f 443 XSRETURN_YES;
444 else
445 XSRETURN_NO;
446 }
447
448void
449sv_utf8_encode(sv)
450SV * sv
451
452bool
453sv_utf8_decode(sv)
454SV * sv
455
4411f3b6 456STRLEN
7525822f 457sv_utf8_upgrade(sv)
458SV * sv
459
460bool
461sv_utf8_downgrade(sv,failok=0)
462SV * sv
463bool failok
464
2f2b4ff2 465MODULE = Encode PACKAGE = Encode::XS PREFIX = Encode_
466
467PROTOTYPES: ENABLE
468
469void
470Encode_toUnicode(obj,src,check = 0)
471SV * obj
472SV * src
473int check
474CODE:
475 {
476 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
477 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
478 SvUTF8_on(ST(0));
479 XSRETURN(1);
480 }
481
482void
483Encode_fromUnicode(obj,src,check = 0)
484SV * obj
485SV * src
486int check
487CODE:
488 {
489 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
490 sv_utf8_upgrade(src);
491 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
492 XSRETURN(1);
493 }
494
67e989fb 495MODULE = Encode PACKAGE = Encode
2c674647 496
497PROTOTYPES: ENABLE
498
67e989fb 499I32
2c674647 500_bytes_to_utf8(sv, ...)
67e989fb 501 SV * sv
2c674647 502 CODE:
67e989fb 503 {
504 SV * encoding = items == 2 ? ST(1) : Nullsv;
505
506 if (encoding)
507 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
508 else {
509 STRLEN len;
183a2d84 510 U8* s = (U8*)SvPV(sv, len);
67e989fb 511 U8* converted;
512
513 converted = bytes_to_utf8(s, &len); /* This allocs */
183a2d84 514 sv_setpvn(sv, (char *)converted, len);
67e989fb 515 SvUTF8_on(sv); /* XXX Should we? */
516 Safefree(converted); /* ... so free it */
517 RETVAL = len;
518 }
519 }
2c674647 520 OUTPUT:
67e989fb 521 RETVAL
2c674647 522
67e989fb 523I32
2c674647 524_utf8_to_bytes(sv, ...)
67e989fb 525 SV * sv
2c674647 526 CODE:
67e989fb 527 {
528 SV * to = items > 1 ? ST(1) : Nullsv;
529 SV * check = items > 2 ? ST(2) : Nullsv;
87714904 530
67e989fb 531 if (to)
532 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
533 else {
67e989fb 534 STRLEN len;
b113ac0e 535 U8 *s = (U8*)SvPV(sv, len);
67e989fb 536
537 if (SvTRUE(check)) {
538 /* Must do things the slow way */
539 U8 *dest;
87714904 540 U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
67e989fb 541 U8 *send = s + len;
542
543 New(83, dest, len, U8); /* I think */
544
545 while (s < send) {
546 if (*s < 0x80)
547 *dest++ = *s++;
548 else {
b113ac0e 549 STRLEN ulen;
550 UV uv = *s++;
87714904 551
67e989fb 552 /* Have to do it all ourselves because of error routine,
553 aargh. */
554 if (!(uv & 0x40))
555 goto failure;
556 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
557 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
558 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
559 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
560 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
561 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
562 else { ulen = 13; uv = 0; }
87714904 563
67e989fb 564 /* Note change to utf8.c variable naming, for variety */
565 while (ulen--) {
566 if ((*s & 0xc0) != 0x80)
567 goto failure;
87714904 568
67e989fb 569 else
570 uv = (uv << 6) | (*s++ & 0x3f);
87714904 571 }
67e989fb 572 if (uv > 256) {
573 failure:
574 call_failure(check, s, dest, src);
575 /* Now what happens? */
576 }
577 *dest++ = (U8)uv;
578 }
579 }
580 } else
581 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
582 }
2c674647 583 }
584 OUTPUT:
585 RETVAL
586
2c674647 587bool
4411f3b6 588is_utf8(sv, check = FALSE)
589SV * sv
590bool check
2c674647 591 CODE:
592 {
2c674647 593 if (SvPOK(sv)) {
4411f3b6 594 RETVAL = SvUTF8(sv) ? TRUE : FALSE;
2c674647 595 if (RETVAL &&
4411f3b6 596 check &&
2c674647 597 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
598 RETVAL = FALSE;
599 } else {
600 RETVAL = FALSE;
601 }
602 }
603 OUTPUT:
604 RETVAL
605
606SV *
4411f3b6 607_utf8_on(sv)
2c674647 608 SV * sv
609 CODE:
610 {
611 if (SvPOK(sv)) {
87714904 612 SV *rsv = newSViv(SvUTF8(sv));
2c674647 613 RETVAL = rsv;
614 SvUTF8_on(sv);
615 } else {
616 RETVAL = &PL_sv_undef;
617 }
618 }
619 OUTPUT:
620 RETVAL
621
622SV *
4411f3b6 623_utf8_off(sv)
2c674647 624 SV * sv
625 CODE:
626 {
627 if (SvPOK(sv)) {
87714904 628 SV *rsv = newSViv(SvUTF8(sv));
2c674647 629 RETVAL = rsv;
630 SvUTF8_off(sv);
631 } else {
632 RETVAL = &PL_sv_undef;
633 }
634 }
635 OUTPUT:
636 RETVAL
637
33af2bc7 638BOOT:
639{
640#ifdef USE_PERLIO
641 PerlIO_define_layer(&PerlIO_encode);
642#endif
2f2b4ff2 643#include "iso8859.def"
644#include "EBCDIC.def"
645#include "Symbols.def"
33af2bc7 646}