Once again syncing after too long an absence
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.xs
CommitLineData
22d4bb9c 1#include "EXTERN.h"
2#include "perl.h"
3#include "XSUB.h"
0e06870b 4#define U8 U8
5#include "encode.h"
6#include "iso8859.h"
7#include "EBCDIC.h"
8#include "Symbols.h"
22d4bb9c 9
10#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) { \
11 Perl_croak(aTHX_ "panic_unimplemented"); \
12 return (y)0; /* fool picky compilers */ \
13 }
14UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
15UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
16
0e06870b 17#ifdef USE_PERLIO
18/* Define an encoding "layer" in the perliol.h sense.
19 The layer defined here "inherits" in an object-oriented sense from the
20 "perlio" layer with its PerlIOBuf_* "methods".
21 The implementation is particularly efficient as until Encode settles down
22 there is no point in tryint to tune it.
23
24 The layer works by overloading the "fill" and "flush" methods.
25
26 "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API
27 to convert the encoded data to UTF-8 form, then copies it back to the
28 buffer. The "base class's" read methods then see the UTF-8 data.
29
30 "flush" transforms the UTF-8 data deposited by the "base class's write
31 method in the buffer back into the encoded form using the encode OO perl API,
32 then copies data back into the buffer and calls "SUPER::flush.
33
34 Note that "flush" is _also_ called for read mode - we still do the (back)-translate
35 so that the the base class's "flush" sees the correct number of encoded chars
36 for positioning the seek pointer. (This double translation is the worst performance
37 issue - particularly with all-perl encode engine.)
38
39*/
40
41
42#include "perliol.h"
43
44typedef struct
45{
46 PerlIOBuf base; /* PerlIOBuf stuff */
47 SV * bufsv;
48 SV * enc;
49} PerlIOEncode;
50
51
52IV
53PerlIOEncode_pushed(PerlIO *f, const char *mode,const char *arg,STRLEN len)
54{
55 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
56 dTHX;
57 dSP;
58 IV code;
59 code = PerlIOBuf_pushed(f,mode,Nullch,0);
60 ENTER;
61 SAVETMPS;
62 PUSHMARK(sp);
63 XPUSHs(sv_2mortal(newSVpv("Encode",0)));
64 XPUSHs(sv_2mortal(newSVpvn(arg,len)));
65 PUTBACK;
66 if (perl_call_method("getEncoding",G_SCALAR) != 1)
67 return -1;
68 SPAGAIN;
69 e->enc = POPs;
70 PUTBACK;
71 if (!SvROK(e->enc))
72 return -1;
73 SvREFCNT_inc(e->enc);
74 FREETMPS;
75 LEAVE;
76 PerlIOBase(f)->flags |= PERLIO_F_UTF8;
77 return code;
78}
79
80IV
81PerlIOEncode_popped(PerlIO *f)
82{
83 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
84 dTHX;
85 if (e->enc)
86 {
87 SvREFCNT_dec(e->enc);
88 e->enc = Nullsv;
89 }
90 if (e->bufsv)
91 {
92 SvREFCNT_dec(e->bufsv);
93 e->bufsv = Nullsv;
94 }
95 return 0;
96}
97
98STDCHAR *
99PerlIOEncode_get_base(PerlIO *f)
100{
101 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
102 dTHX;
103 if (!e->base.bufsiz)
104 e->base.bufsiz = 1024;
105 if (!e->bufsv)
106 {
107 e->bufsv = newSV(e->base.bufsiz);
108 sv_setpvn(e->bufsv,"",0);
109 }
110 e->base.buf = (STDCHAR *)SvPVX(e->bufsv);
111 if (!e->base.ptr)
112 e->base.ptr = e->base.buf;
113 if (!e->base.end)
114 e->base.end = e->base.buf;
115 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
116 {
117 Perl_warn(aTHX_ " ptr %p(%p)%p",
118 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
119 abort();
120 }
121 if (SvLEN(e->bufsv) < e->base.bufsiz)
122 {
123 SSize_t poff = e->base.ptr - e->base.buf;
124 SSize_t eoff = e->base.end - e->base.buf;
125 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,e->base.bufsiz);
126 e->base.ptr = e->base.buf + poff;
127 e->base.end = e->base.buf + eoff;
128 }
129 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
130 {
131 Perl_warn(aTHX_ " ptr %p(%p)%p",
132 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
133 abort();
134 }
135 return e->base.buf;
136}
137
138IV
139PerlIOEncode_fill(PerlIO *f)
140{
141 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
142 dTHX;
143 dSP;
144 IV code;
145 code = PerlIOBuf_fill(f);
146 if (code == 0)
147 {
148 SV *uni;
149 STRLEN len;
150 char *s;
151 /* Set SV that is the buffer to be buf..ptr */
152 SvCUR_set(e->bufsv, e->base.end - e->base.buf);
153 SvUTF8_off(e->bufsv);
154 ENTER;
155 SAVETMPS;
156 PUSHMARK(sp);
157 XPUSHs(e->enc);
158 XPUSHs(e->bufsv);
159 XPUSHs(&PL_sv_yes);
160 PUTBACK;
161 if (perl_call_method("toUnicode",G_SCALAR) != 1)
162 code = -1;
163 SPAGAIN;
164 uni = POPs;
165 PUTBACK;
166 /* Now get translated string (forced to UTF-8) and copy back to buffer
167 don't use sv_setsv as that may "steal" PV from returned temp
168 and so free() our known-large-enough buffer.
169 sv_setpvn() should do but let us do it long hand.
170 */
171 s = SvPVutf8(uni,len);
172 if (s != SvPVX(e->bufsv))
173 {
174 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
175 Move(s,e->base.buf,len,char);
176 SvCUR_set(e->bufsv,len);
177 }
178 SvUTF8_on(e->bufsv);
179 e->base.end = e->base.buf+len;
180 e->base.ptr = e->base.buf;
181 FREETMPS;
182 LEAVE;
183 }
184 return code;
185}
186
187IV
188PerlIOEncode_flush(PerlIO *f)
189{
190 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
191 IV code = 0;
192 dTHX;
193 if (e->bufsv && (PerlIOBase(f)->flags & (PERLIO_F_RDBUF|PERLIO_F_WRBUF)))
194 {
195 dSP;
196 SV *str;
197 char *s;
198 STRLEN len;
199 SSize_t left = 0;
200 if (PerlIOBase(f)->flags & PERLIO_F_RDBUF)
201 {
202 /* This is really just a flag to see if we took all the data, if
203 we did PerlIOBase_flush avoids a seek to lower layer.
204 Need to revisit if we start getting clever with unreads or seeks-in-buffer
205 */
206 left = e->base.end - e->base.ptr;
207 }
208 ENTER;
209 SAVETMPS;
210 PUSHMARK(sp);
211 XPUSHs(e->enc);
212 SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
213 SvUTF8_on(e->bufsv);
214 XPUSHs(e->bufsv);
215 XPUSHs(&PL_sv_yes);
216 PUTBACK;
217 if (perl_call_method("fromUnicode",G_SCALAR) != 1)
218 code = -1;
219 SPAGAIN;
220 str = POPs;
221 PUTBACK;
222 s = SvPV(str,len);
223 if (s != SvPVX(e->bufsv))
224 {
225 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
226 Move(s,e->base.buf,len,char);
227 SvCUR_set(e->bufsv,len);
228 }
229 SvUTF8_off(e->bufsv);
230 e->base.ptr = e->base.buf+len;
231 /* restore end != ptr as inequality is used by PerlIOBuf_flush in read case */
232 e->base.end = e->base.ptr + left;
233 FREETMPS;
234 LEAVE;
235 if (PerlIOBuf_flush(f) != 0)
236 code = -1;
237 }
238 return code;
239}
240
241IV
242PerlIOEncode_close(PerlIO *f)
243{
244 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
245 IV code = PerlIOBase_close(f);
246 dTHX;
247 if (e->bufsv)
248 {
249 SvREFCNT_dec(e->bufsv);
250 e->bufsv = Nullsv;
251 }
252 e->base.buf = NULL;
253 e->base.ptr = NULL;
254 e->base.end = NULL;
255 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF|PERLIO_F_WRBUF);
256 return code;
257}
258
259Off_t
260PerlIOEncode_tell(PerlIO *f)
261{
262 PerlIOBuf *b = PerlIOSelf(f,PerlIOBuf);
263 /* Unfortunately the only way to get a postion is to back-translate,
264 the UTF8-bytes we have buf..ptr and adjust accordingly.
265 But we will try and save any unread data in case stream
266 is un-seekable.
267 */
268 if ((PerlIOBase(f)->flags & PERLIO_F_RDBUF) && b->ptr < b->end)
269 {
270 Size_t count = b->end - b->ptr;
271 PerlIO_push(f,&PerlIO_pending,"r",Nullch,0);
272 /* Save what we have left to read */
273 PerlIOSelf(f,PerlIOBuf)->bufsiz = count;
274 PerlIO_unread(f,b->ptr,count);
275 /* There isn't any unread data - we just saved it - so avoid the lower seek */
276 b->end = b->ptr;
277 /* Flush ourselves - now one layer down,
278 this does the back translate and adjusts position
279 */
280 PerlIO_flush(PerlIONext(f));
281 /* Set position of the saved data */
282 PerlIOSelf(f,PerlIOBuf)->posn = b->posn;
283 }
284 else
285 {
286 PerlIO_flush(f);
287 }
288 return b->posn;
289}
290
291PerlIO_funcs PerlIO_encode = {
292 "encoding",
293 sizeof(PerlIOEncode),
294 PERLIO_K_BUFFERED,
295 PerlIOBase_fileno,
296 PerlIOBuf_fdopen,
297 PerlIOBuf_open,
298 PerlIOBuf_reopen,
299 PerlIOEncode_pushed,
300 PerlIOEncode_popped,
301 PerlIOBuf_read,
302 PerlIOBuf_unread,
303 PerlIOBuf_write,
304 PerlIOBuf_seek,
305 PerlIOEncode_tell,
306 PerlIOEncode_close,
307 PerlIOEncode_flush,
308 PerlIOEncode_fill,
309 PerlIOBase_eof,
310 PerlIOBase_error,
311 PerlIOBase_clearerr,
312 PerlIOBuf_setlinebuf,
313 PerlIOEncode_get_base,
314 PerlIOBuf_bufsiz,
315 PerlIOBuf_get_ptr,
316 PerlIOBuf_get_cnt,
317 PerlIOBuf_set_ptrcnt,
318};
319#endif
320
321void
322Encode_Define(pTHX_ encode_t *enc)
323{
324 HV *hash = get_hv("Encode::encoding",GV_ADD|GV_ADDMULTI);
325 HV *stash = gv_stashpv("Encode::XS", TRUE);
326 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
327 hv_store(hash,enc->name,strlen(enc->name),sv,0);
328}
329
22d4bb9c 330void call_failure (SV *routine, U8* done, U8* dest, U8* orig) {}
331
0e06870b 332static SV *
333encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check)
334{
335 STRLEN slen;
336 U8 *s = (U8 *) SvPV(src,slen);
337 SV *dst = sv_2mortal(newSV(2*slen+1));
338 if (slen)
339 {
340 U8 *d = (U8 *) SvGROW(dst, 2*slen+1);
341 STRLEN dlen = SvLEN(dst);
342 int code;
343 while ((code = do_encode(dir,s,&slen,d,dlen,&dlen,!check)))
344 {
345 SvCUR_set(dst,dlen);
346 SvPOK_on(dst);
347
348 if (code == ENCODE_FALLBACK)
349 break;
350
351 switch(code)
352 {
353 case ENCODE_NOSPACE:
354 {
355 STRLEN need = (slen) ? (SvLEN(dst)*SvCUR(src)/slen) : (dlen + UTF8_MAXLEN);
356 if (need <= SvLEN(dst))
357 need += UTF8_MAXLEN;
358 d = (U8 *) SvGROW(dst, need);
359 dlen = SvLEN(dst);
360 slen = SvCUR(src);
361 break;
362 }
363
364 case ENCODE_NOREP:
365 if (dir == enc->f_utf8)
366 {
367 if (!check && ckWARN_d(WARN_UTF8))
368 {
369 STRLEN clen;
370 UV ch = utf8_to_uv(s+slen,(SvCUR(src)-slen),&clen,0);
371 Perl_warner(aTHX_ WARN_UTF8, "\"\\x{%x}\" does not map to %s", ch, enc->name);
372 /* FIXME: Skip over the character, copy in replacement and continue
373 * but that is messy so for now just fail.
374 */
375 return &PL_sv_undef;
376 }
377 else
378 {
379 return &PL_sv_undef;
380 }
381 }
382 else
383 {
384 /* UTF-8 is supposed to be "Universal" so should not happen */
385 Perl_croak(aTHX_ "%s '%.*s' does not map to UTF-8",
386 enc->name, (SvCUR(src)-slen),s+slen);
387 }
388 break;
389
390 case ENCODE_PARTIAL:
391 if (!check && ckWARN_d(WARN_UTF8))
392 {
393 Perl_warner(aTHX_ WARN_UTF8, "Partial %s character",
394 (dir == enc->f_utf8) ? "UTF-8" : enc->name);
395 }
396 return &PL_sv_undef;
397
398 default:
399 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
400 code, (dir == enc->f_utf8) ? "to" : "from",enc->name);
401 return &PL_sv_undef;
402 }
403 }
404 SvCUR_set(dst,dlen);
405 SvPOK_on(dst);
406 if (check)
407 {
408 if (slen < SvCUR(src))
409 {
410 Move(s+slen,s,SvCUR(src)-slen,U8);
411 }
412 SvCUR_set(src,SvCUR(src)-slen);
413 }
414 }
415 return dst;
416}
417
418MODULE = Encode PACKAGE = Encode PREFIX = sv_
419
420void
421valid_utf8(sv)
422SV * sv
423CODE:
424 {
425 STRLEN len;
426 char *s = SvPV(sv,len);
427 if (!SvUTF8(sv) || is_utf8_string((U8*)s,len))
428 XSRETURN_YES;
429 else
430 XSRETURN_NO;
431 }
432
433void
434sv_utf8_encode(sv)
435SV * sv
436
437bool
438sv_utf8_decode(sv)
439SV * sv
440
441void
442sv_utf8_upgrade(sv)
443SV * sv
444
445bool
446sv_utf8_downgrade(sv,failok=0)
447SV * sv
448bool failok
449
450MODULE = Encode PACKAGE = Encode::XS PREFIX = Encode_
451
452PROTOTYPES: ENABLE
453
454void
455Encode_toUnicode(obj,src,check = 0)
456SV * obj
457SV * src
458int check
459CODE:
460 {
461 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
462 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
463 SvUTF8_on(ST(0));
464 XSRETURN(1);
465 }
466
467void
468Encode_fromUnicode(obj,src,check = 0)
469SV * obj
470SV * src
471int check
472CODE:
473 {
474 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
475 sv_utf8_upgrade(src);
476 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
477 XSRETURN(1);
478 }
479
22d4bb9c 480MODULE = Encode PACKAGE = Encode
481
482PROTOTYPES: ENABLE
483
484I32
485_bytes_to_utf8(sv, ...)
486 SV * sv
487 CODE:
488 {
489 SV * encoding = items == 2 ? ST(1) : Nullsv;
490
491 if (encoding)
492 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
493 else {
494 STRLEN len;
495 U8* s = (U8*)SvPV(sv, len);
496 U8* converted;
497
498 converted = bytes_to_utf8(s, &len); /* This allocs */
499 sv_setpvn(sv, (char *)converted, len);
500 SvUTF8_on(sv); /* XXX Should we? */
501 Safefree(converted); /* ... so free it */
502 RETVAL = len;
503 }
504 }
505 OUTPUT:
506 RETVAL
507
508I32
509_utf8_to_bytes(sv, ...)
510 SV * sv
511 CODE:
512 {
513 SV * to = items > 1 ? ST(1) : Nullsv;
514 SV * check = items > 2 ? ST(2) : Nullsv;
515
516 if (to)
517 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
518 else {
519 STRLEN len;
520 U8 *s = (U8*)SvPV(sv, len);
521
522 if (SvTRUE(check)) {
523 /* Must do things the slow way */
524 U8 *dest;
525 U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
526 U8 *send = s + len;
527
528 New(83, dest, len, U8); /* I think */
529
530 while (s < send) {
531 if (*s < 0x80)
532 *dest++ = *s++;
533 else {
534 STRLEN ulen;
535 UV uv = *s++;
536
537 /* Have to do it all ourselves because of error routine,
538 aargh. */
539 if (!(uv & 0x40))
540 goto failure;
541 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
542 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
543 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
544 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
545 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
546 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
547 else { ulen = 13; uv = 0; }
548
549 /* Note change to utf8.c variable naming, for variety */
550 while (ulen--) {
551 if ((*s & 0xc0) != 0x80)
552 goto failure;
553
554 else
555 uv = (uv << 6) | (*s++ & 0x3f);
556 }
557 if (uv > 256) {
558 failure:
559 call_failure(check, s, dest, src);
560 /* Now what happens? */
561 }
562 *dest++ = (U8)uv;
563 }
564 }
565 } else
566 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
567 }
568 }
569 OUTPUT:
570 RETVAL
571
572SV *
573_chars_to_utf8(sv, from, ...)
574 SV * sv
575 SV * from
576 CODE:
577 {
578 SV * check = items == 3 ? ST(2) : Nullsv;
579 RETVAL = &PL_sv_undef;
580 }
581 OUTPUT:
582 RETVAL
583
584SV *
585_utf8_to_chars(sv, to, ...)
586 SV * sv
587 SV * to
588 CODE:
589 {
590 SV * check = items == 3 ? ST(2) : Nullsv;
591 RETVAL = &PL_sv_undef;
592 }
593 OUTPUT:
594 RETVAL
595
596SV *
597_utf8_to_chars_check(sv, ...)
598 SV * sv
599 CODE:
600 {
601 SV * check = items == 2 ? ST(1) : Nullsv;
602 RETVAL = &PL_sv_undef;
603 }
604 OUTPUT:
605 RETVAL
606
607SV *
608_bytes_to_chars(sv, from, ...)
609 SV * sv
610 SV * from
611 CODE:
612 {
613 SV * check = items == 3 ? ST(2) : Nullsv;
614 RETVAL = &PL_sv_undef;
615 }
616 OUTPUT:
617 RETVAL
618
619SV *
620_chars_to_bytes(sv, to, ...)
621 SV * sv
622 SV * to
623 CODE:
624 {
625 SV * check = items == 3 ? ST(2) : Nullsv;
626 RETVAL = &PL_sv_undef;
627 }
628 OUTPUT:
629 RETVAL
630
631SV *
632_from_to(sv, from, to, ...)
633 SV * sv
634 SV * from
635 SV * to
636 CODE:
637 {
638 SV * check = items == 4 ? ST(3) : Nullsv;
639 RETVAL = &PL_sv_undef;
640 }
641 OUTPUT:
642 RETVAL
643
644bool
645_is_utf8(sv, ...)
646 SV * sv
647 CODE:
648 {
649 SV * check = items == 2 ? ST(1) : Nullsv;
650 if (SvPOK(sv)) {
0e06870b 651 RETVAL = SvUTF8(sv) ? 1 : 0;
22d4bb9c 652 if (RETVAL &&
653 SvTRUE(check) &&
654 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
655 RETVAL = FALSE;
656 } else {
657 RETVAL = FALSE;
658 }
659 }
660 OUTPUT:
661 RETVAL
662
663SV *
664_on_utf8(sv)
665 SV * sv
666 CODE:
667 {
668 if (SvPOK(sv)) {
669 SV *rsv = newSViv(SvUTF8(sv));
670 RETVAL = rsv;
671 SvUTF8_on(sv);
672 } else {
673 RETVAL = &PL_sv_undef;
674 }
675 }
676 OUTPUT:
677 RETVAL
678
679SV *
680_off_utf8(sv)
681 SV * sv
682 CODE:
683 {
684 if (SvPOK(sv)) {
685 SV *rsv = newSViv(SvUTF8(sv));
686 RETVAL = rsv;
687 SvUTF8_off(sv);
688 } else {
689 RETVAL = &PL_sv_undef;
690 }
691 }
692 OUTPUT:
693 RETVAL
694
695SV *
696_utf_to_utf(sv, from, to, ...)
697 SV * sv
698 SV * from
699 SV * to
700 CODE:
701 {
702 SV * check = items == 4 ? ST(3) : Nullsv;
703 RETVAL = &PL_sv_undef;
704 }
705 OUTPUT:
706 RETVAL
707
0e06870b 708BOOT:
709{
710#ifdef USE_PERLIO
711 PerlIO_define_layer(&PerlIO_encode);
712#endif
713#include "iso8859.def"
714#include "EBCDIC.def"
715#include "Symbols.def"
716}