Start of support of ICU-style .ucm files:
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.xs
CommitLineData
2c674647 1#include "EXTERN.h"
2#include "perl.h"
3#include "XSUB.h"
2f2b4ff2 4#define U8 U8
5#include "encode.h"
6#include "iso8859.h"
7#include "EBCDIC.h"
8#include "Symbols.h"
2c674647 9
67e989fb 10#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) { \
2f5768b8 11 Perl_croak(aTHX_ "panic_unimplemented"); \
4a83738a 12 return (y)0; /* fool picky compilers */ \
87714904 13 }
67e989fb 14UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
15UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
16
33af2bc7 17#ifdef USE_PERLIO
72e44f29 18/* Define an encoding "layer" in the perliol.h sense.
19 The layer defined here "inherits" in an object-oriented sense from the
20 "perlio" layer with its PerlIOBuf_* "methods".
21 The implementation is particularly efficient as until Encode settles down
22 there is no point in tryint to tune it.
23
24 The layer works by overloading the "fill" and "flush" methods.
25
26 "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API
27 to convert the encoded data to UTF-8 form, then copies it back to the
28 buffer. The "base class's" read methods then see the UTF-8 data.
29
30 "flush" transforms the UTF-8 data deposited by the "base class's write
31 method in the buffer back into the encoded form using the encode OO perl API,
32 then copies data back into the buffer and calls "SUPER::flush.
33
34 Note that "flush" is _also_ called for read mode - we still do the (back)-translate
35 so that the the base class's "flush" sees the correct number of encoded chars
36 for positioning the seek pointer. (This double translation is the worst performance
37 issue - particularly with all-perl encode engine.)
38
39*/
40
41
33af2bc7 42#include "perliol.h"
43
44typedef struct
45{
46 PerlIOBuf base; /* PerlIOBuf stuff */
47 SV * bufsv;
48 SV * enc;
49} PerlIOEncode;
50
51
52IV
53PerlIOEncode_pushed(PerlIO *f, const char *mode,const char *arg,STRLEN len)
54{
55 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
56 dTHX;
57 dSP;
58 IV code;
59 code = PerlIOBuf_pushed(f,mode,Nullch,0);
60 ENTER;
61 SAVETMPS;
62 PUSHMARK(sp);
63 XPUSHs(sv_2mortal(newSVpv("Encode",0)));
64 XPUSHs(sv_2mortal(newSVpvn(arg,len)));
65 PUTBACK;
66 if (perl_call_method("getEncoding",G_SCALAR) != 1)
67 return -1;
68 SPAGAIN;
69 e->enc = POPs;
70 PUTBACK;
71 if (!SvROK(e->enc))
72 return -1;
73 SvREFCNT_inc(e->enc);
74 FREETMPS;
75 LEAVE;
76 PerlIOBase(f)->flags |= PERLIO_F_UTF8;
77 return code;
78}
79
80IV
81PerlIOEncode_popped(PerlIO *f)
82{
83 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
84 dTHX;
85 if (e->enc)
86 {
87 SvREFCNT_dec(e->enc);
88 e->enc = Nullsv;
89 }
90 if (e->bufsv)
91 {
92 SvREFCNT_dec(e->bufsv);
93 e->bufsv = Nullsv;
94 }
95 return 0;
96}
97
98STDCHAR *
99PerlIOEncode_get_base(PerlIO *f)
100{
101 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
102 dTHX;
103 if (!e->base.bufsiz)
104 e->base.bufsiz = 1024;
105 if (!e->bufsv)
106 {
107 e->bufsv = newSV(e->base.bufsiz);
108 sv_setpvn(e->bufsv,"",0);
109 }
62e8870c 110 e->base.buf = (STDCHAR *)SvPVX(e->bufsv);
33af2bc7 111 if (!e->base.ptr)
112 e->base.ptr = e->base.buf;
113 if (!e->base.end)
114 e->base.end = e->base.buf;
115 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
116 {
117 Perl_warn(aTHX_ " ptr %p(%p)%p",
118 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
119 abort();
120 }
121 if (SvLEN(e->bufsv) < e->base.bufsiz)
122 {
123 SSize_t poff = e->base.ptr - e->base.buf;
124 SSize_t eoff = e->base.end - e->base.buf;
62e8870c 125 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,e->base.bufsiz);
33af2bc7 126 e->base.ptr = e->base.buf + poff;
127 e->base.end = e->base.buf + eoff;
128 }
129 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
130 {
131 Perl_warn(aTHX_ " ptr %p(%p)%p",
132 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
133 abort();
134 }
135 return e->base.buf;
136}
137
33af2bc7 138IV
139PerlIOEncode_fill(PerlIO *f)
140{
141 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
142 dTHX;
143 dSP;
144 IV code;
33af2bc7 145 code = PerlIOBuf_fill(f);
146 if (code == 0)
147 {
148 SV *uni;
72e44f29 149 STRLEN len;
150 char *s;
151 /* Set SV that is the buffer to be buf..ptr */
33af2bc7 152 SvCUR_set(e->bufsv, e->base.end - e->base.buf);
153 SvUTF8_off(e->bufsv);
154 ENTER;
155 SAVETMPS;
156 PUSHMARK(sp);
157 XPUSHs(e->enc);
158 XPUSHs(e->bufsv);
159 XPUSHs(&PL_sv_yes);
160 PUTBACK;
161 if (perl_call_method("toUnicode",G_SCALAR) != 1)
162 code = -1;
163 SPAGAIN;
164 uni = POPs;
165 PUTBACK;
72e44f29 166 /* Now get translated string (forced to UTF-8) and copy back to buffer
167 don't use sv_setsv as that may "steal" PV from returned temp
168 and so free() our known-large-enough buffer.
169 sv_setpvn() should do but let us do it long hand.
170 */
171 s = SvPVutf8(uni,len);
172 if (s != SvPVX(e->bufsv))
173 {
62e8870c 174 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
72e44f29 175 Move(s,e->base.buf,len,char);
176 SvCUR_set(e->bufsv,len);
177 }
178 SvUTF8_on(e->bufsv);
179 e->base.end = e->base.buf+len;
33af2bc7 180 e->base.ptr = e->base.buf;
181 FREETMPS;
182 LEAVE;
183 }
184 return code;
185}
186
187IV
188PerlIOEncode_flush(PerlIO *f)
189{
190 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
191 IV code = 0;
192 dTHX;
193 if (e->bufsv && (PerlIOBase(f)->flags & (PERLIO_F_RDBUF|PERLIO_F_WRBUF)))
194 {
195 dSP;
196 SV *str;
197 char *s;
198 STRLEN len;
72e44f29 199 SSize_t left = 0;
200 if (PerlIOBase(f)->flags & PERLIO_F_RDBUF)
201 {
202 /* This is really just a flag to see if we took all the data, if
203 we did PerlIOBase_flush avoids a seek to lower layer.
204 Need to revisit if we start getting clever with unreads or seeks-in-buffer
205 */
206 left = e->base.end - e->base.ptr;
207 }
33af2bc7 208 ENTER;
209 SAVETMPS;
210 PUSHMARK(sp);
211 XPUSHs(e->enc);
72e44f29 212 SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
33af2bc7 213 SvUTF8_on(e->bufsv);
214 XPUSHs(e->bufsv);
215 XPUSHs(&PL_sv_yes);
216 PUTBACK;
217 if (perl_call_method("fromUnicode",G_SCALAR) != 1)
218 code = -1;
219 SPAGAIN;
220 str = POPs;
221 PUTBACK;
72e44f29 222 s = SvPV(str,len);
223 if (s != SvPVX(e->bufsv))
224 {
62e8870c 225 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
72e44f29 226 Move(s,e->base.buf,len,char);
227 SvCUR_set(e->bufsv,len);
228 }
33af2bc7 229 SvUTF8_off(e->bufsv);
72e44f29 230 e->base.ptr = e->base.buf+len;
231 /* restore end != ptr as inequality is used by PerlIOBuf_flush in read case */
232 e->base.end = e->base.ptr + left;
33af2bc7 233 FREETMPS;
234 LEAVE;
235 if (PerlIOBuf_flush(f) != 0)
236 code = -1;
237 }
238 return code;
239}
240
241IV
242PerlIOEncode_close(PerlIO *f)
243{
244 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
245 IV code = PerlIOBase_close(f);
246 dTHX;
247 if (e->bufsv)
248 {
249 SvREFCNT_dec(e->bufsv);
250 e->bufsv = Nullsv;
251 }
252 e->base.buf = NULL;
253 e->base.ptr = NULL;
254 e->base.end = NULL;
255 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF|PERLIO_F_WRBUF);
256 return code;
257}
258
72e44f29 259Off_t
260PerlIOEncode_tell(PerlIO *f)
261{
262 PerlIOBuf *b = PerlIOSelf(f,PerlIOBuf);
263 /* Unfortunately the only way to get a postion is to back-translate,
264 the UTF8-bytes we have buf..ptr and adjust accordingly.
265 But we will try and save any unread data in case stream
266 is un-seekable.
267 */
268 if ((PerlIOBase(f)->flags & PERLIO_F_RDBUF) && b->ptr < b->end)
269 {
270 Size_t count = b->end - b->ptr;
271 PerlIO_push(f,&PerlIO_pending,"r",Nullch,0);
272 /* Save what we have left to read */
273 PerlIOSelf(f,PerlIOBuf)->bufsiz = count;
274 PerlIO_unread(f,b->ptr,count);
275 /* There isn't any unread data - we just saved it - so avoid the lower seek */
276 b->end = b->ptr;
277 /* Flush ourselves - now one layer down,
278 this does the back translate and adjusts position
279 */
280 PerlIO_flush(PerlIONext(f));
281 /* Set position of the saved data */
282 PerlIOSelf(f,PerlIOBuf)->posn = b->posn;
283 }
284 else
285 {
286 PerlIO_flush(f);
287 }
288 return b->posn;
289}
290
33af2bc7 291PerlIO_funcs PerlIO_encode = {
72e44f29 292 "encoding",
33af2bc7 293 sizeof(PerlIOEncode),
294 PERLIO_K_BUFFERED,
295 PerlIOBase_fileno,
296 PerlIOBuf_fdopen,
297 PerlIOBuf_open,
298 PerlIOBuf_reopen,
299 PerlIOEncode_pushed,
300 PerlIOEncode_popped,
301 PerlIOBuf_read,
302 PerlIOBuf_unread,
303 PerlIOBuf_write,
304 PerlIOBuf_seek,
72e44f29 305 PerlIOEncode_tell,
33af2bc7 306 PerlIOEncode_close,
307 PerlIOEncode_flush,
308 PerlIOEncode_fill,
309 PerlIOBase_eof,
310 PerlIOBase_error,
311 PerlIOBase_clearerr,
312 PerlIOBuf_setlinebuf,
313 PerlIOEncode_get_base,
314 PerlIOBuf_bufsiz,
315 PerlIOBuf_get_ptr,
316 PerlIOBuf_get_cnt,
317 PerlIOBuf_set_ptrcnt,
318};
319#endif
320
2f2b4ff2 321void
322Encode_Define(pTHX_ encode_t *enc)
323{
324 HV *hash = get_hv("Encode::encoding",GV_ADD|GV_ADDMULTI);
325 HV *stash = gv_stashpv("Encode::XS", TRUE);
326 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
327 hv_store(hash,enc->name,strlen(enc->name),sv,0);
328}
329
183a2d84 330void call_failure (SV *routine, U8* done, U8* dest, U8* orig) {}
67e989fb 331
2f2b4ff2 332static SV *
333encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check)
334{
335 STRLEN slen;
336 U8 *s = (U8 *) SvPV(src,slen);
337 SV *dst = sv_2mortal(newSV(2*slen+1));
338 if (slen)
339 {
340 U8 *d = (U8 *) SvGROW(dst, 2*slen+1);
341 STRLEN dlen = SvLEN(dst);
342 int code;
9b37254d 343 while ((code = do_encode(dir,s,&slen,d,dlen,&dlen,!check)))
2f2b4ff2 344 {
345 SvCUR_set(dst,dlen);
346 SvPOK_on(dst);
9b37254d 347
348 if (code == ENCODE_FALLBACK)
349 break;
350
2f2b4ff2 351 switch(code)
352 {
353 case ENCODE_NOSPACE:
354 {
355 STRLEN need = (slen) ? (SvLEN(dst)*SvCUR(src)/slen) : (dlen + UTF8_MAXLEN);
356 if (need <= SvLEN(dst))
357 need += UTF8_MAXLEN;
358 d = (U8 *) SvGROW(dst, need);
359 dlen = SvLEN(dst);
360 slen = SvCUR(src);
361 break;
362 }
363
364 case ENCODE_NOREP:
365 if (dir == enc->f_utf8)
366 {
367 if (!check && ckWARN_d(WARN_UTF8))
368 {
369 STRLEN clen;
370 UV ch = utf8_to_uv(s+slen,(SvCUR(src)-slen),&clen,0);
371 Perl_warner(aTHX_ WARN_UTF8, "\"\\x{%x}\" does not map to %s", ch, enc->name);
372 /* FIXME: Skip over the character, copy in replacement and continue
373 * but that is messy so for now just fail.
374 */
375 return &PL_sv_undef;
376 }
377 else
378 {
379 return &PL_sv_undef;
380 }
381 }
382 else
383 {
384 /* UTF-8 is supposed to be "Universal" so should not happen */
385 Perl_croak(aTHX_ "%s '%.*s' does not map to UTF-8",
386 enc->name, (SvCUR(src)-slen),s+slen);
387 }
388 break;
389
390 case ENCODE_PARTIAL:
391 if (!check && ckWARN_d(WARN_UTF8))
392 {
393 Perl_warner(aTHX_ WARN_UTF8, "Partial %s character",
394 (dir == enc->f_utf8) ? "UTF-8" : enc->name);
395 }
396 return &PL_sv_undef;
397
398 default:
399 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
400 code, (dir == enc->f_utf8) ? "to" : "from",enc->name);
401 return &PL_sv_undef;
402 }
403 }
404 SvCUR_set(dst,dlen);
405 SvPOK_on(dst);
406 if (check)
407 {
408 if (slen < SvCUR(src))
409 {
410 Move(s+slen,s,SvCUR(src)-slen,U8);
411 }
412 SvCUR_set(src,SvCUR(src)-slen);
413 }
414 }
415 return dst;
416}
417
418MODULE = Encode PACKAGE = Encode::XS PREFIX = Encode_
419
420PROTOTYPES: ENABLE
421
422void
423Encode_toUnicode(obj,src,check = 0)
424SV * obj
425SV * src
426int check
427CODE:
428 {
429 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
430 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
431 SvUTF8_on(ST(0));
432 XSRETURN(1);
433 }
434
435void
436Encode_fromUnicode(obj,src,check = 0)
437SV * obj
438SV * src
439int check
440CODE:
441 {
442 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
443 sv_utf8_upgrade(src);
444 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
445 XSRETURN(1);
446 }
447
67e989fb 448MODULE = Encode PACKAGE = Encode
2c674647 449
450PROTOTYPES: ENABLE
451
67e989fb 452I32
2c674647 453_bytes_to_utf8(sv, ...)
67e989fb 454 SV * sv
2c674647 455 CODE:
67e989fb 456 {
457 SV * encoding = items == 2 ? ST(1) : Nullsv;
458
459 if (encoding)
460 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
461 else {
462 STRLEN len;
183a2d84 463 U8* s = (U8*)SvPV(sv, len);
67e989fb 464 U8* converted;
465
466 converted = bytes_to_utf8(s, &len); /* This allocs */
183a2d84 467 sv_setpvn(sv, (char *)converted, len);
67e989fb 468 SvUTF8_on(sv); /* XXX Should we? */
469 Safefree(converted); /* ... so free it */
470 RETVAL = len;
471 }
472 }
2c674647 473 OUTPUT:
67e989fb 474 RETVAL
2c674647 475
67e989fb 476I32
2c674647 477_utf8_to_bytes(sv, ...)
67e989fb 478 SV * sv
2c674647 479 CODE:
67e989fb 480 {
481 SV * to = items > 1 ? ST(1) : Nullsv;
482 SV * check = items > 2 ? ST(2) : Nullsv;
87714904 483
67e989fb 484 if (to)
485 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
486 else {
67e989fb 487 STRLEN len;
b113ac0e 488 U8 *s = (U8*)SvPV(sv, len);
67e989fb 489
490 if (SvTRUE(check)) {
491 /* Must do things the slow way */
492 U8 *dest;
87714904 493 U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
67e989fb 494 U8 *send = s + len;
495
496 New(83, dest, len, U8); /* I think */
497
498 while (s < send) {
499 if (*s < 0x80)
500 *dest++ = *s++;
501 else {
b113ac0e 502 STRLEN ulen;
503 UV uv = *s++;
87714904 504
67e989fb 505 /* Have to do it all ourselves because of error routine,
506 aargh. */
507 if (!(uv & 0x40))
508 goto failure;
509 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
510 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
511 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
512 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
513 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
514 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
515 else { ulen = 13; uv = 0; }
87714904 516
67e989fb 517 /* Note change to utf8.c variable naming, for variety */
518 while (ulen--) {
519 if ((*s & 0xc0) != 0x80)
520 goto failure;
87714904 521
67e989fb 522 else
523 uv = (uv << 6) | (*s++ & 0x3f);
87714904 524 }
67e989fb 525 if (uv > 256) {
526 failure:
527 call_failure(check, s, dest, src);
528 /* Now what happens? */
529 }
530 *dest++ = (U8)uv;
531 }
532 }
533 } else
534 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
535 }
2c674647 536 }
537 OUTPUT:
538 RETVAL
539
540SV *
541_chars_to_utf8(sv, from, ...)
542 SV * sv
543 SV * from
544 CODE:
545 {
546 SV * check = items == 3 ? ST(2) : Nullsv;
547 RETVAL = &PL_sv_undef;
548 }
549 OUTPUT:
550 RETVAL
551
552SV *
553_utf8_to_chars(sv, to, ...)
554 SV * sv
555 SV * to
556 CODE:
557 {
558 SV * check = items == 3 ? ST(2) : Nullsv;
559 RETVAL = &PL_sv_undef;
560 }
561 OUTPUT:
562 RETVAL
563
564SV *
565_utf8_to_chars_check(sv, ...)
566 SV * sv
567 CODE:
568 {
569 SV * check = items == 2 ? ST(1) : Nullsv;
570 RETVAL = &PL_sv_undef;
571 }
572 OUTPUT:
573 RETVAL
574
575SV *
576_bytes_to_chars(sv, from, ...)
577 SV * sv
578 SV * from
579 CODE:
580 {
581 SV * check = items == 3 ? ST(2) : Nullsv;
582 RETVAL = &PL_sv_undef;
583 }
584 OUTPUT:
585 RETVAL
586
587SV *
588_chars_to_bytes(sv, to, ...)
589 SV * sv
590 SV * to
591 CODE:
592 {
593 SV * check = items == 3 ? ST(2) : Nullsv;
594 RETVAL = &PL_sv_undef;
595 }
596 OUTPUT:
597 RETVAL
598
599SV *
600_from_to(sv, from, to, ...)
601 SV * sv
602 SV * from
603 SV * to
604 CODE:
605 {
606 SV * check = items == 4 ? ST(3) : Nullsv;
607 RETVAL = &PL_sv_undef;
608 }
609 OUTPUT:
610 RETVAL
611
612bool
613_is_utf8(sv, ...)
614 SV * sv
615 CODE:
616 {
617 SV * check = items == 2 ? ST(1) : Nullsv;
618 if (SvPOK(sv)) {
067a85ef 619 RETVAL = SvUTF8(sv) ? 1 : 0;
2c674647 620 if (RETVAL &&
621 SvTRUE(check) &&
622 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
623 RETVAL = FALSE;
624 } else {
625 RETVAL = FALSE;
626 }
627 }
628 OUTPUT:
629 RETVAL
630
631SV *
632_on_utf8(sv)
633 SV * sv
634 CODE:
635 {
636 if (SvPOK(sv)) {
87714904 637 SV *rsv = newSViv(SvUTF8(sv));
2c674647 638 RETVAL = rsv;
639 SvUTF8_on(sv);
640 } else {
641 RETVAL = &PL_sv_undef;
642 }
643 }
644 OUTPUT:
645 RETVAL
646
647SV *
648_off_utf8(sv)
649 SV * sv
650 CODE:
651 {
652 if (SvPOK(sv)) {
87714904 653 SV *rsv = newSViv(SvUTF8(sv));
2c674647 654 RETVAL = rsv;
655 SvUTF8_off(sv);
656 } else {
657 RETVAL = &PL_sv_undef;
658 }
659 }
660 OUTPUT:
661 RETVAL
662
663SV *
664_utf_to_utf(sv, from, to, ...)
665 SV * sv
666 SV * from
667 SV * to
668 CODE:
669 {
670 SV * check = items == 4 ? ST(3) : Nullsv;
671 RETVAL = &PL_sv_undef;
672 }
673 OUTPUT:
674 RETVAL
675
33af2bc7 676BOOT:
677{
678#ifdef USE_PERLIO
679 PerlIO_define_layer(&PerlIO_encode);
680#endif
2f2b4ff2 681#include "iso8859.def"
682#include "EBCDIC.def"
683#include "Symbols.def"
33af2bc7 684}