UTF-X encoding invariance for Encode:
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.xs
CommitLineData
2c674647 1#include "EXTERN.h"
2#include "perl.h"
3#include "XSUB.h"
2f2b4ff2 4#define U8 U8
5#include "encode.h"
6#include "iso8859.h"
7#include "EBCDIC.h"
8#include "Symbols.h"
2c674647 9
67e989fb 10#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) { \
2f5768b8 11 Perl_croak(aTHX_ "panic_unimplemented"); \
4a83738a 12 return (y)0; /* fool picky compilers */ \
87714904 13 }
67e989fb 14UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
15UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
16
9df9a5cd 17#if defined(USE_PERLIO) && !defined(USE_SFIO)
72e44f29 18/* Define an encoding "layer" in the perliol.h sense.
19 The layer defined here "inherits" in an object-oriented sense from the
20 "perlio" layer with its PerlIOBuf_* "methods".
21 The implementation is particularly efficient as until Encode settles down
22 there is no point in tryint to tune it.
23
24 The layer works by overloading the "fill" and "flush" methods.
25
26 "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API
27 to convert the encoded data to UTF-8 form, then copies it back to the
28 buffer. The "base class's" read methods then see the UTF-8 data.
29
30 "flush" transforms the UTF-8 data deposited by the "base class's write
31 method in the buffer back into the encoded form using the encode OO perl API,
32 then copies data back into the buffer and calls "SUPER::flush.
33
34 Note that "flush" is _also_ called for read mode - we still do the (back)-translate
35 so that the the base class's "flush" sees the correct number of encoded chars
36 for positioning the seek pointer. (This double translation is the worst performance
37 issue - particularly with all-perl encode engine.)
38
39*/
40
41
33af2bc7 42#include "perliol.h"
43
44typedef struct
45{
46 PerlIOBuf base; /* PerlIOBuf stuff */
47 SV * bufsv;
48 SV * enc;
49} PerlIOEncode;
50
51
52IV
53PerlIOEncode_pushed(PerlIO *f, const char *mode,const char *arg,STRLEN len)
54{
55 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
56 dTHX;
57 dSP;
58 IV code;
59 code = PerlIOBuf_pushed(f,mode,Nullch,0);
60 ENTER;
61 SAVETMPS;
62 PUSHMARK(sp);
33af2bc7 63 XPUSHs(sv_2mortal(newSVpvn(arg,len)));
64 PUTBACK;
51ef4e11 65 if (perl_call_pv("Encode::find_encoding",G_SCALAR) != 1)
52744f63 66 {
67 /* should never happen */
51ef4e11 68 Perl_die(aTHX_ "Encode::find_encoding did not return a value");
52744f63 69 return -1;
70 }
33af2bc7 71 SPAGAIN;
72 e->enc = POPs;
73 PUTBACK;
74 if (!SvROK(e->enc))
52744f63 75 {
76 e->enc = Nullsv;
77 errno = EINVAL;
78 Perl_warner(aTHX_ WARN_IO, "Cannot find encoding \"%.*s\"", (int) len, arg);
79 return -1;
80 }
33af2bc7 81 SvREFCNT_inc(e->enc);
82 FREETMPS;
83 LEAVE;
84 PerlIOBase(f)->flags |= PERLIO_F_UTF8;
85 return code;
86}
87
88IV
89PerlIOEncode_popped(PerlIO *f)
90{
91 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
92 dTHX;
93 if (e->enc)
94 {
95 SvREFCNT_dec(e->enc);
96 e->enc = Nullsv;
97 }
98 if (e->bufsv)
99 {
100 SvREFCNT_dec(e->bufsv);
101 e->bufsv = Nullsv;
102 }
103 return 0;
104}
105
106STDCHAR *
107PerlIOEncode_get_base(PerlIO *f)
108{
109 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
110 dTHX;
111 if (!e->base.bufsiz)
112 e->base.bufsiz = 1024;
113 if (!e->bufsv)
114 {
115 e->bufsv = newSV(e->base.bufsiz);
116 sv_setpvn(e->bufsv,"",0);
117 }
62e8870c 118 e->base.buf = (STDCHAR *)SvPVX(e->bufsv);
33af2bc7 119 if (!e->base.ptr)
120 e->base.ptr = e->base.buf;
121 if (!e->base.end)
122 e->base.end = e->base.buf;
123 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
124 {
125 Perl_warn(aTHX_ " ptr %p(%p)%p",
126 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
127 abort();
128 }
129 if (SvLEN(e->bufsv) < e->base.bufsiz)
130 {
131 SSize_t poff = e->base.ptr - e->base.buf;
132 SSize_t eoff = e->base.end - e->base.buf;
62e8870c 133 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,e->base.bufsiz);
33af2bc7 134 e->base.ptr = e->base.buf + poff;
135 e->base.end = e->base.buf + eoff;
136 }
137 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
138 {
139 Perl_warn(aTHX_ " ptr %p(%p)%p",
140 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
141 abort();
142 }
143 return e->base.buf;
144}
145
33af2bc7 146IV
147PerlIOEncode_fill(PerlIO *f)
148{
149 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
150 dTHX;
151 dSP;
152 IV code;
33af2bc7 153 code = PerlIOBuf_fill(f);
154 if (code == 0)
155 {
156 SV *uni;
72e44f29 157 STRLEN len;
158 char *s;
159 /* Set SV that is the buffer to be buf..ptr */
33af2bc7 160 SvCUR_set(e->bufsv, e->base.end - e->base.buf);
161 SvUTF8_off(e->bufsv);
162 ENTER;
163 SAVETMPS;
164 PUSHMARK(sp);
165 XPUSHs(e->enc);
166 XPUSHs(e->bufsv);
167 XPUSHs(&PL_sv_yes);
168 PUTBACK;
50d26985 169 if (perl_call_method("decode",G_SCALAR) != 1)
33af2bc7 170 code = -1;
171 SPAGAIN;
172 uni = POPs;
173 PUTBACK;
72e44f29 174 /* Now get translated string (forced to UTF-8) and copy back to buffer
175 don't use sv_setsv as that may "steal" PV from returned temp
176 and so free() our known-large-enough buffer.
177 sv_setpvn() should do but let us do it long hand.
178 */
179 s = SvPVutf8(uni,len);
180 if (s != SvPVX(e->bufsv))
181 {
62e8870c 182 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
72e44f29 183 Move(s,e->base.buf,len,char);
184 SvCUR_set(e->bufsv,len);
185 }
186 SvUTF8_on(e->bufsv);
187 e->base.end = e->base.buf+len;
33af2bc7 188 e->base.ptr = e->base.buf;
189 FREETMPS;
190 LEAVE;
191 }
192 return code;
193}
194
195IV
196PerlIOEncode_flush(PerlIO *f)
197{
198 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
199 IV code = 0;
200 dTHX;
201 if (e->bufsv && (PerlIOBase(f)->flags & (PERLIO_F_RDBUF|PERLIO_F_WRBUF)))
202 {
203 dSP;
204 SV *str;
205 char *s;
206 STRLEN len;
72e44f29 207 SSize_t left = 0;
208 if (PerlIOBase(f)->flags & PERLIO_F_RDBUF)
209 {
210 /* This is really just a flag to see if we took all the data, if
211 we did PerlIOBase_flush avoids a seek to lower layer.
212 Need to revisit if we start getting clever with unreads or seeks-in-buffer
213 */
214 left = e->base.end - e->base.ptr;
215 }
33af2bc7 216 ENTER;
217 SAVETMPS;
218 PUSHMARK(sp);
219 XPUSHs(e->enc);
72e44f29 220 SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
33af2bc7 221 SvUTF8_on(e->bufsv);
222 XPUSHs(e->bufsv);
223 XPUSHs(&PL_sv_yes);
224 PUTBACK;
50d26985 225 if (perl_call_method("encode",G_SCALAR) != 1)
33af2bc7 226 code = -1;
227 SPAGAIN;
228 str = POPs;
229 PUTBACK;
72e44f29 230 s = SvPV(str,len);
231 if (s != SvPVX(e->bufsv))
232 {
62e8870c 233 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
72e44f29 234 Move(s,e->base.buf,len,char);
235 SvCUR_set(e->bufsv,len);
236 }
33af2bc7 237 SvUTF8_off(e->bufsv);
72e44f29 238 e->base.ptr = e->base.buf+len;
239 /* restore end != ptr as inequality is used by PerlIOBuf_flush in read case */
240 e->base.end = e->base.ptr + left;
33af2bc7 241 FREETMPS;
242 LEAVE;
243 if (PerlIOBuf_flush(f) != 0)
244 code = -1;
245 }
246 return code;
247}
248
249IV
250PerlIOEncode_close(PerlIO *f)
251{
252 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
253 IV code = PerlIOBase_close(f);
254 dTHX;
255 if (e->bufsv)
256 {
257 SvREFCNT_dec(e->bufsv);
258 e->bufsv = Nullsv;
259 }
260 e->base.buf = NULL;
261 e->base.ptr = NULL;
262 e->base.end = NULL;
263 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF|PERLIO_F_WRBUF);
264 return code;
265}
266
72e44f29 267Off_t
268PerlIOEncode_tell(PerlIO *f)
269{
270 PerlIOBuf *b = PerlIOSelf(f,PerlIOBuf);
271 /* Unfortunately the only way to get a postion is to back-translate,
272 the UTF8-bytes we have buf..ptr and adjust accordingly.
273 But we will try and save any unread data in case stream
274 is un-seekable.
275 */
276 if ((PerlIOBase(f)->flags & PERLIO_F_RDBUF) && b->ptr < b->end)
277 {
278 Size_t count = b->end - b->ptr;
279 PerlIO_push(f,&PerlIO_pending,"r",Nullch,0);
280 /* Save what we have left to read */
281 PerlIOSelf(f,PerlIOBuf)->bufsiz = count;
282 PerlIO_unread(f,b->ptr,count);
283 /* There isn't any unread data - we just saved it - so avoid the lower seek */
284 b->end = b->ptr;
285 /* Flush ourselves - now one layer down,
286 this does the back translate and adjusts position
287 */
288 PerlIO_flush(PerlIONext(f));
289 /* Set position of the saved data */
290 PerlIOSelf(f,PerlIOBuf)->posn = b->posn;
291 }
292 else
293 {
294 PerlIO_flush(f);
295 }
296 return b->posn;
297}
298
33af2bc7 299PerlIO_funcs PerlIO_encode = {
72e44f29 300 "encoding",
33af2bc7 301 sizeof(PerlIOEncode),
302 PERLIO_K_BUFFERED,
303 PerlIOBase_fileno,
304 PerlIOBuf_fdopen,
305 PerlIOBuf_open,
306 PerlIOBuf_reopen,
307 PerlIOEncode_pushed,
308 PerlIOEncode_popped,
309 PerlIOBuf_read,
310 PerlIOBuf_unread,
311 PerlIOBuf_write,
312 PerlIOBuf_seek,
72e44f29 313 PerlIOEncode_tell,
33af2bc7 314 PerlIOEncode_close,
315 PerlIOEncode_flush,
316 PerlIOEncode_fill,
317 PerlIOBase_eof,
318 PerlIOBase_error,
319 PerlIOBase_clearerr,
320 PerlIOBuf_setlinebuf,
321 PerlIOEncode_get_base,
322 PerlIOBuf_bufsiz,
323 PerlIOBuf_get_ptr,
324 PerlIOBuf_get_cnt,
325 PerlIOBuf_set_ptrcnt,
326};
9df9a5cd 327#endif /* encode layer */
33af2bc7 328
2f2b4ff2 329void
330Encode_Define(pTHX_ encode_t *enc)
331{
51ef4e11 332 dSP;
2f2b4ff2 333 HV *stash = gv_stashpv("Encode::XS", TRUE);
334 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
dcda1f94 335 int i = 0;
51ef4e11 336 PUSHMARK(sp);
337 XPUSHs(sv);
dcda1f94 338 while (enc->name[i])
339 {
340 const char *name = enc->name[i++];
51ef4e11 341 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
dcda1f94 342 }
51ef4e11 343 PUTBACK;
344 call_pv("Encode::define_encoding",G_DISCARD);
dcda1f94 345 SvREFCNT_dec(sv);
2f2b4ff2 346}
347
183a2d84 348void call_failure (SV *routine, U8* done, U8* dest, U8* orig) {}
67e989fb 349
2f2b4ff2 350static SV *
351encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check)
352{
353 STRLEN slen;
354 U8 *s = (U8 *) SvPV(src,slen);
355 SV *dst = sv_2mortal(newSV(2*slen+1));
356 if (slen)
357 {
358 U8 *d = (U8 *) SvGROW(dst, 2*slen+1);
359 STRLEN dlen = SvLEN(dst);
360 int code;
9b37254d 361 while ((code = do_encode(dir,s,&slen,d,dlen,&dlen,!check)))
2f2b4ff2 362 {
363 SvCUR_set(dst,dlen);
364 SvPOK_on(dst);
9b37254d 365
366 if (code == ENCODE_FALLBACK)
367 break;
368
2f2b4ff2 369 switch(code)
370 {
371 case ENCODE_NOSPACE:
372 {
373 STRLEN need = (slen) ? (SvLEN(dst)*SvCUR(src)/slen) : (dlen + UTF8_MAXLEN);
374 if (need <= SvLEN(dst))
375 need += UTF8_MAXLEN;
376 d = (U8 *) SvGROW(dst, need);
377 dlen = SvLEN(dst);
378 slen = SvCUR(src);
379 break;
380 }
381
382 case ENCODE_NOREP:
383 if (dir == enc->f_utf8)
384 {
385 if (!check && ckWARN_d(WARN_UTF8))
386 {
387 STRLEN clen;
9041c2e3 388 UV ch = utf8n_to_uvuni(s+slen,(SvCUR(src)-slen),&clen,0);
389 Perl_warner(aTHX_ WARN_UTF8, "\"\\N{U+%"UVxf"}\" does not map to %s", ch, enc->name[0]);
2f2b4ff2 390 /* FIXME: Skip over the character, copy in replacement and continue
391 * but that is messy so for now just fail.
392 */
393 return &PL_sv_undef;
394 }
395 else
396 {
397 return &PL_sv_undef;
398 }
399 }
400 else
401 {
402 /* UTF-8 is supposed to be "Universal" so should not happen */
403 Perl_croak(aTHX_ "%s '%.*s' does not map to UTF-8",
dcda1f94 404 enc->name[0], (int)(SvCUR(src)-slen),s+slen);
2f2b4ff2 405 }
406 break;
407
408 case ENCODE_PARTIAL:
409 if (!check && ckWARN_d(WARN_UTF8))
410 {
411 Perl_warner(aTHX_ WARN_UTF8, "Partial %s character",
dcda1f94 412 (dir == enc->f_utf8) ? "UTF-8" : enc->name[0]);
2f2b4ff2 413 }
414 return &PL_sv_undef;
415
416 default:
417 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
dcda1f94 418 code, (dir == enc->f_utf8) ? "to" : "from",enc->name[0]);
2f2b4ff2 419 return &PL_sv_undef;
420 }
421 }
422 SvCUR_set(dst,dlen);
423 SvPOK_on(dst);
424 if (check)
425 {
426 if (slen < SvCUR(src))
427 {
428 Move(s+slen,s,SvCUR(src)-slen,U8);
429 }
430 SvCUR_set(src,SvCUR(src)-slen);
431 }
432 }
433 return dst;
434}
435
50d26985 436MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_
2f2b4ff2 437
438PROTOTYPES: ENABLE
439
440void
50d26985 441Method_decode(obj,src,check = 0)
2f2b4ff2 442SV * obj
443SV * src
444int check
445CODE:
446 {
447 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
448 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
449 SvUTF8_on(ST(0));
450 XSRETURN(1);
451 }
452
453void
50d26985 454Method_encode(obj,src,check = 0)
2f2b4ff2 455SV * obj
456SV * src
457int check
458CODE:
459 {
460 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
461 sv_utf8_upgrade(src);
462 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
463 XSRETURN(1);
464 }
465
67e989fb 466MODULE = Encode PACKAGE = Encode
2c674647 467
468PROTOTYPES: ENABLE
469
67e989fb 470I32
2c674647 471_bytes_to_utf8(sv, ...)
67e989fb 472 SV * sv
2c674647 473 CODE:
67e989fb 474 {
475 SV * encoding = items == 2 ? ST(1) : Nullsv;
476
477 if (encoding)
478 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
479 else {
480 STRLEN len;
183a2d84 481 U8* s = (U8*)SvPV(sv, len);
67e989fb 482 U8* converted;
483
484 converted = bytes_to_utf8(s, &len); /* This allocs */
183a2d84 485 sv_setpvn(sv, (char *)converted, len);
67e989fb 486 SvUTF8_on(sv); /* XXX Should we? */
487 Safefree(converted); /* ... so free it */
488 RETVAL = len;
489 }
490 }
2c674647 491 OUTPUT:
67e989fb 492 RETVAL
2c674647 493
67e989fb 494I32
2c674647 495_utf8_to_bytes(sv, ...)
67e989fb 496 SV * sv
2c674647 497 CODE:
67e989fb 498 {
499 SV * to = items > 1 ? ST(1) : Nullsv;
500 SV * check = items > 2 ? ST(2) : Nullsv;
87714904 501
67e989fb 502 if (to)
503 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
504 else {
67e989fb 505 STRLEN len;
b113ac0e 506 U8 *s = (U8*)SvPV(sv, len);
67e989fb 507
508 if (SvTRUE(check)) {
509 /* Must do things the slow way */
510 U8 *dest;
87714904 511 U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
67e989fb 512 U8 *send = s + len;
513
514 New(83, dest, len, U8); /* I think */
515
516 while (s < send) {
517 if (*s < 0x80)
518 *dest++ = *s++;
519 else {
b113ac0e 520 STRLEN ulen;
521 UV uv = *s++;
87714904 522
67e989fb 523 /* Have to do it all ourselves because of error routine,
524 aargh. */
525 if (!(uv & 0x40))
526 goto failure;
527 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
528 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
529 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
530 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
531 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
532 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
533 else { ulen = 13; uv = 0; }
87714904 534
67e989fb 535 /* Note change to utf8.c variable naming, for variety */
536 while (ulen--) {
537 if ((*s & 0xc0) != 0x80)
538 goto failure;
87714904 539
67e989fb 540 else
541 uv = (uv << 6) | (*s++ & 0x3f);
87714904 542 }
67e989fb 543 if (uv > 256) {
544 failure:
545 call_failure(check, s, dest, src);
546 /* Now what happens? */
547 }
548 *dest++ = (U8)uv;
549 }
550 }
551 } else
552 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
553 }
2c674647 554 }
555 OUTPUT:
556 RETVAL
557
2c674647 558bool
4411f3b6 559is_utf8(sv, check = FALSE)
560SV * sv
561bool check
2c674647 562 CODE:
563 {
2c674647 564 if (SvPOK(sv)) {
4411f3b6 565 RETVAL = SvUTF8(sv) ? TRUE : FALSE;
2c674647 566 if (RETVAL &&
4411f3b6 567 check &&
2c674647 568 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
569 RETVAL = FALSE;
570 } else {
571 RETVAL = FALSE;
572 }
573 }
574 OUTPUT:
575 RETVAL
576
577SV *
4411f3b6 578_utf8_on(sv)
2c674647 579 SV * sv
580 CODE:
581 {
582 if (SvPOK(sv)) {
87714904 583 SV *rsv = newSViv(SvUTF8(sv));
2c674647 584 RETVAL = rsv;
585 SvUTF8_on(sv);
586 } else {
587 RETVAL = &PL_sv_undef;
588 }
589 }
590 OUTPUT:
591 RETVAL
592
593SV *
4411f3b6 594_utf8_off(sv)
2c674647 595 SV * sv
596 CODE:
597 {
598 if (SvPOK(sv)) {
87714904 599 SV *rsv = newSViv(SvUTF8(sv));
2c674647 600 RETVAL = rsv;
601 SvUTF8_off(sv);
602 } else {
603 RETVAL = &PL_sv_undef;
604 }
605 }
606 OUTPUT:
607 RETVAL
608
33af2bc7 609BOOT:
610{
6a59c517 611#if defined(USE_PERLIO) && !defined(USE_SFIO)
33af2bc7 612 PerlIO_define_layer(&PerlIO_encode);
613#endif
2f2b4ff2 614#include "iso8859.def"
615#include "EBCDIC.def"
616#include "Symbols.def"
33af2bc7 617}