Avoid expensive paranoid check that decoder does return legal UTF-8
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.xs
CommitLineData
fcf2db38 1#define PERL_NO_GET_CONTEXT
2c674647 2#include "EXTERN.h"
3#include "perl.h"
4#include "XSUB.h"
2f2b4ff2 5#define U8 U8
6#include "encode.h"
071db25d 7#include "def_t.h"
fcf2db38 8
fcb875d4 9#define ENCODE_XS_PROFILE 0 /* set 1 or more to profile.
a999c27c 10 t/encoding.t dumps core because of
11 Perl_warner and PerlIO don't work well */
39cf9a5e 12
13#define ENCODE_XS_USEFP 1 /* set 0 to disable floating point to calculate
a999c27c 14 buffer size for encode_method().
15 1 is recommended. 2 restores NI-S original */
39cf9a5e 16
fcf2db38 17#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \
2f5768b8 18 Perl_croak(aTHX_ "panic_unimplemented"); \
4a83738a 19 return (y)0; /* fool picky compilers */ \
87714904 20 }
67e989fb 21UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
aa0053b7 22 UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
9df9a5cd 23#if defined(USE_PERLIO) && !defined(USE_SFIO)
72e44f29 24/* Define an encoding "layer" in the perliol.h sense.
25 The layer defined here "inherits" in an object-oriented sense from the
26 "perlio" layer with its PerlIOBuf_* "methods".
27 The implementation is particularly efficient as until Encode settles down
28 there is no point in tryint to tune it.
29
30 The layer works by overloading the "fill" and "flush" methods.
31
32 "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API
33 to convert the encoded data to UTF-8 form, then copies it back to the
34 buffer. The "base class's" read methods then see the UTF-8 data.
35
36 "flush" transforms the UTF-8 data deposited by the "base class's write
37 method in the buffer back into the encoded form using the encode OO perl API,
38 then copies data back into the buffer and calls "SUPER::flush.
39
40 Note that "flush" is _also_ called for read mode - we still do the (back)-translate
41 so that the the base class's "flush" sees the correct number of encoded chars
42 for positioning the seek pointer. (This double translation is the worst performance
43 issue - particularly with all-perl encode engine.)
44
45*/
33af2bc7 46#include "perliol.h"
aa0053b7 47typedef struct {
48 PerlIOBuf base; /* PerlIOBuf stuff */
0b3236bb 49 SV *bufsv; /* buffer seen by layers above */
50 SV *dataSV; /* data we have read from layer below */
51 SV *enc; /* the encoding object */
33af2bc7 52} PerlIOEncode;
53
e3f3bf95 54SV *
aa0053b7 55PerlIOEncode_getarg(pTHX_ PerlIO * f, CLONE_PARAMS * param, int flags)
e3f3bf95 56{
aa0053b7 57 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
58 SV *sv = &PL_sv_undef;
59 if (e->enc) {
60 dSP;
61 ENTER;
62 SAVETMPS;
63 PUSHMARK(sp);
64 XPUSHs(e->enc);
65 PUTBACK;
66 if (perl_call_method("name", G_SCALAR) == 1) {
67 SPAGAIN;
68 sv = newSVsv(POPs);
69 PUTBACK;
70 }
e3f3bf95 71 }
aa0053b7 72 return sv;
e3f3bf95 73}
33af2bc7 74
75IV
aa0053b7 76PerlIOEncode_pushed(pTHX_ PerlIO * f, const char *mode, SV * arg)
33af2bc7 77{
aa0053b7 78 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
79 dSP;
80 IV code;
81 code = PerlIOBuf_pushed(aTHX_ f, mode, Nullsv);
82 ENTER;
83 SAVETMPS;
84 PUSHMARK(sp);
85 XPUSHs(arg);
86 PUTBACK;
87 if (perl_call_pv("Encode::find_encoding", G_SCALAR) != 1) {
88 /* should never happen */
89 Perl_die(aTHX_ "Encode::find_encoding did not return a value");
90 return -1;
91 }
92 SPAGAIN;
93 e->enc = POPs;
94 PUTBACK;
95 if (!SvROK(e->enc)) {
96 e->enc = Nullsv;
97 errno = EINVAL;
f98bc0c6 98 Perl_warner(aTHX_ packWARN(WARN_IO), "Cannot find encoding \"%" SVf "\"",
aa0053b7 99 arg);
100 code = -1;
101 }
102 else {
103 SvREFCNT_inc(e->enc);
104 PerlIOBase(f)->flags |= PERLIO_F_UTF8;
105 }
106 FREETMPS;
107 LEAVE;
108 return code;
33af2bc7 109}
110
111IV
aa0053b7 112PerlIOEncode_popped(pTHX_ PerlIO * f)
33af2bc7 113{
aa0053b7 114 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
115 if (e->enc) {
116 SvREFCNT_dec(e->enc);
117 e->enc = Nullsv;
118 }
119 if (e->bufsv) {
120 SvREFCNT_dec(e->bufsv);
121 e->bufsv = Nullsv;
122 }
0b3236bb 123 if (e->dataSV) {
124 SvREFCNT_dec(e->dataSV);
284ee456 125 e->dataSV = Nullsv;
0b3236bb 126 }
aa0053b7 127 return 0;
33af2bc7 128}
129
130STDCHAR *
aa0053b7 131PerlIOEncode_get_base(pTHX_ PerlIO * f)
33af2bc7 132{
aa0053b7 133 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
134 if (!e->base.bufsiz)
135 e->base.bufsiz = 1024;
136 if (!e->bufsv) {
137 e->bufsv = newSV(e->base.bufsiz);
138 sv_setpvn(e->bufsv, "", 0);
139 }
140 e->base.buf = (STDCHAR *) SvPVX(e->bufsv);
141 if (!e->base.ptr)
142 e->base.ptr = e->base.buf;
143 if (!e->base.end)
144 e->base.end = e->base.buf;
145 if (e->base.ptr < e->base.buf
146 || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) {
147 Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr,
148 e->base.buf + SvLEN(e->bufsv));
149 abort();
150 }
151 if (SvLEN(e->bufsv) < e->base.bufsiz) {
152 SSize_t poff = e->base.ptr - e->base.buf;
153 SSize_t eoff = e->base.end - e->base.buf;
154 e->base.buf = (STDCHAR *) SvGROW(e->bufsv, e->base.bufsiz);
155 e->base.ptr = e->base.buf + poff;
156 e->base.end = e->base.buf + eoff;
157 }
158 if (e->base.ptr < e->base.buf
159 || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) {
160 Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr,
161 e->base.buf + SvLEN(e->bufsv));
162 abort();
163 }
164 return e->base.buf;
33af2bc7 165}
166
33af2bc7 167IV
aa0053b7 168PerlIOEncode_fill(pTHX_ PerlIO * f)
33af2bc7 169{
aa0053b7 170 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
171 dSP;
0b3236bb 172 IV code = 0;
173 PerlIO *n;
174 SSize_t avail;
175 if (PerlIO_flush(f) != 0)
176 return -1;
177 n = PerlIONext(f);
178 if (!PerlIO_fast_gets(n)) {
179 /* Things get too messy if we don't have a buffer layer
180 push a :perlio to do the job */
181 char mode[8];
182 n = PerlIO_push(aTHX_ n, &PerlIO_perlio, PerlIO_modestr(f,mode), Nullsv);
183 if (!n) {
184 Perl_die(aTHX_ "panic: cannot push :perlio for %p",f);
185 }
186 }
187 ENTER;
188 SAVETMPS;
189 retry:
190 avail = PerlIO_get_cnt(n);
191 if (avail <= 0) {
192 avail = PerlIO_fill(n);
193 if (avail == 0) {
194 avail = PerlIO_get_cnt(n);
195 }
196 else {
197 if (!PerlIO_error(n) && PerlIO_eof(n))
198 avail = 0;
199 }
200 }
201 if (avail > 0) {
202 STDCHAR *ptr = PerlIO_get_ptr(n);
203 SSize_t use = avail;
aa0053b7 204 SV *uni;
aa0053b7 205 char *s;
0b3236bb 206 STRLEN len = 0;
207 e->base.ptr = e->base.end = (STDCHAR *) Nullch;
208 (void) PerlIOEncode_get_base(aTHX_ f);
209 if (!e->dataSV)
210 e->dataSV = newSV(0);
211 if (SvTYPE(e->dataSV) < SVt_PV) {
212 sv_upgrade(e->dataSV,SVt_PV);
213 }
214 if (SvCUR(e->dataSV)) {
215 /* something left over from last time - create a normal
216 SV with new data appended
217 */
218 if (use + SvCUR(e->dataSV) > e->base.bufsiz) {
219 use = e->base.bufsiz - SvCUR(e->dataSV);
220 }
9dbbe389 221 sv_catpvn(e->dataSV,(char*)ptr,use);
0b3236bb 222 }
223 else {
224 /* Create a "dummy" SV to represent the available data from layer below */
225 if (SvLEN(e->dataSV) && SvPVX(e->dataSV)) {
226 Safefree(SvPVX(e->dataSV));
227 }
228 if (use > e->base.bufsiz) {
229 use = e->base.bufsiz;
230 }
231 SvPVX(e->dataSV) = (char *) ptr;
232 SvLEN(e->dataSV) = 0; /* Hands off sv.c - it isn't yours */
233 SvCUR_set(e->dataSV,use);
284ee456 234 SvPOK_only(e->dataSV);
0b3236bb 235 }
236 SvUTF8_off(e->dataSV);
aa0053b7 237 PUSHMARK(sp);
238 XPUSHs(e->enc);
0b3236bb 239 XPUSHs(e->dataSV);
aa0053b7 240 XPUSHs(&PL_sv_yes);
241 PUTBACK;
0b3236bb 242 if (perl_call_method("decode", G_SCALAR) != 1) {
243 Perl_die(aTHX_ "panic: decode did not return a value");
244 }
aa0053b7 245 SPAGAIN;
246 uni = POPs;
247 PUTBACK;
0b3236bb 248 /* Now get translated string (forced to UTF-8) and use as buffer */
249 if (SvPOK(uni)) {
250 s = SvPVutf8(uni, len);
3c49ab08 251#ifdef PARANOID_ENCODE_CHECKS
85f7ebdf 252 if (len && !is_utf8_string((U8*)s,len)) {
0b3236bb 253 Perl_warn(aTHX_ "panic: decode did not return UTF-8 '%.*s'",(int) len,s);
254 }
3c49ab08 255#endif
0b3236bb 256 }
257 if (len > 0) {
258 /* Got _something */
259 /* if decode gave us back dataSV then data may vanish when
260 we do ptrcnt adjust - so take our copy now.
261 (The copy is a pain - need a put-it-here option for decode.)
262 */
263 sv_setpvn(e->bufsv,s,len);
284ee456 264 e->base.ptr = e->base.buf = (STDCHAR*)SvPVX(e->bufsv);
0b3236bb 265 e->base.end = e->base.ptr + SvCUR(e->bufsv);
266 PerlIOBase(f)->flags |= PERLIO_F_RDBUF;
267 SvUTF8_on(e->bufsv);
268
269 /* Adjust ptr/cnt not taking anything which
270 did not translate - not clear this is a win */
271 /* compute amount we took */
272 use -= SvCUR(e->dataSV);
273 PerlIO_set_ptrcnt(n, ptr+use, (avail-use));
274 /* and as we did not take it it isn't pending */
275 SvCUR_set(e->dataSV,0);
276 } else {
277 /* Got nothing - assume partial character so we need some more */
278 /* Make sure e->dataSV is a normal SV before re-filling as
279 buffer alias will change under us
280 */
281 s = SvPV(e->dataSV,len);
282 sv_setpvn(e->dataSV,s,len);
283 PerlIO_set_ptrcnt(n, ptr+use, (avail-use));
284 goto retry;
aa0053b7 285 }
aa0053b7 286 FREETMPS;
287 LEAVE;
0b3236bb 288 return code;
289 }
290 else {
291 if (avail == 0)
292 PerlIOBase(f)->flags |= PERLIO_F_EOF;
293 else
294 PerlIOBase(f)->flags |= PERLIO_F_ERROR;
295 return -1;
72e44f29 296 }
33af2bc7 297}
298
299IV
aa0053b7 300PerlIOEncode_flush(pTHX_ PerlIO * f)
33af2bc7 301{
aa0053b7 302 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
303 IV code = 0;
0b3236bb 304 if (e->bufsv && (e->base.ptr > e->base.buf)) {
aa0053b7 305 dSP;
306 SV *str;
307 char *s;
308 STRLEN len;
0b3236bb 309 SSize_t count = 0;
310 if (PerlIOBase(f)->flags & PERLIO_F_WRBUF) {
311 /* Write case encode the buffer and write() to layer below */
312 ENTER;
313 SAVETMPS;
314 PUSHMARK(sp);
315 XPUSHs(e->enc);
316 SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
317 SvUTF8_on(e->bufsv);
0b3236bb 318 XPUSHs(e->bufsv);
319 XPUSHs(&PL_sv_yes);
320 PUTBACK;
8bbd9480 321 if (perl_call_method("encode", G_SCALAR) != 1) {
322 Perl_die(aTHX_ "panic: encode did not return a value");
323 }
0b3236bb 324 SPAGAIN;
325 str = POPs;
326 PUTBACK;
327 s = SvPV(str, len);
328 count = PerlIO_write(PerlIONext(f),s,len);
329 if (count != len) {
330 code = -1;
331 }
332 FREETMPS;
333 LEAVE;
334 if (PerlIO_flush(PerlIONext(f)) != 0) {
335 code = -1;
336 }
25f7d9d3 337 if (SvCUR(e->bufsv)) {
338 /* Did not all translate */
339 e->base.ptr = e->base.buf+SvCUR(e->bufsv);
340 return code;
341 }
aa0053b7 342 }
0b3236bb 343 else if (PerlIOBase(f)->flags & PERLIO_F_RDBUF) {
344 /* read case */
345 /* if we have any untranslated stuff then unread that first */
346 if (e->dataSV && SvCUR(e->dataSV)) {
347 s = SvPV(e->dataSV, len);
348 count = PerlIO_unread(PerlIONext(f),s,len);
349 if (count != len) {
350 code = -1;
351 }
352 }
353 /* See if there is anything left in the buffer */
354 if (e->base.ptr < e->base.end) {
355 /* Bother - have unread data.
356 re-encode and unread() to layer below
357 */
358 ENTER;
359 SAVETMPS;
360 str = sv_newmortal();
361 sv_upgrade(str, SVt_PV);
9dbbe389 362 SvPVX(str) = (char*)e->base.ptr;
0b3236bb 363 SvLEN(str) = 0;
364 SvCUR_set(str, e->base.end - e->base.ptr);
284ee456 365 SvPOK_only(str);
0b3236bb 366 SvUTF8_on(str);
367 PUSHMARK(sp);
368 XPUSHs(e->enc);
369 XPUSHs(str);
370 XPUSHs(&PL_sv_yes);
371 PUTBACK;
8bbd9480 372 if (perl_call_method("encode", G_SCALAR) != 1) {
373 Perl_die(aTHX_ "panic: encode did not return a value");
374 }
0b3236bb 375 SPAGAIN;
376 str = POPs;
377 PUTBACK;
378 s = SvPV(str, len);
379 count = PerlIO_unread(PerlIONext(f),s,len);
380 if (count != len) {
381 code = -1;
382 }
383 FREETMPS;
384 LEAVE;
385 }
aa0053b7 386 }
0b3236bb 387 e->base.ptr = e->base.end = e->base.buf;
388 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF);
72e44f29 389 }
aa0053b7 390 return code;
33af2bc7 391}
392
393IV
aa0053b7 394PerlIOEncode_close(pTHX_ PerlIO * f)
33af2bc7 395{
aa0053b7 396 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
397 IV code = PerlIOBase_close(aTHX_ f);
398 if (e->bufsv) {
25f7d9d3 399 if (e->base.buf && e->base.ptr > e->base.buf) {
400 Perl_croak(aTHX_ "Close with partial character");
401 }
aa0053b7 402 SvREFCNT_dec(e->bufsv);
403 e->bufsv = Nullsv;
404 }
405 e->base.buf = NULL;
406 e->base.ptr = NULL;
407 e->base.end = NULL;
408 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF);
409 return code;
33af2bc7 410}
411
72e44f29 412Off_t
aa0053b7 413PerlIOEncode_tell(pTHX_ PerlIO * f)
72e44f29 414{
aa0053b7 415 PerlIOBuf *b = PerlIOSelf(f, PerlIOBuf);
0b3236bb 416 /* Unfortunately the only way to get a postion is to (re-)translate,
417 the UTF8 we have in bufefr and then ask layer below
aa0053b7 418 */
0b3236bb 419 PerlIO_flush(f);
25f7d9d3 420 if (b->buf && b->ptr > b->buf) {
421 Perl_croak(aTHX_ "Cannot tell at partial character");
422 }
0b3236bb 423 return PerlIO_tell(PerlIONext(f));
72e44f29 424}
425
8cf8f3d1 426PerlIO *
aa0053b7 427PerlIOEncode_dup(pTHX_ PerlIO * f, PerlIO * o,
428 CLONE_PARAMS * params, int flags)
8cf8f3d1 429{
aa0053b7 430 if ((f = PerlIOBase_dup(aTHX_ f, o, params, flags))) {
431 PerlIOEncode *fe = PerlIOSelf(f, PerlIOEncode);
432 PerlIOEncode *oe = PerlIOSelf(o, PerlIOEncode);
433 if (oe->enc) {
434 fe->enc = PerlIO_sv_dup(aTHX_ oe->enc, params);
435 }
9f16d962 436 }
aa0053b7 437 return f;
8cf8f3d1 438}
439
33af2bc7 440PerlIO_funcs PerlIO_encode = {
aa0053b7 441 "encoding",
442 sizeof(PerlIOEncode),
284ee456 443 PERLIO_K_BUFFERED|PERLIO_K_DESTRUCT,
aa0053b7 444 PerlIOEncode_pushed,
445 PerlIOEncode_popped,
446 PerlIOBuf_open,
447 PerlIOEncode_getarg,
448 PerlIOBase_fileno,
449 PerlIOEncode_dup,
450 PerlIOBuf_read,
451 PerlIOBuf_unread,
452 PerlIOBuf_write,
453 PerlIOBuf_seek,
454 PerlIOEncode_tell,
455 PerlIOEncode_close,
456 PerlIOEncode_flush,
457 PerlIOEncode_fill,
458 PerlIOBase_eof,
459 PerlIOBase_error,
460 PerlIOBase_clearerr,
461 PerlIOBase_setlinebuf,
462 PerlIOEncode_get_base,
463 PerlIOBuf_bufsiz,
464 PerlIOBuf_get_ptr,
465 PerlIOBuf_get_cnt,
466 PerlIOBuf_set_ptrcnt,
33af2bc7 467};
aa0053b7 468#endif /* encode layer */
33af2bc7 469
2f2b4ff2 470void
aa0053b7 471Encode_XSEncoding(pTHX_ encode_t * enc)
2f2b4ff2 472{
aa0053b7 473 dSP;
474 HV *stash = gv_stashpv("Encode::XS", TRUE);
475 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))), stash);
476 int i = 0;
477 PUSHMARK(sp);
478 XPUSHs(sv);
479 while (enc->name[i]) {
480 const char *name = enc->name[i++];
481 XPUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
482 }
483 PUTBACK;
484 call_pv("Encode::define_encoding", G_DISCARD);
485 SvREFCNT_dec(sv);
2f2b4ff2 486}
487
aa0053b7 488void
489call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig)
490{
25f7d9d3 491 /* Exists for breakpointing */
aa0053b7 492}
67e989fb 493
2f2b4ff2 494static SV *
aa0053b7 495encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
496 int check)
2f2b4ff2 497{
aa0053b7 498 STRLEN slen;
499 U8 *s = (U8 *) SvPV(src, slen);
3aececda 500 STRLEN tlen = slen;
501 STRLEN ddone = 0;
502 STRLEN sdone = 0;
39cf9a5e 503
3c49ab08 504 /* We allocate slen+1.
a999c27c 505 PerlIO dumps core if this value is smaller than this. */
3c49ab08 506 SV *dst = sv_2mortal(newSV(slen+1));
aa0053b7 507 if (slen) {
0b3236bb 508 U8 *d = (U8 *) SvPVX(dst);
509 STRLEN dlen = SvLEN(dst)-1;
aa0053b7 510 int code;
511 while ((code = do_encode(dir, s, &slen, d, dlen, &dlen, !check))) {
3aececda 512 SvCUR_set(dst, dlen+ddone);
284ee456 513 SvPOK_only(dst);
9b37254d 514
39cf9a5e 515#if ENCODE_XS_PROFILE >= 3
516 Perl_warn(aTHX_ "code=%d @ s=%d/%d/%d d=%d/%d/%d\n",code,slen,sdone,tlen,dlen,ddone,SvLEN(dst)-1);
0b3236bb 517#endif
518
519 if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL)
aa0053b7 520 break;
9b37254d 521
aa0053b7 522 switch (code) {
523 case ENCODE_NOSPACE:
a999c27c 524 {
fcb875d4 525 STRLEN more = 0; /* make sure you initialize! */
526 STRLEN sleft;
3aececda 527 sdone += slen;
528 ddone += dlen;
39cf9a5e 529 sleft = tlen - sdone;
fcb875d4 530#if ENCODE_XS_PROFILE >= 2
3c49ab08 531 Perl_warn(aTHX_
fcb875d4 532 "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
533 more, sdone, sleft, SvLEN(dst));
534#endif
535 if (sdone != 0) { /* has src ever been processed ? */
39cf9a5e 536#if ENCODE_XS_USEFP == 2
a999c27c 537 more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone
538 - SvLEN(dst);
39cf9a5e 539#elif ENCODE_XS_USEFP
a999c27c 540 more = (1.0*SvLEN(dst)+1)/sdone * sleft;
39cf9a5e 541#else
a999c27c 542 /* safe until SvLEN(dst) == MAX_INT/16 */
543 more = (16*SvLEN(dst)+1)/sdone/16 * sleft;
c98ca92f 544#endif
0b3236bb 545 }
39cf9a5e 546 more += UTF8_MAXLEN; /* insurance policy */
547#if ENCODE_XS_PROFILE >= 2
3c49ab08 548 Perl_warn(aTHX_
a999c27c 549 "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
550 more, sdone, sleft, SvLEN(dst));
39cf9a5e 551#endif
552 d = (U8 *) SvGROW(dst, SvLEN(dst) + more);
553 /* dst need to grow need MORE bytes! */
3aececda 554 if (ddone >= SvLEN(dst)) {
555 Perl_croak(aTHX_ "Destination couldn't be grown.");
aa0053b7 556 }
3aececda 557 dlen = SvLEN(dst)-ddone-1;
558 d += ddone;
559 s += slen;
560 slen = tlen-sdone;
561 continue;
39cf9a5e 562 }
2f2b4ff2 563
aa0053b7 564 case ENCODE_NOREP:
565 if (dir == enc->f_utf8) {
566 if (!check && ckWARN_d(WARN_UTF8)) {
567 STRLEN clen;
568 UV ch =
569 utf8n_to_uvuni(s + slen, (SvCUR(src) - slen),
570 &clen, 0);
f98bc0c6 571 Perl_warner(aTHX_ packWARN(WARN_UTF8),
aa0053b7 572 "\"\\N{U+%" UVxf
573 "}\" does not map to %s", ch,
574 enc->name[0]);
575 /* FIXME: Skip over the character, copy in replacement and continue
576 * but that is messy so for now just fail.
577 */
578 return &PL_sv_undef;
579 }
580 else {
581 return &PL_sv_undef;
582 }
583 }
584 else {
d0ee0454 585 /* UTF-8 is supposed to be "Universal" so should not happen
586 for real characters, but some encodings have non-assigned
587 codes which may occur.
588 */
ee981de6 589 Perl_croak(aTHX_ "%s \"\\x%02X\" does not map to Unicode (%d)",
590 enc->name[0], (U8) s[slen], code);
aa0053b7 591 }
592 break;
2f2b4ff2 593
aa0053b7 594 default:
595 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
596 code, (dir == enc->f_utf8) ? "to" : "from",
597 enc->name[0]);
598 return &PL_sv_undef;
599 }
600 }
3aececda 601 SvCUR_set(dst, dlen+ddone);
284ee456 602 SvPOK_only(dst);
aa0053b7 603 if (check) {
3aececda 604 sdone = SvCUR(src) - (slen+sdone);
605 if (sdone) {
f54fca96 606#if 1
607 /* FIXME: A Move() is dangerous - PV could be mmap'ed readonly
608 SvOOK would be ideal - but sv_backoff does not understand SvLEN == 0
609 type SVs and sv_clear() calls it ...
610 */
64ffdd5e 611 sv_setpvn(src, (char*)s+slen, sdone);
f54fca96 612#else
3aececda 613 Move(s + slen, SvPVX(src), sdone , U8);
f54fca96 614#endif
aa0053b7 615 }
3aececda 616 SvCUR_set(src, sdone);
aa0053b7 617 }
2f2b4ff2 618 }
aa0053b7 619 else {
0b3236bb 620 SvCUR_set(dst, 0);
284ee456 621 SvPOK_only(dst);
2f2b4ff2 622 }
39cf9a5e 623#if ENCODE_XS_PROFILE
624 if (SvCUR(dst) > SvCUR(src)){
3c49ab08 625 Perl_warn(aTHX_
a999c27c 626 "SvLEN(dst)=%d, SvCUR(dst)=%d. "
627 "%d bytes unused(%f %%)\n",
3c49ab08 628 SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst),
a999c27c 629 (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0);
3c49ab08 630
39cf9a5e 631 }
3c49ab08 632#endif
0b3236bb 633 *SvEND(dst) = '\0';
aa0053b7 634 return dst;
2f2b4ff2 635}
636
50d26985 637MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_
2f2b4ff2 638
639PROTOTYPES: ENABLE
640
641void
0a95303c 642Method_name(obj)
643SV * obj
644CODE:
645 {
646 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
647 ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0])));
648 XSRETURN(1);
649 }
650
651void
691638dd 652Method_decode(obj,src,check = FALSE)
2f2b4ff2 653SV * obj
654SV * src
691638dd 655bool check
2f2b4ff2 656CODE:
657 {
658 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
659 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
660 SvUTF8_on(ST(0));
661 XSRETURN(1);
662 }
663
664void
691638dd 665Method_encode(obj,src,check = FALSE)
2f2b4ff2 666SV * obj
667SV * src
691638dd 668bool check
2f2b4ff2 669CODE:
670 {
671 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
672 sv_utf8_upgrade(src);
673 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
674 XSRETURN(1);
675 }
676
67e989fb 677MODULE = Encode PACKAGE = Encode
2c674647 678
679PROTOTYPES: ENABLE
680
67e989fb 681I32
2c674647 682_bytes_to_utf8(sv, ...)
67e989fb 683 SV * sv
2c674647 684 CODE:
67e989fb 685 {
686 SV * encoding = items == 2 ? ST(1) : Nullsv;
687
688 if (encoding)
689 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
690 else {
691 STRLEN len;
183a2d84 692 U8* s = (U8*)SvPV(sv, len);
67e989fb 693 U8* converted;
694
695 converted = bytes_to_utf8(s, &len); /* This allocs */
183a2d84 696 sv_setpvn(sv, (char *)converted, len);
67e989fb 697 SvUTF8_on(sv); /* XXX Should we? */
698 Safefree(converted); /* ... so free it */
699 RETVAL = len;
700 }
701 }
2c674647 702 OUTPUT:
67e989fb 703 RETVAL
2c674647 704
67e989fb 705I32
2c674647 706_utf8_to_bytes(sv, ...)
67e989fb 707 SV * sv
2c674647 708 CODE:
67e989fb 709 {
710 SV * to = items > 1 ? ST(1) : Nullsv;
711 SV * check = items > 2 ? ST(2) : Nullsv;
87714904 712
67e989fb 713 if (to)
714 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
715 else {
67e989fb 716 STRLEN len;
b113ac0e 717 U8 *s = (U8*)SvPV(sv, len);
67e989fb 718
9c5ffd7c 719 RETVAL = 0;
67e989fb 720 if (SvTRUE(check)) {
721 /* Must do things the slow way */
722 U8 *dest;
87714904 723 U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
67e989fb 724 U8 *send = s + len;
725
726 New(83, dest, len, U8); /* I think */
727
728 while (s < send) {
729 if (*s < 0x80)
730 *dest++ = *s++;
731 else {
b113ac0e 732 STRLEN ulen;
733 UV uv = *s++;
87714904 734
67e989fb 735 /* Have to do it all ourselves because of error routine,
736 aargh. */
737 if (!(uv & 0x40))
738 goto failure;
739 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
740 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
741 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
742 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
743 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
744 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
745 else { ulen = 13; uv = 0; }
87714904 746
67e989fb 747 /* Note change to utf8.c variable naming, for variety */
748 while (ulen--) {
749 if ((*s & 0xc0) != 0x80)
750 goto failure;
87714904 751
67e989fb 752 else
753 uv = (uv << 6) | (*s++ & 0x3f);
87714904 754 }
67e989fb 755 if (uv > 256) {
756 failure:
757 call_failure(check, s, dest, src);
758 /* Now what happens? */
759 }
760 *dest++ = (U8)uv;
761 }
762 }
763 } else
764 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
765 }
2c674647 766 }
767 OUTPUT:
768 RETVAL
769
2c674647 770bool
4411f3b6 771is_utf8(sv, check = FALSE)
772SV * sv
773bool check
2c674647 774 CODE:
775 {
2eebba1d 776 if (SvGMAGICAL(sv)) /* it could be $1, for example */
777 sv = newSVsv(sv); /* GMAGIG will be done */
2c674647 778 if (SvPOK(sv)) {
4411f3b6 779 RETVAL = SvUTF8(sv) ? TRUE : FALSE;
2c674647 780 if (RETVAL &&
4411f3b6 781 check &&
2c674647 782 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
783 RETVAL = FALSE;
784 } else {
785 RETVAL = FALSE;
786 }
2eebba1d 787 if (sv != ST(0))
788 SvREFCNT_dec(sv); /* it was a temp copy */
2c674647 789 }
790 OUTPUT:
791 RETVAL
792
793SV *
4411f3b6 794_utf8_on(sv)
2c674647 795 SV * sv
796 CODE:
797 {
798 if (SvPOK(sv)) {
87714904 799 SV *rsv = newSViv(SvUTF8(sv));
2c674647 800 RETVAL = rsv;
801 SvUTF8_on(sv);
802 } else {
803 RETVAL = &PL_sv_undef;
804 }
805 }
806 OUTPUT:
807 RETVAL
808
809SV *
4411f3b6 810_utf8_off(sv)
2c674647 811 SV * sv
812 CODE:
813 {
814 if (SvPOK(sv)) {
87714904 815 SV *rsv = newSViv(SvUTF8(sv));
2c674647 816 RETVAL = rsv;
817 SvUTF8_off(sv);
818 } else {
819 RETVAL = &PL_sv_undef;
820 }
821 }
822 OUTPUT:
823 RETVAL
824
33af2bc7 825BOOT:
826{
6a59c517 827#if defined(USE_PERLIO) && !defined(USE_SFIO)
a999f61b 828 PerlIO_define_layer(aTHX_ &PerlIO_encode);
33af2bc7 829#endif
e7cbefb8 830#include "def_t.exh"
33af2bc7 831}