Upgrade to Encode 1.32, from Dan Kogai.
[p5sagit/p5-mst-13.2.git] / ext / Encode / Encode.xs
CommitLineData
fcf2db38 1#define PERL_NO_GET_CONTEXT
2c674647 2#include "EXTERN.h"
3#include "perl.h"
4#include "XSUB.h"
2f2b4ff2 5#define U8 U8
6#include "encode.h"
071db25d 7#include "def_t.h"
fcf2db38 8
fcb875d4 9#define ENCODE_XS_PROFILE 0 /* set 1 or more to profile.
a999c27c 10 t/encoding.t dumps core because of
11 Perl_warner and PerlIO don't work well */
39cf9a5e 12
13#define ENCODE_XS_USEFP 1 /* set 0 to disable floating point to calculate
a999c27c 14 buffer size for encode_method().
15 1 is recommended. 2 restores NI-S original */
39cf9a5e 16
fcf2db38 17#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \
2f5768b8 18 Perl_croak(aTHX_ "panic_unimplemented"); \
4a83738a 19 return (y)0; /* fool picky compilers */ \
87714904 20 }
67e989fb 21UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
aa0053b7 22 UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
9df9a5cd 23#if defined(USE_PERLIO) && !defined(USE_SFIO)
72e44f29 24/* Define an encoding "layer" in the perliol.h sense.
25 The layer defined here "inherits" in an object-oriented sense from the
26 "perlio" layer with its PerlIOBuf_* "methods".
27 The implementation is particularly efficient as until Encode settles down
28 there is no point in tryint to tune it.
29
30 The layer works by overloading the "fill" and "flush" methods.
31
32 "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API
33 to convert the encoded data to UTF-8 form, then copies it back to the
34 buffer. The "base class's" read methods then see the UTF-8 data.
35
36 "flush" transforms the UTF-8 data deposited by the "base class's write
37 method in the buffer back into the encoded form using the encode OO perl API,
38 then copies data back into the buffer and calls "SUPER::flush.
39
40 Note that "flush" is _also_ called for read mode - we still do the (back)-translate
41 so that the the base class's "flush" sees the correct number of encoded chars
42 for positioning the seek pointer. (This double translation is the worst performance
43 issue - particularly with all-perl encode engine.)
44
45*/
33af2bc7 46#include "perliol.h"
aa0053b7 47typedef struct {
48 PerlIOBuf base; /* PerlIOBuf stuff */
0b3236bb 49 SV *bufsv; /* buffer seen by layers above */
50 SV *dataSV; /* data we have read from layer below */
51 SV *enc; /* the encoding object */
33af2bc7 52} PerlIOEncode;
53
e3f3bf95 54SV *
aa0053b7 55PerlIOEncode_getarg(pTHX_ PerlIO * f, CLONE_PARAMS * param, int flags)
e3f3bf95 56{
aa0053b7 57 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
58 SV *sv = &PL_sv_undef;
59 if (e->enc) {
60 dSP;
61 ENTER;
62 SAVETMPS;
63 PUSHMARK(sp);
64 XPUSHs(e->enc);
65 PUTBACK;
66 if (perl_call_method("name", G_SCALAR) == 1) {
67 SPAGAIN;
68 sv = newSVsv(POPs);
69 PUTBACK;
70 }
e3f3bf95 71 }
aa0053b7 72 return sv;
e3f3bf95 73}
33af2bc7 74
75IV
aa0053b7 76PerlIOEncode_pushed(pTHX_ PerlIO * f, const char *mode, SV * arg)
33af2bc7 77{
aa0053b7 78 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
79 dSP;
80 IV code;
81 code = PerlIOBuf_pushed(aTHX_ f, mode, Nullsv);
82 ENTER;
83 SAVETMPS;
84 PUSHMARK(sp);
85 XPUSHs(arg);
86 PUTBACK;
87 if (perl_call_pv("Encode::find_encoding", G_SCALAR) != 1) {
88 /* should never happen */
89 Perl_die(aTHX_ "Encode::find_encoding did not return a value");
90 return -1;
91 }
92 SPAGAIN;
93 e->enc = POPs;
94 PUTBACK;
95 if (!SvROK(e->enc)) {
96 e->enc = Nullsv;
97 errno = EINVAL;
f98bc0c6 98 Perl_warner(aTHX_ packWARN(WARN_IO), "Cannot find encoding \"%" SVf "\"",
aa0053b7 99 arg);
100 code = -1;
101 }
102 else {
103 SvREFCNT_inc(e->enc);
104 PerlIOBase(f)->flags |= PERLIO_F_UTF8;
105 }
106 FREETMPS;
107 LEAVE;
108 return code;
33af2bc7 109}
110
111IV
aa0053b7 112PerlIOEncode_popped(pTHX_ PerlIO * f)
33af2bc7 113{
aa0053b7 114 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
115 if (e->enc) {
116 SvREFCNT_dec(e->enc);
117 e->enc = Nullsv;
118 }
119 if (e->bufsv) {
120 SvREFCNT_dec(e->bufsv);
121 e->bufsv = Nullsv;
122 }
0b3236bb 123 if (e->dataSV) {
124 SvREFCNT_dec(e->dataSV);
284ee456 125 e->dataSV = Nullsv;
0b3236bb 126 }
aa0053b7 127 return 0;
33af2bc7 128}
129
130STDCHAR *
aa0053b7 131PerlIOEncode_get_base(pTHX_ PerlIO * f)
33af2bc7 132{
aa0053b7 133 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
134 if (!e->base.bufsiz)
135 e->base.bufsiz = 1024;
136 if (!e->bufsv) {
137 e->bufsv = newSV(e->base.bufsiz);
138 sv_setpvn(e->bufsv, "", 0);
139 }
140 e->base.buf = (STDCHAR *) SvPVX(e->bufsv);
141 if (!e->base.ptr)
142 e->base.ptr = e->base.buf;
143 if (!e->base.end)
144 e->base.end = e->base.buf;
145 if (e->base.ptr < e->base.buf
146 || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) {
147 Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr,
148 e->base.buf + SvLEN(e->bufsv));
149 abort();
150 }
151 if (SvLEN(e->bufsv) < e->base.bufsiz) {
152 SSize_t poff = e->base.ptr - e->base.buf;
153 SSize_t eoff = e->base.end - e->base.buf;
154 e->base.buf = (STDCHAR *) SvGROW(e->bufsv, e->base.bufsiz);
155 e->base.ptr = e->base.buf + poff;
156 e->base.end = e->base.buf + eoff;
157 }
158 if (e->base.ptr < e->base.buf
159 || e->base.ptr > e->base.buf + SvLEN(e->bufsv)) {
160 Perl_warn(aTHX_ " ptr %p(%p)%p", e->base.buf, e->base.ptr,
161 e->base.buf + SvLEN(e->bufsv));
162 abort();
163 }
164 return e->base.buf;
33af2bc7 165}
166
33af2bc7 167IV
aa0053b7 168PerlIOEncode_fill(pTHX_ PerlIO * f)
33af2bc7 169{
aa0053b7 170 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
171 dSP;
0b3236bb 172 IV code = 0;
173 PerlIO *n;
174 SSize_t avail;
175 if (PerlIO_flush(f) != 0)
176 return -1;
177 n = PerlIONext(f);
178 if (!PerlIO_fast_gets(n)) {
179 /* Things get too messy if we don't have a buffer layer
180 push a :perlio to do the job */
181 char mode[8];
182 n = PerlIO_push(aTHX_ n, &PerlIO_perlio, PerlIO_modestr(f,mode), Nullsv);
183 if (!n) {
184 Perl_die(aTHX_ "panic: cannot push :perlio for %p",f);
185 }
186 }
187 ENTER;
188 SAVETMPS;
189 retry:
190 avail = PerlIO_get_cnt(n);
191 if (avail <= 0) {
192 avail = PerlIO_fill(n);
193 if (avail == 0) {
194 avail = PerlIO_get_cnt(n);
195 }
196 else {
197 if (!PerlIO_error(n) && PerlIO_eof(n))
198 avail = 0;
199 }
200 }
201 if (avail > 0) {
202 STDCHAR *ptr = PerlIO_get_ptr(n);
203 SSize_t use = avail;
aa0053b7 204 SV *uni;
aa0053b7 205 char *s;
0b3236bb 206 STRLEN len = 0;
207 e->base.ptr = e->base.end = (STDCHAR *) Nullch;
208 (void) PerlIOEncode_get_base(aTHX_ f);
209 if (!e->dataSV)
210 e->dataSV = newSV(0);
211 if (SvTYPE(e->dataSV) < SVt_PV) {
212 sv_upgrade(e->dataSV,SVt_PV);
213 }
214 if (SvCUR(e->dataSV)) {
215 /* something left over from last time - create a normal
216 SV with new data appended
217 */
218 if (use + SvCUR(e->dataSV) > e->base.bufsiz) {
219 use = e->base.bufsiz - SvCUR(e->dataSV);
220 }
9dbbe389 221 sv_catpvn(e->dataSV,(char*)ptr,use);
0b3236bb 222 }
223 else {
224 /* Create a "dummy" SV to represent the available data from layer below */
225 if (SvLEN(e->dataSV) && SvPVX(e->dataSV)) {
226 Safefree(SvPVX(e->dataSV));
227 }
228 if (use > e->base.bufsiz) {
229 use = e->base.bufsiz;
230 }
231 SvPVX(e->dataSV) = (char *) ptr;
232 SvLEN(e->dataSV) = 0; /* Hands off sv.c - it isn't yours */
233 SvCUR_set(e->dataSV,use);
284ee456 234 SvPOK_only(e->dataSV);
0b3236bb 235 }
236 SvUTF8_off(e->dataSV);
aa0053b7 237 PUSHMARK(sp);
238 XPUSHs(e->enc);
0b3236bb 239 XPUSHs(e->dataSV);
aa0053b7 240 XPUSHs(&PL_sv_yes);
241 PUTBACK;
0b3236bb 242 if (perl_call_method("decode", G_SCALAR) != 1) {
243 Perl_die(aTHX_ "panic: decode did not return a value");
244 }
aa0053b7 245 SPAGAIN;
246 uni = POPs;
247 PUTBACK;
0b3236bb 248 /* Now get translated string (forced to UTF-8) and use as buffer */
249 if (SvPOK(uni)) {
250 s = SvPVutf8(uni, len);
85f7ebdf 251 if (len && !is_utf8_string((U8*)s,len)) {
0b3236bb 252 Perl_warn(aTHX_ "panic: decode did not return UTF-8 '%.*s'",(int) len,s);
253 }
254 }
255 if (len > 0) {
256 /* Got _something */
257 /* if decode gave us back dataSV then data may vanish when
258 we do ptrcnt adjust - so take our copy now.
259 (The copy is a pain - need a put-it-here option for decode.)
260 */
261 sv_setpvn(e->bufsv,s,len);
284ee456 262 e->base.ptr = e->base.buf = (STDCHAR*)SvPVX(e->bufsv);
0b3236bb 263 e->base.end = e->base.ptr + SvCUR(e->bufsv);
264 PerlIOBase(f)->flags |= PERLIO_F_RDBUF;
265 SvUTF8_on(e->bufsv);
266
267 /* Adjust ptr/cnt not taking anything which
268 did not translate - not clear this is a win */
269 /* compute amount we took */
270 use -= SvCUR(e->dataSV);
271 PerlIO_set_ptrcnt(n, ptr+use, (avail-use));
272 /* and as we did not take it it isn't pending */
273 SvCUR_set(e->dataSV,0);
274 } else {
275 /* Got nothing - assume partial character so we need some more */
276 /* Make sure e->dataSV is a normal SV before re-filling as
277 buffer alias will change under us
278 */
279 s = SvPV(e->dataSV,len);
280 sv_setpvn(e->dataSV,s,len);
281 PerlIO_set_ptrcnt(n, ptr+use, (avail-use));
282 goto retry;
aa0053b7 283 }
aa0053b7 284 FREETMPS;
285 LEAVE;
0b3236bb 286 return code;
287 }
288 else {
289 if (avail == 0)
290 PerlIOBase(f)->flags |= PERLIO_F_EOF;
291 else
292 PerlIOBase(f)->flags |= PERLIO_F_ERROR;
293 return -1;
72e44f29 294 }
33af2bc7 295}
296
297IV
aa0053b7 298PerlIOEncode_flush(pTHX_ PerlIO * f)
33af2bc7 299{
aa0053b7 300 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
301 IV code = 0;
0b3236bb 302 if (e->bufsv && (e->base.ptr > e->base.buf)) {
aa0053b7 303 dSP;
304 SV *str;
305 char *s;
306 STRLEN len;
0b3236bb 307 SSize_t count = 0;
308 if (PerlIOBase(f)->flags & PERLIO_F_WRBUF) {
309 /* Write case encode the buffer and write() to layer below */
310 ENTER;
311 SAVETMPS;
312 PUSHMARK(sp);
313 XPUSHs(e->enc);
314 SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
315 SvUTF8_on(e->bufsv);
0b3236bb 316 XPUSHs(e->bufsv);
317 XPUSHs(&PL_sv_yes);
318 PUTBACK;
8bbd9480 319 if (perl_call_method("encode", G_SCALAR) != 1) {
320 Perl_die(aTHX_ "panic: encode did not return a value");
321 }
0b3236bb 322 SPAGAIN;
323 str = POPs;
324 PUTBACK;
325 s = SvPV(str, len);
326 count = PerlIO_write(PerlIONext(f),s,len);
327 if (count != len) {
328 code = -1;
329 }
330 FREETMPS;
331 LEAVE;
332 if (PerlIO_flush(PerlIONext(f)) != 0) {
333 code = -1;
334 }
25f7d9d3 335 if (SvCUR(e->bufsv)) {
336 /* Did not all translate */
337 e->base.ptr = e->base.buf+SvCUR(e->bufsv);
338 return code;
339 }
aa0053b7 340 }
0b3236bb 341 else if (PerlIOBase(f)->flags & PERLIO_F_RDBUF) {
342 /* read case */
343 /* if we have any untranslated stuff then unread that first */
344 if (e->dataSV && SvCUR(e->dataSV)) {
345 s = SvPV(e->dataSV, len);
346 count = PerlIO_unread(PerlIONext(f),s,len);
347 if (count != len) {
348 code = -1;
349 }
350 }
351 /* See if there is anything left in the buffer */
352 if (e->base.ptr < e->base.end) {
353 /* Bother - have unread data.
354 re-encode and unread() to layer below
355 */
356 ENTER;
357 SAVETMPS;
358 str = sv_newmortal();
359 sv_upgrade(str, SVt_PV);
9dbbe389 360 SvPVX(str) = (char*)e->base.ptr;
0b3236bb 361 SvLEN(str) = 0;
362 SvCUR_set(str, e->base.end - e->base.ptr);
284ee456 363 SvPOK_only(str);
0b3236bb 364 SvUTF8_on(str);
365 PUSHMARK(sp);
366 XPUSHs(e->enc);
367 XPUSHs(str);
368 XPUSHs(&PL_sv_yes);
369 PUTBACK;
8bbd9480 370 if (perl_call_method("encode", G_SCALAR) != 1) {
371 Perl_die(aTHX_ "panic: encode did not return a value");
372 }
0b3236bb 373 SPAGAIN;
374 str = POPs;
375 PUTBACK;
376 s = SvPV(str, len);
377 count = PerlIO_unread(PerlIONext(f),s,len);
378 if (count != len) {
379 code = -1;
380 }
381 FREETMPS;
382 LEAVE;
383 }
aa0053b7 384 }
0b3236bb 385 e->base.ptr = e->base.end = e->base.buf;
386 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF);
72e44f29 387 }
aa0053b7 388 return code;
33af2bc7 389}
390
391IV
aa0053b7 392PerlIOEncode_close(pTHX_ PerlIO * f)
33af2bc7 393{
aa0053b7 394 PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
395 IV code = PerlIOBase_close(aTHX_ f);
396 if (e->bufsv) {
25f7d9d3 397 if (e->base.buf && e->base.ptr > e->base.buf) {
398 Perl_croak(aTHX_ "Close with partial character");
399 }
aa0053b7 400 SvREFCNT_dec(e->bufsv);
401 e->bufsv = Nullsv;
402 }
403 e->base.buf = NULL;
404 e->base.ptr = NULL;
405 e->base.end = NULL;
406 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF);
407 return code;
33af2bc7 408}
409
72e44f29 410Off_t
aa0053b7 411PerlIOEncode_tell(pTHX_ PerlIO * f)
72e44f29 412{
aa0053b7 413 PerlIOBuf *b = PerlIOSelf(f, PerlIOBuf);
0b3236bb 414 /* Unfortunately the only way to get a postion is to (re-)translate,
415 the UTF8 we have in bufefr and then ask layer below
aa0053b7 416 */
0b3236bb 417 PerlIO_flush(f);
25f7d9d3 418 if (b->buf && b->ptr > b->buf) {
419 Perl_croak(aTHX_ "Cannot tell at partial character");
420 }
0b3236bb 421 return PerlIO_tell(PerlIONext(f));
72e44f29 422}
423
8cf8f3d1 424PerlIO *
aa0053b7 425PerlIOEncode_dup(pTHX_ PerlIO * f, PerlIO * o,
426 CLONE_PARAMS * params, int flags)
8cf8f3d1 427{
aa0053b7 428 if ((f = PerlIOBase_dup(aTHX_ f, o, params, flags))) {
429 PerlIOEncode *fe = PerlIOSelf(f, PerlIOEncode);
430 PerlIOEncode *oe = PerlIOSelf(o, PerlIOEncode);
431 if (oe->enc) {
432 fe->enc = PerlIO_sv_dup(aTHX_ oe->enc, params);
433 }
9f16d962 434 }
aa0053b7 435 return f;
8cf8f3d1 436}
437
33af2bc7 438PerlIO_funcs PerlIO_encode = {
aa0053b7 439 "encoding",
440 sizeof(PerlIOEncode),
284ee456 441 PERLIO_K_BUFFERED|PERLIO_K_DESTRUCT,
aa0053b7 442 PerlIOEncode_pushed,
443 PerlIOEncode_popped,
444 PerlIOBuf_open,
445 PerlIOEncode_getarg,
446 PerlIOBase_fileno,
447 PerlIOEncode_dup,
448 PerlIOBuf_read,
449 PerlIOBuf_unread,
450 PerlIOBuf_write,
451 PerlIOBuf_seek,
452 PerlIOEncode_tell,
453 PerlIOEncode_close,
454 PerlIOEncode_flush,
455 PerlIOEncode_fill,
456 PerlIOBase_eof,
457 PerlIOBase_error,
458 PerlIOBase_clearerr,
459 PerlIOBase_setlinebuf,
460 PerlIOEncode_get_base,
461 PerlIOBuf_bufsiz,
462 PerlIOBuf_get_ptr,
463 PerlIOBuf_get_cnt,
464 PerlIOBuf_set_ptrcnt,
33af2bc7 465};
aa0053b7 466#endif /* encode layer */
33af2bc7 467
2f2b4ff2 468void
aa0053b7 469Encode_XSEncoding(pTHX_ encode_t * enc)
2f2b4ff2 470{
aa0053b7 471 dSP;
472 HV *stash = gv_stashpv("Encode::XS", TRUE);
473 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))), stash);
474 int i = 0;
475 PUSHMARK(sp);
476 XPUSHs(sv);
477 while (enc->name[i]) {
478 const char *name = enc->name[i++];
479 XPUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
480 }
481 PUTBACK;
482 call_pv("Encode::define_encoding", G_DISCARD);
483 SvREFCNT_dec(sv);
2f2b4ff2 484}
485
aa0053b7 486void
487call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig)
488{
25f7d9d3 489 /* Exists for breakpointing */
aa0053b7 490}
67e989fb 491
2f2b4ff2 492static SV *
aa0053b7 493encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
494 int check)
2f2b4ff2 495{
aa0053b7 496 STRLEN slen;
497 U8 *s = (U8 *) SvPV(src, slen);
3aececda 498 STRLEN tlen = slen;
499 STRLEN ddone = 0;
500 STRLEN sdone = 0;
39cf9a5e 501
a999c27c 502 /* We allocate slen+1.
503 PerlIO dumps core if this value is smaller than this. */
504 SV *dst = sv_2mortal(newSV(slen+1));
aa0053b7 505 if (slen) {
0b3236bb 506 U8 *d = (U8 *) SvPVX(dst);
507 STRLEN dlen = SvLEN(dst)-1;
aa0053b7 508 int code;
509 while ((code = do_encode(dir, s, &slen, d, dlen, &dlen, !check))) {
3aececda 510 SvCUR_set(dst, dlen+ddone);
284ee456 511 SvPOK_only(dst);
9b37254d 512
39cf9a5e 513#if ENCODE_XS_PROFILE >= 3
514 Perl_warn(aTHX_ "code=%d @ s=%d/%d/%d d=%d/%d/%d\n",code,slen,sdone,tlen,dlen,ddone,SvLEN(dst)-1);
0b3236bb 515#endif
516
517 if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL)
aa0053b7 518 break;
9b37254d 519
aa0053b7 520 switch (code) {
521 case ENCODE_NOSPACE:
a999c27c 522 {
fcb875d4 523 STRLEN more = 0; /* make sure you initialize! */
524 STRLEN sleft;
3aececda 525 sdone += slen;
526 ddone += dlen;
39cf9a5e 527 sleft = tlen - sdone;
fcb875d4 528#if ENCODE_XS_PROFILE >= 2
529 Perl_warn(aTHX_
530 "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
531 more, sdone, sleft, SvLEN(dst));
532#endif
533 if (sdone != 0) { /* has src ever been processed ? */
39cf9a5e 534#if ENCODE_XS_USEFP == 2
a999c27c 535 more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone
536 - SvLEN(dst);
39cf9a5e 537#elif ENCODE_XS_USEFP
a999c27c 538 more = (1.0*SvLEN(dst)+1)/sdone * sleft;
39cf9a5e 539#else
a999c27c 540 /* safe until SvLEN(dst) == MAX_INT/16 */
541 more = (16*SvLEN(dst)+1)/sdone/16 * sleft;
c98ca92f 542#endif
0b3236bb 543 }
39cf9a5e 544 more += UTF8_MAXLEN; /* insurance policy */
545#if ENCODE_XS_PROFILE >= 2
a999c27c 546 Perl_warn(aTHX_
547 "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
548 more, sdone, sleft, SvLEN(dst));
39cf9a5e 549#endif
550 d = (U8 *) SvGROW(dst, SvLEN(dst) + more);
551 /* dst need to grow need MORE bytes! */
3aececda 552 if (ddone >= SvLEN(dst)) {
553 Perl_croak(aTHX_ "Destination couldn't be grown.");
aa0053b7 554 }
3aececda 555 dlen = SvLEN(dst)-ddone-1;
556 d += ddone;
557 s += slen;
558 slen = tlen-sdone;
559 continue;
39cf9a5e 560 }
2f2b4ff2 561
aa0053b7 562 case ENCODE_NOREP:
563 if (dir == enc->f_utf8) {
564 if (!check && ckWARN_d(WARN_UTF8)) {
565 STRLEN clen;
566 UV ch =
567 utf8n_to_uvuni(s + slen, (SvCUR(src) - slen),
568 &clen, 0);
f98bc0c6 569 Perl_warner(aTHX_ packWARN(WARN_UTF8),
aa0053b7 570 "\"\\N{U+%" UVxf
571 "}\" does not map to %s", ch,
572 enc->name[0]);
573 /* FIXME: Skip over the character, copy in replacement and continue
574 * but that is messy so for now just fail.
575 */
576 return &PL_sv_undef;
577 }
578 else {
579 return &PL_sv_undef;
580 }
581 }
582 else {
d0ee0454 583 /* UTF-8 is supposed to be "Universal" so should not happen
584 for real characters, but some encodings have non-assigned
585 codes which may occur.
586 */
ee981de6 587 Perl_croak(aTHX_ "%s \"\\x%02X\" does not map to Unicode (%d)",
588 enc->name[0], (U8) s[slen], code);
aa0053b7 589 }
590 break;
2f2b4ff2 591
aa0053b7 592 default:
593 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
594 code, (dir == enc->f_utf8) ? "to" : "from",
595 enc->name[0]);
596 return &PL_sv_undef;
597 }
598 }
3aececda 599 SvCUR_set(dst, dlen+ddone);
284ee456 600 SvPOK_only(dst);
aa0053b7 601 if (check) {
3aececda 602 sdone = SvCUR(src) - (slen+sdone);
603 if (sdone) {
f54fca96 604#if 1
605 /* FIXME: A Move() is dangerous - PV could be mmap'ed readonly
606 SvOOK would be ideal - but sv_backoff does not understand SvLEN == 0
607 type SVs and sv_clear() calls it ...
608 */
64ffdd5e 609 sv_setpvn(src, (char*)s+slen, sdone);
f54fca96 610#else
3aececda 611 Move(s + slen, SvPVX(src), sdone , U8);
f54fca96 612#endif
aa0053b7 613 }
3aececda 614 SvCUR_set(src, sdone);
aa0053b7 615 }
2f2b4ff2 616 }
aa0053b7 617 else {
0b3236bb 618 SvCUR_set(dst, 0);
284ee456 619 SvPOK_only(dst);
2f2b4ff2 620 }
39cf9a5e 621#if ENCODE_XS_PROFILE
622 if (SvCUR(dst) > SvCUR(src)){
a999c27c 623 Perl_warn(aTHX_
624 "SvLEN(dst)=%d, SvCUR(dst)=%d. "
625 "%d bytes unused(%f %%)\n",
626 SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst),
627 (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0);
628
39cf9a5e 629 }
a999c27c 630#endif
0b3236bb 631 *SvEND(dst) = '\0';
aa0053b7 632 return dst;
2f2b4ff2 633}
634
50d26985 635MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_
2f2b4ff2 636
637PROTOTYPES: ENABLE
638
639void
0a95303c 640Method_name(obj)
641SV * obj
642CODE:
643 {
644 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
645 ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0])));
646 XSRETURN(1);
647 }
648
649void
691638dd 650Method_decode(obj,src,check = FALSE)
2f2b4ff2 651SV * obj
652SV * src
691638dd 653bool check
2f2b4ff2 654CODE:
655 {
656 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
657 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
658 SvUTF8_on(ST(0));
659 XSRETURN(1);
660 }
661
662void
691638dd 663Method_encode(obj,src,check = FALSE)
2f2b4ff2 664SV * obj
665SV * src
691638dd 666bool check
2f2b4ff2 667CODE:
668 {
669 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
670 sv_utf8_upgrade(src);
671 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
672 XSRETURN(1);
673 }
674
67e989fb 675MODULE = Encode PACKAGE = Encode
2c674647 676
677PROTOTYPES: ENABLE
678
67e989fb 679I32
2c674647 680_bytes_to_utf8(sv, ...)
67e989fb 681 SV * sv
2c674647 682 CODE:
67e989fb 683 {
684 SV * encoding = items == 2 ? ST(1) : Nullsv;
685
686 if (encoding)
687 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
688 else {
689 STRLEN len;
183a2d84 690 U8* s = (U8*)SvPV(sv, len);
67e989fb 691 U8* converted;
692
693 converted = bytes_to_utf8(s, &len); /* This allocs */
183a2d84 694 sv_setpvn(sv, (char *)converted, len);
67e989fb 695 SvUTF8_on(sv); /* XXX Should we? */
696 Safefree(converted); /* ... so free it */
697 RETVAL = len;
698 }
699 }
2c674647 700 OUTPUT:
67e989fb 701 RETVAL
2c674647 702
67e989fb 703I32
2c674647 704_utf8_to_bytes(sv, ...)
67e989fb 705 SV * sv
2c674647 706 CODE:
67e989fb 707 {
708 SV * to = items > 1 ? ST(1) : Nullsv;
709 SV * check = items > 2 ? ST(2) : Nullsv;
87714904 710
67e989fb 711 if (to)
712 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
713 else {
67e989fb 714 STRLEN len;
b113ac0e 715 U8 *s = (U8*)SvPV(sv, len);
67e989fb 716
9c5ffd7c 717 RETVAL = 0;
67e989fb 718 if (SvTRUE(check)) {
719 /* Must do things the slow way */
720 U8 *dest;
87714904 721 U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
67e989fb 722 U8 *send = s + len;
723
724 New(83, dest, len, U8); /* I think */
725
726 while (s < send) {
727 if (*s < 0x80)
728 *dest++ = *s++;
729 else {
b113ac0e 730 STRLEN ulen;
731 UV uv = *s++;
87714904 732
67e989fb 733 /* Have to do it all ourselves because of error routine,
734 aargh. */
735 if (!(uv & 0x40))
736 goto failure;
737 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
738 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
739 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
740 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
741 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
742 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
743 else { ulen = 13; uv = 0; }
87714904 744
67e989fb 745 /* Note change to utf8.c variable naming, for variety */
746 while (ulen--) {
747 if ((*s & 0xc0) != 0x80)
748 goto failure;
87714904 749
67e989fb 750 else
751 uv = (uv << 6) | (*s++ & 0x3f);
87714904 752 }
67e989fb 753 if (uv > 256) {
754 failure:
755 call_failure(check, s, dest, src);
756 /* Now what happens? */
757 }
758 *dest++ = (U8)uv;
759 }
760 }
761 } else
762 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
763 }
2c674647 764 }
765 OUTPUT:
766 RETVAL
767
2c674647 768bool
4411f3b6 769is_utf8(sv, check = FALSE)
770SV * sv
771bool check
2c674647 772 CODE:
773 {
2eebba1d 774 if (SvGMAGICAL(sv)) /* it could be $1, for example */
775 sv = newSVsv(sv); /* GMAGIG will be done */
2c674647 776 if (SvPOK(sv)) {
4411f3b6 777 RETVAL = SvUTF8(sv) ? TRUE : FALSE;
2c674647 778 if (RETVAL &&
4411f3b6 779 check &&
2c674647 780 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
781 RETVAL = FALSE;
782 } else {
783 RETVAL = FALSE;
784 }
2eebba1d 785 if (sv != ST(0))
786 SvREFCNT_dec(sv); /* it was a temp copy */
2c674647 787 }
788 OUTPUT:
789 RETVAL
790
791SV *
4411f3b6 792_utf8_on(sv)
2c674647 793 SV * sv
794 CODE:
795 {
796 if (SvPOK(sv)) {
87714904 797 SV *rsv = newSViv(SvUTF8(sv));
2c674647 798 RETVAL = rsv;
799 SvUTF8_on(sv);
800 } else {
801 RETVAL = &PL_sv_undef;
802 }
803 }
804 OUTPUT:
805 RETVAL
806
807SV *
4411f3b6 808_utf8_off(sv)
2c674647 809 SV * sv
810 CODE:
811 {
812 if (SvPOK(sv)) {
87714904 813 SV *rsv = newSViv(SvUTF8(sv));
2c674647 814 RETVAL = rsv;
815 SvUTF8_off(sv);
816 } else {
817 RETVAL = &PL_sv_undef;
818 }
819 }
820 OUTPUT:
821 RETVAL
822
33af2bc7 823BOOT:
824{
6a59c517 825#if defined(USE_PERLIO) && !defined(USE_SFIO)
a999f61b 826 PerlIO_define_layer(aTHX_ &PerlIO_encode);
33af2bc7 827#endif
e7cbefb8 828#include "def_t.exh"
33af2bc7 829}