Commit | Line | Data |
85982a32 |
1 | /* |
011b2d2f |
2 | $Id: Encode.xs,v 1.34 2002/04/22 20:27:30 dankogai Exp dankogai $ |
85982a32 |
3 | */ |
4 | |
fcf2db38 |
5 | #define PERL_NO_GET_CONTEXT |
2c674647 |
6 | #include "EXTERN.h" |
7 | #include "perl.h" |
8 | #include "XSUB.h" |
2f2b4ff2 |
9 | #define U8 U8 |
10 | #include "encode.h" |
0ab8f81e |
11 | # define PERLIO_FILENAME "PerlIO/encoding.pm" |
aae85ceb |
12 | |
85982a32 |
13 | /* set 1 or more to profile. t/encoding.t dumps core because of |
14 | Perl_warner and PerlIO don't work well */ |
0ab8f81e |
15 | #define ENCODE_XS_PROFILE 0 |
39cf9a5e |
16 | |
85982a32 |
17 | /* set 0 to disable floating point to calculate buffer size for |
18 | encode_method(). 1 is recommended. 2 restores NI-S original */ |
0ab8f81e |
19 | #define ENCODE_XS_USEFP 1 |
39cf9a5e |
20 | |
fcf2db38 |
21 | #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \ |
2f5768b8 |
22 | Perl_croak(aTHX_ "panic_unimplemented"); \ |
4a83738a |
23 | return (y)0; /* fool picky compilers */ \ |
87714904 |
24 | } |
85982a32 |
25 | /**/ |
011b2d2f |
26 | |
7e9a885a |
27 | UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) |
28 | UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) |
33af2bc7 |
29 | |
b0b300a3 |
30 | void |
aa0053b7 |
31 | Encode_XSEncoding(pTHX_ encode_t * enc) |
2f2b4ff2 |
32 | { |
aa0053b7 |
33 | dSP; |
34 | HV *stash = gv_stashpv("Encode::XS", TRUE); |
35 | SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))), stash); |
36 | int i = 0; |
37 | PUSHMARK(sp); |
38 | XPUSHs(sv); |
39 | while (enc->name[i]) { |
40 | const char *name = enc->name[i++]; |
41 | XPUSHs(sv_2mortal(newSVpvn(name, strlen(name)))); |
42 | } |
43 | PUTBACK; |
44 | call_pv("Encode::define_encoding", G_DISCARD); |
45 | SvREFCNT_dec(sv); |
2f2b4ff2 |
46 | } |
47 | |
aa0053b7 |
48 | void |
49 | call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig) |
50 | { |
85982a32 |
51 | /* Exists for breakpointing */ |
aa0053b7 |
52 | } |
67e989fb |
53 | |
85982a32 |
54 | |
2f2b4ff2 |
55 | static SV * |
aa0053b7 |
56 | encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, |
85982a32 |
57 | int check) |
2f2b4ff2 |
58 | { |
aa0053b7 |
59 | STRLEN slen; |
60 | U8 *s = (U8 *) SvPV(src, slen); |
3aececda |
61 | STRLEN tlen = slen; |
62 | STRLEN ddone = 0; |
63 | STRLEN sdone = 0; |
39cf9a5e |
64 | |
3c49ab08 |
65 | /* We allocate slen+1. |
85982a32 |
66 | PerlIO dumps core if this value is smaller than this. */ |
3c49ab08 |
67 | SV *dst = sv_2mortal(newSV(slen+1)); |
85982a32 |
68 | U8 *d = (U8 *)SvPVX(dst); |
69 | STRLEN dlen = SvLEN(dst)-1; |
70 | int code; |
71 | |
72 | if (!slen){ |
73 | SvCUR_set(dst, 0); |
74 | SvPOK_only(dst); |
75 | goto ENCODE_END; |
76 | } |
77 | |
78 | while (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check)) |
79 | { |
80 | SvCUR_set(dst, dlen+ddone); |
81 | SvPOK_only(dst); |
0b3236bb |
82 | |
85982a32 |
83 | if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL){ |
84 | break; |
85 | } |
86 | switch (code) { |
87 | case ENCODE_NOSPACE: |
88 | { |
89 | STRLEN more = 0; /* make sure you initialize! */ |
90 | STRLEN sleft; |
91 | sdone += slen; |
92 | ddone += dlen; |
93 | sleft = tlen - sdone; |
fcb875d4 |
94 | #if ENCODE_XS_PROFILE >= 2 |
85982a32 |
95 | Perl_warn(aTHX_ |
96 | "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n", |
97 | more, sdone, sleft, SvLEN(dst)); |
fcb875d4 |
98 | #endif |
85982a32 |
99 | if (sdone != 0) { /* has src ever been processed ? */ |
39cf9a5e |
100 | #if ENCODE_XS_USEFP == 2 |
85982a32 |
101 | more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone |
102 | - SvLEN(dst); |
39cf9a5e |
103 | #elif ENCODE_XS_USEFP |
85982a32 |
104 | more = (1.0*SvLEN(dst)+1)/sdone * sleft; |
39cf9a5e |
105 | #else |
85982a32 |
106 | /* safe until SvLEN(dst) == MAX_INT/16 */ |
107 | more = (16*SvLEN(dst)+1)/sdone/16 * sleft; |
39cf9a5e |
108 | #endif |
39cf9a5e |
109 | } |
85982a32 |
110 | more += UTF8_MAXLEN; /* insurance policy */ |
111 | d = (U8 *) SvGROW(dst, SvLEN(dst) + more); |
112 | /* dst need to grow need MORE bytes! */ |
113 | if (ddone >= SvLEN(dst)) { |
114 | Perl_croak(aTHX_ "Destination couldn't be grown."); |
115 | } |
116 | dlen = SvLEN(dst)-ddone-1; |
117 | d += ddone; |
118 | s += slen; |
119 | slen = tlen-sdone; |
120 | continue; |
121 | } |
122 | case ENCODE_NOREP: |
123 | /* encoding */ |
0ab8f81e |
124 | if (dir == enc->f_utf8) { |
85982a32 |
125 | STRLEN clen; |
126 | UV ch = |
0ab8f81e |
127 | utf8n_to_uvuni(s+slen, (SvCUR(src)-slen), |
b0b300a3 |
128 | &clen, UTF8_ALLOW_ANY|UTF8_CHECK_ONLY); |
85982a32 |
129 | if (check & ENCODE_DIE_ON_ERR) { |
130 | Perl_croak( |
0ab8f81e |
131 | aTHX_ "\"\\N{U+%" UVxf "}\" does not map to %s, %d", |
85982a32 |
132 | ch, enc->name[0], __LINE__); |
133 | }else{ |
134 | if (check & ENCODE_RETURN_ON_ERR){ |
135 | if (check & ENCODE_WARN_ON_ERR){ |
136 | Perl_warner( |
137 | aTHX_ packWARN(WARN_UTF8), |
0ab8f81e |
138 | "\"\\N{U+%" UVxf "}\" does not map to %s", |
85982a32 |
139 | ch,enc->name[0]); |
140 | } |
141 | goto ENCODE_SET_SRC; |
142 | }else if (check & ENCODE_PERLQQ){ |
0ab8f81e |
143 | SV* perlqq = |
85982a32 |
144 | sv_2mortal(newSVpvf("\\x{%04x}", ch)); |
b2704119 |
145 | sdone += slen + clen; |
85982a32 |
146 | ddone += dlen + SvCUR(perlqq); |
147 | sv_catsv(dst, perlqq); |
0ab8f81e |
148 | } else { |
85982a32 |
149 | /* fallback char */ |
150 | sdone += slen + clen; |
0ab8f81e |
151 | ddone += dlen + enc->replen; |
152 | sv_catpvn(dst, (char*)enc->rep, enc->replen); |
b2704119 |
153 | } |
0ab8f81e |
154 | } |
b2704119 |
155 | } |
85982a32 |
156 | /* decoding */ |
0ab8f81e |
157 | else { |
85982a32 |
158 | if (check & ENCODE_DIE_ON_ERR){ |
159 | Perl_croak( |
160 | aTHX_ "%s \"\\x%02X\" does not map to Unicode (%d)", |
161 | enc->name[0], (U8) s[slen], code); |
162 | }else{ |
163 | if (check & ENCODE_RETURN_ON_ERR){ |
164 | if (check & ENCODE_WARN_ON_ERR){ |
165 | Perl_warner( |
166 | aTHX_ packWARN(WARN_UTF8), |
167 | "%s \"\\x%02X\" does not map to Unicode (%d)", |
168 | enc->name[0], (U8) s[slen], code); |
169 | } |
170 | goto ENCODE_SET_SRC; |
171 | }else if (check & ENCODE_PERLQQ){ |
0ab8f81e |
172 | SV* perlqq = |
b2704119 |
173 | sv_2mortal(newSVpvf("\\x%02X", s[slen])); |
85982a32 |
174 | sdone += slen + 1; |
175 | ddone += dlen + SvCUR(perlqq); |
176 | sv_catsv(dst, perlqq); |
177 | } else { |
178 | sdone += slen + 1; |
0ab8f81e |
179 | ddone += dlen + strlen(FBCHAR_UTF8); |
180 | sv_catpv(dst, FBCHAR_UTF8); |
85982a32 |
181 | } |
aa0053b7 |
182 | } |
b2704119 |
183 | } |
85982a32 |
184 | /* settle variables when fallback */ |
b0b300a3 |
185 | d = (U8 *)SvEND(dst); |
0ab8f81e |
186 | dlen = SvLEN(dst) - ddone - 1; |
187 | s = (U8*)SvPVX(src) + sdone; |
b2704119 |
188 | slen = tlen - sdone; |
189 | break; |
2f2b4ff2 |
190 | |
85982a32 |
191 | default: |
192 | Perl_croak(aTHX_ "Unexpected code %d converting %s %s", |
193 | code, (dir == enc->f_utf8) ? "to" : "from", |
194 | enc->name[0]); |
195 | return &PL_sv_undef; |
aa0053b7 |
196 | } |
85982a32 |
197 | } |
198 | ENCODE_SET_SRC: |
6d1c0808 |
199 | if (check && !(check & ENCODE_LEAVE_SRC)){ |
200 | sdone = SvCUR(src) - (slen+sdone); |
85982a32 |
201 | if (sdone) { |
202 | sv_setpvn(src, (char*)s+slen, sdone); |
aa0053b7 |
203 | } |
85982a32 |
204 | SvCUR_set(src, sdone); |
2f2b4ff2 |
205 | } |
85982a32 |
206 | /* warn("check = 0x%X, code = 0x%d\n", check, code); */ |
207 | if (code && !(check & ENCODE_RETURN_ON_ERR)) { |
208 | return &PL_sv_undef; |
2f2b4ff2 |
209 | } |
0ab8f81e |
210 | |
85982a32 |
211 | SvCUR_set(dst, dlen+ddone); |
212 | SvPOK_only(dst); |
0ab8f81e |
213 | |
39cf9a5e |
214 | #if ENCODE_XS_PROFILE |
215 | if (SvCUR(dst) > SvCUR(src)){ |
85982a32 |
216 | Perl_warn(aTHX_ |
217 | "SvLEN(dst)=%d, SvCUR(dst)=%d. %d bytes unused(%f %%)\n", |
218 | SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst), |
219 | (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0); |
39cf9a5e |
220 | } |
3c49ab08 |
221 | #endif |
0ab8f81e |
222 | |
85982a32 |
223 | ENCODE_END: |
0b3236bb |
224 | *SvEND(dst) = '\0'; |
aa0053b7 |
225 | return dst; |
2f2b4ff2 |
226 | } |
227 | |
50d26985 |
228 | MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_ |
2f2b4ff2 |
229 | |
230 | PROTOTYPES: ENABLE |
231 | |
232 | void |
0a95303c |
233 | Method_name(obj) |
234 | SV * obj |
235 | CODE: |
85982a32 |
236 | { |
237 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
238 | ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0]))); |
239 | XSRETURN(1); |
240 | } |
0a95303c |
241 | |
242 | void |
b2704119 |
243 | Method_decode(obj,src,check = 0) |
2f2b4ff2 |
244 | SV * obj |
245 | SV * src |
b2704119 |
246 | int check |
2f2b4ff2 |
247 | CODE: |
aae85ceb |
248 | { |
85982a32 |
249 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
250 | ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check); |
251 | SvUTF8_on(ST(0)); |
aae85ceb |
252 | XSRETURN(1); |
253 | } |
254 | |
255 | void |
85982a32 |
256 | Method_encode(obj,src,check = 0) |
aae85ceb |
257 | SV * obj |
85982a32 |
258 | SV * src |
259 | int check |
aae85ceb |
260 | CODE: |
261 | { |
85982a32 |
262 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
263 | sv_utf8_upgrade(src); |
264 | ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check); |
aae85ceb |
265 | XSRETURN(1); |
266 | } |
267 | |
0ab8f81e |
268 | void |
269 | Method_needs_lines(obj) |
270 | SV * obj |
271 | CODE: |
272 | { |
273 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
274 | ST(0) = &PL_sv_no; |
275 | XSRETURN(1); |
276 | } |
277 | |
278 | void |
279 | Method_perlio_ok(obj) |
280 | SV * obj |
281 | CODE: |
282 | { |
283 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
011b2d2f |
284 | require_pv(PERLIO_FILENAME); |
0ab8f81e |
285 | if (hv_exists(get_hv("INC", 0), |
286 | PERLIO_FILENAME, strlen(PERLIO_FILENAME))) |
287 | { |
288 | ST(0) = &PL_sv_yes; |
289 | }else{ |
290 | ST(0) = &PL_sv_no; |
291 | } |
292 | XSRETURN(1); |
293 | } |
294 | |
67e989fb |
295 | MODULE = Encode PACKAGE = Encode |
2c674647 |
296 | |
297 | PROTOTYPES: ENABLE |
298 | |
67e989fb |
299 | I32 |
2c674647 |
300 | _bytes_to_utf8(sv, ...) |
85982a32 |
301 | SV * sv |
302 | CODE: |
303 | { |
304 | SV * encoding = items == 2 ? ST(1) : Nullsv; |
0ab8f81e |
305 | |
85982a32 |
306 | if (encoding) |
307 | RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding)); |
308 | else { |
309 | STRLEN len; |
310 | U8* s = (U8*)SvPV(sv, len); |
311 | U8* converted; |
312 | |
313 | converted = bytes_to_utf8(s, &len); /* This allocs */ |
314 | sv_setpvn(sv, (char *)converted, len); |
315 | SvUTF8_on(sv); /* XXX Should we? */ |
316 | Safefree(converted); /* ... so free it */ |
317 | RETVAL = len; |
318 | } |
319 | } |
320 | OUTPUT: |
321 | RETVAL |
2c674647 |
322 | |
67e989fb |
323 | I32 |
2c674647 |
324 | _utf8_to_bytes(sv, ...) |
85982a32 |
325 | SV * sv |
326 | CODE: |
327 | { |
328 | SV * to = items > 1 ? ST(1) : Nullsv; |
329 | SV * check = items > 2 ? ST(2) : Nullsv; |
330 | |
331 | if (to) { |
332 | RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to)); |
333 | } else { |
334 | STRLEN len; |
335 | U8 *s = (U8*)SvPV(sv, len); |
336 | |
337 | RETVAL = 0; |
338 | if (SvTRUE(check)) { |
339 | /* Must do things the slow way */ |
340 | U8 *dest; |
341 | /* We need a copy to pass to check() */ |
0ab8f81e |
342 | U8 *src = (U8*)savepv((char *)s); |
85982a32 |
343 | U8 *send = s + len; |
344 | |
345 | New(83, dest, len, U8); /* I think */ |
346 | |
347 | while (s < send) { |
348 | if (*s < 0x80){ |
349 | *dest++ = *s++; |
350 | } else { |
351 | STRLEN ulen; |
352 | UV uv = *s++; |
353 | |
354 | /* Have to do it all ourselves because of error routine, |
355 | aargh. */ |
356 | if (!(uv & 0x40)){ goto failure; } |
357 | if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; } |
358 | else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; } |
359 | else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; } |
360 | else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; } |
361 | else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; } |
362 | else if (!(uv & 0x01)) { ulen = 7; uv = 0; } |
363 | else { ulen = 13; uv = 0; } |
87714904 |
364 | |
85982a32 |
365 | /* Note change to utf8.c variable naming, for variety */ |
366 | while (ulen--) { |
0ab8f81e |
367 | if ((*s & 0xc0) != 0x80){ |
368 | goto failure; |
85982a32 |
369 | } else { |
370 | uv = (uv << 6) | (*s++ & 0x3f); |
371 | } |
87714904 |
372 | } |
67e989fb |
373 | if (uv > 256) { |
374 | failure: |
85982a32 |
375 | call_failure(check, s, dest, src); |
376 | /* Now what happens? */ |
67e989fb |
377 | } |
378 | *dest++ = (U8)uv; |
85982a32 |
379 | } |
380 | } |
381 | } else { |
382 | RETVAL = (utf8_to_bytes(s, &len) ? len : 0); |
2c674647 |
383 | } |
85982a32 |
384 | } |
385 | } |
386 | OUTPUT: |
387 | RETVAL |
2c674647 |
388 | |
2c674647 |
389 | bool |
b2704119 |
390 | is_utf8(sv, check = 0) |
4411f3b6 |
391 | SV * sv |
b2704119 |
392 | int check |
85982a32 |
393 | CODE: |
394 | { |
395 | if (SvGMAGICAL(sv)) /* it could be $1, for example */ |
396 | sv = newSVsv(sv); /* GMAGIG will be done */ |
397 | if (SvPOK(sv)) { |
398 | RETVAL = SvUTF8(sv) ? TRUE : FALSE; |
399 | if (RETVAL && |
400 | check && |
401 | !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) |
2c674647 |
402 | RETVAL = FALSE; |
85982a32 |
403 | } else { |
404 | RETVAL = FALSE; |
405 | } |
406 | if (sv != ST(0)) |
407 | SvREFCNT_dec(sv); /* it was a temp copy */ |
408 | } |
409 | OUTPUT: |
410 | RETVAL |
2c674647 |
411 | |
412 | SV * |
4411f3b6 |
413 | _utf8_on(sv) |
85982a32 |
414 | SV * sv |
415 | CODE: |
416 | { |
417 | if (SvPOK(sv)) { |
418 | SV *rsv = newSViv(SvUTF8(sv)); |
419 | RETVAL = rsv; |
420 | SvUTF8_on(sv); |
421 | } else { |
422 | RETVAL = &PL_sv_undef; |
423 | } |
424 | } |
425 | OUTPUT: |
426 | RETVAL |
2c674647 |
427 | |
428 | SV * |
4411f3b6 |
429 | _utf8_off(sv) |
85982a32 |
430 | SV * sv |
431 | CODE: |
432 | { |
433 | if (SvPOK(sv)) { |
434 | SV *rsv = newSViv(SvUTF8(sv)); |
435 | RETVAL = rsv; |
436 | SvUTF8_off(sv); |
437 | } else { |
438 | RETVAL = &PL_sv_undef; |
439 | } |
440 | } |
441 | OUTPUT: |
442 | RETVAL |
443 | |
444 | PROTOTYPES: DISABLE |
445 | |
446 | |
447 | int |
448 | DIE_ON_ERR() |
449 | CODE: |
450 | RETVAL = ENCODE_DIE_ON_ERR; |
451 | OUTPUT: |
452 | RETVAL |
453 | |
0ab8f81e |
454 | int |
85982a32 |
455 | WARN_ON_ERR() |
456 | CODE: |
457 | RETVAL = ENCODE_WARN_ON_ERR; |
458 | OUTPUT: |
459 | RETVAL |
460 | |
461 | int |
462 | LEAVE_SRC() |
463 | CODE: |
464 | RETVAL = ENCODE_LEAVE_SRC; |
465 | OUTPUT: |
466 | RETVAL |
467 | |
468 | int |
469 | RETURN_ON_ERR() |
470 | CODE: |
471 | RETVAL = ENCODE_RETURN_ON_ERR; |
472 | OUTPUT: |
473 | RETVAL |
474 | |
475 | int |
476 | PERLQQ() |
477 | CODE: |
478 | RETVAL = ENCODE_PERLQQ; |
479 | OUTPUT: |
480 | RETVAL |
481 | |
482 | int |
483 | FB_DEFAULT() |
484 | CODE: |
485 | RETVAL = ENCODE_FB_DEFAULT; |
486 | OUTPUT: |
487 | RETVAL |
488 | |
489 | int |
490 | FB_CROAK() |
491 | CODE: |
492 | RETVAL = ENCODE_FB_CROAK; |
493 | OUTPUT: |
494 | RETVAL |
495 | |
496 | int |
497 | FB_QUIET() |
498 | CODE: |
499 | RETVAL = ENCODE_FB_QUIET; |
500 | OUTPUT: |
501 | RETVAL |
502 | |
503 | int |
504 | FB_WARN() |
505 | CODE: |
506 | RETVAL = ENCODE_FB_WARN; |
507 | OUTPUT: |
508 | RETVAL |
509 | |
510 | int |
511 | FB_PERLQQ() |
512 | CODE: |
513 | RETVAL = ENCODE_FB_PERLQQ; |
514 | OUTPUT: |
515 | RETVAL |
2c674647 |
516 | |
33af2bc7 |
517 | BOOT: |
518 | { |
85982a32 |
519 | #include "def_t.h" |
e7cbefb8 |
520 | #include "def_t.exh" |
33af2bc7 |
521 | } |