Commit | Line | Data |
2c674647 |
1 | #include "EXTERN.h" |
2 | #include "perl.h" |
3 | #include "XSUB.h" |
4 | |
67e989fb |
5 | #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) { \ |
2f5768b8 |
6 | Perl_croak(aTHX_ "panic_unimplemented"); \ |
4a83738a |
7 | return (y)0; /* fool picky compilers */ \ |
87714904 |
8 | } |
67e989fb |
9 | UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) |
10 | UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) |
11 | |
33af2bc7 |
12 | #ifdef USE_PERLIO |
13 | #include "perliol.h" |
14 | |
15 | typedef struct |
16 | { |
17 | PerlIOBuf base; /* PerlIOBuf stuff */ |
18 | SV * bufsv; |
19 | SV * enc; |
20 | } PerlIOEncode; |
21 | |
22 | |
23 | IV |
24 | PerlIOEncode_pushed(PerlIO *f, const char *mode,const char *arg,STRLEN len) |
25 | { |
26 | PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode); |
27 | dTHX; |
28 | dSP; |
29 | IV code; |
30 | code = PerlIOBuf_pushed(f,mode,Nullch,0); |
31 | ENTER; |
32 | SAVETMPS; |
33 | PUSHMARK(sp); |
34 | XPUSHs(sv_2mortal(newSVpv("Encode",0))); |
35 | XPUSHs(sv_2mortal(newSVpvn(arg,len))); |
36 | PUTBACK; |
37 | if (perl_call_method("getEncoding",G_SCALAR) != 1) |
38 | return -1; |
39 | SPAGAIN; |
40 | e->enc = POPs; |
41 | PUTBACK; |
42 | if (!SvROK(e->enc)) |
43 | return -1; |
44 | SvREFCNT_inc(e->enc); |
45 | FREETMPS; |
46 | LEAVE; |
47 | PerlIOBase(f)->flags |= PERLIO_F_UTF8; |
48 | return code; |
49 | } |
50 | |
51 | IV |
52 | PerlIOEncode_popped(PerlIO *f) |
53 | { |
54 | PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode); |
55 | dTHX; |
56 | if (e->enc) |
57 | { |
58 | SvREFCNT_dec(e->enc); |
59 | e->enc = Nullsv; |
60 | } |
61 | if (e->bufsv) |
62 | { |
63 | SvREFCNT_dec(e->bufsv); |
64 | e->bufsv = Nullsv; |
65 | } |
66 | return 0; |
67 | } |
68 | |
69 | STDCHAR * |
70 | PerlIOEncode_get_base(PerlIO *f) |
71 | { |
72 | PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode); |
73 | dTHX; |
74 | if (!e->base.bufsiz) |
75 | e->base.bufsiz = 1024; |
76 | if (!e->bufsv) |
77 | { |
78 | e->bufsv = newSV(e->base.bufsiz); |
79 | sv_setpvn(e->bufsv,"",0); |
80 | } |
81 | e->base.buf = SvPVX(e->bufsv); |
82 | if (!e->base.ptr) |
83 | e->base.ptr = e->base.buf; |
84 | if (!e->base.end) |
85 | e->base.end = e->base.buf; |
86 | if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv)) |
87 | { |
88 | Perl_warn(aTHX_ " ptr %p(%p)%p", |
89 | e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv)); |
90 | abort(); |
91 | } |
92 | if (SvLEN(e->bufsv) < e->base.bufsiz) |
93 | { |
94 | SSize_t poff = e->base.ptr - e->base.buf; |
95 | SSize_t eoff = e->base.end - e->base.buf; |
96 | e->base.buf = SvGROW(e->bufsv,e->base.bufsiz); |
97 | e->base.ptr = e->base.buf + poff; |
98 | e->base.end = e->base.buf + eoff; |
99 | } |
100 | if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv)) |
101 | { |
102 | Perl_warn(aTHX_ " ptr %p(%p)%p", |
103 | e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv)); |
104 | abort(); |
105 | } |
106 | return e->base.buf; |
107 | } |
108 | |
109 | static void |
110 | Break(void) |
111 | { |
112 | |
113 | } |
114 | |
115 | IV |
116 | PerlIOEncode_fill(PerlIO *f) |
117 | { |
118 | PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode); |
119 | dTHX; |
120 | dSP; |
121 | IV code; |
122 | Break(); |
123 | code = PerlIOBuf_fill(f); |
124 | if (code == 0) |
125 | { |
126 | SV *uni; |
127 | SvCUR_set(e->bufsv, e->base.end - e->base.buf); |
128 | SvUTF8_off(e->bufsv); |
129 | ENTER; |
130 | SAVETMPS; |
131 | PUSHMARK(sp); |
132 | XPUSHs(e->enc); |
133 | XPUSHs(e->bufsv); |
134 | XPUSHs(&PL_sv_yes); |
135 | PUTBACK; |
136 | if (perl_call_method("toUnicode",G_SCALAR) != 1) |
137 | code = -1; |
138 | SPAGAIN; |
139 | uni = POPs; |
140 | PUTBACK; |
141 | sv_setsv(e->bufsv,uni); |
142 | sv_utf8_upgrade(e->bufsv); |
143 | e->base.buf = SvPVX(e->bufsv); |
144 | e->base.end = e->base.buf+SvCUR(e->bufsv); |
145 | e->base.ptr = e->base.buf; |
146 | FREETMPS; |
147 | LEAVE; |
148 | } |
149 | return code; |
150 | } |
151 | |
152 | IV |
153 | PerlIOEncode_flush(PerlIO *f) |
154 | { |
155 | PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode); |
156 | IV code = 0; |
157 | dTHX; |
158 | if (e->bufsv && (PerlIOBase(f)->flags & (PERLIO_F_RDBUF|PERLIO_F_WRBUF))) |
159 | { |
160 | dSP; |
161 | SV *str; |
162 | char *s; |
163 | STRLEN len; |
164 | ENTER; |
165 | SAVETMPS; |
166 | PUSHMARK(sp); |
167 | XPUSHs(e->enc); |
168 | SvCUR_set(e->bufsv, e->base.end - e->base.buf); |
169 | SvUTF8_on(e->bufsv); |
170 | XPUSHs(e->bufsv); |
171 | XPUSHs(&PL_sv_yes); |
172 | PUTBACK; |
173 | if (perl_call_method("fromUnicode",G_SCALAR) != 1) |
174 | code = -1; |
175 | SPAGAIN; |
176 | str = POPs; |
177 | PUTBACK; |
178 | sv_setsv(e->bufsv,str); |
179 | SvUTF8_off(e->bufsv); |
180 | e->base.buf = SvPVX(e->bufsv); |
181 | e->base.ptr = e->base.buf+SvCUR(e->bufsv); |
182 | FREETMPS; |
183 | LEAVE; |
184 | if (PerlIOBuf_flush(f) != 0) |
185 | code = -1; |
186 | } |
187 | return code; |
188 | } |
189 | |
190 | IV |
191 | PerlIOEncode_close(PerlIO *f) |
192 | { |
193 | PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode); |
194 | IV code = PerlIOBase_close(f); |
195 | dTHX; |
196 | if (e->bufsv) |
197 | { |
198 | SvREFCNT_dec(e->bufsv); |
199 | e->bufsv = Nullsv; |
200 | } |
201 | e->base.buf = NULL; |
202 | e->base.ptr = NULL; |
203 | e->base.end = NULL; |
204 | PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF|PERLIO_F_WRBUF); |
205 | return code; |
206 | } |
207 | |
208 | PerlIO_funcs PerlIO_encode = { |
209 | "encode", |
210 | sizeof(PerlIOEncode), |
211 | PERLIO_K_BUFFERED, |
212 | PerlIOBase_fileno, |
213 | PerlIOBuf_fdopen, |
214 | PerlIOBuf_open, |
215 | PerlIOBuf_reopen, |
216 | PerlIOEncode_pushed, |
217 | PerlIOEncode_popped, |
218 | PerlIOBuf_read, |
219 | PerlIOBuf_unread, |
220 | PerlIOBuf_write, |
221 | PerlIOBuf_seek, |
222 | PerlIOBuf_tell, |
223 | PerlIOEncode_close, |
224 | PerlIOEncode_flush, |
225 | PerlIOEncode_fill, |
226 | PerlIOBase_eof, |
227 | PerlIOBase_error, |
228 | PerlIOBase_clearerr, |
229 | PerlIOBuf_setlinebuf, |
230 | PerlIOEncode_get_base, |
231 | PerlIOBuf_bufsiz, |
232 | PerlIOBuf_get_ptr, |
233 | PerlIOBuf_get_cnt, |
234 | PerlIOBuf_set_ptrcnt, |
235 | }; |
236 | #endif |
237 | |
183a2d84 |
238 | void call_failure (SV *routine, U8* done, U8* dest, U8* orig) {} |
67e989fb |
239 | |
240 | MODULE = Encode PACKAGE = Encode |
2c674647 |
241 | |
242 | PROTOTYPES: ENABLE |
243 | |
67e989fb |
244 | I32 |
2c674647 |
245 | _bytes_to_utf8(sv, ...) |
67e989fb |
246 | SV * sv |
2c674647 |
247 | CODE: |
67e989fb |
248 | { |
249 | SV * encoding = items == 2 ? ST(1) : Nullsv; |
250 | |
251 | if (encoding) |
252 | RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding)); |
253 | else { |
254 | STRLEN len; |
183a2d84 |
255 | U8* s = (U8*)SvPV(sv, len); |
67e989fb |
256 | U8* converted; |
257 | |
258 | converted = bytes_to_utf8(s, &len); /* This allocs */ |
183a2d84 |
259 | sv_setpvn(sv, (char *)converted, len); |
67e989fb |
260 | SvUTF8_on(sv); /* XXX Should we? */ |
261 | Safefree(converted); /* ... so free it */ |
262 | RETVAL = len; |
263 | } |
264 | } |
2c674647 |
265 | OUTPUT: |
67e989fb |
266 | RETVAL |
2c674647 |
267 | |
67e989fb |
268 | I32 |
2c674647 |
269 | _utf8_to_bytes(sv, ...) |
67e989fb |
270 | SV * sv |
2c674647 |
271 | CODE: |
67e989fb |
272 | { |
273 | SV * to = items > 1 ? ST(1) : Nullsv; |
274 | SV * check = items > 2 ? ST(2) : Nullsv; |
87714904 |
275 | |
67e989fb |
276 | if (to) |
277 | RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to)); |
278 | else { |
67e989fb |
279 | STRLEN len; |
b113ac0e |
280 | U8 *s = (U8*)SvPV(sv, len); |
67e989fb |
281 | |
282 | if (SvTRUE(check)) { |
283 | /* Must do things the slow way */ |
284 | U8 *dest; |
87714904 |
285 | U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */ |
67e989fb |
286 | U8 *send = s + len; |
287 | |
288 | New(83, dest, len, U8); /* I think */ |
289 | |
290 | while (s < send) { |
291 | if (*s < 0x80) |
292 | *dest++ = *s++; |
293 | else { |
b113ac0e |
294 | STRLEN ulen; |
295 | UV uv = *s++; |
87714904 |
296 | |
67e989fb |
297 | /* Have to do it all ourselves because of error routine, |
298 | aargh. */ |
299 | if (!(uv & 0x40)) |
300 | goto failure; |
301 | if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; } |
302 | else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; } |
303 | else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; } |
304 | else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; } |
305 | else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; } |
306 | else if (!(uv & 0x01)) { ulen = 7; uv = 0; } |
307 | else { ulen = 13; uv = 0; } |
87714904 |
308 | |
67e989fb |
309 | /* Note change to utf8.c variable naming, for variety */ |
310 | while (ulen--) { |
311 | if ((*s & 0xc0) != 0x80) |
312 | goto failure; |
87714904 |
313 | |
67e989fb |
314 | else |
315 | uv = (uv << 6) | (*s++ & 0x3f); |
87714904 |
316 | } |
67e989fb |
317 | if (uv > 256) { |
318 | failure: |
319 | call_failure(check, s, dest, src); |
320 | /* Now what happens? */ |
321 | } |
322 | *dest++ = (U8)uv; |
323 | } |
324 | } |
325 | } else |
326 | RETVAL = (utf8_to_bytes(s, &len) ? len : 0); |
327 | } |
2c674647 |
328 | } |
329 | OUTPUT: |
330 | RETVAL |
331 | |
332 | SV * |
333 | _chars_to_utf8(sv, from, ...) |
334 | SV * sv |
335 | SV * from |
336 | CODE: |
337 | { |
338 | SV * check = items == 3 ? ST(2) : Nullsv; |
339 | RETVAL = &PL_sv_undef; |
340 | } |
341 | OUTPUT: |
342 | RETVAL |
343 | |
344 | SV * |
345 | _utf8_to_chars(sv, to, ...) |
346 | SV * sv |
347 | SV * to |
348 | CODE: |
349 | { |
350 | SV * check = items == 3 ? ST(2) : Nullsv; |
351 | RETVAL = &PL_sv_undef; |
352 | } |
353 | OUTPUT: |
354 | RETVAL |
355 | |
356 | SV * |
357 | _utf8_to_chars_check(sv, ...) |
358 | SV * sv |
359 | CODE: |
360 | { |
361 | SV * check = items == 2 ? ST(1) : Nullsv; |
362 | RETVAL = &PL_sv_undef; |
363 | } |
364 | OUTPUT: |
365 | RETVAL |
366 | |
367 | SV * |
368 | _bytes_to_chars(sv, from, ...) |
369 | SV * sv |
370 | SV * from |
371 | CODE: |
372 | { |
373 | SV * check = items == 3 ? ST(2) : Nullsv; |
374 | RETVAL = &PL_sv_undef; |
375 | } |
376 | OUTPUT: |
377 | RETVAL |
378 | |
379 | SV * |
380 | _chars_to_bytes(sv, to, ...) |
381 | SV * sv |
382 | SV * to |
383 | CODE: |
384 | { |
385 | SV * check = items == 3 ? ST(2) : Nullsv; |
386 | RETVAL = &PL_sv_undef; |
387 | } |
388 | OUTPUT: |
389 | RETVAL |
390 | |
391 | SV * |
392 | _from_to(sv, from, to, ...) |
393 | SV * sv |
394 | SV * from |
395 | SV * to |
396 | CODE: |
397 | { |
398 | SV * check = items == 4 ? ST(3) : Nullsv; |
399 | RETVAL = &PL_sv_undef; |
400 | } |
401 | OUTPUT: |
402 | RETVAL |
403 | |
404 | bool |
405 | _is_utf8(sv, ...) |
406 | SV * sv |
407 | CODE: |
408 | { |
409 | SV * check = items == 2 ? ST(1) : Nullsv; |
410 | if (SvPOK(sv)) { |
067a85ef |
411 | RETVAL = SvUTF8(sv) ? 1 : 0; |
2c674647 |
412 | if (RETVAL && |
413 | SvTRUE(check) && |
414 | !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) |
415 | RETVAL = FALSE; |
416 | } else { |
417 | RETVAL = FALSE; |
418 | } |
419 | } |
420 | OUTPUT: |
421 | RETVAL |
422 | |
423 | SV * |
424 | _on_utf8(sv) |
425 | SV * sv |
426 | CODE: |
427 | { |
428 | if (SvPOK(sv)) { |
87714904 |
429 | SV *rsv = newSViv(SvUTF8(sv)); |
2c674647 |
430 | RETVAL = rsv; |
431 | SvUTF8_on(sv); |
432 | } else { |
433 | RETVAL = &PL_sv_undef; |
434 | } |
435 | } |
436 | OUTPUT: |
437 | RETVAL |
438 | |
439 | SV * |
440 | _off_utf8(sv) |
441 | SV * sv |
442 | CODE: |
443 | { |
444 | if (SvPOK(sv)) { |
87714904 |
445 | SV *rsv = newSViv(SvUTF8(sv)); |
2c674647 |
446 | RETVAL = rsv; |
447 | SvUTF8_off(sv); |
448 | } else { |
449 | RETVAL = &PL_sv_undef; |
450 | } |
451 | } |
452 | OUTPUT: |
453 | RETVAL |
454 | |
455 | SV * |
456 | _utf_to_utf(sv, from, to, ...) |
457 | SV * sv |
458 | SV * from |
459 | SV * to |
460 | CODE: |
461 | { |
462 | SV * check = items == 4 ? ST(3) : Nullsv; |
463 | RETVAL = &PL_sv_undef; |
464 | } |
465 | OUTPUT: |
466 | RETVAL |
467 | |
33af2bc7 |
468 | BOOT: |
469 | { |
470 | #ifdef USE_PERLIO |
471 | PerlIO_define_layer(&PerlIO_encode); |
472 | #endif |
473 | } |