Multiplicity & ithreads fix for sv.c/Encode.xs
[p5sagit/p5-mst-13.2.git] / utf8.c
CommitLineData
a0ed51b3 1/* utf8.c
2 *
3818b22b 3 * Copyright (c) 1998-2000, Larry Wall
a0ed51b3 4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3 25#include "perl.h"
26
27/* Unicode support */
28
dfe13c55 29U8 *
864dbfa3 30Perl_uv_to_utf8(pTHX_ U8 *d, UV uv)
a0ed51b3 31{
32 if (uv < 0x80) {
33 *d++ = uv;
34 return d;
35 }
36 if (uv < 0x800) {
37 *d++ = (( uv >> 6) | 0xc0);
38 *d++ = (( uv & 0x3f) | 0x80);
39 return d;
40 }
41 if (uv < 0x10000) {
42 *d++ = (( uv >> 12) | 0xe0);
43 *d++ = (((uv >> 6) & 0x3f) | 0x80);
44 *d++ = (( uv & 0x3f) | 0x80);
45 return d;
46 }
47 if (uv < 0x200000) {
48 *d++ = (( uv >> 18) | 0xf0);
49 *d++ = (((uv >> 12) & 0x3f) | 0x80);
50 *d++ = (((uv >> 6) & 0x3f) | 0x80);
51 *d++ = (( uv & 0x3f) | 0x80);
52 return d;
53 }
54 if (uv < 0x4000000) {
55 *d++ = (( uv >> 24) | 0xf8);
56 *d++ = (((uv >> 18) & 0x3f) | 0x80);
57 *d++ = (((uv >> 12) & 0x3f) | 0x80);
58 *d++ = (((uv >> 6) & 0x3f) | 0x80);
59 *d++ = (( uv & 0x3f) | 0x80);
60 return d;
61 }
62 if (uv < 0x80000000) {
63 *d++ = (( uv >> 30) | 0xfc);
64 *d++ = (((uv >> 24) & 0x3f) | 0x80);
65 *d++ = (((uv >> 18) & 0x3f) | 0x80);
66 *d++ = (((uv >> 12) & 0x3f) | 0x80);
67 *d++ = (((uv >> 6) & 0x3f) | 0x80);
68 *d++ = (( uv & 0x3f) | 0x80);
69 return d;
70 }
6b8eaf93 71#ifdef HAS_QUAD
628e1a40 72 if (uv < 0x1000000000LL)
a0ed51b3 73#endif
74 {
75 *d++ = 0xfe; /* Can't match U+FEFF! */
76 *d++ = (((uv >> 30) & 0x3f) | 0x80);
77 *d++ = (((uv >> 24) & 0x3f) | 0x80);
78 *d++ = (((uv >> 18) & 0x3f) | 0x80);
79 *d++ = (((uv >> 12) & 0x3f) | 0x80);
80 *d++ = (((uv >> 6) & 0x3f) | 0x80);
81 *d++ = (( uv & 0x3f) | 0x80);
82 return d;
83 }
6b8eaf93 84#ifdef HAS_QUAD
a0ed51b3 85 {
86 *d++ = 0xff; /* Can't match U+FFFE! */
3c77ea2b 87 *d++ = 0x80; /* 6 Reserved bits */
88 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
89 *d++ = (((uv >> 54) & 0x3f) | 0x80);
90 *d++ = (((uv >> 48) & 0x3f) | 0x80);
91 *d++ = (((uv >> 42) & 0x3f) | 0x80);
a0ed51b3 92 *d++ = (((uv >> 36) & 0x3f) | 0x80);
93 *d++ = (((uv >> 30) & 0x3f) | 0x80);
94 *d++ = (((uv >> 24) & 0x3f) | 0x80);
95 *d++ = (((uv >> 18) & 0x3f) | 0x80);
96 *d++ = (((uv >> 12) & 0x3f) | 0x80);
97 *d++ = (((uv >> 6) & 0x3f) | 0x80);
98 *d++ = (( uv & 0x3f) | 0x80);
99 return d;
100 }
101#endif
102}
103
386d01d6 104/* Tests if some arbitrary number of bytes begins in a valid UTF-8 character.
105 * The actual number of bytes in the UTF-8 character will be returned if it
106 * is valid, otherwise 0. */
107int
108Perl_is_utf8_char(pTHX_ U8 *s)
109{
110 U8 u = *s;
111 int slen, len;
112
113 if (!(u & 0x80))
114 return 1;
115
116 if (!(u & 0x40))
117 return 0;
118
119 if (!(u & 0x20)) { len = 2; }
120 else if (!(u & 0x10)) { len = 3; }
121 else if (!(u & 0x08)) { len = 4; }
122 else if (!(u & 0x04)) { len = 5; }
123 else if (!(u & 0x02)) { len = 6; }
124 else if (!(u & 0x01)) { len = 7; }
125 else { len = 13; } /* whoa! */
126
127 slen = len - 1;
128 s++;
129 while (slen--) {
130 if ((*s & 0xc0) != 0x80)
131 return 0;
132 s++;
133 }
134 return len;
135}
136
6662521e 137/*
b2a2e44b 138=for apidoc Am|is_utf8_string|U8 *s|STRLEN len
6662521e 139
140Returns true if first C<len> bytes of the given string form valid a UTF8
141string, false otherwise.
142
143=cut
144*/
145
146bool
147Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
148{
149 U8* x=s;
150 U8* send=s+len;
151 int c;
152 while (x < send) {
153 c = is_utf8_char(x);
154 x += c;
155 if (!c || x > send)
156 return 0;
157 }
158 return 1;
159}
160
67e989fb 161/*
b6b716fe 162=for apidoc Am|U8* s|utf8_to_uv|I32 *retlen|I32 checking
67e989fb 163
164Returns the character value of the first character in the string C<s>
165which is assumed to be in UTF8 encoding; C<retlen> will be set to the
166length, in bytes, of that character, and the pointer C<s> will be
167advanced to the end of the character.
168
169If C<s> does not point to a well-formed UTF8 character, the behaviour
170is dependent on the value of C<checking>: if this is true, it is
171assumed that the caller will raise a warning, and this function will
172set C<retlen> to C<-1> and return. If C<checking> is not true, an optional UTF8
173warning is produced.
174
175=cut
176*/
177
a0ed51b3 178UV
67e989fb 179Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen, bool checking)
a0ed51b3 180{
181 UV uv = *s;
182 int len;
183 if (!(uv & 0x80)) {
184 if (retlen)
185 *retlen = 1;
186 return *s;
187 }
188 if (!(uv & 0x40)) {
0453d815 189 dTHR;
67e989fb 190 if (checking && retlen) {
191 *retlen = -1;
192 return 0;
193 }
194
0453d815 195 if (ckWARN_d(WARN_UTF8))
196 Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
a0ed51b3 197 if (retlen)
198 *retlen = 1;
199 return *s;
200 }
201
202 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
203 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
204 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
205 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
206 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
3c77ea2b 207 else if (!(uv & 0x01)) { len = 7; uv = 0; }
208 else { len = 13; uv = 0; } /* whoa! */
a0ed51b3 209
210 if (retlen)
211 *retlen = len;
212 --len;
213 s++;
214 while (len--) {
215 if ((*s & 0xc0) != 0x80) {
0453d815 216 dTHR;
67e989fb 217 if (checking && retlen) {
218 *retlen = -1;
219 return 0;
220 }
221
0453d815 222 if (ckWARN_d(WARN_UTF8))
223 Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
a0ed51b3 224 if (retlen)
225 *retlen -= len + 1;
226 return 0xfffd;
227 }
228 else
229 uv = (uv << 6) | (*s++ & 0x3f);
230 }
231 return uv;
232}
233
246fae53 234/* utf8_distance(a,b) returns the number of UTF8 characters between
235 the pointers a and b */
a0ed51b3 236
237I32
864dbfa3 238Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 239{
240 I32 off = 0;
241 if (a < b) {
242 while (a < b) {
243 a += UTF8SKIP(a);
244 off--;
245 }
246 }
247 else {
248 while (b < a) {
249 b += UTF8SKIP(b);
250 off++;
251 }
252 }
253 return off;
254}
255
256/* WARNING: do not use the following unless you *know* off is within bounds */
257
258U8 *
864dbfa3 259Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 260{
261 if (off >= 0) {
262 while (off--)
263 s += UTF8SKIP(s);
264 }
265 else {
266 while (off++) {
267 s--;
268 if (*s & 0x80) {
269 while ((*s & 0xc0) == 0x80)
270 s--;
271 }
272 }
273 }
274 return s;
275}
276
6940069f 277/*
246fae53 278=for apidoc Am|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 279
246fae53 280Converts a string C<s> of length C<len> from UTF8 into byte encoding.
281Unlike C<bytes_to_utf8>, this over-writes the original string, and
282updates len to contain the new length.
67e989fb 283Returns zero on failure, setting C<len> to -1.
6940069f 284
285=cut
286*/
287
288U8 *
246fae53 289Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
6940069f 290{
291 dTHR;
292 U8 *send;
293 U8 *d;
294 U8 *save;
295
246fae53 296 send = s + *len;
6940069f 297 d = save = s;
246fae53 298
299 /* ensure valid UTF8 and chars < 256 before updating string */
300 while (s < send) {
301 U8 c = *s++;
9f9ab905 302 if (c >= 0x80 &&
303 ( (s >= send) || ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) {
67e989fb 304 *len = -1;
246fae53 305 return 0;
67e989fb 306 }
246fae53 307 }
308 s = save;
6940069f 309 while (s < send) {
310 if (*s < 0x80)
311 *d++ = *s++;
312 else {
313 I32 ulen;
67e989fb 314 *d++ = (U8)utf8_to_uv(s, &ulen, 0);
6940069f 315 s += ulen;
6940069f 316 }
317 }
318 *d = '\0';
246fae53 319 *len = d - save;
6940069f 320 return save;
321}
322
323/*
6662521e 324=for apidoc Am|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f 325
326Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
6662521e 327Returns a pointer to the newly-created string, and sets C<len> to
328reflect the new length.
6940069f 329
497711e7 330=cut
6940069f 331*/
332
333U8*
6662521e 334Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
6940069f 335{
336 dTHR;
337 U8 *send;
338 U8 *d;
339 U8 *dst;
6662521e 340 send = s + (*len);
6940069f 341
6662521e 342 Newz(801, d, (*len) * 2 + 1, U8);
6940069f 343 dst = d;
344
345 while (s < send) {
346 if (*s < 0x80)
347 *d++ = *s++;
348 else {
349 UV uv = *s++;
350 *d++ = (( uv >> 6) | 0xc0);
351 *d++ = (( uv & 0x3f) | 0x80);
352 }
353 }
354 *d = '\0';
6662521e 355 *len = d-dst;
6940069f 356 return dst;
357}
358
a0ed51b3 359/*
dea0fc0b 360 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3 361 *
362 * Destination must be pre-extended to 3/2 source. Do not use in-place.
363 * We optimize for native, for obvious reasons. */
364
365U8*
dea0fc0b 366Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 367{
dea0fc0b 368 U8* pend;
369 U8* dstart = d;
370
371 if (bytelen & 1)
a7867d0a 372 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b 373
374 pend = p + bytelen;
375
a0ed51b3 376 while (p < pend) {
dea0fc0b 377 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
378 p += 2;
a0ed51b3 379 if (uv < 0x80) {
380 *d++ = uv;
381 continue;
382 }
383 if (uv < 0x800) {
384 *d++ = (( uv >> 6) | 0xc0);
385 *d++ = (( uv & 0x3f) | 0x80);
386 continue;
387 }
388 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
0453d815 389 dTHR;
dea0fc0b 390 UV low = *p++;
391 if (low < 0xdc00 || low >= 0xdfff)
392 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3 393 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
394 }
395 if (uv < 0x10000) {
396 *d++ = (( uv >> 12) | 0xe0);
397 *d++ = (((uv >> 6) & 0x3f) | 0x80);
398 *d++ = (( uv & 0x3f) | 0x80);
399 continue;
400 }
401 else {
402 *d++ = (( uv >> 18) | 0xf0);
403 *d++ = (((uv >> 12) & 0x3f) | 0x80);
404 *d++ = (((uv >> 6) & 0x3f) | 0x80);
405 *d++ = (( uv & 0x3f) | 0x80);
406 continue;
407 }
408 }
dea0fc0b 409 *newlen = d - dstart;
a0ed51b3 410 return d;
411}
412
413/* Note: this one is slightly destructive of the source. */
414
415U8*
dea0fc0b 416Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 417{
418 U8* s = (U8*)p;
419 U8* send = s + bytelen;
420 while (s < send) {
421 U8 tmp = s[0];
422 s[0] = s[1];
423 s[1] = tmp;
424 s += 2;
425 }
dea0fc0b 426 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3 427}
428
429/* for now these are all defined (inefficiently) in terms of the utf8 versions */
430
431bool
864dbfa3 432Perl_is_uni_alnum(pTHX_ U32 c)
a0ed51b3 433{
aa6ffa16 434 U8 tmpbuf[UTF8_MAXLEN];
a0ed51b3 435 uv_to_utf8(tmpbuf, (UV)c);
436 return is_utf8_alnum(tmpbuf);
437}
438
439bool
b8c5462f 440Perl_is_uni_alnumc(pTHX_ U32 c)
441{
aa6ffa16 442 U8 tmpbuf[UTF8_MAXLEN];
b8c5462f 443 uv_to_utf8(tmpbuf, (UV)c);
444 return is_utf8_alnumc(tmpbuf);
445}
446
447bool
864dbfa3 448Perl_is_uni_idfirst(pTHX_ U32 c)
a0ed51b3 449{
aa6ffa16 450 U8 tmpbuf[UTF8_MAXLEN];
a0ed51b3 451 uv_to_utf8(tmpbuf, (UV)c);
452 return is_utf8_idfirst(tmpbuf);
453}
454
455bool
864dbfa3 456Perl_is_uni_alpha(pTHX_ U32 c)
a0ed51b3 457{
aa6ffa16 458 U8 tmpbuf[UTF8_MAXLEN];
a0ed51b3 459 uv_to_utf8(tmpbuf, (UV)c);
460 return is_utf8_alpha(tmpbuf);
461}
462
463bool
4d61ec05 464Perl_is_uni_ascii(pTHX_ U32 c)
465{
aa6ffa16 466 U8 tmpbuf[UTF8_MAXLEN];
4d61ec05 467 uv_to_utf8(tmpbuf, (UV)c);
468 return is_utf8_ascii(tmpbuf);
469}
470
471bool
864dbfa3 472Perl_is_uni_space(pTHX_ U32 c)
a0ed51b3 473{
aa6ffa16 474 U8 tmpbuf[UTF8_MAXLEN];
a0ed51b3 475 uv_to_utf8(tmpbuf, (UV)c);
476 return is_utf8_space(tmpbuf);
477}
478
479bool
864dbfa3 480Perl_is_uni_digit(pTHX_ U32 c)
a0ed51b3 481{
aa6ffa16 482 U8 tmpbuf[UTF8_MAXLEN];
a0ed51b3 483 uv_to_utf8(tmpbuf, (UV)c);
484 return is_utf8_digit(tmpbuf);
485}
486
487bool
864dbfa3 488Perl_is_uni_upper(pTHX_ U32 c)
a0ed51b3 489{
aa6ffa16 490 U8 tmpbuf[UTF8_MAXLEN];
a0ed51b3 491 uv_to_utf8(tmpbuf, (UV)c);
492 return is_utf8_upper(tmpbuf);
493}
494
495bool
864dbfa3 496Perl_is_uni_lower(pTHX_ U32 c)
a0ed51b3 497{
aa6ffa16 498 U8 tmpbuf[UTF8_MAXLEN];
a0ed51b3 499 uv_to_utf8(tmpbuf, (UV)c);
500 return is_utf8_lower(tmpbuf);
501}
502
503bool
b8c5462f 504Perl_is_uni_cntrl(pTHX_ U32 c)
505{
aa6ffa16 506 U8 tmpbuf[UTF8_MAXLEN];
b8c5462f 507 uv_to_utf8(tmpbuf, (UV)c);
508 return is_utf8_cntrl(tmpbuf);
509}
510
511bool
512Perl_is_uni_graph(pTHX_ U32 c)
513{
aa6ffa16 514 U8 tmpbuf[UTF8_MAXLEN];
b8c5462f 515 uv_to_utf8(tmpbuf, (UV)c);
516 return is_utf8_graph(tmpbuf);
517}
518
519bool
864dbfa3 520Perl_is_uni_print(pTHX_ U32 c)
a0ed51b3 521{
aa6ffa16 522 U8 tmpbuf[UTF8_MAXLEN];
a0ed51b3 523 uv_to_utf8(tmpbuf, (UV)c);
524 return is_utf8_print(tmpbuf);
525}
526
b8c5462f 527bool
f248d071 528Perl_is_uni_punct(pTHX_ U32 c)
b8c5462f 529{
aa6ffa16 530 U8 tmpbuf[UTF8_MAXLEN];
b8c5462f 531 uv_to_utf8(tmpbuf, (UV)c);
532 return is_utf8_punct(tmpbuf);
533}
534
4d61ec05 535bool
536Perl_is_uni_xdigit(pTHX_ U32 c)
537{
aa6ffa16 538 U8 tmpbuf[UTF8_MAXLEN];
4d61ec05 539 uv_to_utf8(tmpbuf, (UV)c);
540 return is_utf8_xdigit(tmpbuf);
541}
542
a0ed51b3 543U32
864dbfa3 544Perl_to_uni_upper(pTHX_ U32 c)
a0ed51b3 545{
aa6ffa16 546 U8 tmpbuf[UTF8_MAXLEN];
a0ed51b3 547 uv_to_utf8(tmpbuf, (UV)c);
548 return to_utf8_upper(tmpbuf);
549}
550
551U32
864dbfa3 552Perl_to_uni_title(pTHX_ U32 c)
a0ed51b3 553{
aa6ffa16 554 U8 tmpbuf[UTF8_MAXLEN];
a0ed51b3 555 uv_to_utf8(tmpbuf, (UV)c);
556 return to_utf8_title(tmpbuf);
557}
558
559U32
864dbfa3 560Perl_to_uni_lower(pTHX_ U32 c)
a0ed51b3 561{
aa6ffa16 562 U8 tmpbuf[UTF8_MAXLEN];
a0ed51b3 563 uv_to_utf8(tmpbuf, (UV)c);
564 return to_utf8_lower(tmpbuf);
565}
566
567/* for now these all assume no locale info available for Unicode > 255 */
568
569bool
864dbfa3 570Perl_is_uni_alnum_lc(pTHX_ U32 c)
a0ed51b3 571{
572 return is_uni_alnum(c); /* XXX no locale support yet */
573}
574
575bool
b8c5462f 576Perl_is_uni_alnumc_lc(pTHX_ U32 c)
577{
578 return is_uni_alnumc(c); /* XXX no locale support yet */
579}
580
581bool
864dbfa3 582Perl_is_uni_idfirst_lc(pTHX_ U32 c)
a0ed51b3 583{
584 return is_uni_idfirst(c); /* XXX no locale support yet */
585}
586
587bool
864dbfa3 588Perl_is_uni_alpha_lc(pTHX_ U32 c)
a0ed51b3 589{
590 return is_uni_alpha(c); /* XXX no locale support yet */
591}
592
593bool
4d61ec05 594Perl_is_uni_ascii_lc(pTHX_ U32 c)
595{
596 return is_uni_ascii(c); /* XXX no locale support yet */
597}
598
599bool
864dbfa3 600Perl_is_uni_space_lc(pTHX_ U32 c)
a0ed51b3 601{
602 return is_uni_space(c); /* XXX no locale support yet */
603}
604
605bool
864dbfa3 606Perl_is_uni_digit_lc(pTHX_ U32 c)
a0ed51b3 607{
608 return is_uni_digit(c); /* XXX no locale support yet */
609}
610
611bool
864dbfa3 612Perl_is_uni_upper_lc(pTHX_ U32 c)
a0ed51b3 613{
614 return is_uni_upper(c); /* XXX no locale support yet */
615}
616
617bool
864dbfa3 618Perl_is_uni_lower_lc(pTHX_ U32 c)
a0ed51b3 619{
620 return is_uni_lower(c); /* XXX no locale support yet */
621}
622
623bool
b8c5462f 624Perl_is_uni_cntrl_lc(pTHX_ U32 c)
625{
626 return is_uni_cntrl(c); /* XXX no locale support yet */
627}
628
629bool
630Perl_is_uni_graph_lc(pTHX_ U32 c)
631{
632 return is_uni_graph(c); /* XXX no locale support yet */
633}
634
635bool
864dbfa3 636Perl_is_uni_print_lc(pTHX_ U32 c)
a0ed51b3 637{
638 return is_uni_print(c); /* XXX no locale support yet */
639}
640
b8c5462f 641bool
642Perl_is_uni_punct_lc(pTHX_ U32 c)
643{
644 return is_uni_punct(c); /* XXX no locale support yet */
645}
646
4d61ec05 647bool
648Perl_is_uni_xdigit_lc(pTHX_ U32 c)
649{
650 return is_uni_xdigit(c); /* XXX no locale support yet */
651}
652
a0ed51b3 653U32
864dbfa3 654Perl_to_uni_upper_lc(pTHX_ U32 c)
a0ed51b3 655{
656 return to_uni_upper(c); /* XXX no locale support yet */
657}
658
659U32
864dbfa3 660Perl_to_uni_title_lc(pTHX_ U32 c)
a0ed51b3 661{
662 return to_uni_title(c); /* XXX no locale support yet */
663}
664
665U32
864dbfa3 666Perl_to_uni_lower_lc(pTHX_ U32 c)
a0ed51b3 667{
668 return to_uni_lower(c); /* XXX no locale support yet */
669}
670
a0ed51b3 671bool
864dbfa3 672Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 673{
386d01d6 674 if (!is_utf8_char(p))
675 return FALSE;
a0ed51b3 676 if (!PL_utf8_alnum)
289d4f09 677 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
678 * descendant of isalnum(3), in other words, it doesn't
679 * contain the '_'. --jhi */
680 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
a0ed51b3 681 return swash_fetch(PL_utf8_alnum, p);
682/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
683#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
684 if (!PL_utf8_alnum)
685 PL_utf8_alnum = swash_init("utf8", "",
686 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
687 return swash_fetch(PL_utf8_alnum, p);
688#endif
689}
690
691bool
b8c5462f 692Perl_is_utf8_alnumc(pTHX_ U8 *p)
693{
386d01d6 694 if (!is_utf8_char(p))
695 return FALSE;
b8c5462f 696 if (!PL_utf8_alnum)
697 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
698 return swash_fetch(PL_utf8_alnum, p);
699/* return is_utf8_alpha(p) || is_utf8_digit(p); */
700#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
701 if (!PL_utf8_alnum)
702 PL_utf8_alnum = swash_init("utf8", "",
703 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
704 return swash_fetch(PL_utf8_alnum, p);
705#endif
706}
707
708bool
864dbfa3 709Perl_is_utf8_idfirst(pTHX_ U8 *p)
a0ed51b3 710{
711 return *p == '_' || is_utf8_alpha(p);
712}
713
714bool
864dbfa3 715Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 716{
386d01d6 717 if (!is_utf8_char(p))
718 return FALSE;
a0ed51b3 719 if (!PL_utf8_alpha)
e24b16f9 720 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
a0ed51b3 721 return swash_fetch(PL_utf8_alpha, p);
722}
723
724bool
b8c5462f 725Perl_is_utf8_ascii(pTHX_ U8 *p)
726{
386d01d6 727 if (!is_utf8_char(p))
728 return FALSE;
b8c5462f 729 if (!PL_utf8_ascii)
730 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
731 return swash_fetch(PL_utf8_ascii, p);
732}
733
734bool
864dbfa3 735Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 736{
386d01d6 737 if (!is_utf8_char(p))
738 return FALSE;
a0ed51b3 739 if (!PL_utf8_space)
e24b16f9 740 PL_utf8_space = swash_init("utf8", "IsSpace", &PL_sv_undef, 0, 0);
a0ed51b3 741 return swash_fetch(PL_utf8_space, p);
742}
743
744bool
864dbfa3 745Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 746{
386d01d6 747 if (!is_utf8_char(p))
748 return FALSE;
a0ed51b3 749 if (!PL_utf8_digit)
e24b16f9 750 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
a0ed51b3 751 return swash_fetch(PL_utf8_digit, p);
752}
753
754bool
864dbfa3 755Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 756{
386d01d6 757 if (!is_utf8_char(p))
758 return FALSE;
a0ed51b3 759 if (!PL_utf8_upper)
e24b16f9 760 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
a0ed51b3 761 return swash_fetch(PL_utf8_upper, p);
762}
763
764bool
864dbfa3 765Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 766{
386d01d6 767 if (!is_utf8_char(p))
768 return FALSE;
a0ed51b3 769 if (!PL_utf8_lower)
e24b16f9 770 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
a0ed51b3 771 return swash_fetch(PL_utf8_lower, p);
772}
773
774bool
b8c5462f 775Perl_is_utf8_cntrl(pTHX_ U8 *p)
776{
386d01d6 777 if (!is_utf8_char(p))
778 return FALSE;
b8c5462f 779 if (!PL_utf8_cntrl)
780 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
781 return swash_fetch(PL_utf8_cntrl, p);
782}
783
784bool
785Perl_is_utf8_graph(pTHX_ U8 *p)
786{
386d01d6 787 if (!is_utf8_char(p))
788 return FALSE;
b8c5462f 789 if (!PL_utf8_graph)
790 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
791 return swash_fetch(PL_utf8_graph, p);
792}
793
794bool
864dbfa3 795Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 796{
386d01d6 797 if (!is_utf8_char(p))
798 return FALSE;
a0ed51b3 799 if (!PL_utf8_print)
e24b16f9 800 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
a0ed51b3 801 return swash_fetch(PL_utf8_print, p);
802}
803
804bool
b8c5462f 805Perl_is_utf8_punct(pTHX_ U8 *p)
806{
386d01d6 807 if (!is_utf8_char(p))
808 return FALSE;
b8c5462f 809 if (!PL_utf8_punct)
810 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
811 return swash_fetch(PL_utf8_punct, p);
812}
813
814bool
815Perl_is_utf8_xdigit(pTHX_ U8 *p)
816{
386d01d6 817 if (!is_utf8_char(p))
818 return FALSE;
b8c5462f 819 if (!PL_utf8_xdigit)
820 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
821 return swash_fetch(PL_utf8_xdigit, p);
822}
823
824bool
864dbfa3 825Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 826{
386d01d6 827 if (!is_utf8_char(p))
828 return FALSE;
a0ed51b3 829 if (!PL_utf8_mark)
e24b16f9 830 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
a0ed51b3 831 return swash_fetch(PL_utf8_mark, p);
832}
833
2104c8d9 834UV
864dbfa3 835Perl_to_utf8_upper(pTHX_ U8 *p)
a0ed51b3 836{
837 UV uv;
838
839 if (!PL_utf8_toupper)
e24b16f9 840 PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
a0ed51b3 841 uv = swash_fetch(PL_utf8_toupper, p);
67e989fb 842 return uv ? uv : utf8_to_uv(p,0,0);
a0ed51b3 843}
844
2104c8d9 845UV
864dbfa3 846Perl_to_utf8_title(pTHX_ U8 *p)
a0ed51b3 847{
848 UV uv;
849
850 if (!PL_utf8_totitle)
e24b16f9 851 PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
a0ed51b3 852 uv = swash_fetch(PL_utf8_totitle, p);
67e989fb 853 return uv ? uv : utf8_to_uv(p,0,0);
a0ed51b3 854}
855
2104c8d9 856UV
864dbfa3 857Perl_to_utf8_lower(pTHX_ U8 *p)
a0ed51b3 858{
859 UV uv;
860
861 if (!PL_utf8_tolower)
e24b16f9 862 PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
a0ed51b3 863 uv = swash_fetch(PL_utf8_tolower, p);
67e989fb 864 return uv ? uv : utf8_to_uv(p,0,0);
a0ed51b3 865}
866
867/* a "swash" is a swatch hash */
868
869SV*
864dbfa3 870Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 871{
872 SV* retval;
873 char tmpbuf[256];
874 dSP;
ce3b816e 875
876 if (!gv_stashpv(pkg, 0)) { /* demand load utf8 */
877 ENTER;
878 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
879 LEAVE;
880 }
881 SPAGAIN;
a0ed51b3 882 PUSHSTACKi(PERLSI_MAGIC);
883 PUSHMARK(SP);
884 EXTEND(SP,5);
885 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
886 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
887 PUSHs(listsv);
888 PUSHs(sv_2mortal(newSViv(minbits)));
889 PUSHs(sv_2mortal(newSViv(none)));
890 PUTBACK;
891 ENTER;
892 SAVEI32(PL_hints);
893 PL_hints = 0;
894 save_re_context();
e24b16f9 895 if (PL_curcop == &PL_compiling) /* XXX ought to be handled by lex_start */
a0ed51b3 896 strncpy(tmpbuf, PL_tokenbuf, sizeof tmpbuf);
864dbfa3 897 if (call_method("SWASHNEW", G_SCALAR))
e24b16f9 898 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 899 else
e24b16f9 900 retval = &PL_sv_undef;
a0ed51b3 901 LEAVE;
902 POPSTACK;
e24b16f9 903 if (PL_curcop == &PL_compiling) {
a0ed51b3 904 strncpy(PL_tokenbuf, tmpbuf, sizeof tmpbuf);
e24b16f9 905 PL_curcop->op_private = PL_hints;
a0ed51b3 906 }
907 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
cea2e8a9 908 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
a0ed51b3 909 return retval;
910}
911
912UV
864dbfa3 913Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
a0ed51b3 914{
915 HV* hv = (HV*)SvRV(sv);
916 U32 klen = UTF8SKIP(ptr) - 1;
917 U32 off = ptr[klen] & 127; /* NB: 64 bit always 0 when len > 1 */
918 STRLEN slen;
919 STRLEN needents = (klen ? 64 : 128);
dfe13c55 920 U8 *tmps;
a0ed51b3 921 U32 bit;
922 SV *retval;
923
924 /*
925 * This single-entry cache saves about 1/3 of the utf8 overhead in test
926 * suite. (That is, only 7-8% overall over just a hash cache. Still,
927 * it's nothing to sniff at.) Pity we usually come through at least
928 * two function calls to get here...
929 *
930 * NB: this code assumes that swatches are never modified, once generated!
931 */
932
933 if (hv == PL_last_swash_hv &&
934 klen == PL_last_swash_klen &&
12ae5dfc 935 (!klen || memEQ((char *)ptr,(char *)PL_last_swash_key,klen)) )
a0ed51b3 936 {
937 tmps = PL_last_swash_tmps;
938 slen = PL_last_swash_slen;
939 }
940 else {
941 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 942 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3 943
944 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 945 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 946 dSP;
947 ENTER;
948 SAVETMPS;
949 save_re_context();
950 PUSHSTACKi(PERLSI_MAGIC);
951 PUSHMARK(SP);
952 EXTEND(SP,3);
953 PUSHs((SV*)sv);
67e989fb 954 PUSHs(sv_2mortal(newSViv(utf8_to_uv(ptr, 0, 0) & ~(needents - 1))));
a0ed51b3 955 PUSHs(sv_2mortal(newSViv(needents)));
956 PUTBACK;
864dbfa3 957 if (call_method("SWASHGET", G_SCALAR))
e24b16f9 958 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 959 else
e24b16f9 960 retval = &PL_sv_undef;
a0ed51b3 961 POPSTACK;
962 FREETMPS;
963 LEAVE;
e24b16f9 964 if (PL_curcop == &PL_compiling)
965 PL_curcop->op_private = PL_hints;
a0ed51b3 966
dfe13c55 967 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 968
dfe13c55 969 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8)
cea2e8a9 970 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3 971 }
972
973 PL_last_swash_hv = hv;
974 PL_last_swash_klen = klen;
975 PL_last_swash_tmps = tmps;
976 PL_last_swash_slen = slen;
977 if (klen)
978 Copy(ptr, PL_last_swash_key, klen, U8);
979 }
980
981 switch ((slen << 3) / needents) {
982 case 1:
983 bit = 1 << (off & 7);
984 off >>= 3;
985 return (tmps[off] & bit) != 0;
986 case 8:
987 return tmps[off];
988 case 16:
989 off <<= 1;
990 return (tmps[off] << 8) + tmps[off + 1] ;
991 case 32:
992 off <<= 2;
993 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
994 }
cea2e8a9 995 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3 996 return 0;
997}