Fix for ID 20010306.008, UTF-8 and \w without 'use utf8' coredump.
[p5sagit/p5-mst-13.2.git] / utf8.c
CommitLineData
a0ed51b3 1/* utf8.c
2 *
bc89e66f 3 * Copyright (c) 1998-2001, Larry Wall
a0ed51b3 4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3 25#include "perl.h"
26
27/* Unicode support */
28
eebe1485 29/*
9041c2e3 30=for apidoc A|U8*|uvuni_to_utf8|U8 *d|UV uv
eebe1485 31
32Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34bytes available. The return value is the pointer to the byte after the
9041c2e3 35end of the new character. In other words,
eebe1485 36
9041c2e3 37 d = uvuni_to_utf8(d, uv);
eebe1485 38
39is the recommended Unicode-aware way of saying
40
41 *(d++) = uv;
42
43=cut
44*/
45
dfe13c55 46U8 *
9041c2e3 47Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
a0ed51b3 48{
49 if (uv < 0x80) {
50 *d++ = uv;
51 return d;
52 }
53 if (uv < 0x800) {
54 *d++ = (( uv >> 6) | 0xc0);
55 *d++ = (( uv & 0x3f) | 0x80);
56 return d;
57 }
58 if (uv < 0x10000) {
59 *d++ = (( uv >> 12) | 0xe0);
60 *d++ = (((uv >> 6) & 0x3f) | 0x80);
61 *d++ = (( uv & 0x3f) | 0x80);
62 return d;
63 }
64 if (uv < 0x200000) {
65 *d++ = (( uv >> 18) | 0xf0);
66 *d++ = (((uv >> 12) & 0x3f) | 0x80);
67 *d++ = (((uv >> 6) & 0x3f) | 0x80);
68 *d++ = (( uv & 0x3f) | 0x80);
69 return d;
70 }
71 if (uv < 0x4000000) {
72 *d++ = (( uv >> 24) | 0xf8);
73 *d++ = (((uv >> 18) & 0x3f) | 0x80);
74 *d++ = (((uv >> 12) & 0x3f) | 0x80);
75 *d++ = (((uv >> 6) & 0x3f) | 0x80);
76 *d++ = (( uv & 0x3f) | 0x80);
77 return d;
78 }
79 if (uv < 0x80000000) {
80 *d++ = (( uv >> 30) | 0xfc);
81 *d++ = (((uv >> 24) & 0x3f) | 0x80);
82 *d++ = (((uv >> 18) & 0x3f) | 0x80);
83 *d++ = (((uv >> 12) & 0x3f) | 0x80);
84 *d++ = (((uv >> 6) & 0x3f) | 0x80);
85 *d++ = (( uv & 0x3f) | 0x80);
86 return d;
87 }
6b8eaf93 88#ifdef HAS_QUAD
d7578b48 89 if (uv < UTF8_QUAD_MAX)
a0ed51b3 90#endif
91 {
92 *d++ = 0xfe; /* Can't match U+FEFF! */
93 *d++ = (((uv >> 30) & 0x3f) | 0x80);
94 *d++ = (((uv >> 24) & 0x3f) | 0x80);
95 *d++ = (((uv >> 18) & 0x3f) | 0x80);
96 *d++ = (((uv >> 12) & 0x3f) | 0x80);
97 *d++ = (((uv >> 6) & 0x3f) | 0x80);
98 *d++ = (( uv & 0x3f) | 0x80);
99 return d;
100 }
6b8eaf93 101#ifdef HAS_QUAD
a0ed51b3 102 {
103 *d++ = 0xff; /* Can't match U+FFFE! */
3c77ea2b 104 *d++ = 0x80; /* 6 Reserved bits */
105 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
106 *d++ = (((uv >> 54) & 0x3f) | 0x80);
107 *d++ = (((uv >> 48) & 0x3f) | 0x80);
108 *d++ = (((uv >> 42) & 0x3f) | 0x80);
a0ed51b3 109 *d++ = (((uv >> 36) & 0x3f) | 0x80);
110 *d++ = (((uv >> 30) & 0x3f) | 0x80);
111 *d++ = (((uv >> 24) & 0x3f) | 0x80);
112 *d++ = (((uv >> 18) & 0x3f) | 0x80);
113 *d++ = (((uv >> 12) & 0x3f) | 0x80);
114 *d++ = (((uv >> 6) & 0x3f) | 0x80);
115 *d++ = (( uv & 0x3f) | 0x80);
116 return d;
117 }
118#endif
119}
120
eebe1485 121/*
9041c2e3 122=for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
123
124Adds the UTF8 representation of the Native codepoint C<uv> to the end
125of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
126bytes available. The return value is the pointer to the byte after the
127end of the new character. In other words,
128
129 d = uvchr_to_utf8(d, uv);
130
131is the recommended wide native character-aware way of saying
132
133 *(d++) = uv;
134
135=cut
136*/
137
138U8 *
139Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
140{
141 if (uv < 0x100)
142 uv = NATIVE_TO_ASCII(uv);
143 return Perl_uvuni_to_utf8(aTHX_ d, uv);
144}
145
146
147/*
eebe1485 148=for apidoc A|STRLEN|is_utf8_char|U8 *s
149
5da9da9e 150Tests if some arbitrary number of bytes begins in a valid UTF-8
151character. Note that an ASCII character is a valid UTF-8 character.
152The actual number of bytes in the UTF-8 character will be returned if
153it is valid, otherwise 0.
9041c2e3 154
5da9da9e 155=cut */
067a85ef 156STRLEN
386d01d6 157Perl_is_utf8_char(pTHX_ U8 *s)
158{
159 U8 u = *s;
067a85ef 160 STRLEN slen, len;
161 UV uv, ouv;
386d01d6 162
60006e79 163 if (UTF8_IS_ASCII(u))
386d01d6 164 return 1;
165
60006e79 166 if (!UTF8_IS_START(u))
386d01d6 167 return 0;
168
9f07fdcd 169 len = UTF8SKIP(s);
386d01d6 170
60006e79 171 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef 172 return 0;
173
386d01d6 174 slen = len - 1;
175 s++;
067a85ef 176 uv = u;
177 ouv = uv;
386d01d6 178 while (slen--) {
60006e79 179 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 180 return 0;
8850bf83 181 uv = UTF8_ACCUMULATE(uv, *s);
067a85ef 182 if (uv < ouv)
183 return 0;
184 ouv = uv;
386d01d6 185 s++;
186 }
067a85ef 187
5bbb0b5a 188 if (UNISKIP(uv) < len)
067a85ef 189 return 0;
190
386d01d6 191 return len;
192}
193
6662521e 194/*
eebe1485 195=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 196
5da9da9e 197Returns true if first C<len> bytes of the given string form a valid UTF8
198string, false otherwise. Note that 'a valid UTF8 string' does not mean
199'a string that contains UTF8' because a valid ASCII string is a valid
200UTF8 string.
6662521e 201
202=cut
203*/
204
8e84507e 205bool
6662521e 206Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
207{
067a85ef 208 U8* x = s;
1aa99e6b 209 U8* send;
067a85ef 210 STRLEN c;
211
1aa99e6b 212 if (!len)
6cd5fe39 213 len = strlen((char *)s);
1aa99e6b 214 send = s + len;
215
6662521e 216 while (x < send) {
217 c = is_utf8_char(x);
067a85ef 218 if (!c)
219 return FALSE;
6662521e 220 x += c;
6662521e 221 }
60006e79 222 if (x != send)
223 return FALSE;
067a85ef 224
225 return TRUE;
6662521e 226}
227
67e989fb 228/*
9041c2e3 229=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 230
9041c2e3 231Bottom level UTF-8 decode routine.
232Returns the unicode code point value of the first character in the string C<s>
ba210ebe 233which is assumed to be in UTF8 encoding and no longer than C<curlen>;
7df053ec 234C<retlen> will be set to the length, in bytes, of that character.
67e989fb 235
236If C<s> does not point to a well-formed UTF8 character, the behaviour
dcad2880 237is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
238it is assumed that the caller will raise a warning, and this function
28d3d195 239will silently just set C<retlen> to C<-1> and return zero. If the
240C<flags> does not contain UTF8_CHECK_ONLY, warnings about
241malformations will be given, C<retlen> will be set to the expected
242length of the UTF-8 character in bytes, and zero will be returned.
243
244The C<flags> can also contain various flags to allow deviations from
245the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 246
9041c2e3 247Most code should use utf8_to_uvchr() rather than call this directly.
248
dcad2880 249=cut */
67e989fb 250
a0ed51b3 251UV
9041c2e3 252Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
a0ed51b3 253{
ba210ebe 254 UV uv = *s, ouv;
255 STRLEN len = 1;
7bf1b6bb 256#ifdef EBCDIC
257 bool dowarn = 0;
258#else
ba210ebe 259 bool dowarn = ckWARN_d(WARN_UTF8);
7bf1b6bb 260#endif
ba210ebe 261 STRLEN expectlen = 0;
a0dbb045 262 U32 warning = 0;
263
264/* This list is a superset of the UTF8_ALLOW_XXX. */
265
266#define UTF8_WARN_EMPTY 1
267#define UTF8_WARN_CONTINUATION 2
268#define UTF8_WARN_NON_CONTINUATION 3
269#define UTF8_WARN_FE_FF 4
270#define UTF8_WARN_SHORT 5
271#define UTF8_WARN_OVERFLOW 6
272#define UTF8_WARN_SURROGATE 7
273#define UTF8_WARN_BOM 8
274#define UTF8_WARN_LONG 9
275#define UTF8_WARN_FFFF 10
276
277 if (curlen == 0 &&
278 !(flags & UTF8_ALLOW_EMPTY)) {
279 warning = UTF8_WARN_EMPTY;
0c443dc2 280 goto malformed;
281 }
282
421a8bf2 283 if (UTF8_IS_ASCII(uv)) {
a0ed51b3 284 if (retlen)
285 *retlen = 1;
9041c2e3 286 return (UV) (*s);
a0ed51b3 287 }
67e989fb 288
421a8bf2 289 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 290 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 291 warning = UTF8_WARN_CONTINUATION;
ba210ebe 292 goto malformed;
293 }
294
421a8bf2 295 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 296 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 297 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 298 goto malformed;
299 }
9041c2e3 300
fcc8fcf6 301 if ((uv == 0xfe || uv == 0xff) &&
302 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 303 warning = UTF8_WARN_FE_FF;
ba210ebe 304 goto malformed;
a0ed51b3 305 }
fcc8fcf6 306
ba210ebe 307 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
308 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
309 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
310 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
311 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
312 else if (!(uv & 0x01)) { len = 7; uv = 0; }
3c77ea2b 313 else { len = 13; uv = 0; } /* whoa! */
fcc8fcf6 314
a0ed51b3 315 if (retlen)
316 *retlen = len;
9041c2e3 317
ba210ebe 318 expectlen = len;
319
fcc8fcf6 320 if ((curlen < expectlen) &&
321 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 322 warning = UTF8_WARN_SHORT;
ba210ebe 323 goto malformed;
324 }
325
326 len--;
a0ed51b3 327 s++;
ba210ebe 328 ouv = uv;
329
a0ed51b3 330 while (len--) {
421a8bf2 331 if (!UTF8_IS_CONTINUATION(*s) &&
332 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 333 s--;
334 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 335 goto malformed;
a0ed51b3 336 }
337 else
8850bf83 338 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045 339 if (!(uv > ouv)) {
340 /* These cannot be allowed. */
341 if (uv == ouv) {
342 if (!(flags & UTF8_ALLOW_LONG)) {
343 warning = UTF8_WARN_LONG;
344 goto malformed;
345 }
346 }
347 else { /* uv < ouv */
348 /* This cannot be allowed. */
349 warning = UTF8_WARN_OVERFLOW;
350 goto malformed;
351 }
ba210ebe 352 }
353 s++;
354 ouv = uv;
355 }
356
421a8bf2 357 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 358 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 359 warning = UTF8_WARN_SURROGATE;
ba210ebe 360 goto malformed;
421a8bf2 361 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
fcc8fcf6 362 !(flags & UTF8_ALLOW_BOM)) {
a0dbb045 363 warning = UTF8_WARN_BOM;
ba210ebe 364 goto malformed;
fcc8fcf6 365 } else if ((expectlen > UNISKIP(uv)) &&
366 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 367 warning = UTF8_WARN_LONG;
ba210ebe 368 goto malformed;
421a8bf2 369 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 370 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 371 warning = UTF8_WARN_FFFF;
a9917092 372 goto malformed;
a0ed51b3 373 }
ba210ebe 374
a0ed51b3 375 return uv;
ba210ebe 376
377malformed:
378
fcc8fcf6 379 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 380 if (retlen)
cc366d4b 381 *retlen = -1;
ba210ebe 382 return 0;
383 }
384
a0dbb045 385 if (dowarn) {
386 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
387
388 switch (warning) {
389 case 0: /* Intentionally empty. */ break;
390 case UTF8_WARN_EMPTY:
391 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
392 break;
393 case UTF8_WARN_CONTINUATION:
394 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
395 break;
396 case UTF8_WARN_NON_CONTINUATION:
397 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
398 (UV)s[1], uv);
399 break;
400 case UTF8_WARN_FE_FF:
401 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
402 break;
403 case UTF8_WARN_SHORT:
404 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
405 curlen, curlen == 1 ? "" : "s", expectlen);
406 break;
407 case UTF8_WARN_OVERFLOW:
408 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
409 ouv, *s);
410 break;
411 case UTF8_WARN_SURROGATE:
412 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
413 break;
414 case UTF8_WARN_BOM:
415 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
416 break;
417 case UTF8_WARN_LONG:
418 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
419 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
420 break;
421 case UTF8_WARN_FFFF:
422 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
423 break;
424 default:
425 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
426 break;
427 }
428
429 if (warning) {
430 char *s = SvPVX(sv);
431
432 if (PL_op)
433 Perl_warner(aTHX_ WARN_UTF8,
434 "%s in %s", s, PL_op_desc[PL_op->op_type]);
435 else
436 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
437 }
438 }
439
ba210ebe 440 if (retlen)
28d3d195 441 *retlen = expectlen ? expectlen : len;
ba210ebe 442
28d3d195 443 return 0;
a0ed51b3 444}
445
8e84507e 446/*
9041c2e3 447=for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
8e84507e 448
9041c2e3 449Returns the native character value of the first character in the string C<s>
8e84507e 450which is assumed to be in UTF8 encoding; C<retlen> will be set to the
7df053ec 451length, in bytes, of that character.
8e84507e 452
9041c2e3 453Allows length and flags to be passed to low level routine.
454
455=cut
456*/
457
458UV
459Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
460{
461 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
462 if (uv < 0x100)
463 return (UV) ASCII_TO_NATIVE(uv);
464 return uv;
465}
466
467/*
468=for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen
469
470Returns the native character value of the first character in the string C<s>
471which is assumed to be in UTF8 encoding; C<retlen> will be set to the
472length, in bytes, of that character.
473
474If C<s> does not point to a well-formed UTF8 character, zero is
475returned and retlen is set, if possible, to -1.
476
477=cut
478*/
479
480UV
481Perl_utf8_to_uvchr(pTHX_ U8* s, STRLEN* retlen)
482{
483 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
484}
485
486/*
487=for apidoc A|U8* s|utf8_to_uvuni|STRLEN *retlen
488
489Returns the Unicode code point of the first character in the string C<s>
490which is assumed to be in UTF8 encoding; C<retlen> will be set to the
491length, in bytes, of that character.
492
493This function should only be used when returned UV is considered
494an index into the Unicode semantic tables (e.g. swashes).
495
ba210ebe 496If C<s> does not point to a well-formed UTF8 character, zero is
497returned and retlen is set, if possible, to -1.
8e84507e 498
499=cut
500*/
501
502UV
9041c2e3 503Perl_utf8_to_uvuni(pTHX_ U8* s, STRLEN* retlen)
8e84507e 504{
9041c2e3 505 /* Call the low level routine asking for checks */
506 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
8e84507e 507}
508
b76347f2 509/*
eebe1485 510=for apidoc A|STRLEN|utf8_length|U8* s|U8 *e
b76347f2 511
512Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47 513Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
514up past C<e>, croaks.
b76347f2 515
516=cut
517*/
518
519STRLEN
520Perl_utf8_length(pTHX_ U8* s, U8* e)
521{
522 STRLEN len = 0;
523
8850bf83 524 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
525 * the bitops (especially ~) can create illegal UTF-8.
526 * In other words: in Perl UTF-8 is not just for Unicode. */
527
b76347f2 528 if (e < s)
02eb7b47 529 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
b76347f2 530 while (s < e) {
02eb7b47 531 U8 t = UTF8SKIP(s);
b76347f2 532
533 if (e - s < t)
02eb7b47 534 Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
b76347f2 535 s += t;
536 len++;
537 }
538
539 return len;
540}
541
b06226ff 542/*
eebe1485 543=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff 544
545Returns the number of UTF8 characters between the UTF-8 pointers C<a>
546and C<b>.
547
548WARNING: use only if you *know* that the pointers point inside the
549same UTF-8 buffer.
550
551=cut */
a0ed51b3 552
02eb7b47 553IV
864dbfa3 554Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 555{
02eb7b47 556 IV off = 0;
557
8850bf83 558 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
559 * the bitops (especially ~) can create illegal UTF-8.
560 * In other words: in Perl UTF-8 is not just for Unicode. */
561
a0ed51b3 562 if (a < b) {
563 while (a < b) {
02eb7b47 564 U8 c = UTF8SKIP(a);
565
566 if (b - a < c)
567 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
568 a += c;
a0ed51b3 569 off--;
570 }
571 }
572 else {
573 while (b < a) {
02eb7b47 574 U8 c = UTF8SKIP(b);
575
576 if (a - b < c)
577 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
578 b += c;
a0ed51b3 579 off++;
580 }
581 }
02eb7b47 582
a0ed51b3 583 return off;
584}
585
b06226ff 586/*
eebe1485 587=for apidoc A|U8*|utf8_hop|U8 *s|I32 off
b06226ff 588
8850bf83 589Return the UTF-8 pointer C<s> displaced by C<off> characters, either
590forward or backward.
b06226ff 591
592WARNING: do not use the following unless you *know* C<off> is within
8850bf83 593the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
594on the first byte of character or just after the last byte of a character.
b06226ff 595
596=cut */
a0ed51b3 597
598U8 *
864dbfa3 599Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 600{
8850bf83 601 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
602 * the bitops (especially ~) can create illegal UTF-8.
603 * In other words: in Perl UTF-8 is not just for Unicode. */
604
a0ed51b3 605 if (off >= 0) {
606 while (off--)
607 s += UTF8SKIP(s);
608 }
609 else {
610 while (off++) {
611 s--;
8850bf83 612 while (UTF8_IS_CONTINUATION(*s))
613 s--;
a0ed51b3 614 }
615 }
616 return s;
617}
618
6940069f 619/*
eebe1485 620=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 621
246fae53 622Converts a string C<s> of length C<len> from UTF8 into byte encoding.
623Unlike C<bytes_to_utf8>, this over-writes the original string, and
624updates len to contain the new length.
67e989fb 625Returns zero on failure, setting C<len> to -1.
6940069f 626
627=cut
628*/
629
630U8 *
246fae53 631Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
6940069f 632{
6940069f 633 U8 *send;
634 U8 *d;
dcad2880 635 U8 *save = s;
246fae53 636
637 /* ensure valid UTF8 and chars < 256 before updating string */
dcad2880 638 for (send = s + *len; s < send; ) {
639 U8 c = *s++;
640
9f9ab905 641 if (c >= 0x80 &&
dcad2880 642 ((s >= send) ||
643 ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) {
644 *len = -1;
645 return 0;
646 }
246fae53 647 }
dcad2880 648
649 d = s = save;
6940069f 650 while (s < send) {
ed646e6e 651 STRLEN ulen;
9041c2e3 652 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 653 s += ulen;
6940069f 654 }
655 *d = '\0';
246fae53 656 *len = d - save;
6940069f 657 return save;
658}
659
660/*
f9a63242 661=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
662
663Converts a string C<s> of length C<len> from UTF8 into byte encoding.
664Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0 665the newly-created string, and updates C<len> to contain the new
666length. Returns the original string if no conversion occurs, C<len>
667is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
6680 if C<s> is converted or contains all 7bit characters.
f9a63242 669
670=cut */
671
672U8 *
673Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
674{
675 U8 *send;
676 U8 *d;
677 U8 *start = s;
678 I32 count = 0;
679
680 if (!*is_utf8)
681 return start;
682
ef9edfd0 683 /* ensure valid UTF8 and chars < 256 before converting string */
f9a63242 684 for (send = s + *len; s < send;) {
685 U8 c = *s++;
686 if (!UTF8_IS_ASCII(c)) {
687 if (UTF8_IS_CONTINUATION(c) || s >= send ||
df84a23b 688 !UTF8_IS_CONTINUATION(*s) || UTF8_IS_DOWNGRADEABLE_START(c))
f9a63242 689 return start;
690 s++, count++;
691 }
692 }
693
694 *is_utf8 = 0;
695
696 if (!count)
697 return start;
698
699 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 700 s = start; start = d;
f9a63242 701 while (s < send) {
702 U8 c = *s++;
90f44359 703
f9a63242 704 if (UTF8_IS_ASCII(c))
705 *d++ = c;
706 else
90f44359 707 *d++ = UTF8_ACCUMULATE(c, *s++);
f9a63242 708 }
709 *d = '\0';
710 *len = d - start;
711 return start;
712}
713
714/*
eebe1485 715=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f 716
717Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
6662521e 718Returns a pointer to the newly-created string, and sets C<len> to
719reflect the new length.
6940069f 720
497711e7 721=cut
6940069f 722*/
723
724U8*
6662521e 725Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
6940069f 726{
6940069f 727 U8 *send;
728 U8 *d;
729 U8 *dst;
6662521e 730 send = s + (*len);
6940069f 731
6662521e 732 Newz(801, d, (*len) * 2 + 1, U8);
6940069f 733 dst = d;
734
735 while (s < send) {
90f44359 736 if (UTF8_IS_ASCII(*s))
6940069f 737 *d++ = *s++;
738 else {
739 UV uv = *s++;
90f44359 740
741 *d++ = UTF8_EIGHT_BIT_HI(uv);
742 *d++ = UTF8_EIGHT_BIT_LO(uv);
6940069f 743 }
744 }
745 *d = '\0';
6662521e 746 *len = d-dst;
6940069f 747 return dst;
748}
749
a0ed51b3 750/*
dea0fc0b 751 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3 752 *
753 * Destination must be pre-extended to 3/2 source. Do not use in-place.
754 * We optimize for native, for obvious reasons. */
755
756U8*
dea0fc0b 757Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 758{
dea0fc0b 759 U8* pend;
760 U8* dstart = d;
761
762 if (bytelen & 1)
a7867d0a 763 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b 764
765 pend = p + bytelen;
766
a0ed51b3 767 while (p < pend) {
dea0fc0b 768 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
769 p += 2;
a0ed51b3 770 if (uv < 0x80) {
771 *d++ = uv;
772 continue;
773 }
774 if (uv < 0x800) {
775 *d++ = (( uv >> 6) | 0xc0);
776 *d++ = (( uv & 0x3f) | 0x80);
777 continue;
778 }
779 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
dea0fc0b 780 UV low = *p++;
781 if (low < 0xdc00 || low >= 0xdfff)
782 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3 783 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
784 }
785 if (uv < 0x10000) {
786 *d++ = (( uv >> 12) | 0xe0);
787 *d++ = (((uv >> 6) & 0x3f) | 0x80);
788 *d++ = (( uv & 0x3f) | 0x80);
789 continue;
790 }
791 else {
792 *d++ = (( uv >> 18) | 0xf0);
793 *d++ = (((uv >> 12) & 0x3f) | 0x80);
794 *d++ = (((uv >> 6) & 0x3f) | 0x80);
795 *d++ = (( uv & 0x3f) | 0x80);
796 continue;
797 }
798 }
dea0fc0b 799 *newlen = d - dstart;
a0ed51b3 800 return d;
801}
802
803/* Note: this one is slightly destructive of the source. */
804
805U8*
dea0fc0b 806Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 807{
808 U8* s = (U8*)p;
809 U8* send = s + bytelen;
810 while (s < send) {
811 U8 tmp = s[0];
812 s[0] = s[1];
813 s[1] = tmp;
814 s += 2;
815 }
dea0fc0b 816 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3 817}
818
819/* for now these are all defined (inefficiently) in terms of the utf8 versions */
820
821bool
864dbfa3 822Perl_is_uni_alnum(pTHX_ U32 c)
a0ed51b3 823{
ad391ad9 824 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 825 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3 826 return is_utf8_alnum(tmpbuf);
827}
828
829bool
b8c5462f 830Perl_is_uni_alnumc(pTHX_ U32 c)
831{
ad391ad9 832 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 833 uvuni_to_utf8(tmpbuf, (UV)c);
b8c5462f 834 return is_utf8_alnumc(tmpbuf);
835}
836
837bool
864dbfa3 838Perl_is_uni_idfirst(pTHX_ U32 c)
a0ed51b3 839{
ad391ad9 840 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 841 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3 842 return is_utf8_idfirst(tmpbuf);
843}
844
845bool
864dbfa3 846Perl_is_uni_alpha(pTHX_ U32 c)
a0ed51b3 847{
ad391ad9 848 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 849 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3 850 return is_utf8_alpha(tmpbuf);
851}
852
853bool
4d61ec05 854Perl_is_uni_ascii(pTHX_ U32 c)
855{
ad391ad9 856 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 857 uvuni_to_utf8(tmpbuf, (UV)c);
4d61ec05 858 return is_utf8_ascii(tmpbuf);
859}
860
861bool
864dbfa3 862Perl_is_uni_space(pTHX_ U32 c)
a0ed51b3 863{
ad391ad9 864 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 865 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3 866 return is_utf8_space(tmpbuf);
867}
868
869bool
864dbfa3 870Perl_is_uni_digit(pTHX_ U32 c)
a0ed51b3 871{
ad391ad9 872 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 873 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3 874 return is_utf8_digit(tmpbuf);
875}
876
877bool
864dbfa3 878Perl_is_uni_upper(pTHX_ U32 c)
a0ed51b3 879{
ad391ad9 880 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 881 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3 882 return is_utf8_upper(tmpbuf);
883}
884
885bool
864dbfa3 886Perl_is_uni_lower(pTHX_ U32 c)
a0ed51b3 887{
ad391ad9 888 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 889 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3 890 return is_utf8_lower(tmpbuf);
891}
892
893bool
b8c5462f 894Perl_is_uni_cntrl(pTHX_ U32 c)
895{
ad391ad9 896 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 897 uvuni_to_utf8(tmpbuf, (UV)c);
b8c5462f 898 return is_utf8_cntrl(tmpbuf);
899}
900
901bool
902Perl_is_uni_graph(pTHX_ U32 c)
903{
ad391ad9 904 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 905 uvuni_to_utf8(tmpbuf, (UV)c);
b8c5462f 906 return is_utf8_graph(tmpbuf);
907}
908
909bool
864dbfa3 910Perl_is_uni_print(pTHX_ U32 c)
a0ed51b3 911{
ad391ad9 912 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 913 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3 914 return is_utf8_print(tmpbuf);
915}
916
b8c5462f 917bool
f248d071 918Perl_is_uni_punct(pTHX_ U32 c)
b8c5462f 919{
ad391ad9 920 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 921 uvuni_to_utf8(tmpbuf, (UV)c);
b8c5462f 922 return is_utf8_punct(tmpbuf);
923}
924
4d61ec05 925bool
926Perl_is_uni_xdigit(pTHX_ U32 c)
927{
ad391ad9 928 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 929 uvuni_to_utf8(tmpbuf, (UV)c);
4d61ec05 930 return is_utf8_xdigit(tmpbuf);
931}
932
a0ed51b3 933U32
864dbfa3 934Perl_to_uni_upper(pTHX_ U32 c)
a0ed51b3 935{
ad391ad9 936 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 937 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3 938 return to_utf8_upper(tmpbuf);
939}
940
941U32
864dbfa3 942Perl_to_uni_title(pTHX_ U32 c)
a0ed51b3 943{
ad391ad9 944 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 945 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3 946 return to_utf8_title(tmpbuf);
947}
948
949U32
864dbfa3 950Perl_to_uni_lower(pTHX_ U32 c)
a0ed51b3 951{
ad391ad9 952 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 953 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3 954 return to_utf8_lower(tmpbuf);
955}
956
957/* for now these all assume no locale info available for Unicode > 255 */
958
959bool
864dbfa3 960Perl_is_uni_alnum_lc(pTHX_ U32 c)
a0ed51b3 961{
962 return is_uni_alnum(c); /* XXX no locale support yet */
963}
964
965bool
b8c5462f 966Perl_is_uni_alnumc_lc(pTHX_ U32 c)
967{
968 return is_uni_alnumc(c); /* XXX no locale support yet */
969}
970
971bool
864dbfa3 972Perl_is_uni_idfirst_lc(pTHX_ U32 c)
a0ed51b3 973{
974 return is_uni_idfirst(c); /* XXX no locale support yet */
975}
976
977bool
864dbfa3 978Perl_is_uni_alpha_lc(pTHX_ U32 c)
a0ed51b3 979{
980 return is_uni_alpha(c); /* XXX no locale support yet */
981}
982
983bool
4d61ec05 984Perl_is_uni_ascii_lc(pTHX_ U32 c)
985{
986 return is_uni_ascii(c); /* XXX no locale support yet */
987}
988
989bool
864dbfa3 990Perl_is_uni_space_lc(pTHX_ U32 c)
a0ed51b3 991{
992 return is_uni_space(c); /* XXX no locale support yet */
993}
994
995bool
864dbfa3 996Perl_is_uni_digit_lc(pTHX_ U32 c)
a0ed51b3 997{
998 return is_uni_digit(c); /* XXX no locale support yet */
999}
1000
1001bool
864dbfa3 1002Perl_is_uni_upper_lc(pTHX_ U32 c)
a0ed51b3 1003{
1004 return is_uni_upper(c); /* XXX no locale support yet */
1005}
1006
1007bool
864dbfa3 1008Perl_is_uni_lower_lc(pTHX_ U32 c)
a0ed51b3 1009{
1010 return is_uni_lower(c); /* XXX no locale support yet */
1011}
1012
1013bool
b8c5462f 1014Perl_is_uni_cntrl_lc(pTHX_ U32 c)
1015{
1016 return is_uni_cntrl(c); /* XXX no locale support yet */
1017}
1018
1019bool
1020Perl_is_uni_graph_lc(pTHX_ U32 c)
1021{
1022 return is_uni_graph(c); /* XXX no locale support yet */
1023}
1024
1025bool
864dbfa3 1026Perl_is_uni_print_lc(pTHX_ U32 c)
a0ed51b3 1027{
1028 return is_uni_print(c); /* XXX no locale support yet */
1029}
1030
b8c5462f 1031bool
1032Perl_is_uni_punct_lc(pTHX_ U32 c)
1033{
1034 return is_uni_punct(c); /* XXX no locale support yet */
1035}
1036
4d61ec05 1037bool
1038Perl_is_uni_xdigit_lc(pTHX_ U32 c)
1039{
1040 return is_uni_xdigit(c); /* XXX no locale support yet */
1041}
1042
a0ed51b3 1043U32
864dbfa3 1044Perl_to_uni_upper_lc(pTHX_ U32 c)
a0ed51b3 1045{
1046 return to_uni_upper(c); /* XXX no locale support yet */
1047}
1048
1049U32
864dbfa3 1050Perl_to_uni_title_lc(pTHX_ U32 c)
a0ed51b3 1051{
1052 return to_uni_title(c); /* XXX no locale support yet */
1053}
1054
1055U32
864dbfa3 1056Perl_to_uni_lower_lc(pTHX_ U32 c)
a0ed51b3 1057{
1058 return to_uni_lower(c); /* XXX no locale support yet */
1059}
1060
a0ed51b3 1061bool
864dbfa3 1062Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1063{
386d01d6 1064 if (!is_utf8_char(p))
1065 return FALSE;
a0ed51b3 1066 if (!PL_utf8_alnum)
289d4f09 1067 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1068 * descendant of isalnum(3), in other words, it doesn't
1069 * contain the '_'. --jhi */
1070 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
a0ed51b3 1071 return swash_fetch(PL_utf8_alnum, p);
1072/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1073#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1074 if (!PL_utf8_alnum)
1075 PL_utf8_alnum = swash_init("utf8", "",
1076 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1077 return swash_fetch(PL_utf8_alnum, p);
1078#endif
1079}
1080
1081bool
b8c5462f 1082Perl_is_utf8_alnumc(pTHX_ U8 *p)
1083{
386d01d6 1084 if (!is_utf8_char(p))
1085 return FALSE;
b8c5462f 1086 if (!PL_utf8_alnum)
1087 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1088 return swash_fetch(PL_utf8_alnum, p);
1089/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1090#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1091 if (!PL_utf8_alnum)
1092 PL_utf8_alnum = swash_init("utf8", "",
1093 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1094 return swash_fetch(PL_utf8_alnum, p);
1095#endif
1096}
1097
1098bool
864dbfa3 1099Perl_is_utf8_idfirst(pTHX_ U8 *p)
a0ed51b3 1100{
1101 return *p == '_' || is_utf8_alpha(p);
1102}
1103
1104bool
864dbfa3 1105Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1106{
386d01d6 1107 if (!is_utf8_char(p))
1108 return FALSE;
a0ed51b3 1109 if (!PL_utf8_alpha)
e24b16f9 1110 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
a0ed51b3 1111 return swash_fetch(PL_utf8_alpha, p);
1112}
1113
1114bool
b8c5462f 1115Perl_is_utf8_ascii(pTHX_ U8 *p)
1116{
386d01d6 1117 if (!is_utf8_char(p))
1118 return FALSE;
b8c5462f 1119 if (!PL_utf8_ascii)
1120 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1121 return swash_fetch(PL_utf8_ascii, p);
1122}
1123
1124bool
864dbfa3 1125Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1126{
386d01d6 1127 if (!is_utf8_char(p))
1128 return FALSE;
a0ed51b3 1129 if (!PL_utf8_space)
3bec3564 1130 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
a0ed51b3 1131 return swash_fetch(PL_utf8_space, p);
1132}
1133
1134bool
864dbfa3 1135Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1136{
386d01d6 1137 if (!is_utf8_char(p))
1138 return FALSE;
a0ed51b3 1139 if (!PL_utf8_digit)
e24b16f9 1140 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
a0ed51b3 1141 return swash_fetch(PL_utf8_digit, p);
1142}
1143
1144bool
864dbfa3 1145Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1146{
386d01d6 1147 if (!is_utf8_char(p))
1148 return FALSE;
a0ed51b3 1149 if (!PL_utf8_upper)
e24b16f9 1150 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
a0ed51b3 1151 return swash_fetch(PL_utf8_upper, p);
1152}
1153
1154bool
864dbfa3 1155Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1156{
386d01d6 1157 if (!is_utf8_char(p))
1158 return FALSE;
a0ed51b3 1159 if (!PL_utf8_lower)
e24b16f9 1160 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
a0ed51b3 1161 return swash_fetch(PL_utf8_lower, p);
1162}
1163
1164bool
b8c5462f 1165Perl_is_utf8_cntrl(pTHX_ U8 *p)
1166{
386d01d6 1167 if (!is_utf8_char(p))
1168 return FALSE;
b8c5462f 1169 if (!PL_utf8_cntrl)
1170 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1171 return swash_fetch(PL_utf8_cntrl, p);
1172}
1173
1174bool
1175Perl_is_utf8_graph(pTHX_ U8 *p)
1176{
386d01d6 1177 if (!is_utf8_char(p))
1178 return FALSE;
b8c5462f 1179 if (!PL_utf8_graph)
1180 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1181 return swash_fetch(PL_utf8_graph, p);
1182}
1183
1184bool
864dbfa3 1185Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1186{
386d01d6 1187 if (!is_utf8_char(p))
1188 return FALSE;
a0ed51b3 1189 if (!PL_utf8_print)
e24b16f9 1190 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
a0ed51b3 1191 return swash_fetch(PL_utf8_print, p);
1192}
1193
1194bool
b8c5462f 1195Perl_is_utf8_punct(pTHX_ U8 *p)
1196{
386d01d6 1197 if (!is_utf8_char(p))
1198 return FALSE;
b8c5462f 1199 if (!PL_utf8_punct)
1200 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1201 return swash_fetch(PL_utf8_punct, p);
1202}
1203
1204bool
1205Perl_is_utf8_xdigit(pTHX_ U8 *p)
1206{
386d01d6 1207 if (!is_utf8_char(p))
1208 return FALSE;
b8c5462f 1209 if (!PL_utf8_xdigit)
1210 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1211 return swash_fetch(PL_utf8_xdigit, p);
1212}
1213
1214bool
864dbfa3 1215Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1216{
386d01d6 1217 if (!is_utf8_char(p))
1218 return FALSE;
a0ed51b3 1219 if (!PL_utf8_mark)
e24b16f9 1220 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
a0ed51b3 1221 return swash_fetch(PL_utf8_mark, p);
1222}
1223
2104c8d9 1224UV
864dbfa3 1225Perl_to_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1226{
1227 UV uv;
1228
1229 if (!PL_utf8_toupper)
e24b16f9 1230 PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
a0ed51b3 1231 uv = swash_fetch(PL_utf8_toupper, p);
9041c2e3 1232 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
a0ed51b3 1233}
1234
2104c8d9 1235UV
864dbfa3 1236Perl_to_utf8_title(pTHX_ U8 *p)
a0ed51b3 1237{
1238 UV uv;
1239
1240 if (!PL_utf8_totitle)
e24b16f9 1241 PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
a0ed51b3 1242 uv = swash_fetch(PL_utf8_totitle, p);
9041c2e3 1243 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
a0ed51b3 1244}
1245
2104c8d9 1246UV
864dbfa3 1247Perl_to_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1248{
1249 UV uv;
1250
1251 if (!PL_utf8_tolower)
e24b16f9 1252 PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
a0ed51b3 1253 uv = swash_fetch(PL_utf8_tolower, p);
9041c2e3 1254 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
a0ed51b3 1255}
1256
1257/* a "swash" is a swatch hash */
1258
1259SV*
864dbfa3 1260Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1261{
1262 SV* retval;
bf1fed83 1263 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1264 dSP;
ce3b816e 1265
1266 if (!gv_stashpv(pkg, 0)) { /* demand load utf8 */
1267 ENTER;
1268 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1269 LEAVE;
1270 }
1271 SPAGAIN;
a0ed51b3 1272 PUSHSTACKi(PERLSI_MAGIC);
1273 PUSHMARK(SP);
1274 EXTEND(SP,5);
1275 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1276 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1277 PUSHs(listsv);
1278 PUSHs(sv_2mortal(newSViv(minbits)));
1279 PUSHs(sv_2mortal(newSViv(none)));
1280 PUTBACK;
1281 ENTER;
1282 SAVEI32(PL_hints);
1283 PL_hints = 0;
1284 save_re_context();
bf1fed83 1285 if (PL_curcop == &PL_compiling)
1286 /* XXX ought to be handled by lex_start */
1287 sv_setpv(tokenbufsv, PL_tokenbuf);
864dbfa3 1288 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1289 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1290 else
e24b16f9 1291 retval = &PL_sv_undef;
a0ed51b3 1292 LEAVE;
1293 POPSTACK;
e24b16f9 1294 if (PL_curcop == &PL_compiling) {
bf1fed83 1295 STRLEN len;
1296 char* pv = SvPV(tokenbufsv, len);
1297
1298 Copy(pv, PL_tokenbuf, len+1, char);
e24b16f9 1299 PL_curcop->op_private = PL_hints;
a0ed51b3 1300 }
1301 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
cea2e8a9 1302 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
a0ed51b3 1303 return retval;
1304}
1305
1306UV
864dbfa3 1307Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
a0ed51b3 1308{
1309 HV* hv = (HV*)SvRV(sv);
1310 U32 klen = UTF8SKIP(ptr) - 1;
1311 U32 off = ptr[klen] & 127; /* NB: 64 bit always 0 when len > 1 */
1312 STRLEN slen;
1313 STRLEN needents = (klen ? 64 : 128);
dfe13c55 1314 U8 *tmps;
a0ed51b3 1315 U32 bit;
1316 SV *retval;
1317
1318 /*
1319 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1320 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1321 * it's nothing to sniff at.) Pity we usually come through at least
1322 * two function calls to get here...
1323 *
1324 * NB: this code assumes that swatches are never modified, once generated!
1325 */
1326
1327 if (hv == PL_last_swash_hv &&
1328 klen == PL_last_swash_klen &&
12ae5dfc 1329 (!klen || memEQ((char *)ptr,(char *)PL_last_swash_key,klen)) )
a0ed51b3 1330 {
1331 tmps = PL_last_swash_tmps;
1332 slen = PL_last_swash_slen;
1333 }
1334 else {
1335 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1336 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3 1337
1338 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1339 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1340 dSP;
1341 ENTER;
1342 SAVETMPS;
1343 save_re_context();
1344 PUSHSTACKi(PERLSI_MAGIC);
1345 PUSHMARK(SP);
1346 EXTEND(SP,3);
1347 PUSHs((SV*)sv);
9041c2e3 1348 /* We call utf8_to_uni as we want and index into Unicode tables,
1349 not a native character number.
1350 */
1351 PUSHs(sv_2mortal(newSViv(utf8_to_uvuni(ptr, 0) & ~(needents - 1))));
a0ed51b3 1352 PUSHs(sv_2mortal(newSViv(needents)));
1353 PUTBACK;
864dbfa3 1354 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1355 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1356 else
e24b16f9 1357 retval = &PL_sv_undef;
a0ed51b3 1358 POPSTACK;
1359 FREETMPS;
1360 LEAVE;
e24b16f9 1361 if (PL_curcop == &PL_compiling)
1362 PL_curcop->op_private = PL_hints;
a0ed51b3 1363
dfe13c55 1364 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1365
dfe13c55 1366 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8)
cea2e8a9 1367 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3 1368 }
1369
1370 PL_last_swash_hv = hv;
1371 PL_last_swash_klen = klen;
1372 PL_last_swash_tmps = tmps;
1373 PL_last_swash_slen = slen;
1374 if (klen)
1375 Copy(ptr, PL_last_swash_key, klen, U8);
1376 }
1377
9faf8d75 1378 switch ((int)((slen << 3) / needents)) {
a0ed51b3 1379 case 1:
1380 bit = 1 << (off & 7);
1381 off >>= 3;
1382 return (tmps[off] & bit) != 0;
1383 case 8:
1384 return tmps[off];
1385 case 16:
1386 off <<= 1;
1387 return (tmps[off] << 8) + tmps[off + 1] ;
1388 case 32:
1389 off <<= 2;
1390 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1391 }
cea2e8a9 1392 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3 1393 return 0;
1394}