And that same fool forgot to add the not-really-needed "fuzzy" versions
[p5sagit/p5-mst-13.2.git] / utf8.c
CommitLineData
a0ed51b3 1/* utf8.c
2 *
b5f8cc5c 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, by Larry Wall and others
a0ed51b3 4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3 25#include "perl.h"
26
901b21bf 27static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
28
ccfc67b7 29/*
30=head1 Unicode Support
a0ed51b3 31
b851fbc1 32=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 33
1e54db1a 34Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
eebe1485 35of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
36bytes available. The return value is the pointer to the byte after the
9041c2e3 37end of the new character. In other words,
eebe1485 38
b851fbc1 39 d = uvuni_to_utf8_flags(d, uv, flags);
40
41or, in most cases,
42
9041c2e3 43 d = uvuni_to_utf8(d, uv);
eebe1485 44
b851fbc1 45(which is equivalent to)
46
47 d = uvuni_to_utf8_flags(d, uv, 0);
48
eebe1485 49is the recommended Unicode-aware way of saying
50
51 *(d++) = uv;
52
53=cut
54*/
55
dfe13c55 56U8 *
b851fbc1 57Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 58{
62961d2e 59 if (ckWARN(WARN_UTF8)) {
b851fbc1 60 if (UNICODE_IS_SURROGATE(uv) &&
61 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 62 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1 63 else if (
64 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
65 !(flags & UNICODE_ALLOW_FDD0))
66 ||
c867b360 67 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1 68 !(flags & UNICODE_ALLOW_FFFF))) &&
69 /* UNICODE_ALLOW_SUPER includes
2a20b9da 70 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1 71 ((uv <= PERL_UNICODE_MAX) ||
72 !(flags & UNICODE_ALLOW_SUPER))
73 )
9014280d 74 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800 75 "Unicode character 0x%04"UVxf" is illegal", uv);
76 }
c4d5f83a 77 if (UNI_IS_INVARIANT(uv)) {
eb160463 78 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3 79 return d;
80 }
2d331972 81#if defined(EBCDIC)
1d72bdf6 82 else {
83 STRLEN len = UNISKIP(uv);
84 U8 *p = d+len-1;
85 while (p > d) {
eb160463 86 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6 87 uv >>= UTF_ACCUMULATION_SHIFT;
88 }
eb160463 89 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6 90 return d+len;
91 }
92#else /* Non loop style */
a0ed51b3 93 if (uv < 0x800) {
eb160463 94 *d++ = (U8)(( uv >> 6) | 0xc0);
95 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 96 return d;
97 }
98 if (uv < 0x10000) {
eb160463 99 *d++ = (U8)(( uv >> 12) | 0xe0);
100 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
101 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 102 return d;
103 }
104 if (uv < 0x200000) {
eb160463 105 *d++ = (U8)(( uv >> 18) | 0xf0);
106 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
107 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
108 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 109 return d;
110 }
111 if (uv < 0x4000000) {
eb160463 112 *d++ = (U8)(( uv >> 24) | 0xf8);
113 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 117 return d;
118 }
119 if (uv < 0x80000000) {
eb160463 120 *d++ = (U8)(( uv >> 30) | 0xfc);
121 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
124 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
125 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 126 return d;
127 }
6b8eaf93 128#ifdef HAS_QUAD
d7578b48 129 if (uv < UTF8_QUAD_MAX)
a0ed51b3 130#endif
131 {
eb160463 132 *d++ = 0xfe; /* Can't match U+FEFF! */
133 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
134 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
135 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
136 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
137 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
138 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 139 return d;
140 }
6b8eaf93 141#ifdef HAS_QUAD
a0ed51b3 142 {
eb160463 143 *d++ = 0xff; /* Can't match U+FFFE! */
144 *d++ = 0x80; /* 6 Reserved bits */
145 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
146 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
147 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
148 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
149 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
150 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
151 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
152 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
153 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
154 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
155 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 156 return d;
157 }
158#endif
1d72bdf6 159#endif /* Loop style */
a0ed51b3 160}
b851fbc1 161
162U8 *
163Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
164{
165 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
166}
9041c2e3 167
168
169/*
eebe1485 170=for apidoc A|STRLEN|is_utf8_char|U8 *s
171
5da9da9e 172Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01 173character. Note that an INVARIANT (i.e. ASCII) character is a valid
174UTF-8 character. The actual number of bytes in the UTF-8 character
175will be returned if it is valid, otherwise 0.
9041c2e3 176
82686b01 177=cut */
067a85ef 178STRLEN
386d01d6 179Perl_is_utf8_char(pTHX_ U8 *s)
180{
181 U8 u = *s;
067a85ef 182 STRLEN slen, len;
183 UV uv, ouv;
386d01d6 184
1d72bdf6 185 if (UTF8_IS_INVARIANT(u))
386d01d6 186 return 1;
187
60006e79 188 if (!UTF8_IS_START(u))
386d01d6 189 return 0;
190
9f07fdcd 191 len = UTF8SKIP(s);
386d01d6 192
60006e79 193 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef 194 return 0;
195
386d01d6 196 slen = len - 1;
197 s++;
6eb6869e 198 u &= UTF_START_MASK(len);
1d72bdf6 199 uv = u;
067a85ef 200 ouv = uv;
386d01d6 201 while (slen--) {
60006e79 202 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 203 return 0;
8850bf83 204 uv = UTF8_ACCUMULATE(uv, *s);
209a9bc1 205 if (uv < ouv)
067a85ef 206 return 0;
207 ouv = uv;
386d01d6 208 s++;
209 }
067a85ef 210
eb160463 211 if ((STRLEN)UNISKIP(uv) < len)
067a85ef 212 return 0;
213
386d01d6 214 return len;
215}
216
6662521e 217/*
eebe1485 218=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 219
c9ada85f 220Returns true if first C<len> bytes of the given string form a valid
1e54db1a 221UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
222not mean 'a string that contains code points above 0x7F encoded in UTF-8'
223because a valid ASCII string is a valid UTF-8 string.
6662521e 224
225=cut
226*/
227
8e84507e 228bool
6662521e 229Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
230{
067a85ef 231 U8* x = s;
1aa99e6b 232 U8* send;
067a85ef 233 STRLEN c;
234
1aa99e6b 235 if (!len)
6cd5fe39 236 len = strlen((char *)s);
1aa99e6b 237 send = s + len;
238
6662521e 239 while (x < send) {
1acdb0da 240 /* Inline the easy bits of is_utf8_char() here for speed... */
241 if (UTF8_IS_INVARIANT(*x))
242 c = 1;
243 else if (!UTF8_IS_START(*x))
244 return FALSE;
245 else {
246 /* ... and call is_utf8_char() only if really needed. */
247 c = is_utf8_char(x);
248 if (!c)
249 return FALSE;
250 }
6662521e 251 x += c;
6662521e 252 }
60006e79 253 if (x != send)
254 return FALSE;
067a85ef 255
256 return TRUE;
6662521e 257}
258
67e989fb 259/*
81cd54e3 260=for apidoc A|bool|is_utf8_string_loc|U8 *s|STRLEN len|U8 **p
261
262Like is_ut8_string but store the location of the failure in
263the last argument.
264
265=cut
266*/
267
268bool
269Perl_is_utf8_string_loc(pTHX_ U8 *s, STRLEN len, U8 **p)
270{
271 U8* x = s;
272 U8* send;
273 STRLEN c;
274
275 if (!len)
276 len = strlen((char *)s);
277 send = s + len;
278
279 while (x < send) {
280 /* Inline the easy bits of is_utf8_char() here for speed... */
281 if (UTF8_IS_INVARIANT(*x))
282 c = 1;
283 else if (!UTF8_IS_START(*x)) {
284 if (p)
285 *p = x;
286 return FALSE;
287 }
288 else {
289 /* ... and call is_utf8_char() only if really needed. */
290 c = is_utf8_char(x);
291 if (!c) {
292 if (p)
293 *p = x;
294 return FALSE;
295 }
296 }
297 x += c;
298 }
299 if (x != send) {
300 if (p)
301 *p = x;
302 return FALSE;
303 }
304
305 return TRUE;
306}
307
308/*
9041c2e3 309=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 310
9041c2e3 311Bottom level UTF-8 decode routine.
312Returns the unicode code point value of the first character in the string C<s>
1e54db1a 313which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 314C<retlen> will be set to the length, in bytes, of that character.
67e989fb 315
1e54db1a 316If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880 317is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
318it is assumed that the caller will raise a warning, and this function
28d3d195 319will silently just set C<retlen> to C<-1> and return zero. If the
320C<flags> does not contain UTF8_CHECK_ONLY, warnings about
321malformations will be given, C<retlen> will be set to the expected
322length of the UTF-8 character in bytes, and zero will be returned.
323
324The C<flags> can also contain various flags to allow deviations from
325the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 326
9041c2e3 327Most code should use utf8_to_uvchr() rather than call this directly.
328
37607a96 329=cut
330*/
67e989fb 331
a0ed51b3 332UV
37607a96 333Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 334{
097fb8e2 335 U8 *s0 = s;
9c5ffd7c 336 UV uv = *s, ouv = 0;
ba210ebe 337 STRLEN len = 1;
338 bool dowarn = ckWARN_d(WARN_UTF8);
4f849cb6 339 UV startbyte = *s;
ba210ebe 340 STRLEN expectlen = 0;
a0dbb045 341 U32 warning = 0;
342
343/* This list is a superset of the UTF8_ALLOW_XXX. */
344
345#define UTF8_WARN_EMPTY 1
346#define UTF8_WARN_CONTINUATION 2
347#define UTF8_WARN_NON_CONTINUATION 3
348#define UTF8_WARN_FE_FF 4
349#define UTF8_WARN_SHORT 5
350#define UTF8_WARN_OVERFLOW 6
351#define UTF8_WARN_SURROGATE 7
c867b360 352#define UTF8_WARN_LONG 8
353#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045 354
355 if (curlen == 0 &&
356 !(flags & UTF8_ALLOW_EMPTY)) {
357 warning = UTF8_WARN_EMPTY;
0c443dc2 358 goto malformed;
359 }
360
1d72bdf6 361 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3 362 if (retlen)
363 *retlen = 1;
c4d5f83a 364 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 365 }
67e989fb 366
421a8bf2 367 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 368 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 369 warning = UTF8_WARN_CONTINUATION;
ba210ebe 370 goto malformed;
371 }
372
421a8bf2 373 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 374 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 375 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 376 goto malformed;
377 }
9041c2e3 378
1d72bdf6 379#ifdef EBCDIC
75383841 380 uv = NATIVE_TO_UTF(uv);
1d72bdf6 381#else
fcc8fcf6 382 if ((uv == 0xfe || uv == 0xff) &&
383 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 384 warning = UTF8_WARN_FE_FF;
ba210ebe 385 goto malformed;
a0ed51b3 386 }
1d72bdf6 387#endif
388
ba210ebe 389 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
390 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
391 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
392 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6 393#ifdef EBCDIC
394 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
395 else { len = 7; uv &= 0x01; }
396#else
ba210ebe 397 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
398 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6 399 else { len = 13; uv = 0; } /* whoa! */
400#endif
401
a0ed51b3 402 if (retlen)
403 *retlen = len;
9041c2e3 404
ba210ebe 405 expectlen = len;
406
fcc8fcf6 407 if ((curlen < expectlen) &&
408 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 409 warning = UTF8_WARN_SHORT;
ba210ebe 410 goto malformed;
411 }
412
413 len--;
a0ed51b3 414 s++;
ba210ebe 415 ouv = uv;
416
a0ed51b3 417 while (len--) {
421a8bf2 418 if (!UTF8_IS_CONTINUATION(*s) &&
419 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 420 s--;
421 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 422 goto malformed;
a0ed51b3 423 }
424 else
8850bf83 425 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045 426 if (!(uv > ouv)) {
427 /* These cannot be allowed. */
428 if (uv == ouv) {
429 if (!(flags & UTF8_ALLOW_LONG)) {
430 warning = UTF8_WARN_LONG;
431 goto malformed;
432 }
433 }
434 else { /* uv < ouv */
435 /* This cannot be allowed. */
436 warning = UTF8_WARN_OVERFLOW;
437 goto malformed;
438 }
ba210ebe 439 }
440 s++;
441 ouv = uv;
442 }
443
421a8bf2 444 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 445 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 446 warning = UTF8_WARN_SURROGATE;
ba210ebe 447 goto malformed;
eb160463 448 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 449 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 450 warning = UTF8_WARN_LONG;
ba210ebe 451 goto malformed;
421a8bf2 452 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 453 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 454 warning = UTF8_WARN_FFFF;
a9917092 455 goto malformed;
a0ed51b3 456 }
ba210ebe 457
a0ed51b3 458 return uv;
ba210ebe 459
460malformed:
461
fcc8fcf6 462 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 463 if (retlen)
cc366d4b 464 *retlen = -1;
ba210ebe 465 return 0;
466 }
467
a0dbb045 468 if (dowarn) {
469 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
470
471 switch (warning) {
472 case 0: /* Intentionally empty. */ break;
473 case UTF8_WARN_EMPTY:
474 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
475 break;
476 case UTF8_WARN_CONTINUATION:
097fb8e2 477 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045 478 break;
479 case UTF8_WARN_NON_CONTINUATION:
097fb8e2 480 if (s == s0)
481 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
482 (UV)s[1], startbyte);
483 else
484 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
485 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
486
a0dbb045 487 break;
488 case UTF8_WARN_FE_FF:
489 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
490 break;
491 case UTF8_WARN_SHORT:
097fb8e2 492 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
493 curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
b31f83c2 494 expectlen = curlen; /* distance for caller to skip */
a0dbb045 495 break;
496 case UTF8_WARN_OVERFLOW:
097fb8e2 497 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
498 ouv, *s, startbyte);
a0dbb045 499 break;
500 case UTF8_WARN_SURROGATE:
501 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
502 break;
a0dbb045 503 case UTF8_WARN_LONG:
097fb8e2 504 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
505 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045 506 break;
507 case UTF8_WARN_FFFF:
508 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
509 break;
510 default:
511 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
512 break;
513 }
514
515 if (warning) {
516 char *s = SvPVX(sv);
517
518 if (PL_op)
9014280d 519 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 520 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 521 else
9014280d 522 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045 523 }
524 }
525
ba210ebe 526 if (retlen)
28d3d195 527 *retlen = expectlen ? expectlen : len;
ba210ebe 528
28d3d195 529 return 0;
a0ed51b3 530}
531
8e84507e 532/*
37607a96 533=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
9041c2e3 534
535Returns the native character value of the first character in the string C<s>
1e54db1a 536which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3 537length, in bytes, of that character.
538
1e54db1a 539If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3 540returned and retlen is set, if possible, to -1.
541
542=cut
543*/
544
545UV
37607a96 546Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
9041c2e3 547{
872c91ae 548 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
549 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3 550}
551
552/*
37607a96 553=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
9041c2e3 554
555Returns the Unicode code point of the first character in the string C<s>
1e54db1a 556which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3 557length, in bytes, of that character.
558
559This function should only be used when returned UV is considered
560an index into the Unicode semantic tables (e.g. swashes).
561
1e54db1a 562If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 563returned and retlen is set, if possible, to -1.
8e84507e 564
565=cut
566*/
567
568UV
37607a96 569Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
8e84507e 570{
9041c2e3 571 /* Call the low level routine asking for checks */
872c91ae 572 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
573 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e 574}
575
b76347f2 576/*
37607a96 577=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
b76347f2 578
579Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47 580Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
581up past C<e>, croaks.
b76347f2 582
583=cut
584*/
585
586STRLEN
37607a96 587Perl_utf8_length(pTHX_ U8 *s, U8 *e)
b76347f2 588{
589 STRLEN len = 0;
590
8850bf83 591 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
592 * the bitops (especially ~) can create illegal UTF-8.
593 * In other words: in Perl UTF-8 is not just for Unicode. */
594
901b21bf 595 if (e < s) {
596 if (ckWARN_d(WARN_UTF8)) {
597 if (PL_op)
598 Perl_warner(aTHX_ packWARN(WARN_UTF8),
599 "%s in %s", unees, OP_DESC(PL_op));
600 else
601 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
602 }
603 return 0;
604 }
b76347f2 605 while (s < e) {
02eb7b47 606 U8 t = UTF8SKIP(s);
b76347f2 607
901b21bf 608 if (e - s < t) {
609 if (ckWARN_d(WARN_UTF8)) {
610 if (PL_op)
611 Perl_warner(aTHX_ packWARN(WARN_UTF8),
612 unees, OP_DESC(PL_op));
613 else
614 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
615 }
616 return len;
617 }
b76347f2 618 s += t;
619 len++;
620 }
621
622 return len;
623}
624
b06226ff 625/*
eebe1485 626=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff 627
1e54db1a 628Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff 629and C<b>.
630
631WARNING: use only if you *know* that the pointers point inside the
632same UTF-8 buffer.
633
37607a96 634=cut
635*/
a0ed51b3 636
02eb7b47 637IV
864dbfa3 638Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 639{
02eb7b47 640 IV off = 0;
641
8850bf83 642 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
643 * the bitops (especially ~) can create illegal UTF-8.
644 * In other words: in Perl UTF-8 is not just for Unicode. */
645
a0ed51b3 646 if (a < b) {
647 while (a < b) {
02eb7b47 648 U8 c = UTF8SKIP(a);
649
901b21bf 650 if (b - a < c) {
651 if (ckWARN_d(WARN_UTF8)) {
652 if (PL_op)
653 Perl_warner(aTHX_ packWARN(WARN_UTF8),
654 "%s in %s", unees, OP_DESC(PL_op));
655 else
656 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
657 }
658 return off;
659 }
02eb7b47 660 a += c;
a0ed51b3 661 off--;
662 }
663 }
664 else {
665 while (b < a) {
02eb7b47 666 U8 c = UTF8SKIP(b);
667
901b21bf 668 if (a - b < c) {
669 if (ckWARN_d(WARN_UTF8)) {
670 if (PL_op)
671 Perl_warner(aTHX_ packWARN(WARN_UTF8),
672 "%s in %s", unees, OP_DESC(PL_op));
673 else
674 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
675 }
676 return off;
677 }
02eb7b47 678 b += c;
a0ed51b3 679 off++;
680 }
681 }
02eb7b47 682
a0ed51b3 683 return off;
684}
685
b06226ff 686/*
37607a96 687=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 688
8850bf83 689Return the UTF-8 pointer C<s> displaced by C<off> characters, either
690forward or backward.
b06226ff 691
692WARNING: do not use the following unless you *know* C<off> is within
8850bf83 693the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
694on the first byte of character or just after the last byte of a character.
b06226ff 695
37607a96 696=cut
697*/
a0ed51b3 698
699U8 *
864dbfa3 700Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 701{
8850bf83 702 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
703 * the bitops (especially ~) can create illegal UTF-8.
704 * In other words: in Perl UTF-8 is not just for Unicode. */
705
a0ed51b3 706 if (off >= 0) {
707 while (off--)
708 s += UTF8SKIP(s);
709 }
710 else {
711 while (off++) {
712 s--;
8850bf83 713 while (UTF8_IS_CONTINUATION(*s))
714 s--;
a0ed51b3 715 }
716 }
717 return s;
718}
719
6940069f 720/*
eebe1485 721=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 722
1e54db1a 723Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53 724Unlike C<bytes_to_utf8>, this over-writes the original string, and
725updates len to contain the new length.
67e989fb 726Returns zero on failure, setting C<len> to -1.
6940069f 727
728=cut
729*/
730
731U8 *
37607a96 732Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 733{
6940069f 734 U8 *send;
735 U8 *d;
dcad2880 736 U8 *save = s;
246fae53 737
1e54db1a 738 /* ensure valid UTF-8 and chars < 256 before updating string */
dcad2880 739 for (send = s + *len; s < send; ) {
740 U8 c = *s++;
741
1d72bdf6 742 if (!UTF8_IS_INVARIANT(c) &&
743 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
744 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880 745 *len = -1;
746 return 0;
747 }
246fae53 748 }
dcad2880 749
750 d = s = save;
6940069f 751 while (s < send) {
ed646e6e 752 STRLEN ulen;
9041c2e3 753 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 754 s += ulen;
6940069f 755 }
756 *d = '\0';
246fae53 757 *len = d - save;
6940069f 758 return save;
759}
760
761/*
f9a63242 762=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
763
1e54db1a 764Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
f9a63242 765Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0 766the newly-created string, and updates C<len> to contain the new
767length. Returns the original string if no conversion occurs, C<len>
768is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
7690 if C<s> is converted or contains all 7bit characters.
f9a63242 770
37607a96 771=cut
772*/
f9a63242 773
774U8 *
37607a96 775Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 776{
f9a63242 777 U8 *d;
778 U8 *start = s;
db42d148 779 U8 *send;
f9a63242 780 I32 count = 0;
781
782 if (!*is_utf8)
783 return start;
784
1e54db1a 785 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 786 for (send = s + *len; s < send;) {
787 U8 c = *s++;
1d72bdf6 788 if (!UTF8_IS_INVARIANT(c)) {
db42d148 789 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
790 (c = *s++) && UTF8_IS_CONTINUATION(c))
791 count++;
792 else
f9a63242 793 return start;
db42d148 794 }
f9a63242 795 }
796
797 *is_utf8 = 0;
798
f9a63242 799 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 800 s = start; start = d;
f9a63242 801 while (s < send) {
802 U8 c = *s++;
c4d5f83a 803 if (!UTF8_IS_INVARIANT(c)) {
804 /* Then it is two-byte encoded */
805 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
806 c = ASCII_TO_NATIVE(c);
807 }
808 *d++ = c;
f9a63242 809 }
810 *d = '\0';
811 *len = d - start;
812 return start;
813}
814
815/*
eebe1485 816=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f 817
1e54db1a 818Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e 819Returns a pointer to the newly-created string, and sets C<len> to
820reflect the new length.
6940069f 821
1e54db1a 822If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f 823see sv_recode_to_utf8().
824
497711e7 825=cut
6940069f 826*/
827
828U8*
37607a96 829Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
6940069f 830{
6940069f 831 U8 *send;
832 U8 *d;
833 U8 *dst;
6662521e 834 send = s + (*len);
6940069f 835
6662521e 836 Newz(801, d, (*len) * 2 + 1, U8);
6940069f 837 dst = d;
838
839 while (s < send) {
db42d148 840 UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 841 if (UNI_IS_INVARIANT(uv))
eb160463 842 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 843 else {
eb160463 844 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
845 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f 846 }
847 }
848 *d = '\0';
6662521e 849 *len = d-dst;
6940069f 850 return dst;
851}
852
a0ed51b3 853/*
dea0fc0b 854 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3 855 *
856 * Destination must be pre-extended to 3/2 source. Do not use in-place.
857 * We optimize for native, for obvious reasons. */
858
859U8*
dea0fc0b 860Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 861{
dea0fc0b 862 U8* pend;
863 U8* dstart = d;
864
865 if (bytelen & 1)
a7867d0a 866 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b 867
868 pend = p + bytelen;
869
a0ed51b3 870 while (p < pend) {
dea0fc0b 871 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
872 p += 2;
a0ed51b3 873 if (uv < 0x80) {
eb160463 874 *d++ = (U8)uv;
a0ed51b3 875 continue;
876 }
877 if (uv < 0x800) {
eb160463 878 *d++ = (U8)(( uv >> 6) | 0xc0);
879 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 880 continue;
881 }
882 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e 883 UV low = (p[0] << 8) + p[1];
884 p += 2;
dea0fc0b 885 if (low < 0xdc00 || low >= 0xdfff)
886 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3 887 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
888 }
889 if (uv < 0x10000) {
eb160463 890 *d++ = (U8)(( uv >> 12) | 0xe0);
891 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
892 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 893 continue;
894 }
895 else {
eb160463 896 *d++ = (U8)(( uv >> 18) | 0xf0);
897 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
898 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
899 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 900 continue;
901 }
902 }
dea0fc0b 903 *newlen = d - dstart;
a0ed51b3 904 return d;
905}
906
907/* Note: this one is slightly destructive of the source. */
908
909U8*
dea0fc0b 910Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 911{
912 U8* s = (U8*)p;
913 U8* send = s + bytelen;
914 while (s < send) {
915 U8 tmp = s[0];
916 s[0] = s[1];
917 s[1] = tmp;
918 s += 2;
919 }
dea0fc0b 920 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3 921}
922
923/* for now these are all defined (inefficiently) in terms of the utf8 versions */
924
925bool
84afefe6 926Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 927{
ad391ad9 928 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 929 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 930 return is_utf8_alnum(tmpbuf);
931}
932
933bool
84afefe6 934Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 935{
ad391ad9 936 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 937 uvchr_to_utf8(tmpbuf, c);
b8c5462f 938 return is_utf8_alnumc(tmpbuf);
939}
940
941bool
84afefe6 942Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 943{
ad391ad9 944 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 945 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 946 return is_utf8_idfirst(tmpbuf);
947}
948
949bool
84afefe6 950Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 951{
ad391ad9 952 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 953 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 954 return is_utf8_alpha(tmpbuf);
955}
956
957bool
84afefe6 958Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 959{
ad391ad9 960 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 961 uvchr_to_utf8(tmpbuf, c);
4d61ec05 962 return is_utf8_ascii(tmpbuf);
963}
964
965bool
84afefe6 966Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 967{
ad391ad9 968 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 969 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 970 return is_utf8_space(tmpbuf);
971}
972
973bool
84afefe6 974Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 975{
ad391ad9 976 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 977 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 978 return is_utf8_digit(tmpbuf);
979}
980
981bool
84afefe6 982Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 983{
ad391ad9 984 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 985 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 986 return is_utf8_upper(tmpbuf);
987}
988
989bool
84afefe6 990Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 991{
ad391ad9 992 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 993 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 994 return is_utf8_lower(tmpbuf);
995}
996
997bool
84afefe6 998Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 999{
ad391ad9 1000 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1001 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1002 return is_utf8_cntrl(tmpbuf);
1003}
1004
1005bool
84afefe6 1006Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1007{
ad391ad9 1008 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1009 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1010 return is_utf8_graph(tmpbuf);
1011}
1012
1013bool
84afefe6 1014Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1015{
ad391ad9 1016 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1017 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1018 return is_utf8_print(tmpbuf);
1019}
1020
b8c5462f 1021bool
84afefe6 1022Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1023{
ad391ad9 1024 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1025 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1026 return is_utf8_punct(tmpbuf);
1027}
1028
4d61ec05 1029bool
84afefe6 1030Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1031{
e7ae6809 1032 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
230880c1 1033 uvchr_to_utf8(tmpbuf, c);
4d61ec05 1034 return is_utf8_xdigit(tmpbuf);
1035}
1036
84afefe6 1037UV
1038Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1039{
0ebc6274 1040 uvchr_to_utf8(p, c);
1041 return to_utf8_upper(p, p, lenp);
a0ed51b3 1042}
1043
84afefe6 1044UV
1045Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1046{
0ebc6274 1047 uvchr_to_utf8(p, c);
1048 return to_utf8_title(p, p, lenp);
a0ed51b3 1049}
1050
84afefe6 1051UV
1052Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1053{
0ebc6274 1054 uvchr_to_utf8(p, c);
1055 return to_utf8_lower(p, p, lenp);
a0ed51b3 1056}
1057
84afefe6 1058UV
1059Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1060{
0ebc6274 1061 uvchr_to_utf8(p, c);
1062 return to_utf8_fold(p, p, lenp);
84afefe6 1063}
1064
a0ed51b3 1065/* for now these all assume no locale info available for Unicode > 255 */
1066
1067bool
84afefe6 1068Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3 1069{
1070 return is_uni_alnum(c); /* XXX no locale support yet */
1071}
1072
1073bool
84afefe6 1074Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f 1075{
1076 return is_uni_alnumc(c); /* XXX no locale support yet */
1077}
1078
1079bool
84afefe6 1080Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3 1081{
1082 return is_uni_idfirst(c); /* XXX no locale support yet */
1083}
1084
1085bool
84afefe6 1086Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3 1087{
1088 return is_uni_alpha(c); /* XXX no locale support yet */
1089}
1090
1091bool
84afefe6 1092Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05 1093{
1094 return is_uni_ascii(c); /* XXX no locale support yet */
1095}
1096
1097bool
84afefe6 1098Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3 1099{
1100 return is_uni_space(c); /* XXX no locale support yet */
1101}
1102
1103bool
84afefe6 1104Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3 1105{
1106 return is_uni_digit(c); /* XXX no locale support yet */
1107}
1108
1109bool
84afefe6 1110Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3 1111{
1112 return is_uni_upper(c); /* XXX no locale support yet */
1113}
1114
1115bool
84afefe6 1116Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3 1117{
1118 return is_uni_lower(c); /* XXX no locale support yet */
1119}
1120
1121bool
84afefe6 1122Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f 1123{
1124 return is_uni_cntrl(c); /* XXX no locale support yet */
1125}
1126
1127bool
84afefe6 1128Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f 1129{
1130 return is_uni_graph(c); /* XXX no locale support yet */
1131}
1132
1133bool
84afefe6 1134Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3 1135{
1136 return is_uni_print(c); /* XXX no locale support yet */
1137}
1138
b8c5462f 1139bool
84afefe6 1140Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f 1141{
1142 return is_uni_punct(c); /* XXX no locale support yet */
1143}
1144
4d61ec05 1145bool
84afefe6 1146Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05 1147{
1148 return is_uni_xdigit(c); /* XXX no locale support yet */
1149}
1150
b7ac61fa 1151U32
1152Perl_to_uni_upper_lc(pTHX_ U32 c)
1153{
ee099d14 1154 /* XXX returns only the first character -- do not use XXX */
1155 /* XXX no locale support yet */
1156 STRLEN len;
1157 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1158 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa 1159}
1160
1161U32
1162Perl_to_uni_title_lc(pTHX_ U32 c)
1163{
ee099d14 1164 /* XXX returns only the first character XXX -- do not use XXX */
1165 /* XXX no locale support yet */
1166 STRLEN len;
1167 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1168 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa 1169}
1170
1171U32
1172Perl_to_uni_lower_lc(pTHX_ U32 c)
1173{
ee099d14 1174 /* XXX returns only the first character -- do not use XXX */
1175 /* XXX no locale support yet */
1176 STRLEN len;
1177 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1178 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa 1179}
1180
a0ed51b3 1181bool
864dbfa3 1182Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1183{
386d01d6 1184 if (!is_utf8_char(p))
1185 return FALSE;
a0ed51b3 1186 if (!PL_utf8_alnum)
289d4f09 1187 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1188 * descendant of isalnum(3), in other words, it doesn't
1189 * contain the '_'. --jhi */
1190 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
eb160463 1191 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3 1192/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1193#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1194 if (!PL_utf8_alnum)
1195 PL_utf8_alnum = swash_init("utf8", "",
1196 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1197 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3 1198#endif
1199}
1200
1201bool
b8c5462f 1202Perl_is_utf8_alnumc(pTHX_ U8 *p)
1203{
386d01d6 1204 if (!is_utf8_char(p))
1205 return FALSE;
b8c5462f 1206 if (!PL_utf8_alnum)
1207 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
eb160463 1208 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f 1209/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1210#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1211 if (!PL_utf8_alnum)
1212 PL_utf8_alnum = swash_init("utf8", "",
1213 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1214 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f 1215#endif
1216}
1217
1218bool
82686b01 1219Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
a0ed51b3 1220{
82686b01 1221 if (*p == '_')
1222 return TRUE;
1223 if (!is_utf8_char(p))
1224 return FALSE;
1225 if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1226 PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
eb160463 1227 return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
82686b01 1228}
1229
1230bool
1231Perl_is_utf8_idcont(pTHX_ U8 *p)
1232{
1233 if (*p == '_')
1234 return TRUE;
1235 if (!is_utf8_char(p))
1236 return FALSE;
1237 if (!PL_utf8_idcont)
1238 PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
eb160463 1239 return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
a0ed51b3 1240}
1241
1242bool
864dbfa3 1243Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1244{
386d01d6 1245 if (!is_utf8_char(p))
1246 return FALSE;
a0ed51b3 1247 if (!PL_utf8_alpha)
e24b16f9 1248 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
eb160463 1249 return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
a0ed51b3 1250}
1251
1252bool
b8c5462f 1253Perl_is_utf8_ascii(pTHX_ U8 *p)
1254{
386d01d6 1255 if (!is_utf8_char(p))
1256 return FALSE;
b8c5462f 1257 if (!PL_utf8_ascii)
1258 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
eb160463 1259 return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
b8c5462f 1260}
1261
1262bool
864dbfa3 1263Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1264{
386d01d6 1265 if (!is_utf8_char(p))
1266 return FALSE;
a0ed51b3 1267 if (!PL_utf8_space)
3bec3564 1268 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
eb160463 1269 return swash_fetch(PL_utf8_space, p, TRUE) != 0;
a0ed51b3 1270}
1271
1272bool
864dbfa3 1273Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1274{
386d01d6 1275 if (!is_utf8_char(p))
1276 return FALSE;
a0ed51b3 1277 if (!PL_utf8_digit)
e24b16f9 1278 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
eb160463 1279 return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
a0ed51b3 1280}
1281
1282bool
864dbfa3 1283Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1284{
386d01d6 1285 if (!is_utf8_char(p))
1286 return FALSE;
a0ed51b3 1287 if (!PL_utf8_upper)
c65e4d19 1288 PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
eb160463 1289 return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
a0ed51b3 1290}
1291
1292bool
864dbfa3 1293Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1294{
386d01d6 1295 if (!is_utf8_char(p))
1296 return FALSE;
a0ed51b3 1297 if (!PL_utf8_lower)
c65e4d19 1298 PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
eb160463 1299 return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
a0ed51b3 1300}
1301
1302bool
b8c5462f 1303Perl_is_utf8_cntrl(pTHX_ U8 *p)
1304{
386d01d6 1305 if (!is_utf8_char(p))
1306 return FALSE;
b8c5462f 1307 if (!PL_utf8_cntrl)
1308 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
eb160463 1309 return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
b8c5462f 1310}
1311
1312bool
1313Perl_is_utf8_graph(pTHX_ U8 *p)
1314{
386d01d6 1315 if (!is_utf8_char(p))
1316 return FALSE;
b8c5462f 1317 if (!PL_utf8_graph)
1318 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
eb160463 1319 return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
b8c5462f 1320}
1321
1322bool
864dbfa3 1323Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1324{
386d01d6 1325 if (!is_utf8_char(p))
1326 return FALSE;
a0ed51b3 1327 if (!PL_utf8_print)
e24b16f9 1328 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
eb160463 1329 return swash_fetch(PL_utf8_print, p, TRUE) != 0;
a0ed51b3 1330}
1331
1332bool
b8c5462f 1333Perl_is_utf8_punct(pTHX_ U8 *p)
1334{
386d01d6 1335 if (!is_utf8_char(p))
1336 return FALSE;
b8c5462f 1337 if (!PL_utf8_punct)
1338 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
eb160463 1339 return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
b8c5462f 1340}
1341
1342bool
1343Perl_is_utf8_xdigit(pTHX_ U8 *p)
1344{
386d01d6 1345 if (!is_utf8_char(p))
1346 return FALSE;
b8c5462f 1347 if (!PL_utf8_xdigit)
1348 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
eb160463 1349 return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
b8c5462f 1350}
1351
1352bool
864dbfa3 1353Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1354{
386d01d6 1355 if (!is_utf8_char(p))
1356 return FALSE;
a0ed51b3 1357 if (!PL_utf8_mark)
e24b16f9 1358 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
eb160463 1359 return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
a0ed51b3 1360}
1361
6b5c0936 1362/*
1363=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1364
1365The "p" contains the pointer to the UTF-8 string encoding
1366the character that is being converted.
1367
1368The "ustrp" is a pointer to the character buffer to put the
1369conversion result to. The "lenp" is a pointer to the length
1370of the result.
1371
0134edef 1372The "swashp" is a pointer to the swash to use.
6b5c0936 1373
0134edef 1374Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1375and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1376but not always, a multicharacter mapping), is tried first.
6b5c0936 1377
0134edef 1378The "special" is a string like "utf8::ToSpecLower", which means the
1379hash %utf8::ToSpecLower. The access to the hash is through
1380Perl_to_utf8_case().
6b5c0936 1381
0134edef 1382The "normal" is a string like "ToLower" which means the swash
1383%utf8::ToLower.
1384
1385=cut */
6b5c0936 1386
2104c8d9 1387UV
a6872d42 1388Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
a0ed51b3 1389{
0134edef 1390 UV uv0, uv1;
2f9475ad 1391 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
0134edef 1392 STRLEN len = 0;
a0ed51b3 1393
1feea2c7 1394 uv0 = utf8_to_uvchr(p, 0);
1395 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1396 * are necessary in EBCDIC, they are redundant no-ops
1397 * in ASCII-ish platforms, and hopefully optimized away. */
1398 uv1 = NATIVE_TO_UNI(uv0);
1399 uvuni_to_utf8(tmpbuf, uv1);
0134edef 1400
1401 if (!*swashp) /* load on-demand */
1402 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1403
b08cf34e 1404 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1405 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1406 /* It might be "special" (sometimes, but not always,
2a37f04d 1407 * a multicharacter mapping) */
983ffd37 1408 HV *hv;
b08cf34e 1409 SV **svp;
1410
1411 if ((hv = get_hv(special, FALSE)) &&
1412 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1413 (*svp)) {
1414 char *s;
47654450 1415
b08cf34e 1416 s = SvPV(*svp, len);
47654450 1417 if (len == 1)
1418 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1419 else {
2f9475ad 1420#ifdef EBCDIC
1421 /* If we have EBCDIC we need to remap the characters
1422 * since any characters in the low 256 are Unicode
1423 * code points, not EBCDIC. */
7cda7a3d 1424 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad 1425
1426 d = tmpbuf;
b08cf34e 1427 if (SvUTF8(*svp)) {
2f9475ad 1428 STRLEN tlen = 0;
1429
1430 while (t < tend) {
1431 UV c = utf8_to_uvchr(t, &tlen);
1432 if (tlen > 0) {
1433 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1434 t += tlen;
1435 }
1436 else
1437 break;
1438 }
1439 }
1440 else {
36fec512 1441 while (t < tend) {
1442 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1443 t++;
1444 }
2f9475ad 1445 }
1446 len = d - tmpbuf;
1447 Copy(tmpbuf, ustrp, len, U8);
1448#else
d2dcd0fb 1449 Copy(s, ustrp, len, U8);
2f9475ad 1450#endif
29e98929 1451 }
983ffd37 1452 }
0134edef 1453 }
1454
1455 if (!len && *swashp) {
1456 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1457
1458 if (uv2) {
1459 /* It was "normal" (a single character mapping). */
1460 UV uv3 = UNI_TO_NATIVE(uv2);
1461
e9101d72 1462 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d 1463 }
1464 }
1feea2c7 1465
0134edef 1466 if (!len) /* Neither: just copy. */
1467 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1468
2a37f04d 1469 if (lenp)
1470 *lenp = len;
1471
0134edef 1472 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3 1473}
1474
d3e79532 1475/*
1476=for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1477
1478Convert the UTF-8 encoded character at p to its uppercase version and
1479store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1480that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1481uppercase version may be longer than the original character (up to two
1482characters).
1483
1484The first character of the uppercased version is returned
1485(but note, as explained above, that there may be more.)
1486
1487=cut */
1488
2104c8d9 1489UV
983ffd37 1490Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1491{
983ffd37 1492 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1493 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1494}
a0ed51b3 1495
d3e79532 1496/*
1497=for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1498
1499Convert the UTF-8 encoded character at p to its titlecase version and
1500store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1501that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1502titlecase version may be longer than the original character (up to two
1503characters).
1504
1505The first character of the titlecased version is returned
1506(but note, as explained above, that there may be more.)
1507
1508=cut */
1509
983ffd37 1510UV
1511Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1512{
1513 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1514 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3 1515}
1516
d3e79532 1517/*
1518=for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1519
1520Convert the UTF-8 encoded character at p to its lowercase version and
1521store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1522that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1523lowercase version may be longer than the original character (up to two
1524characters).
1525
1526The first character of the lowercased version is returned
1527(but note, as explained above, that there may be more.)
1528
1529=cut */
1530
2104c8d9 1531UV
a2a2844f 1532Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1533{
983ffd37 1534 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1535 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1536}
1537
d3e79532 1538/*
1539=for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1540
1541Convert the UTF-8 encoded character at p to its foldcase version and
1542store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1543that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1544foldcase version may be longer than the original character (up to
1545three characters).
1546
1547The first character of the foldcased version is returned
1548(but note, as explained above, that there may be more.)
1549
1550=cut */
1551
b4e400f9 1552UV
1553Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1554{
1555 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1556 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3 1557}
1558
1559/* a "swash" is a swatch hash */
1560
1561SV*
864dbfa3 1562Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1563{
1564 SV* retval;
bf1fed83 1565 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1566 dSP;
1b026014 1567 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
f8be5cf0 1568 SV* errsv_save;
ce3b816e 1569
1b026014 1570 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1571 ENTER;
f8be5cf0 1572 errsv_save = newSVsv(ERRSV);
ce3b816e 1573 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
f8be5cf0 1574 if (!SvTRUE(ERRSV))
1575 sv_setsv(ERRSV, errsv_save);
1576 SvREFCNT_dec(errsv_save);
ce3b816e 1577 LEAVE;
1578 }
1579 SPAGAIN;
a0ed51b3 1580 PUSHSTACKi(PERLSI_MAGIC);
1581 PUSHMARK(SP);
1582 EXTEND(SP,5);
1583 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1584 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1585 PUSHs(listsv);
1586 PUSHs(sv_2mortal(newSViv(minbits)));
1587 PUSHs(sv_2mortal(newSViv(none)));
1588 PUTBACK;
1589 ENTER;
1590 SAVEI32(PL_hints);
1591 PL_hints = 0;
1592 save_re_context();
923e4eb5 1593 if (IN_PERL_COMPILETIME) {
bf1fed83 1594 /* XXX ought to be handled by lex_start */
82686b01 1595 SAVEI32(PL_in_my);
2b4bd638 1596 PL_in_my = 0;
bf1fed83 1597 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1598 }
f8be5cf0 1599 errsv_save = newSVsv(ERRSV);
864dbfa3 1600 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1601 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1602 else
e24b16f9 1603 retval = &PL_sv_undef;
f8be5cf0 1604 if (!SvTRUE(ERRSV))
1605 sv_setsv(ERRSV, errsv_save);
1606 SvREFCNT_dec(errsv_save);
a0ed51b3 1607 LEAVE;
1608 POPSTACK;
923e4eb5 1609 if (IN_PERL_COMPILETIME) {
bf1fed83 1610 STRLEN len;
1611 char* pv = SvPV(tokenbufsv, len);
1612
1613 Copy(pv, PL_tokenbuf, len+1, char);
eb160463 1614 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1615 }
bc45ce41 1616 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1617 if (SvPOK(retval))
35c1215d 1618 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1619 retval);
cea2e8a9 1620 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1621 }
a0ed51b3 1622 return retval;
1623}
1624
035d37be 1625
1626/* This API is wrong for special case conversions since we may need to
1627 * return several Unicode characters for a single Unicode character
1628 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1629 * the lower-level routine, and it is similarly broken for returning
1630 * multiple values. --jhi */
a0ed51b3 1631UV
3568d838 1632Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
a0ed51b3 1633{
1634 HV* hv = (HV*)SvRV(sv);
3568d838 1635 U32 klen;
1636 U32 off;
a0ed51b3 1637 STRLEN slen;
7d85a32c 1638 STRLEN needents;
4ea42e7f 1639 U8 *tmps = NULL;
a0ed51b3 1640 U32 bit;
1641 SV *retval;
3568d838 1642 U8 tmputf8[2];
1643 UV c = NATIVE_TO_ASCII(*ptr);
1644
1645 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
eb160463 1646 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1647 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
3568d838 1648 ptr = tmputf8;
1649 }
1650 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1651 * then the "swatch" is a vec() for al the chars which start
1652 * with 0xAA..0xYY
1653 * So the key in the hash (klen) is length of encoded char -1
1654 */
1655 klen = UTF8SKIP(ptr) - 1;
1656 off = ptr[klen];
a0ed51b3 1657
7d85a32c 1658 if (klen == 0)
1659 {
1660 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1661 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1662 */
1663 needents = UTF_CONTINUATION_MARK;
1664 off = NATIVE_TO_UTF(ptr[klen]);
1665 }
1666 else
1667 {
1668 /* If char is encoded then swatch is for the prefix */
1669 needents = (1 << UTF_ACCUMULATION_SHIFT);
1670 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1671 }
1672
a0ed51b3 1673 /*
1674 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1675 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1676 * it's nothing to sniff at.) Pity we usually come through at least
1677 * two function calls to get here...
1678 *
1679 * NB: this code assumes that swatches are never modified, once generated!
1680 */
1681
3568d838 1682 if (hv == PL_last_swash_hv &&
a0ed51b3 1683 klen == PL_last_swash_klen &&
3568d838 1684 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3 1685 {
1686 tmps = PL_last_swash_tmps;
1687 slen = PL_last_swash_slen;
1688 }
1689 else {
1690 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1691 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3 1692
1693 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1694 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1695 dSP;
2b9d42f0 1696 /* We use utf8n_to_uvuni() as we want an index into
1697 Unicode tables, not a native character number.
1698 */
872c91ae 1699 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
1700 ckWARN(WARN_UTF8) ?
1701 0 : UTF8_ALLOW_ANY);
f8be5cf0 1702 SV *errsv_save;
a0ed51b3 1703 ENTER;
1704 SAVETMPS;
1705 save_re_context();
1706 PUSHSTACKi(PERLSI_MAGIC);
1707 PUSHMARK(SP);
1708 EXTEND(SP,3);
1709 PUSHs((SV*)sv);
ffbc6a93 1710 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838 1711 PUSHs(sv_2mortal(newSViv((klen) ?
1712 (code_point & ~(needents - 1)) : 0)));
a0ed51b3 1713 PUSHs(sv_2mortal(newSViv(needents)));
1714 PUTBACK;
f8be5cf0 1715 errsv_save = newSVsv(ERRSV);
864dbfa3 1716 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1717 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1718 else
e24b16f9 1719 retval = &PL_sv_undef;
f8be5cf0 1720 if (!SvTRUE(ERRSV))
1721 sv_setsv(ERRSV, errsv_save);
1722 SvREFCNT_dec(errsv_save);
a0ed51b3 1723 POPSTACK;
1724 FREETMPS;
1725 LEAVE;
923e4eb5 1726 if (IN_PERL_COMPILETIME)
eb160463 1727 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1728
dfe13c55 1729 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1730
7d85a32c 1731 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1732 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3 1733 }
1734
1735 PL_last_swash_hv = hv;
1736 PL_last_swash_klen = klen;
1737 PL_last_swash_tmps = tmps;
1738 PL_last_swash_slen = slen;
1739 if (klen)
1740 Copy(ptr, PL_last_swash_key, klen, U8);
1741 }
1742
9faf8d75 1743 switch ((int)((slen << 3) / needents)) {
a0ed51b3 1744 case 1:
1745 bit = 1 << (off & 7);
1746 off >>= 3;
1747 return (tmps[off] & bit) != 0;
1748 case 8:
1749 return tmps[off];
1750 case 16:
1751 off <<= 1;
1752 return (tmps[off] << 8) + tmps[off + 1] ;
1753 case 32:
1754 off <<= 2;
1755 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1756 }
cea2e8a9 1757 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3 1758 return 0;
1759}
2b9d42f0 1760
1761
1762/*
37607a96 1763=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0 1764
1e54db1a 1765Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2b9d42f0 1766of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1767bytes available. The return value is the pointer to the byte after the
1768end of the new character. In other words,
1769
1770 d = uvchr_to_utf8(d, uv);
1771
1772is the recommended wide native character-aware way of saying
1773
1774 *(d++) = uv;
1775
1776=cut
1777*/
1778
1779/* On ASCII machines this is normally a macro but we want a
1780 real function in case XS code wants it
1781*/
1782#undef Perl_uvchr_to_utf8
1783U8 *
1784Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1785{
b851fbc1 1786 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2b9d42f0 1787}
1788
b851fbc1 1789U8 *
1790Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1791{
1792 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1793}
2b9d42f0 1794
1795/*
37607a96 1796=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0 1797
1798Returns the native character value of the first character in the string C<s>
1e54db1a 1799which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2b9d42f0 1800length, in bytes, of that character.
1801
1802Allows length and flags to be passed to low level routine.
1803
1804=cut
1805*/
0a2ef054 1806/* On ASCII machines this is normally a macro but we want
1807 a real function in case XS code wants it
2b9d42f0 1808*/
1809#undef Perl_utf8n_to_uvchr
1810UV
37607a96 1811Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0 1812{
1813 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1814 return UNI_TO_NATIVE(uv);
1815}
1816
d2cc3551 1817/*
1818=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1819
1820Build to the scalar dsv a displayable version of the string spv,
1821length len, the displayable version being at most pvlim bytes long
1822(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1823
9e55ce06 1824The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1825isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054 1826to display the \\[nrfta\\] as the backslashed versions (like '\n')
1827(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1828UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1829UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1830
d2cc3551 1831The pointer to the PV of the dsv is returned.
1832
1833=cut */
e6b2e755 1834char *
1835Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1836{
1837 int truncated = 0;
1838 char *s, *e;
1839
1840 sv_setpvn(dsv, "", 0);
1841 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1842 UV u;
c728cb41 1843 bool ok = FALSE;
1844
e6b2e755 1845 if (pvlim && SvCUR(dsv) >= pvlim) {
1846 truncated++;
1847 break;
1848 }
1849 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1850 if (u < 256) {
c728cb41 1851 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1852 switch (u & 0xFF) {
1853 case '\n':
1854 Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1855 case '\r':
1856 Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1857 case '\t':
1858 Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1859 case '\f':
1860 Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1861 case '\a':
1862 Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1863 case '\\':
d79a7a3d 1864 Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
c728cb41 1865 default: break;
1866 }
1867 }
00e86452 1868 /* isPRINT() is the locale-blind version. */
1869 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
2c4547fe 1870 Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
0a2ef054 1871 ok = TRUE;
1872 }
c728cb41 1873 }
1874 if (!ok)
9e55ce06 1875 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755 1876 }
1877 if (truncated)
1878 sv_catpvn(dsv, "...", 3);
1879
1880 return SvPVX(dsv);
1881}
2b9d42f0 1882
d2cc3551 1883/*
1884=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1885
1886Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1887the displayable version being at most pvlim bytes long
d2cc3551 1888(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1889
1890The flags argument is as in pv_uni_display().
1891
d2cc3551 1892The pointer to the PV of the dsv is returned.
1893
1894=cut */
e6b2e755 1895char *
1896Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1897{
701a277b 1898 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1899 pvlim, flags);
1900}
1901
d2cc3551 1902/*
d07ddd77 1903=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551 1904
1905Return true if the strings s1 and s2 differ case-insensitively, false
1906if not (if they are equal case-insensitively). If u1 is true, the
1907string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77 1908the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1909are false, the respective string is assumed to be in native 8-bit
1910encoding.
1911
1912If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1913in there (they will point at the beginning of the I<next> character).
1914If the pointers behind pe1 or pe2 are non-NULL, they are the end
1915pointers beyond which scanning will not continue under any
1916circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1917s2+l2 will be used as goal end pointers that will also stop the scan,
1918and which qualify towards defining a successful match: all the scans
1919that define an explicit length must reach their goal pointers for
1920a match to succeed).
d2cc3551 1921
1922For case-insensitiveness, the "casefolding" of Unicode is used
1923instead of upper/lowercasing both the characters, see
1924http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1925
1926=cut */
701a277b 1927I32
d07ddd77 1928Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 1929{
5469e704 1930 register U8 *p1 = (U8*)s1;
1931 register U8 *p2 = (U8*)s2;
d07ddd77 1932 register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1933 register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1934 STRLEN n1 = 0, n2 = 0;
ffce6cc2 1935 U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1936 U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
d7f013c8 1937 U8 natbuf[1+1];
1938 STRLEN foldlen1, foldlen2;
d07ddd77 1939 bool match;
332ddc25 1940
d07ddd77 1941 if (pe1)
1942 e1 = *(U8**)pe1;
eb160463 1943 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (U8*)s1)))
d07ddd77 1944 f1 = (U8*)s1 + l1;
1945 if (pe2)
1946 e2 = *(U8**)pe2;
eb160463 1947 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (U8*)s2)))
d07ddd77 1948 f2 = (U8*)s2 + l2;
1949
1950 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1951 return 1; /* mismatch; possible infinite loop or false positive */
1952
a6872d42 1953 if (!u1 || !u2)
1954 natbuf[1] = 0; /* Need to terminate the buffer. */
1955
d07ddd77 1956 while ((e1 == 0 || p1 < e1) &&
1957 (f1 == 0 || p1 < f1) &&
1958 (e2 == 0 || p2 < e2) &&
1959 (f2 == 0 || p2 < f2)) {
1960 if (n1 == 0) {
d7f013c8 1961 if (u1)
1962 to_utf8_fold(p1, foldbuf1, &foldlen1);
1963 else {
f5cee151 1964 natbuf[0] = *p1;
d7f013c8 1965 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1966 }
1967 q1 = foldbuf1;
d07ddd77 1968 n1 = foldlen1;
332ddc25 1969 }
d07ddd77 1970 if (n2 == 0) {
d7f013c8 1971 if (u2)
1972 to_utf8_fold(p2, foldbuf2, &foldlen2);
1973 else {
f5cee151 1974 natbuf[0] = *p2;
d7f013c8 1975 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1976 }
1977 q2 = foldbuf2;
d07ddd77 1978 n2 = foldlen2;
332ddc25 1979 }
d07ddd77 1980 while (n1 && n2) {
1981 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1982 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1983 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 1984 return 1; /* mismatch */
d07ddd77 1985 n1 -= UTF8SKIP(q1);
d7f013c8 1986 q1 += UTF8SKIP(q1);
d07ddd77 1987 n2 -= UTF8SKIP(q2);
d7f013c8 1988 q2 += UTF8SKIP(q2);
701a277b 1989 }
d07ddd77 1990 if (n1 == 0)
d7f013c8 1991 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 1992 if (n2 == 0)
d7f013c8 1993 p2 += u2 ? UTF8SKIP(p2) : 1;
1994
d2cc3551 1995 }
5469e704 1996
d07ddd77 1997 /* A match is defined by all the scans that specified
1998 * an explicit length reaching their final goals. */
1999 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704 2000
2001 if (match) {
d07ddd77 2002 if (pe1)
2003 *pe1 = (char*)p1;
2004 if (pe2)
2005 *pe2 = (char*)p2;
5469e704 2006 }
2007
2008 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2009}
701a277b 2010