lib/FileCache.t needs to use valid filename chars
[p5sagit/p5-mst-13.2.git] / utf8.c
CommitLineData
a0ed51b3 1/* utf8.c
2 *
be3c0a43 3 * Copyright (c) 1998-2002, Larry Wall
a0ed51b3 4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3 25#include "perl.h"
26
901b21bf 27static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
28
ccfc67b7 29/*
30=head1 Unicode Support
a0ed51b3 31
b851fbc1 32=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 33
34Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
35of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
36bytes available. The return value is the pointer to the byte after the
9041c2e3 37end of the new character. In other words,
eebe1485 38
b851fbc1 39 d = uvuni_to_utf8_flags(d, uv, flags);
40
41or, in most cases,
42
9041c2e3 43 d = uvuni_to_utf8(d, uv);
eebe1485 44
b851fbc1 45(which is equivalent to)
46
47 d = uvuni_to_utf8_flags(d, uv, 0);
48
eebe1485 49is the recommended Unicode-aware way of saying
50
51 *(d++) = uv;
52
53=cut
54*/
55
dfe13c55 56U8 *
b851fbc1 57Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 58{
62961d2e 59 if (ckWARN(WARN_UTF8)) {
b851fbc1 60 if (UNICODE_IS_SURROGATE(uv) &&
61 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 62 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1 63 else if (
64 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
65 !(flags & UNICODE_ALLOW_FDD0))
66 ||
c867b360 67 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1 68 !(flags & UNICODE_ALLOW_FFFF))) &&
69 /* UNICODE_ALLOW_SUPER includes
2a20b9da 70 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1 71 ((uv <= PERL_UNICODE_MAX) ||
72 !(flags & UNICODE_ALLOW_SUPER))
73 )
9014280d 74 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800 75 "Unicode character 0x%04"UVxf" is illegal", uv);
76 }
c4d5f83a 77 if (UNI_IS_INVARIANT(uv)) {
78 *d++ = UTF_TO_NATIVE(uv);
a0ed51b3 79 return d;
80 }
2d331972 81#if defined(EBCDIC)
1d72bdf6 82 else {
83 STRLEN len = UNISKIP(uv);
84 U8 *p = d+len-1;
85 while (p > d) {
86 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
87 uv >>= UTF_ACCUMULATION_SHIFT;
88 }
89 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
90 return d+len;
91 }
92#else /* Non loop style */
a0ed51b3 93 if (uv < 0x800) {
94 *d++ = (( uv >> 6) | 0xc0);
95 *d++ = (( uv & 0x3f) | 0x80);
96 return d;
97 }
98 if (uv < 0x10000) {
99 *d++ = (( uv >> 12) | 0xe0);
100 *d++ = (((uv >> 6) & 0x3f) | 0x80);
101 *d++ = (( uv & 0x3f) | 0x80);
102 return d;
103 }
104 if (uv < 0x200000) {
105 *d++ = (( uv >> 18) | 0xf0);
106 *d++ = (((uv >> 12) & 0x3f) | 0x80);
107 *d++ = (((uv >> 6) & 0x3f) | 0x80);
108 *d++ = (( uv & 0x3f) | 0x80);
109 return d;
110 }
111 if (uv < 0x4000000) {
112 *d++ = (( uv >> 24) | 0xf8);
113 *d++ = (((uv >> 18) & 0x3f) | 0x80);
114 *d++ = (((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (( uv & 0x3f) | 0x80);
117 return d;
118 }
119 if (uv < 0x80000000) {
120 *d++ = (( uv >> 30) | 0xfc);
121 *d++ = (((uv >> 24) & 0x3f) | 0x80);
122 *d++ = (((uv >> 18) & 0x3f) | 0x80);
123 *d++ = (((uv >> 12) & 0x3f) | 0x80);
124 *d++ = (((uv >> 6) & 0x3f) | 0x80);
125 *d++ = (( uv & 0x3f) | 0x80);
126 return d;
127 }
6b8eaf93 128#ifdef HAS_QUAD
d7578b48 129 if (uv < UTF8_QUAD_MAX)
a0ed51b3 130#endif
131 {
132 *d++ = 0xfe; /* Can't match U+FEFF! */
133 *d++ = (((uv >> 30) & 0x3f) | 0x80);
134 *d++ = (((uv >> 24) & 0x3f) | 0x80);
135 *d++ = (((uv >> 18) & 0x3f) | 0x80);
136 *d++ = (((uv >> 12) & 0x3f) | 0x80);
137 *d++ = (((uv >> 6) & 0x3f) | 0x80);
138 *d++ = (( uv & 0x3f) | 0x80);
139 return d;
140 }
6b8eaf93 141#ifdef HAS_QUAD
a0ed51b3 142 {
143 *d++ = 0xff; /* Can't match U+FFFE! */
3c77ea2b 144 *d++ = 0x80; /* 6 Reserved bits */
145 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
146 *d++ = (((uv >> 54) & 0x3f) | 0x80);
147 *d++ = (((uv >> 48) & 0x3f) | 0x80);
148 *d++ = (((uv >> 42) & 0x3f) | 0x80);
a0ed51b3 149 *d++ = (((uv >> 36) & 0x3f) | 0x80);
150 *d++ = (((uv >> 30) & 0x3f) | 0x80);
151 *d++ = (((uv >> 24) & 0x3f) | 0x80);
152 *d++ = (((uv >> 18) & 0x3f) | 0x80);
153 *d++ = (((uv >> 12) & 0x3f) | 0x80);
154 *d++ = (((uv >> 6) & 0x3f) | 0x80);
155 *d++ = (( uv & 0x3f) | 0x80);
156 return d;
157 }
158#endif
1d72bdf6 159#endif /* Loop style */
a0ed51b3 160}
b851fbc1 161
162U8 *
163Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
164{
165 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
166}
9041c2e3 167
168
169/*
eebe1485 170=for apidoc A|STRLEN|is_utf8_char|U8 *s
171
5da9da9e 172Tests if some arbitrary number of bytes begins in a valid UTF-8
1d72bdf6 173character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
5da9da9e 174The actual number of bytes in the UTF-8 character will be returned if
175it is valid, otherwise 0.
9041c2e3 176
37607a96 177=cut
178*/
067a85ef 179STRLEN
386d01d6 180Perl_is_utf8_char(pTHX_ U8 *s)
181{
182 U8 u = *s;
067a85ef 183 STRLEN slen, len;
184 UV uv, ouv;
386d01d6 185
1d72bdf6 186 if (UTF8_IS_INVARIANT(u))
386d01d6 187 return 1;
188
60006e79 189 if (!UTF8_IS_START(u))
386d01d6 190 return 0;
191
9f07fdcd 192 len = UTF8SKIP(s);
386d01d6 193
60006e79 194 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef 195 return 0;
196
386d01d6 197 slen = len - 1;
198 s++;
6eb6869e 199 u &= UTF_START_MASK(len);
1d72bdf6 200 uv = u;
067a85ef 201 ouv = uv;
386d01d6 202 while (slen--) {
60006e79 203 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 204 return 0;
8850bf83 205 uv = UTF8_ACCUMULATE(uv, *s);
209a9bc1 206 if (uv < ouv)
067a85ef 207 return 0;
208 ouv = uv;
386d01d6 209 s++;
210 }
067a85ef 211
5bbb0b5a 212 if (UNISKIP(uv) < len)
067a85ef 213 return 0;
214
386d01d6 215 return len;
216}
217
6662521e 218/*
eebe1485 219=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 220
5da9da9e 221Returns true if first C<len> bytes of the given string form a valid UTF8
222string, false otherwise. Note that 'a valid UTF8 string' does not mean
223'a string that contains UTF8' because a valid ASCII string is a valid
224UTF8 string.
6662521e 225
226=cut
227*/
228
8e84507e 229bool
6662521e 230Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
231{
067a85ef 232 U8* x = s;
1aa99e6b 233 U8* send;
067a85ef 234 STRLEN c;
235
1aa99e6b 236 if (!len)
6cd5fe39 237 len = strlen((char *)s);
1aa99e6b 238 send = s + len;
239
6662521e 240 while (x < send) {
241 c = is_utf8_char(x);
067a85ef 242 if (!c)
243 return FALSE;
6662521e 244 x += c;
6662521e 245 }
60006e79 246 if (x != send)
247 return FALSE;
067a85ef 248
249 return TRUE;
6662521e 250}
251
67e989fb 252/*
9041c2e3 253=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 254
9041c2e3 255Bottom level UTF-8 decode routine.
256Returns the unicode code point value of the first character in the string C<s>
ba210ebe 257which is assumed to be in UTF8 encoding and no longer than C<curlen>;
7df053ec 258C<retlen> will be set to the length, in bytes, of that character.
67e989fb 259
260If C<s> does not point to a well-formed UTF8 character, the behaviour
dcad2880 261is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
262it is assumed that the caller will raise a warning, and this function
28d3d195 263will silently just set C<retlen> to C<-1> and return zero. If the
264C<flags> does not contain UTF8_CHECK_ONLY, warnings about
265malformations will be given, C<retlen> will be set to the expected
266length of the UTF-8 character in bytes, and zero will be returned.
267
268The C<flags> can also contain various flags to allow deviations from
269the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 270
9041c2e3 271Most code should use utf8_to_uvchr() rather than call this directly.
272
37607a96 273=cut
274*/
67e989fb 275
a0ed51b3 276UV
37607a96 277Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 278{
097fb8e2 279 U8 *s0 = s;
9c5ffd7c 280 UV uv = *s, ouv = 0;
ba210ebe 281 STRLEN len = 1;
282 bool dowarn = ckWARN_d(WARN_UTF8);
4f849cb6 283 UV startbyte = *s;
ba210ebe 284 STRLEN expectlen = 0;
a0dbb045 285 U32 warning = 0;
286
287/* This list is a superset of the UTF8_ALLOW_XXX. */
288
289#define UTF8_WARN_EMPTY 1
290#define UTF8_WARN_CONTINUATION 2
291#define UTF8_WARN_NON_CONTINUATION 3
292#define UTF8_WARN_FE_FF 4
293#define UTF8_WARN_SHORT 5
294#define UTF8_WARN_OVERFLOW 6
295#define UTF8_WARN_SURROGATE 7
c867b360 296#define UTF8_WARN_LONG 8
297#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045 298
299 if (curlen == 0 &&
300 !(flags & UTF8_ALLOW_EMPTY)) {
301 warning = UTF8_WARN_EMPTY;
0c443dc2 302 goto malformed;
303 }
304
1d72bdf6 305 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3 306 if (retlen)
307 *retlen = 1;
c4d5f83a 308 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 309 }
67e989fb 310
421a8bf2 311 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 312 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 313 warning = UTF8_WARN_CONTINUATION;
ba210ebe 314 goto malformed;
315 }
316
421a8bf2 317 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 318 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 319 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 320 goto malformed;
321 }
9041c2e3 322
1d72bdf6 323#ifdef EBCDIC
75383841 324 uv = NATIVE_TO_UTF(uv);
1d72bdf6 325#else
fcc8fcf6 326 if ((uv == 0xfe || uv == 0xff) &&
327 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 328 warning = UTF8_WARN_FE_FF;
ba210ebe 329 goto malformed;
a0ed51b3 330 }
1d72bdf6 331#endif
332
ba210ebe 333 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
334 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
335 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
336 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6 337#ifdef EBCDIC
338 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
339 else { len = 7; uv &= 0x01; }
340#else
ba210ebe 341 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
342 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6 343 else { len = 13; uv = 0; } /* whoa! */
344#endif
345
a0ed51b3 346 if (retlen)
347 *retlen = len;
9041c2e3 348
ba210ebe 349 expectlen = len;
350
fcc8fcf6 351 if ((curlen < expectlen) &&
352 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 353 warning = UTF8_WARN_SHORT;
ba210ebe 354 goto malformed;
355 }
356
357 len--;
a0ed51b3 358 s++;
ba210ebe 359 ouv = uv;
360
a0ed51b3 361 while (len--) {
421a8bf2 362 if (!UTF8_IS_CONTINUATION(*s) &&
363 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 364 s--;
365 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 366 goto malformed;
a0ed51b3 367 }
368 else
8850bf83 369 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045 370 if (!(uv > ouv)) {
371 /* These cannot be allowed. */
372 if (uv == ouv) {
373 if (!(flags & UTF8_ALLOW_LONG)) {
374 warning = UTF8_WARN_LONG;
375 goto malformed;
376 }
377 }
378 else { /* uv < ouv */
379 /* This cannot be allowed. */
380 warning = UTF8_WARN_OVERFLOW;
381 goto malformed;
382 }
ba210ebe 383 }
384 s++;
385 ouv = uv;
386 }
387
421a8bf2 388 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 389 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 390 warning = UTF8_WARN_SURROGATE;
ba210ebe 391 goto malformed;
fcc8fcf6 392 } else if ((expectlen > UNISKIP(uv)) &&
393 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 394 warning = UTF8_WARN_LONG;
ba210ebe 395 goto malformed;
421a8bf2 396 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 397 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 398 warning = UTF8_WARN_FFFF;
a9917092 399 goto malformed;
a0ed51b3 400 }
ba210ebe 401
a0ed51b3 402 return uv;
ba210ebe 403
404malformed:
405
fcc8fcf6 406 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 407 if (retlen)
cc366d4b 408 *retlen = -1;
ba210ebe 409 return 0;
410 }
411
a0dbb045 412 if (dowarn) {
413 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
414
415 switch (warning) {
416 case 0: /* Intentionally empty. */ break;
417 case UTF8_WARN_EMPTY:
418 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
419 break;
420 case UTF8_WARN_CONTINUATION:
097fb8e2 421 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045 422 break;
423 case UTF8_WARN_NON_CONTINUATION:
097fb8e2 424 if (s == s0)
425 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
426 (UV)s[1], startbyte);
427 else
428 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
429 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
430
a0dbb045 431 break;
432 case UTF8_WARN_FE_FF:
433 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
434 break;
435 case UTF8_WARN_SHORT:
097fb8e2 436 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
437 curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
b31f83c2 438 expectlen = curlen; /* distance for caller to skip */
a0dbb045 439 break;
440 case UTF8_WARN_OVERFLOW:
097fb8e2 441 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
442 ouv, *s, startbyte);
a0dbb045 443 break;
444 case UTF8_WARN_SURROGATE:
445 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
446 break;
a0dbb045 447 case UTF8_WARN_LONG:
097fb8e2 448 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
449 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045 450 break;
451 case UTF8_WARN_FFFF:
452 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
453 break;
454 default:
455 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
456 break;
457 }
458
459 if (warning) {
460 char *s = SvPVX(sv);
461
462 if (PL_op)
9014280d 463 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 464 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 465 else
9014280d 466 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045 467 }
468 }
469
ba210ebe 470 if (retlen)
28d3d195 471 *retlen = expectlen ? expectlen : len;
ba210ebe 472
28d3d195 473 return 0;
a0ed51b3 474}
475
8e84507e 476/*
37607a96 477=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
9041c2e3 478
479Returns the native character value of the first character in the string C<s>
480which is assumed to be in UTF8 encoding; C<retlen> will be set to the
481length, in bytes, of that character.
482
483If C<s> does not point to a well-formed UTF8 character, zero is
484returned and retlen is set, if possible, to -1.
485
486=cut
487*/
488
489UV
37607a96 490Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
9041c2e3 491{
872c91ae 492 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
493 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3 494}
495
496/*
37607a96 497=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
9041c2e3 498
499Returns the Unicode code point of the first character in the string C<s>
500which is assumed to be in UTF8 encoding; C<retlen> will be set to the
501length, in bytes, of that character.
502
503This function should only be used when returned UV is considered
504an index into the Unicode semantic tables (e.g. swashes).
505
ba210ebe 506If C<s> does not point to a well-formed UTF8 character, zero is
507returned and retlen is set, if possible, to -1.
8e84507e 508
509=cut
510*/
511
512UV
37607a96 513Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
8e84507e 514{
9041c2e3 515 /* Call the low level routine asking for checks */
872c91ae 516 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
517 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e 518}
519
b76347f2 520/*
37607a96 521=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
b76347f2 522
523Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47 524Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
525up past C<e>, croaks.
b76347f2 526
527=cut
528*/
529
530STRLEN
37607a96 531Perl_utf8_length(pTHX_ U8 *s, U8 *e)
b76347f2 532{
533 STRLEN len = 0;
534
8850bf83 535 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
536 * the bitops (especially ~) can create illegal UTF-8.
537 * In other words: in Perl UTF-8 is not just for Unicode. */
538
901b21bf 539 if (e < s) {
540 if (ckWARN_d(WARN_UTF8)) {
541 if (PL_op)
542 Perl_warner(aTHX_ packWARN(WARN_UTF8),
543 "%s in %s", unees, OP_DESC(PL_op));
544 else
545 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
546 }
547 return 0;
548 }
b76347f2 549 while (s < e) {
02eb7b47 550 U8 t = UTF8SKIP(s);
b76347f2 551
901b21bf 552 if (e - s < t) {
553 if (ckWARN_d(WARN_UTF8)) {
554 if (PL_op)
555 Perl_warner(aTHX_ packWARN(WARN_UTF8),
556 unees, OP_DESC(PL_op));
557 else
558 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
559 }
560 return len;
561 }
b76347f2 562 s += t;
563 len++;
564 }
565
566 return len;
567}
568
b06226ff 569/*
eebe1485 570=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff 571
572Returns the number of UTF8 characters between the UTF-8 pointers C<a>
573and C<b>.
574
575WARNING: use only if you *know* that the pointers point inside the
576same UTF-8 buffer.
577
37607a96 578=cut
579*/
a0ed51b3 580
02eb7b47 581IV
864dbfa3 582Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 583{
02eb7b47 584 IV off = 0;
585
8850bf83 586 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
587 * the bitops (especially ~) can create illegal UTF-8.
588 * In other words: in Perl UTF-8 is not just for Unicode. */
589
a0ed51b3 590 if (a < b) {
591 while (a < b) {
02eb7b47 592 U8 c = UTF8SKIP(a);
593
901b21bf 594 if (b - a < c) {
595 if (ckWARN_d(WARN_UTF8)) {
596 if (PL_op)
597 Perl_warner(aTHX_ packWARN(WARN_UTF8),
598 "%s in %s", unees, OP_DESC(PL_op));
599 else
600 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
601 }
602 return off;
603 }
02eb7b47 604 a += c;
a0ed51b3 605 off--;
606 }
607 }
608 else {
609 while (b < a) {
02eb7b47 610 U8 c = UTF8SKIP(b);
611
901b21bf 612 if (a - b < c) {
613 if (ckWARN_d(WARN_UTF8)) {
614 if (PL_op)
615 Perl_warner(aTHX_ packWARN(WARN_UTF8),
616 "%s in %s", unees, OP_DESC(PL_op));
617 else
618 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
619 }
620 return off;
621 }
02eb7b47 622 b += c;
a0ed51b3 623 off++;
624 }
625 }
02eb7b47 626
a0ed51b3 627 return off;
628}
629
b06226ff 630/*
37607a96 631=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 632
8850bf83 633Return the UTF-8 pointer C<s> displaced by C<off> characters, either
634forward or backward.
b06226ff 635
636WARNING: do not use the following unless you *know* C<off> is within
8850bf83 637the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
638on the first byte of character or just after the last byte of a character.
b06226ff 639
37607a96 640=cut
641*/
a0ed51b3 642
643U8 *
864dbfa3 644Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 645{
8850bf83 646 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
647 * the bitops (especially ~) can create illegal UTF-8.
648 * In other words: in Perl UTF-8 is not just for Unicode. */
649
a0ed51b3 650 if (off >= 0) {
651 while (off--)
652 s += UTF8SKIP(s);
653 }
654 else {
655 while (off++) {
656 s--;
8850bf83 657 while (UTF8_IS_CONTINUATION(*s))
658 s--;
a0ed51b3 659 }
660 }
661 return s;
662}
663
6940069f 664/*
eebe1485 665=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 666
246fae53 667Converts a string C<s> of length C<len> from UTF8 into byte encoding.
668Unlike C<bytes_to_utf8>, this over-writes the original string, and
669updates len to contain the new length.
67e989fb 670Returns zero on failure, setting C<len> to -1.
6940069f 671
672=cut
673*/
674
675U8 *
37607a96 676Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 677{
6940069f 678 U8 *send;
679 U8 *d;
dcad2880 680 U8 *save = s;
246fae53 681
682 /* ensure valid UTF8 and chars < 256 before updating string */
dcad2880 683 for (send = s + *len; s < send; ) {
684 U8 c = *s++;
685
1d72bdf6 686 if (!UTF8_IS_INVARIANT(c) &&
687 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
688 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880 689 *len = -1;
690 return 0;
691 }
246fae53 692 }
dcad2880 693
694 d = s = save;
6940069f 695 while (s < send) {
ed646e6e 696 STRLEN ulen;
9041c2e3 697 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 698 s += ulen;
6940069f 699 }
700 *d = '\0';
246fae53 701 *len = d - save;
6940069f 702 return save;
703}
704
705/*
f9a63242 706=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
707
708Converts a string C<s> of length C<len> from UTF8 into byte encoding.
709Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0 710the newly-created string, and updates C<len> to contain the new
711length. Returns the original string if no conversion occurs, C<len>
712is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
7130 if C<s> is converted or contains all 7bit characters.
f9a63242 714
37607a96 715=cut
716*/
f9a63242 717
718U8 *
37607a96 719Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 720{
f9a63242 721 U8 *d;
722 U8 *start = s;
db42d148 723 U8 *send;
f9a63242 724 I32 count = 0;
725
726 if (!*is_utf8)
727 return start;
728
ef9edfd0 729 /* ensure valid UTF8 and chars < 256 before converting string */
f9a63242 730 for (send = s + *len; s < send;) {
731 U8 c = *s++;
1d72bdf6 732 if (!UTF8_IS_INVARIANT(c)) {
db42d148 733 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
734 (c = *s++) && UTF8_IS_CONTINUATION(c))
735 count++;
736 else
f9a63242 737 return start;
db42d148 738 }
f9a63242 739 }
740
741 *is_utf8 = 0;
742
f9a63242 743 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 744 s = start; start = d;
f9a63242 745 while (s < send) {
746 U8 c = *s++;
c4d5f83a 747 if (!UTF8_IS_INVARIANT(c)) {
748 /* Then it is two-byte encoded */
749 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
750 c = ASCII_TO_NATIVE(c);
751 }
752 *d++ = c;
f9a63242 753 }
754 *d = '\0';
755 *len = d - start;
756 return start;
757}
758
759/*
eebe1485 760=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f 761
762Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
6662521e 763Returns a pointer to the newly-created string, and sets C<len> to
764reflect the new length.
6940069f 765
497711e7 766=cut
6940069f 767*/
768
769U8*
37607a96 770Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
6940069f 771{
6940069f 772 U8 *send;
773 U8 *d;
774 U8 *dst;
6662521e 775 send = s + (*len);
6940069f 776
6662521e 777 Newz(801, d, (*len) * 2 + 1, U8);
6940069f 778 dst = d;
779
780 while (s < send) {
db42d148 781 UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 782 if (UNI_IS_INVARIANT(uv))
783 *d++ = UTF_TO_NATIVE(uv);
6940069f 784 else {
90f44359 785 *d++ = UTF8_EIGHT_BIT_HI(uv);
786 *d++ = UTF8_EIGHT_BIT_LO(uv);
6940069f 787 }
788 }
789 *d = '\0';
6662521e 790 *len = d-dst;
6940069f 791 return dst;
792}
793
a0ed51b3 794/*
dea0fc0b 795 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3 796 *
797 * Destination must be pre-extended to 3/2 source. Do not use in-place.
798 * We optimize for native, for obvious reasons. */
799
800U8*
dea0fc0b 801Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 802{
dea0fc0b 803 U8* pend;
804 U8* dstart = d;
805
806 if (bytelen & 1)
a7867d0a 807 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b 808
809 pend = p + bytelen;
810
a0ed51b3 811 while (p < pend) {
dea0fc0b 812 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
813 p += 2;
a0ed51b3 814 if (uv < 0x80) {
815 *d++ = uv;
816 continue;
817 }
818 if (uv < 0x800) {
819 *d++ = (( uv >> 6) | 0xc0);
820 *d++ = (( uv & 0x3f) | 0x80);
821 continue;
822 }
823 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
dea0fc0b 824 UV low = *p++;
825 if (low < 0xdc00 || low >= 0xdfff)
826 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3 827 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
828 }
829 if (uv < 0x10000) {
830 *d++ = (( uv >> 12) | 0xe0);
831 *d++ = (((uv >> 6) & 0x3f) | 0x80);
832 *d++ = (( uv & 0x3f) | 0x80);
833 continue;
834 }
835 else {
836 *d++ = (( uv >> 18) | 0xf0);
837 *d++ = (((uv >> 12) & 0x3f) | 0x80);
838 *d++ = (((uv >> 6) & 0x3f) | 0x80);
839 *d++ = (( uv & 0x3f) | 0x80);
840 continue;
841 }
842 }
dea0fc0b 843 *newlen = d - dstart;
a0ed51b3 844 return d;
845}
846
847/* Note: this one is slightly destructive of the source. */
848
849U8*
dea0fc0b 850Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 851{
852 U8* s = (U8*)p;
853 U8* send = s + bytelen;
854 while (s < send) {
855 U8 tmp = s[0];
856 s[0] = s[1];
857 s[1] = tmp;
858 s += 2;
859 }
dea0fc0b 860 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3 861}
862
863/* for now these are all defined (inefficiently) in terms of the utf8 versions */
864
865bool
84afefe6 866Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 867{
ad391ad9 868 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 869 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 870 return is_utf8_alnum(tmpbuf);
871}
872
873bool
84afefe6 874Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 875{
ad391ad9 876 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 877 uvchr_to_utf8(tmpbuf, c);
b8c5462f 878 return is_utf8_alnumc(tmpbuf);
879}
880
881bool
84afefe6 882Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 883{
ad391ad9 884 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 885 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 886 return is_utf8_idfirst(tmpbuf);
887}
888
889bool
84afefe6 890Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 891{
ad391ad9 892 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 893 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 894 return is_utf8_alpha(tmpbuf);
895}
896
897bool
84afefe6 898Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 899{
ad391ad9 900 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 901 uvchr_to_utf8(tmpbuf, c);
4d61ec05 902 return is_utf8_ascii(tmpbuf);
903}
904
905bool
84afefe6 906Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 907{
ad391ad9 908 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 909 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 910 return is_utf8_space(tmpbuf);
911}
912
913bool
84afefe6 914Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 915{
ad391ad9 916 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 917 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 918 return is_utf8_digit(tmpbuf);
919}
920
921bool
84afefe6 922Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 923{
ad391ad9 924 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 925 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 926 return is_utf8_upper(tmpbuf);
927}
928
929bool
84afefe6 930Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 931{
ad391ad9 932 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 933 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 934 return is_utf8_lower(tmpbuf);
935}
936
937bool
84afefe6 938Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 939{
ad391ad9 940 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 941 uvchr_to_utf8(tmpbuf, c);
b8c5462f 942 return is_utf8_cntrl(tmpbuf);
943}
944
945bool
84afefe6 946Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 947{
ad391ad9 948 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 949 uvchr_to_utf8(tmpbuf, c);
b8c5462f 950 return is_utf8_graph(tmpbuf);
951}
952
953bool
84afefe6 954Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 955{
ad391ad9 956 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 957 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 958 return is_utf8_print(tmpbuf);
959}
960
b8c5462f 961bool
84afefe6 962Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 963{
ad391ad9 964 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 965 uvchr_to_utf8(tmpbuf, c);
b8c5462f 966 return is_utf8_punct(tmpbuf);
967}
968
4d61ec05 969bool
84afefe6 970Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 971{
e7ae6809 972 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
230880c1 973 uvchr_to_utf8(tmpbuf, c);
4d61ec05 974 return is_utf8_xdigit(tmpbuf);
975}
976
84afefe6 977UV
978Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 979{
0ebc6274 980 uvchr_to_utf8(p, c);
981 return to_utf8_upper(p, p, lenp);
a0ed51b3 982}
983
84afefe6 984UV
985Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 986{
0ebc6274 987 uvchr_to_utf8(p, c);
988 return to_utf8_title(p, p, lenp);
a0ed51b3 989}
990
84afefe6 991UV
992Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 993{
0ebc6274 994 uvchr_to_utf8(p, c);
995 return to_utf8_lower(p, p, lenp);
a0ed51b3 996}
997
84afefe6 998UV
999Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1000{
0ebc6274 1001 uvchr_to_utf8(p, c);
1002 return to_utf8_fold(p, p, lenp);
84afefe6 1003}
1004
a0ed51b3 1005/* for now these all assume no locale info available for Unicode > 255 */
1006
1007bool
84afefe6 1008Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3 1009{
1010 return is_uni_alnum(c); /* XXX no locale support yet */
1011}
1012
1013bool
84afefe6 1014Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f 1015{
1016 return is_uni_alnumc(c); /* XXX no locale support yet */
1017}
1018
1019bool
84afefe6 1020Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3 1021{
1022 return is_uni_idfirst(c); /* XXX no locale support yet */
1023}
1024
1025bool
84afefe6 1026Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3 1027{
1028 return is_uni_alpha(c); /* XXX no locale support yet */
1029}
1030
1031bool
84afefe6 1032Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05 1033{
1034 return is_uni_ascii(c); /* XXX no locale support yet */
1035}
1036
1037bool
84afefe6 1038Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3 1039{
1040 return is_uni_space(c); /* XXX no locale support yet */
1041}
1042
1043bool
84afefe6 1044Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3 1045{
1046 return is_uni_digit(c); /* XXX no locale support yet */
1047}
1048
1049bool
84afefe6 1050Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3 1051{
1052 return is_uni_upper(c); /* XXX no locale support yet */
1053}
1054
1055bool
84afefe6 1056Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3 1057{
1058 return is_uni_lower(c); /* XXX no locale support yet */
1059}
1060
1061bool
84afefe6 1062Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f 1063{
1064 return is_uni_cntrl(c); /* XXX no locale support yet */
1065}
1066
1067bool
84afefe6 1068Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f 1069{
1070 return is_uni_graph(c); /* XXX no locale support yet */
1071}
1072
1073bool
84afefe6 1074Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3 1075{
1076 return is_uni_print(c); /* XXX no locale support yet */
1077}
1078
b8c5462f 1079bool
84afefe6 1080Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f 1081{
1082 return is_uni_punct(c); /* XXX no locale support yet */
1083}
1084
4d61ec05 1085bool
84afefe6 1086Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05 1087{
1088 return is_uni_xdigit(c); /* XXX no locale support yet */
1089}
1090
b7ac61fa 1091U32
1092Perl_to_uni_upper_lc(pTHX_ U32 c)
1093{
ee099d14 1094 /* XXX returns only the first character -- do not use XXX */
1095 /* XXX no locale support yet */
1096 STRLEN len;
1097 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1098 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa 1099}
1100
1101U32
1102Perl_to_uni_title_lc(pTHX_ U32 c)
1103{
ee099d14 1104 /* XXX returns only the first character XXX -- do not use XXX */
1105 /* XXX no locale support yet */
1106 STRLEN len;
1107 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1108 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa 1109}
1110
1111U32
1112Perl_to_uni_lower_lc(pTHX_ U32 c)
1113{
ee099d14 1114 /* XXX returns only the first character -- do not use XXX */
1115 /* XXX no locale support yet */
1116 STRLEN len;
1117 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1118 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa 1119}
1120
a0ed51b3 1121bool
864dbfa3 1122Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1123{
386d01d6 1124 if (!is_utf8_char(p))
1125 return FALSE;
a0ed51b3 1126 if (!PL_utf8_alnum)
289d4f09 1127 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1128 * descendant of isalnum(3), in other words, it doesn't
1129 * contain the '_'. --jhi */
1130 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
3568d838 1131 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3 1132/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1133#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1134 if (!PL_utf8_alnum)
1135 PL_utf8_alnum = swash_init("utf8", "",
1136 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1137 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3 1138#endif
1139}
1140
1141bool
b8c5462f 1142Perl_is_utf8_alnumc(pTHX_ U8 *p)
1143{
386d01d6 1144 if (!is_utf8_char(p))
1145 return FALSE;
b8c5462f 1146 if (!PL_utf8_alnum)
1147 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
3568d838 1148 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f 1149/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1150#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1151 if (!PL_utf8_alnum)
1152 PL_utf8_alnum = swash_init("utf8", "",
1153 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1154 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f 1155#endif
1156}
1157
1158bool
864dbfa3 1159Perl_is_utf8_idfirst(pTHX_ U8 *p)
a0ed51b3 1160{
1161 return *p == '_' || is_utf8_alpha(p);
1162}
1163
1164bool
864dbfa3 1165Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1166{
386d01d6 1167 if (!is_utf8_char(p))
1168 return FALSE;
a0ed51b3 1169 if (!PL_utf8_alpha)
e24b16f9 1170 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
3568d838 1171 return swash_fetch(PL_utf8_alpha, p, TRUE);
a0ed51b3 1172}
1173
1174bool
b8c5462f 1175Perl_is_utf8_ascii(pTHX_ U8 *p)
1176{
386d01d6 1177 if (!is_utf8_char(p))
1178 return FALSE;
b8c5462f 1179 if (!PL_utf8_ascii)
1180 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
3568d838 1181 return swash_fetch(PL_utf8_ascii, p, TRUE);
b8c5462f 1182}
1183
1184bool
864dbfa3 1185Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1186{
386d01d6 1187 if (!is_utf8_char(p))
1188 return FALSE;
a0ed51b3 1189 if (!PL_utf8_space)
3bec3564 1190 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
3568d838 1191 return swash_fetch(PL_utf8_space, p, TRUE);
a0ed51b3 1192}
1193
1194bool
864dbfa3 1195Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1196{
386d01d6 1197 if (!is_utf8_char(p))
1198 return FALSE;
a0ed51b3 1199 if (!PL_utf8_digit)
e24b16f9 1200 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
3568d838 1201 return swash_fetch(PL_utf8_digit, p, TRUE);
a0ed51b3 1202}
1203
1204bool
864dbfa3 1205Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1206{
386d01d6 1207 if (!is_utf8_char(p))
1208 return FALSE;
a0ed51b3 1209 if (!PL_utf8_upper)
e24b16f9 1210 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
3568d838 1211 return swash_fetch(PL_utf8_upper, p, TRUE);
a0ed51b3 1212}
1213
1214bool
864dbfa3 1215Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1216{
386d01d6 1217 if (!is_utf8_char(p))
1218 return FALSE;
a0ed51b3 1219 if (!PL_utf8_lower)
e24b16f9 1220 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
3568d838 1221 return swash_fetch(PL_utf8_lower, p, TRUE);
a0ed51b3 1222}
1223
1224bool
b8c5462f 1225Perl_is_utf8_cntrl(pTHX_ U8 *p)
1226{
386d01d6 1227 if (!is_utf8_char(p))
1228 return FALSE;
b8c5462f 1229 if (!PL_utf8_cntrl)
1230 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
3568d838 1231 return swash_fetch(PL_utf8_cntrl, p, TRUE);
b8c5462f 1232}
1233
1234bool
1235Perl_is_utf8_graph(pTHX_ U8 *p)
1236{
386d01d6 1237 if (!is_utf8_char(p))
1238 return FALSE;
b8c5462f 1239 if (!PL_utf8_graph)
1240 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
3568d838 1241 return swash_fetch(PL_utf8_graph, p, TRUE);
b8c5462f 1242}
1243
1244bool
864dbfa3 1245Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1246{
386d01d6 1247 if (!is_utf8_char(p))
1248 return FALSE;
a0ed51b3 1249 if (!PL_utf8_print)
e24b16f9 1250 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
3568d838 1251 return swash_fetch(PL_utf8_print, p, TRUE);
a0ed51b3 1252}
1253
1254bool
b8c5462f 1255Perl_is_utf8_punct(pTHX_ U8 *p)
1256{
386d01d6 1257 if (!is_utf8_char(p))
1258 return FALSE;
b8c5462f 1259 if (!PL_utf8_punct)
1260 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
3568d838 1261 return swash_fetch(PL_utf8_punct, p, TRUE);
b8c5462f 1262}
1263
1264bool
1265Perl_is_utf8_xdigit(pTHX_ U8 *p)
1266{
386d01d6 1267 if (!is_utf8_char(p))
1268 return FALSE;
b8c5462f 1269 if (!PL_utf8_xdigit)
1270 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
3568d838 1271 return swash_fetch(PL_utf8_xdigit, p, TRUE);
b8c5462f 1272}
1273
1274bool
864dbfa3 1275Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1276{
386d01d6 1277 if (!is_utf8_char(p))
1278 return FALSE;
a0ed51b3 1279 if (!PL_utf8_mark)
e24b16f9 1280 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
3568d838 1281 return swash_fetch(PL_utf8_mark, p, TRUE);
a0ed51b3 1282}
1283
6b5c0936 1284/*
1285=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1286
1287The "p" contains the pointer to the UTF-8 string encoding
1288the character that is being converted.
1289
1290The "ustrp" is a pointer to the character buffer to put the
1291conversion result to. The "lenp" is a pointer to the length
1292of the result.
1293
0134edef 1294The "swashp" is a pointer to the swash to use.
6b5c0936 1295
0134edef 1296Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1297and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1298but not always, a multicharacter mapping), is tried first.
6b5c0936 1299
0134edef 1300The "special" is a string like "utf8::ToSpecLower", which means the
1301hash %utf8::ToSpecLower. The access to the hash is through
1302Perl_to_utf8_case().
6b5c0936 1303
0134edef 1304The "normal" is a string like "ToLower" which means the swash
1305%utf8::ToLower.
1306
1307=cut */
6b5c0936 1308
2104c8d9 1309UV
a6872d42 1310Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
a0ed51b3 1311{
0134edef 1312 UV uv0, uv1;
2f9475ad 1313 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
0134edef 1314 STRLEN len = 0;
a0ed51b3 1315
1feea2c7 1316 uv0 = utf8_to_uvchr(p, 0);
1317 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1318 * are necessary in EBCDIC, they are redundant no-ops
1319 * in ASCII-ish platforms, and hopefully optimized away. */
1320 uv1 = NATIVE_TO_UNI(uv0);
1321 uvuni_to_utf8(tmpbuf, uv1);
0134edef 1322
1323 if (!*swashp) /* load on-demand */
1324 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1325
1326 if (special) {
1327 /* It might be "special" (sometimes, but not always,
2a37f04d 1328 * a multicharacter mapping) */
983ffd37 1329 HV *hv;
1330 SV *keysv;
1331 HE *he;
2a37f04d 1332 SV *val;
2f9475ad 1333
983ffd37 1334 if ((hv = get_hv(special, FALSE)) &&
1feea2c7 1335 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
2a37f04d 1336 (he = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
1337 (val = HeVAL(he))) {
47654450 1338 char *s;
47654450 1339
2a37f04d 1340 s = SvPV(val, len);
47654450 1341 if (len == 1)
1342 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1343 else {
2f9475ad 1344#ifdef EBCDIC
1345 /* If we have EBCDIC we need to remap the characters
1346 * since any characters in the low 256 are Unicode
1347 * code points, not EBCDIC. */
7cda7a3d 1348 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad 1349
1350 d = tmpbuf;
1351 if (SvUTF8(val)) {
1352 STRLEN tlen = 0;
1353
1354 while (t < tend) {
1355 UV c = utf8_to_uvchr(t, &tlen);
1356 if (tlen > 0) {
1357 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1358 t += tlen;
1359 }
1360 else
1361 break;
1362 }
1363 }
1364 else {
36fec512 1365 while (t < tend) {
1366 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1367 t++;
1368 }
2f9475ad 1369 }
1370 len = d - tmpbuf;
1371 Copy(tmpbuf, ustrp, len, U8);
1372#else
d2dcd0fb 1373 Copy(s, ustrp, len, U8);
2f9475ad 1374#endif
29e98929 1375 }
983ffd37 1376 }
0134edef 1377 }
1378
1379 if (!len && *swashp) {
1380 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1381
1382 if (uv2) {
1383 /* It was "normal" (a single character mapping). */
1384 UV uv3 = UNI_TO_NATIVE(uv2);
1385
e9101d72 1386 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d 1387 }
1388 }
1feea2c7 1389
0134edef 1390 if (!len) /* Neither: just copy. */
1391 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1392
2a37f04d 1393 if (lenp)
1394 *lenp = len;
1395
0134edef 1396 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3 1397}
1398
d3e79532 1399/*
1400=for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1401
1402Convert the UTF-8 encoded character at p to its uppercase version and
1403store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1404that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1405uppercase version may be longer than the original character (up to two
1406characters).
1407
1408The first character of the uppercased version is returned
1409(but note, as explained above, that there may be more.)
1410
1411=cut */
1412
2104c8d9 1413UV
983ffd37 1414Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1415{
983ffd37 1416 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1417 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1418}
a0ed51b3 1419
d3e79532 1420/*
1421=for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1422
1423Convert the UTF-8 encoded character at p to its titlecase version and
1424store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1425that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1426titlecase version may be longer than the original character (up to two
1427characters).
1428
1429The first character of the titlecased version is returned
1430(but note, as explained above, that there may be more.)
1431
1432=cut */
1433
983ffd37 1434UV
1435Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1436{
1437 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1438 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3 1439}
1440
d3e79532 1441/*
1442=for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1443
1444Convert the UTF-8 encoded character at p to its lowercase version and
1445store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1446that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1447lowercase version may be longer than the original character (up to two
1448characters).
1449
1450The first character of the lowercased version is returned
1451(but note, as explained above, that there may be more.)
1452
1453=cut */
1454
2104c8d9 1455UV
a2a2844f 1456Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1457{
983ffd37 1458 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1459 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1460}
1461
d3e79532 1462/*
1463=for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1464
1465Convert the UTF-8 encoded character at p to its foldcase version and
1466store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1467that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1468foldcase version may be longer than the original character (up to
1469three characters).
1470
1471The first character of the foldcased version is returned
1472(but note, as explained above, that there may be more.)
1473
1474=cut */
1475
b4e400f9 1476UV
1477Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1478{
1479 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1480 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3 1481}
1482
1483/* a "swash" is a swatch hash */
1484
1485SV*
864dbfa3 1486Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1487{
1488 SV* retval;
bf1fed83 1489 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1490 dSP;
1b026014 1491 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
f8be5cf0 1492 SV* errsv_save;
ce3b816e 1493
1b026014 1494 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1495 ENTER;
f8be5cf0 1496 errsv_save = newSVsv(ERRSV);
ce3b816e 1497 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
f8be5cf0 1498 if (!SvTRUE(ERRSV))
1499 sv_setsv(ERRSV, errsv_save);
1500 SvREFCNT_dec(errsv_save);
ce3b816e 1501 LEAVE;
1502 }
1503 SPAGAIN;
a0ed51b3 1504 PUSHSTACKi(PERLSI_MAGIC);
1505 PUSHMARK(SP);
1506 EXTEND(SP,5);
1507 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1508 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1509 PUSHs(listsv);
1510 PUSHs(sv_2mortal(newSViv(minbits)));
1511 PUSHs(sv_2mortal(newSViv(none)));
1512 PUTBACK;
1513 ENTER;
1514 SAVEI32(PL_hints);
1515 PL_hints = 0;
1516 save_re_context();
bf1fed83 1517 if (PL_curcop == &PL_compiling)
1518 /* XXX ought to be handled by lex_start */
1519 sv_setpv(tokenbufsv, PL_tokenbuf);
f8be5cf0 1520 errsv_save = newSVsv(ERRSV);
864dbfa3 1521 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1522 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1523 else
e24b16f9 1524 retval = &PL_sv_undef;
f8be5cf0 1525 if (!SvTRUE(ERRSV))
1526 sv_setsv(ERRSV, errsv_save);
1527 SvREFCNT_dec(errsv_save);
a0ed51b3 1528 LEAVE;
1529 POPSTACK;
e24b16f9 1530 if (PL_curcop == &PL_compiling) {
bf1fed83 1531 STRLEN len;
1532 char* pv = SvPV(tokenbufsv, len);
1533
1534 Copy(pv, PL_tokenbuf, len+1, char);
e24b16f9 1535 PL_curcop->op_private = PL_hints;
a0ed51b3 1536 }
bc45ce41 1537 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1538 if (SvPOK(retval))
1539 Perl_croak(aTHX_ "Can't find Unicode property definition \"%s\"",
1540 SvPV_nolen(retval));
cea2e8a9 1541 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1542 }
a0ed51b3 1543 return retval;
1544}
1545
035d37be 1546
1547/* This API is wrong for special case conversions since we may need to
1548 * return several Unicode characters for a single Unicode character
1549 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1550 * the lower-level routine, and it is similarly broken for returning
1551 * multiple values. --jhi */
a0ed51b3 1552UV
3568d838 1553Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
a0ed51b3 1554{
1555 HV* hv = (HV*)SvRV(sv);
3568d838 1556 U32 klen;
1557 U32 off;
a0ed51b3 1558 STRLEN slen;
7d85a32c 1559 STRLEN needents;
4ea42e7f 1560 U8 *tmps = NULL;
a0ed51b3 1561 U32 bit;
1562 SV *retval;
3568d838 1563 U8 tmputf8[2];
1564 UV c = NATIVE_TO_ASCII(*ptr);
1565
1566 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1567 tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1568 tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1569 ptr = tmputf8;
1570 }
1571 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1572 * then the "swatch" is a vec() for al the chars which start
1573 * with 0xAA..0xYY
1574 * So the key in the hash (klen) is length of encoded char -1
1575 */
1576 klen = UTF8SKIP(ptr) - 1;
1577 off = ptr[klen];
a0ed51b3 1578
7d85a32c 1579 if (klen == 0)
1580 {
1581 /* If char in invariant then swatch is for all the invariant chars
1582 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1583 */
1584 needents = UTF_CONTINUATION_MARK;
1585 off = NATIVE_TO_UTF(ptr[klen]);
1586 }
1587 else
1588 {
1589 /* If char is encoded then swatch is for the prefix */
1590 needents = (1 << UTF_ACCUMULATION_SHIFT);
1591 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1592 }
1593
a0ed51b3 1594 /*
1595 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1596 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1597 * it's nothing to sniff at.) Pity we usually come through at least
1598 * two function calls to get here...
1599 *
1600 * NB: this code assumes that swatches are never modified, once generated!
1601 */
1602
3568d838 1603 if (hv == PL_last_swash_hv &&
a0ed51b3 1604 klen == PL_last_swash_klen &&
3568d838 1605 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3 1606 {
1607 tmps = PL_last_swash_tmps;
1608 slen = PL_last_swash_slen;
1609 }
1610 else {
1611 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1612 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3 1613
1614 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1615 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1616 dSP;
2b9d42f0 1617 /* We use utf8n_to_uvuni() as we want an index into
1618 Unicode tables, not a native character number.
1619 */
872c91ae 1620 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
1621 ckWARN(WARN_UTF8) ?
1622 0 : UTF8_ALLOW_ANY);
f8be5cf0 1623 SV *errsv_save;
a0ed51b3 1624 ENTER;
1625 SAVETMPS;
1626 save_re_context();
1627 PUSHSTACKi(PERLSI_MAGIC);
1628 PUSHMARK(SP);
1629 EXTEND(SP,3);
1630 PUSHs((SV*)sv);
ffbc6a93 1631 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838 1632 PUSHs(sv_2mortal(newSViv((klen) ?
1633 (code_point & ~(needents - 1)) : 0)));
a0ed51b3 1634 PUSHs(sv_2mortal(newSViv(needents)));
1635 PUTBACK;
f8be5cf0 1636 errsv_save = newSVsv(ERRSV);
864dbfa3 1637 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1638 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1639 else
e24b16f9 1640 retval = &PL_sv_undef;
f8be5cf0 1641 if (!SvTRUE(ERRSV))
1642 sv_setsv(ERRSV, errsv_save);
1643 SvREFCNT_dec(errsv_save);
a0ed51b3 1644 POPSTACK;
1645 FREETMPS;
1646 LEAVE;
e24b16f9 1647 if (PL_curcop == &PL_compiling)
1648 PL_curcop->op_private = PL_hints;
a0ed51b3 1649
dfe13c55 1650 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1651
7d85a32c 1652 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1653 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3 1654 }
1655
1656 PL_last_swash_hv = hv;
1657 PL_last_swash_klen = klen;
1658 PL_last_swash_tmps = tmps;
1659 PL_last_swash_slen = slen;
1660 if (klen)
1661 Copy(ptr, PL_last_swash_key, klen, U8);
1662 }
1663
9faf8d75 1664 switch ((int)((slen << 3) / needents)) {
a0ed51b3 1665 case 1:
1666 bit = 1 << (off & 7);
1667 off >>= 3;
1668 return (tmps[off] & bit) != 0;
1669 case 8:
1670 return tmps[off];
1671 case 16:
1672 off <<= 1;
1673 return (tmps[off] << 8) + tmps[off + 1] ;
1674 case 32:
1675 off <<= 2;
1676 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1677 }
cea2e8a9 1678 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3 1679 return 0;
1680}
2b9d42f0 1681
1682
1683/*
37607a96 1684=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0 1685
1686Adds the UTF8 representation of the Native codepoint C<uv> to the end
1687of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1688bytes available. The return value is the pointer to the byte after the
1689end of the new character. In other words,
1690
1691 d = uvchr_to_utf8(d, uv);
1692
1693is the recommended wide native character-aware way of saying
1694
1695 *(d++) = uv;
1696
1697=cut
1698*/
1699
1700/* On ASCII machines this is normally a macro but we want a
1701 real function in case XS code wants it
1702*/
1703#undef Perl_uvchr_to_utf8
1704U8 *
1705Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1706{
b851fbc1 1707 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2b9d42f0 1708}
1709
b851fbc1 1710U8 *
1711Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1712{
1713 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1714}
2b9d42f0 1715
1716/*
37607a96 1717=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0 1718
1719Returns the native character value of the first character in the string C<s>
1720which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1721length, in bytes, of that character.
1722
1723Allows length and flags to be passed to low level routine.
1724
1725=cut
1726*/
0a2ef054 1727/* On ASCII machines this is normally a macro but we want
1728 a real function in case XS code wants it
2b9d42f0 1729*/
1730#undef Perl_utf8n_to_uvchr
1731UV
37607a96 1732Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0 1733{
1734 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1735 return UNI_TO_NATIVE(uv);
1736}
1737
d2cc3551 1738/*
1739=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1740
1741Build to the scalar dsv a displayable version of the string spv,
1742length len, the displayable version being at most pvlim bytes long
1743(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1744
9e55ce06 1745The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1746isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054 1747to display the \\[nrfta\\] as the backslashed versions (like '\n')
1748(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1749UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1750UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1751
d2cc3551 1752The pointer to the PV of the dsv is returned.
1753
1754=cut */
e6b2e755 1755char *
1756Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1757{
1758 int truncated = 0;
1759 char *s, *e;
1760
1761 sv_setpvn(dsv, "", 0);
1762 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1763 UV u;
c728cb41 1764 bool ok = FALSE;
1765
e6b2e755 1766 if (pvlim && SvCUR(dsv) >= pvlim) {
1767 truncated++;
1768 break;
1769 }
1770 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1771 if (u < 256) {
c728cb41 1772 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1773 switch (u & 0xFF) {
1774 case '\n':
1775 Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1776 case '\r':
1777 Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1778 case '\t':
1779 Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1780 case '\f':
1781 Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1782 case '\a':
1783 Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1784 case '\\':
d79a7a3d 1785 Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
c728cb41 1786 default: break;
1787 }
1788 }
00e86452 1789 /* isPRINT() is the locale-blind version. */
1790 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
2c4547fe 1791 Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
0a2ef054 1792 ok = TRUE;
1793 }
c728cb41 1794 }
1795 if (!ok)
9e55ce06 1796 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755 1797 }
1798 if (truncated)
1799 sv_catpvn(dsv, "...", 3);
1800
1801 return SvPVX(dsv);
1802}
2b9d42f0 1803
d2cc3551 1804/*
1805=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1806
1807Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1808the displayable version being at most pvlim bytes long
d2cc3551 1809(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1810
1811The flags argument is as in pv_uni_display().
1812
d2cc3551 1813The pointer to the PV of the dsv is returned.
1814
1815=cut */
e6b2e755 1816char *
1817Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1818{
701a277b 1819 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1820 pvlim, flags);
1821}
1822
d2cc3551 1823/*
d07ddd77 1824=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551 1825
1826Return true if the strings s1 and s2 differ case-insensitively, false
1827if not (if they are equal case-insensitively). If u1 is true, the
1828string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77 1829the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1830are false, the respective string is assumed to be in native 8-bit
1831encoding.
1832
1833If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1834in there (they will point at the beginning of the I<next> character).
1835If the pointers behind pe1 or pe2 are non-NULL, they are the end
1836pointers beyond which scanning will not continue under any
1837circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1838s2+l2 will be used as goal end pointers that will also stop the scan,
1839and which qualify towards defining a successful match: all the scans
1840that define an explicit length must reach their goal pointers for
1841a match to succeed).
d2cc3551 1842
1843For case-insensitiveness, the "casefolding" of Unicode is used
1844instead of upper/lowercasing both the characters, see
1845http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1846
1847=cut */
701a277b 1848I32
d07ddd77 1849Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 1850{
5469e704 1851 register U8 *p1 = (U8*)s1;
1852 register U8 *p2 = (U8*)s2;
d07ddd77 1853 register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1854 register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1855 STRLEN n1 = 0, n2 = 0;
ffce6cc2 1856 U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1857 U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
d7f013c8 1858 U8 natbuf[1+1];
1859 STRLEN foldlen1, foldlen2;
d07ddd77 1860 bool match;
332ddc25 1861
d07ddd77 1862 if (pe1)
1863 e1 = *(U8**)pe1;
1864 if (e1 == 0 || (l1 && l1 < e1 - (U8*)s1))
1865 f1 = (U8*)s1 + l1;
1866 if (pe2)
1867 e2 = *(U8**)pe2;
1868 if (e2 == 0 || (l2 && l2 < e2 - (U8*)s2))
1869 f2 = (U8*)s2 + l2;
1870
1871 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1872 return 1; /* mismatch; possible infinite loop or false positive */
1873
a6872d42 1874 if (!u1 || !u2)
1875 natbuf[1] = 0; /* Need to terminate the buffer. */
1876
d07ddd77 1877 while ((e1 == 0 || p1 < e1) &&
1878 (f1 == 0 || p1 < f1) &&
1879 (e2 == 0 || p2 < e2) &&
1880 (f2 == 0 || p2 < f2)) {
1881 if (n1 == 0) {
d7f013c8 1882 if (u1)
1883 to_utf8_fold(p1, foldbuf1, &foldlen1);
1884 else {
f5cee151 1885 natbuf[0] = *p1;
d7f013c8 1886 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1887 }
1888 q1 = foldbuf1;
d07ddd77 1889 n1 = foldlen1;
332ddc25 1890 }
d07ddd77 1891 if (n2 == 0) {
d7f013c8 1892 if (u2)
1893 to_utf8_fold(p2, foldbuf2, &foldlen2);
1894 else {
f5cee151 1895 natbuf[0] = *p2;
d7f013c8 1896 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1897 }
1898 q2 = foldbuf2;
d07ddd77 1899 n2 = foldlen2;
332ddc25 1900 }
d07ddd77 1901 while (n1 && n2) {
1902 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1903 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1904 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 1905 return 1; /* mismatch */
d07ddd77 1906 n1 -= UTF8SKIP(q1);
d7f013c8 1907 q1 += UTF8SKIP(q1);
d07ddd77 1908 n2 -= UTF8SKIP(q2);
d7f013c8 1909 q2 += UTF8SKIP(q2);
701a277b 1910 }
d07ddd77 1911 if (n1 == 0)
d7f013c8 1912 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 1913 if (n2 == 0)
d7f013c8 1914 p2 += u2 ? UTF8SKIP(p2) : 1;
1915
d2cc3551 1916 }
5469e704 1917
d07ddd77 1918 /* A match is defined by all the scans that specified
1919 * an explicit length reaching their final goals. */
1920 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704 1921
1922 if (match) {
d07ddd77 1923 if (pe1)
1924 *pe1 = (char*)p1;
1925 if (pe2)
1926 *pe2 = (char*)p2;
5469e704 1927 }
1928
1929 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 1930}
701a277b 1931