test() in CPAN.pm
[p5sagit/p5-mst-13.2.git] / utf8.c
CommitLineData
a0ed51b3 1/* utf8.c
2 *
be3c0a43 3 * Copyright (c) 1998-2002, Larry Wall
a0ed51b3 4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3 25#include "perl.h"
26
901b21bf 27static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
28
ccfc67b7 29/*
30=head1 Unicode Support
a0ed51b3 31
b851fbc1 32=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 33
34Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
35of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
36bytes available. The return value is the pointer to the byte after the
9041c2e3 37end of the new character. In other words,
eebe1485 38
b851fbc1 39 d = uvuni_to_utf8_flags(d, uv, flags);
40
41or, in most cases,
42
9041c2e3 43 d = uvuni_to_utf8(d, uv);
eebe1485 44
b851fbc1 45(which is equivalent to)
46
47 d = uvuni_to_utf8_flags(d, uv, 0);
48
eebe1485 49is the recommended Unicode-aware way of saying
50
51 *(d++) = uv;
52
53=cut
54*/
55
dfe13c55 56U8 *
b851fbc1 57Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 58{
62961d2e 59 if (ckWARN(WARN_UTF8)) {
b851fbc1 60 if (UNICODE_IS_SURROGATE(uv) &&
61 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 62 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1 63 else if (
64 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
65 !(flags & UNICODE_ALLOW_FDD0))
66 ||
67 ((uv & 0xFFFF) == 0xFFFE &&
68 !(flags & UNICODE_ALLOW_FFFE))
69 ||
70 ((uv & 0xFFFF) == 0xFFFF &&
71 !(flags & UNICODE_ALLOW_FFFF))) &&
72 /* UNICODE_ALLOW_SUPER includes
73 * FFFEs and FFFFs beyond 0x10FFFF. */
74 ((uv <= PERL_UNICODE_MAX) ||
75 !(flags & UNICODE_ALLOW_SUPER))
76 )
9014280d 77 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800 78 "Unicode character 0x%04"UVxf" is illegal", uv);
79 }
c4d5f83a 80 if (UNI_IS_INVARIANT(uv)) {
81 *d++ = UTF_TO_NATIVE(uv);
a0ed51b3 82 return d;
83 }
2d331972 84#if defined(EBCDIC)
1d72bdf6 85 else {
86 STRLEN len = UNISKIP(uv);
87 U8 *p = d+len-1;
88 while (p > d) {
89 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
90 uv >>= UTF_ACCUMULATION_SHIFT;
91 }
92 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
93 return d+len;
94 }
95#else /* Non loop style */
a0ed51b3 96 if (uv < 0x800) {
97 *d++ = (( uv >> 6) | 0xc0);
98 *d++ = (( uv & 0x3f) | 0x80);
99 return d;
100 }
101 if (uv < 0x10000) {
102 *d++ = (( uv >> 12) | 0xe0);
103 *d++ = (((uv >> 6) & 0x3f) | 0x80);
104 *d++ = (( uv & 0x3f) | 0x80);
105 return d;
106 }
107 if (uv < 0x200000) {
108 *d++ = (( uv >> 18) | 0xf0);
109 *d++ = (((uv >> 12) & 0x3f) | 0x80);
110 *d++ = (((uv >> 6) & 0x3f) | 0x80);
111 *d++ = (( uv & 0x3f) | 0x80);
112 return d;
113 }
114 if (uv < 0x4000000) {
115 *d++ = (( uv >> 24) | 0xf8);
116 *d++ = (((uv >> 18) & 0x3f) | 0x80);
117 *d++ = (((uv >> 12) & 0x3f) | 0x80);
118 *d++ = (((uv >> 6) & 0x3f) | 0x80);
119 *d++ = (( uv & 0x3f) | 0x80);
120 return d;
121 }
122 if (uv < 0x80000000) {
123 *d++ = (( uv >> 30) | 0xfc);
124 *d++ = (((uv >> 24) & 0x3f) | 0x80);
125 *d++ = (((uv >> 18) & 0x3f) | 0x80);
126 *d++ = (((uv >> 12) & 0x3f) | 0x80);
127 *d++ = (((uv >> 6) & 0x3f) | 0x80);
128 *d++ = (( uv & 0x3f) | 0x80);
129 return d;
130 }
6b8eaf93 131#ifdef HAS_QUAD
d7578b48 132 if (uv < UTF8_QUAD_MAX)
a0ed51b3 133#endif
134 {
135 *d++ = 0xfe; /* Can't match U+FEFF! */
136 *d++ = (((uv >> 30) & 0x3f) | 0x80);
137 *d++ = (((uv >> 24) & 0x3f) | 0x80);
138 *d++ = (((uv >> 18) & 0x3f) | 0x80);
139 *d++ = (((uv >> 12) & 0x3f) | 0x80);
140 *d++ = (((uv >> 6) & 0x3f) | 0x80);
141 *d++ = (( uv & 0x3f) | 0x80);
142 return d;
143 }
6b8eaf93 144#ifdef HAS_QUAD
a0ed51b3 145 {
146 *d++ = 0xff; /* Can't match U+FFFE! */
3c77ea2b 147 *d++ = 0x80; /* 6 Reserved bits */
148 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
149 *d++ = (((uv >> 54) & 0x3f) | 0x80);
150 *d++ = (((uv >> 48) & 0x3f) | 0x80);
151 *d++ = (((uv >> 42) & 0x3f) | 0x80);
a0ed51b3 152 *d++ = (((uv >> 36) & 0x3f) | 0x80);
153 *d++ = (((uv >> 30) & 0x3f) | 0x80);
154 *d++ = (((uv >> 24) & 0x3f) | 0x80);
155 *d++ = (((uv >> 18) & 0x3f) | 0x80);
156 *d++ = (((uv >> 12) & 0x3f) | 0x80);
157 *d++ = (((uv >> 6) & 0x3f) | 0x80);
158 *d++ = (( uv & 0x3f) | 0x80);
159 return d;
160 }
161#endif
1d72bdf6 162#endif /* Loop style */
a0ed51b3 163}
b851fbc1 164
165U8 *
166Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
167{
168 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
169}
9041c2e3 170
171
172/*
eebe1485 173=for apidoc A|STRLEN|is_utf8_char|U8 *s
174
5da9da9e 175Tests if some arbitrary number of bytes begins in a valid UTF-8
1d72bdf6 176character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
5da9da9e 177The actual number of bytes in the UTF-8 character will be returned if
178it is valid, otherwise 0.
9041c2e3 179
37607a96 180=cut
181*/
067a85ef 182STRLEN
386d01d6 183Perl_is_utf8_char(pTHX_ U8 *s)
184{
185 U8 u = *s;
067a85ef 186 STRLEN slen, len;
187 UV uv, ouv;
386d01d6 188
1d72bdf6 189 if (UTF8_IS_INVARIANT(u))
386d01d6 190 return 1;
191
60006e79 192 if (!UTF8_IS_START(u))
386d01d6 193 return 0;
194
9f07fdcd 195 len = UTF8SKIP(s);
386d01d6 196
60006e79 197 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef 198 return 0;
199
386d01d6 200 slen = len - 1;
201 s++;
6eb6869e 202 u &= UTF_START_MASK(len);
1d72bdf6 203 uv = u;
067a85ef 204 ouv = uv;
386d01d6 205 while (slen--) {
60006e79 206 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 207 return 0;
8850bf83 208 uv = UTF8_ACCUMULATE(uv, *s);
209a9bc1 209 if (uv < ouv)
067a85ef 210 return 0;
211 ouv = uv;
386d01d6 212 s++;
213 }
067a85ef 214
5bbb0b5a 215 if (UNISKIP(uv) < len)
067a85ef 216 return 0;
217
386d01d6 218 return len;
219}
220
6662521e 221/*
eebe1485 222=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 223
5da9da9e 224Returns true if first C<len> bytes of the given string form a valid UTF8
225string, false otherwise. Note that 'a valid UTF8 string' does not mean
226'a string that contains UTF8' because a valid ASCII string is a valid
227UTF8 string.
6662521e 228
229=cut
230*/
231
8e84507e 232bool
6662521e 233Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
234{
067a85ef 235 U8* x = s;
1aa99e6b 236 U8* send;
067a85ef 237 STRLEN c;
238
1aa99e6b 239 if (!len)
6cd5fe39 240 len = strlen((char *)s);
1aa99e6b 241 send = s + len;
242
6662521e 243 while (x < send) {
244 c = is_utf8_char(x);
067a85ef 245 if (!c)
246 return FALSE;
6662521e 247 x += c;
6662521e 248 }
60006e79 249 if (x != send)
250 return FALSE;
067a85ef 251
252 return TRUE;
6662521e 253}
254
67e989fb 255/*
9041c2e3 256=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 257
9041c2e3 258Bottom level UTF-8 decode routine.
259Returns the unicode code point value of the first character in the string C<s>
ba210ebe 260which is assumed to be in UTF8 encoding and no longer than C<curlen>;
7df053ec 261C<retlen> will be set to the length, in bytes, of that character.
67e989fb 262
263If C<s> does not point to a well-formed UTF8 character, the behaviour
dcad2880 264is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
265it is assumed that the caller will raise a warning, and this function
28d3d195 266will silently just set C<retlen> to C<-1> and return zero. If the
267C<flags> does not contain UTF8_CHECK_ONLY, warnings about
268malformations will be given, C<retlen> will be set to the expected
269length of the UTF-8 character in bytes, and zero will be returned.
270
271The C<flags> can also contain various flags to allow deviations from
272the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 273
9041c2e3 274Most code should use utf8_to_uvchr() rather than call this directly.
275
37607a96 276=cut
277*/
67e989fb 278
a0ed51b3 279UV
37607a96 280Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 281{
097fb8e2 282 U8 *s0 = s;
9c5ffd7c 283 UV uv = *s, ouv = 0;
ba210ebe 284 STRLEN len = 1;
285 bool dowarn = ckWARN_d(WARN_UTF8);
4f849cb6 286 UV startbyte = *s;
ba210ebe 287 STRLEN expectlen = 0;
a0dbb045 288 U32 warning = 0;
289
290/* This list is a superset of the UTF8_ALLOW_XXX. */
291
292#define UTF8_WARN_EMPTY 1
293#define UTF8_WARN_CONTINUATION 2
294#define UTF8_WARN_NON_CONTINUATION 3
295#define UTF8_WARN_FE_FF 4
296#define UTF8_WARN_SHORT 5
297#define UTF8_WARN_OVERFLOW 6
298#define UTF8_WARN_SURROGATE 7
299#define UTF8_WARN_BOM 8
300#define UTF8_WARN_LONG 9
301#define UTF8_WARN_FFFF 10
302
303 if (curlen == 0 &&
304 !(flags & UTF8_ALLOW_EMPTY)) {
305 warning = UTF8_WARN_EMPTY;
0c443dc2 306 goto malformed;
307 }
308
1d72bdf6 309 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3 310 if (retlen)
311 *retlen = 1;
c4d5f83a 312 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 313 }
67e989fb 314
421a8bf2 315 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 316 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 317 warning = UTF8_WARN_CONTINUATION;
ba210ebe 318 goto malformed;
319 }
320
421a8bf2 321 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 322 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 323 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 324 goto malformed;
325 }
9041c2e3 326
1d72bdf6 327#ifdef EBCDIC
75383841 328 uv = NATIVE_TO_UTF(uv);
1d72bdf6 329#else
fcc8fcf6 330 if ((uv == 0xfe || uv == 0xff) &&
331 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 332 warning = UTF8_WARN_FE_FF;
ba210ebe 333 goto malformed;
a0ed51b3 334 }
1d72bdf6 335#endif
336
ba210ebe 337 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
338 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
339 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
340 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6 341#ifdef EBCDIC
342 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
343 else { len = 7; uv &= 0x01; }
344#else
ba210ebe 345 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
346 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6 347 else { len = 13; uv = 0; } /* whoa! */
348#endif
349
a0ed51b3 350 if (retlen)
351 *retlen = len;
9041c2e3 352
ba210ebe 353 expectlen = len;
354
fcc8fcf6 355 if ((curlen < expectlen) &&
356 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 357 warning = UTF8_WARN_SHORT;
ba210ebe 358 goto malformed;
359 }
360
361 len--;
a0ed51b3 362 s++;
ba210ebe 363 ouv = uv;
364
a0ed51b3 365 while (len--) {
421a8bf2 366 if (!UTF8_IS_CONTINUATION(*s) &&
367 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 368 s--;
369 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 370 goto malformed;
a0ed51b3 371 }
372 else
8850bf83 373 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045 374 if (!(uv > ouv)) {
375 /* These cannot be allowed. */
376 if (uv == ouv) {
377 if (!(flags & UTF8_ALLOW_LONG)) {
378 warning = UTF8_WARN_LONG;
379 goto malformed;
380 }
381 }
382 else { /* uv < ouv */
383 /* This cannot be allowed. */
384 warning = UTF8_WARN_OVERFLOW;
385 goto malformed;
386 }
ba210ebe 387 }
388 s++;
389 ouv = uv;
390 }
391
421a8bf2 392 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 393 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 394 warning = UTF8_WARN_SURROGATE;
ba210ebe 395 goto malformed;
421a8bf2 396 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
fcc8fcf6 397 !(flags & UTF8_ALLOW_BOM)) {
a0dbb045 398 warning = UTF8_WARN_BOM;
ba210ebe 399 goto malformed;
fcc8fcf6 400 } else if ((expectlen > UNISKIP(uv)) &&
401 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 402 warning = UTF8_WARN_LONG;
ba210ebe 403 goto malformed;
421a8bf2 404 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 405 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 406 warning = UTF8_WARN_FFFF;
a9917092 407 goto malformed;
a0ed51b3 408 }
ba210ebe 409
a0ed51b3 410 return uv;
ba210ebe 411
412malformed:
413
fcc8fcf6 414 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 415 if (retlen)
cc366d4b 416 *retlen = -1;
ba210ebe 417 return 0;
418 }
419
a0dbb045 420 if (dowarn) {
421 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
422
423 switch (warning) {
424 case 0: /* Intentionally empty. */ break;
425 case UTF8_WARN_EMPTY:
426 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
427 break;
428 case UTF8_WARN_CONTINUATION:
097fb8e2 429 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045 430 break;
431 case UTF8_WARN_NON_CONTINUATION:
097fb8e2 432 if (s == s0)
433 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
434 (UV)s[1], startbyte);
435 else
436 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
437 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
438
a0dbb045 439 break;
440 case UTF8_WARN_FE_FF:
441 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
442 break;
443 case UTF8_WARN_SHORT:
097fb8e2 444 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
445 curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
b31f83c2 446 expectlen = curlen; /* distance for caller to skip */
a0dbb045 447 break;
448 case UTF8_WARN_OVERFLOW:
097fb8e2 449 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
450 ouv, *s, startbyte);
a0dbb045 451 break;
452 case UTF8_WARN_SURROGATE:
453 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
454 break;
455 case UTF8_WARN_BOM:
456 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
457 break;
458 case UTF8_WARN_LONG:
097fb8e2 459 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
460 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045 461 break;
462 case UTF8_WARN_FFFF:
463 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
464 break;
465 default:
466 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
467 break;
468 }
469
470 if (warning) {
471 char *s = SvPVX(sv);
472
473 if (PL_op)
9014280d 474 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 475 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 476 else
9014280d 477 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045 478 }
479 }
480
ba210ebe 481 if (retlen)
28d3d195 482 *retlen = expectlen ? expectlen : len;
ba210ebe 483
28d3d195 484 return 0;
a0ed51b3 485}
486
8e84507e 487/*
37607a96 488=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
9041c2e3 489
490Returns the native character value of the first character in the string C<s>
491which is assumed to be in UTF8 encoding; C<retlen> will be set to the
492length, in bytes, of that character.
493
494If C<s> does not point to a well-formed UTF8 character, zero is
495returned and retlen is set, if possible, to -1.
496
497=cut
498*/
499
500UV
37607a96 501Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
9041c2e3 502{
503 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
504}
505
506/*
37607a96 507=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
9041c2e3 508
509Returns the Unicode code point of the first character in the string C<s>
510which is assumed to be in UTF8 encoding; C<retlen> will be set to the
511length, in bytes, of that character.
512
513This function should only be used when returned UV is considered
514an index into the Unicode semantic tables (e.g. swashes).
515
ba210ebe 516If C<s> does not point to a well-formed UTF8 character, zero is
517returned and retlen is set, if possible, to -1.
8e84507e 518
519=cut
520*/
521
522UV
37607a96 523Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
8e84507e 524{
9041c2e3 525 /* Call the low level routine asking for checks */
526 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
8e84507e 527}
528
b76347f2 529/*
37607a96 530=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
b76347f2 531
532Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47 533Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
534up past C<e>, croaks.
b76347f2 535
536=cut
537*/
538
539STRLEN
37607a96 540Perl_utf8_length(pTHX_ U8 *s, U8 *e)
b76347f2 541{
542 STRLEN len = 0;
543
8850bf83 544 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
545 * the bitops (especially ~) can create illegal UTF-8.
546 * In other words: in Perl UTF-8 is not just for Unicode. */
547
901b21bf 548 if (e < s) {
549 if (ckWARN_d(WARN_UTF8)) {
550 if (PL_op)
551 Perl_warner(aTHX_ packWARN(WARN_UTF8),
552 "%s in %s", unees, OP_DESC(PL_op));
553 else
554 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
555 }
556 return 0;
557 }
b76347f2 558 while (s < e) {
02eb7b47 559 U8 t = UTF8SKIP(s);
b76347f2 560
901b21bf 561 if (e - s < t) {
562 if (ckWARN_d(WARN_UTF8)) {
563 if (PL_op)
564 Perl_warner(aTHX_ packWARN(WARN_UTF8),
565 unees, OP_DESC(PL_op));
566 else
567 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
568 }
569 return len;
570 }
b76347f2 571 s += t;
572 len++;
573 }
574
575 return len;
576}
577
b06226ff 578/*
eebe1485 579=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff 580
581Returns the number of UTF8 characters between the UTF-8 pointers C<a>
582and C<b>.
583
584WARNING: use only if you *know* that the pointers point inside the
585same UTF-8 buffer.
586
37607a96 587=cut
588*/
a0ed51b3 589
02eb7b47 590IV
864dbfa3 591Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 592{
02eb7b47 593 IV off = 0;
594
8850bf83 595 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
596 * the bitops (especially ~) can create illegal UTF-8.
597 * In other words: in Perl UTF-8 is not just for Unicode. */
598
a0ed51b3 599 if (a < b) {
600 while (a < b) {
02eb7b47 601 U8 c = UTF8SKIP(a);
602
901b21bf 603 if (b - a < c) {
604 if (ckWARN_d(WARN_UTF8)) {
605 if (PL_op)
606 Perl_warner(aTHX_ packWARN(WARN_UTF8),
607 "%s in %s", unees, OP_DESC(PL_op));
608 else
609 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
610 }
611 return off;
612 }
02eb7b47 613 a += c;
a0ed51b3 614 off--;
615 }
616 }
617 else {
618 while (b < a) {
02eb7b47 619 U8 c = UTF8SKIP(b);
620
901b21bf 621 if (a - b < c) {
622 if (ckWARN_d(WARN_UTF8)) {
623 if (PL_op)
624 Perl_warner(aTHX_ packWARN(WARN_UTF8),
625 "%s in %s", unees, OP_DESC(PL_op));
626 else
627 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
628 }
629 return off;
630 }
02eb7b47 631 b += c;
a0ed51b3 632 off++;
633 }
634 }
02eb7b47 635
a0ed51b3 636 return off;
637}
638
b06226ff 639/*
37607a96 640=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 641
8850bf83 642Return the UTF-8 pointer C<s> displaced by C<off> characters, either
643forward or backward.
b06226ff 644
645WARNING: do not use the following unless you *know* C<off> is within
8850bf83 646the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
647on the first byte of character or just after the last byte of a character.
b06226ff 648
37607a96 649=cut
650*/
a0ed51b3 651
652U8 *
864dbfa3 653Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 654{
8850bf83 655 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
656 * the bitops (especially ~) can create illegal UTF-8.
657 * In other words: in Perl UTF-8 is not just for Unicode. */
658
a0ed51b3 659 if (off >= 0) {
660 while (off--)
661 s += UTF8SKIP(s);
662 }
663 else {
664 while (off++) {
665 s--;
8850bf83 666 while (UTF8_IS_CONTINUATION(*s))
667 s--;
a0ed51b3 668 }
669 }
670 return s;
671}
672
6940069f 673/*
eebe1485 674=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 675
246fae53 676Converts a string C<s> of length C<len> from UTF8 into byte encoding.
677Unlike C<bytes_to_utf8>, this over-writes the original string, and
678updates len to contain the new length.
67e989fb 679Returns zero on failure, setting C<len> to -1.
6940069f 680
681=cut
682*/
683
684U8 *
37607a96 685Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 686{
6940069f 687 U8 *send;
688 U8 *d;
dcad2880 689 U8 *save = s;
246fae53 690
691 /* ensure valid UTF8 and chars < 256 before updating string */
dcad2880 692 for (send = s + *len; s < send; ) {
693 U8 c = *s++;
694
1d72bdf6 695 if (!UTF8_IS_INVARIANT(c) &&
696 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
697 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880 698 *len = -1;
699 return 0;
700 }
246fae53 701 }
dcad2880 702
703 d = s = save;
6940069f 704 while (s < send) {
ed646e6e 705 STRLEN ulen;
9041c2e3 706 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 707 s += ulen;
6940069f 708 }
709 *d = '\0';
246fae53 710 *len = d - save;
6940069f 711 return save;
712}
713
714/*
f9a63242 715=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
716
717Converts a string C<s> of length C<len> from UTF8 into byte encoding.
718Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0 719the newly-created string, and updates C<len> to contain the new
720length. Returns the original string if no conversion occurs, C<len>
721is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
7220 if C<s> is converted or contains all 7bit characters.
f9a63242 723
37607a96 724=cut
725*/
f9a63242 726
727U8 *
37607a96 728Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 729{
f9a63242 730 U8 *d;
731 U8 *start = s;
db42d148 732 U8 *send;
f9a63242 733 I32 count = 0;
734
735 if (!*is_utf8)
736 return start;
737
ef9edfd0 738 /* ensure valid UTF8 and chars < 256 before converting string */
f9a63242 739 for (send = s + *len; s < send;) {
740 U8 c = *s++;
1d72bdf6 741 if (!UTF8_IS_INVARIANT(c)) {
db42d148 742 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
743 (c = *s++) && UTF8_IS_CONTINUATION(c))
744 count++;
745 else
f9a63242 746 return start;
db42d148 747 }
f9a63242 748 }
749
750 *is_utf8 = 0;
751
f9a63242 752 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 753 s = start; start = d;
f9a63242 754 while (s < send) {
755 U8 c = *s++;
c4d5f83a 756 if (!UTF8_IS_INVARIANT(c)) {
757 /* Then it is two-byte encoded */
758 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
759 c = ASCII_TO_NATIVE(c);
760 }
761 *d++ = c;
f9a63242 762 }
763 *d = '\0';
764 *len = d - start;
765 return start;
766}
767
768/*
eebe1485 769=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f 770
771Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
6662521e 772Returns a pointer to the newly-created string, and sets C<len> to
773reflect the new length.
6940069f 774
497711e7 775=cut
6940069f 776*/
777
778U8*
37607a96 779Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
6940069f 780{
6940069f 781 U8 *send;
782 U8 *d;
783 U8 *dst;
6662521e 784 send = s + (*len);
6940069f 785
6662521e 786 Newz(801, d, (*len) * 2 + 1, U8);
6940069f 787 dst = d;
788
789 while (s < send) {
db42d148 790 UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 791 if (UNI_IS_INVARIANT(uv))
792 *d++ = UTF_TO_NATIVE(uv);
6940069f 793 else {
90f44359 794 *d++ = UTF8_EIGHT_BIT_HI(uv);
795 *d++ = UTF8_EIGHT_BIT_LO(uv);
6940069f 796 }
797 }
798 *d = '\0';
6662521e 799 *len = d-dst;
6940069f 800 return dst;
801}
802
a0ed51b3 803/*
dea0fc0b 804 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3 805 *
806 * Destination must be pre-extended to 3/2 source. Do not use in-place.
807 * We optimize for native, for obvious reasons. */
808
809U8*
dea0fc0b 810Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 811{
dea0fc0b 812 U8* pend;
813 U8* dstart = d;
814
815 if (bytelen & 1)
a7867d0a 816 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b 817
818 pend = p + bytelen;
819
a0ed51b3 820 while (p < pend) {
dea0fc0b 821 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
822 p += 2;
a0ed51b3 823 if (uv < 0x80) {
824 *d++ = uv;
825 continue;
826 }
827 if (uv < 0x800) {
828 *d++ = (( uv >> 6) | 0xc0);
829 *d++ = (( uv & 0x3f) | 0x80);
830 continue;
831 }
832 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
dea0fc0b 833 UV low = *p++;
834 if (low < 0xdc00 || low >= 0xdfff)
835 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3 836 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
837 }
838 if (uv < 0x10000) {
839 *d++ = (( uv >> 12) | 0xe0);
840 *d++ = (((uv >> 6) & 0x3f) | 0x80);
841 *d++ = (( uv & 0x3f) | 0x80);
842 continue;
843 }
844 else {
845 *d++ = (( uv >> 18) | 0xf0);
846 *d++ = (((uv >> 12) & 0x3f) | 0x80);
847 *d++ = (((uv >> 6) & 0x3f) | 0x80);
848 *d++ = (( uv & 0x3f) | 0x80);
849 continue;
850 }
851 }
dea0fc0b 852 *newlen = d - dstart;
a0ed51b3 853 return d;
854}
855
856/* Note: this one is slightly destructive of the source. */
857
858U8*
dea0fc0b 859Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 860{
861 U8* s = (U8*)p;
862 U8* send = s + bytelen;
863 while (s < send) {
864 U8 tmp = s[0];
865 s[0] = s[1];
866 s[1] = tmp;
867 s += 2;
868 }
dea0fc0b 869 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3 870}
871
872/* for now these are all defined (inefficiently) in terms of the utf8 versions */
873
874bool
84afefe6 875Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 876{
ad391ad9 877 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 878 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 879 return is_utf8_alnum(tmpbuf);
880}
881
882bool
84afefe6 883Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 884{
ad391ad9 885 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 886 uvchr_to_utf8(tmpbuf, c);
b8c5462f 887 return is_utf8_alnumc(tmpbuf);
888}
889
890bool
84afefe6 891Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 892{
ad391ad9 893 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 894 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 895 return is_utf8_idfirst(tmpbuf);
896}
897
898bool
84afefe6 899Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 900{
ad391ad9 901 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 902 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 903 return is_utf8_alpha(tmpbuf);
904}
905
906bool
84afefe6 907Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 908{
ad391ad9 909 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 910 uvchr_to_utf8(tmpbuf, c);
4d61ec05 911 return is_utf8_ascii(tmpbuf);
912}
913
914bool
84afefe6 915Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 916{
ad391ad9 917 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 918 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 919 return is_utf8_space(tmpbuf);
920}
921
922bool
84afefe6 923Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 924{
ad391ad9 925 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 926 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 927 return is_utf8_digit(tmpbuf);
928}
929
930bool
84afefe6 931Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 932{
ad391ad9 933 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 934 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 935 return is_utf8_upper(tmpbuf);
936}
937
938bool
84afefe6 939Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 940{
ad391ad9 941 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 942 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 943 return is_utf8_lower(tmpbuf);
944}
945
946bool
84afefe6 947Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 948{
ad391ad9 949 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 950 uvchr_to_utf8(tmpbuf, c);
b8c5462f 951 return is_utf8_cntrl(tmpbuf);
952}
953
954bool
84afefe6 955Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 956{
ad391ad9 957 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 958 uvchr_to_utf8(tmpbuf, c);
b8c5462f 959 return is_utf8_graph(tmpbuf);
960}
961
962bool
84afefe6 963Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 964{
ad391ad9 965 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 966 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 967 return is_utf8_print(tmpbuf);
968}
969
b8c5462f 970bool
84afefe6 971Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 972{
ad391ad9 973 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 974 uvchr_to_utf8(tmpbuf, c);
b8c5462f 975 return is_utf8_punct(tmpbuf);
976}
977
4d61ec05 978bool
84afefe6 979Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 980{
e7ae6809 981 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
230880c1 982 uvchr_to_utf8(tmpbuf, c);
4d61ec05 983 return is_utf8_xdigit(tmpbuf);
984}
985
84afefe6 986UV
987Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 988{
0ebc6274 989 uvchr_to_utf8(p, c);
990 return to_utf8_upper(p, p, lenp);
a0ed51b3 991}
992
84afefe6 993UV
994Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 995{
0ebc6274 996 uvchr_to_utf8(p, c);
997 return to_utf8_title(p, p, lenp);
a0ed51b3 998}
999
84afefe6 1000UV
1001Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1002{
0ebc6274 1003 uvchr_to_utf8(p, c);
1004 return to_utf8_lower(p, p, lenp);
a0ed51b3 1005}
1006
84afefe6 1007UV
1008Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1009{
0ebc6274 1010 uvchr_to_utf8(p, c);
1011 return to_utf8_fold(p, p, lenp);
84afefe6 1012}
1013
a0ed51b3 1014/* for now these all assume no locale info available for Unicode > 255 */
1015
1016bool
84afefe6 1017Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3 1018{
1019 return is_uni_alnum(c); /* XXX no locale support yet */
1020}
1021
1022bool
84afefe6 1023Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f 1024{
1025 return is_uni_alnumc(c); /* XXX no locale support yet */
1026}
1027
1028bool
84afefe6 1029Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3 1030{
1031 return is_uni_idfirst(c); /* XXX no locale support yet */
1032}
1033
1034bool
84afefe6 1035Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3 1036{
1037 return is_uni_alpha(c); /* XXX no locale support yet */
1038}
1039
1040bool
84afefe6 1041Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05 1042{
1043 return is_uni_ascii(c); /* XXX no locale support yet */
1044}
1045
1046bool
84afefe6 1047Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3 1048{
1049 return is_uni_space(c); /* XXX no locale support yet */
1050}
1051
1052bool
84afefe6 1053Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3 1054{
1055 return is_uni_digit(c); /* XXX no locale support yet */
1056}
1057
1058bool
84afefe6 1059Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3 1060{
1061 return is_uni_upper(c); /* XXX no locale support yet */
1062}
1063
1064bool
84afefe6 1065Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3 1066{
1067 return is_uni_lower(c); /* XXX no locale support yet */
1068}
1069
1070bool
84afefe6 1071Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f 1072{
1073 return is_uni_cntrl(c); /* XXX no locale support yet */
1074}
1075
1076bool
84afefe6 1077Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f 1078{
1079 return is_uni_graph(c); /* XXX no locale support yet */
1080}
1081
1082bool
84afefe6 1083Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3 1084{
1085 return is_uni_print(c); /* XXX no locale support yet */
1086}
1087
b8c5462f 1088bool
84afefe6 1089Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f 1090{
1091 return is_uni_punct(c); /* XXX no locale support yet */
1092}
1093
4d61ec05 1094bool
84afefe6 1095Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05 1096{
1097 return is_uni_xdigit(c); /* XXX no locale support yet */
1098}
1099
b7ac61fa 1100U32
1101Perl_to_uni_upper_lc(pTHX_ U32 c)
1102{
ee099d14 1103 /* XXX returns only the first character -- do not use XXX */
1104 /* XXX no locale support yet */
1105 STRLEN len;
1106 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1107 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa 1108}
1109
1110U32
1111Perl_to_uni_title_lc(pTHX_ U32 c)
1112{
ee099d14 1113 /* XXX returns only the first character XXX -- do not use XXX */
1114 /* XXX no locale support yet */
1115 STRLEN len;
1116 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1117 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa 1118}
1119
1120U32
1121Perl_to_uni_lower_lc(pTHX_ U32 c)
1122{
ee099d14 1123 /* XXX returns only the first character -- do not use XXX */
1124 /* XXX no locale support yet */
1125 STRLEN len;
1126 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1127 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa 1128}
1129
a0ed51b3 1130bool
864dbfa3 1131Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1132{
386d01d6 1133 if (!is_utf8_char(p))
1134 return FALSE;
a0ed51b3 1135 if (!PL_utf8_alnum)
289d4f09 1136 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1137 * descendant of isalnum(3), in other words, it doesn't
1138 * contain the '_'. --jhi */
1139 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
3568d838 1140 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3 1141/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1142#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1143 if (!PL_utf8_alnum)
1144 PL_utf8_alnum = swash_init("utf8", "",
1145 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1146 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3 1147#endif
1148}
1149
1150bool
b8c5462f 1151Perl_is_utf8_alnumc(pTHX_ U8 *p)
1152{
386d01d6 1153 if (!is_utf8_char(p))
1154 return FALSE;
b8c5462f 1155 if (!PL_utf8_alnum)
1156 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
3568d838 1157 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f 1158/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1159#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1160 if (!PL_utf8_alnum)
1161 PL_utf8_alnum = swash_init("utf8", "",
1162 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1163 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f 1164#endif
1165}
1166
1167bool
864dbfa3 1168Perl_is_utf8_idfirst(pTHX_ U8 *p)
a0ed51b3 1169{
1170 return *p == '_' || is_utf8_alpha(p);
1171}
1172
1173bool
864dbfa3 1174Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1175{
386d01d6 1176 if (!is_utf8_char(p))
1177 return FALSE;
a0ed51b3 1178 if (!PL_utf8_alpha)
e24b16f9 1179 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
3568d838 1180 return swash_fetch(PL_utf8_alpha, p, TRUE);
a0ed51b3 1181}
1182
1183bool
b8c5462f 1184Perl_is_utf8_ascii(pTHX_ U8 *p)
1185{
386d01d6 1186 if (!is_utf8_char(p))
1187 return FALSE;
b8c5462f 1188 if (!PL_utf8_ascii)
1189 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
3568d838 1190 return swash_fetch(PL_utf8_ascii, p, TRUE);
b8c5462f 1191}
1192
1193bool
864dbfa3 1194Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1195{
386d01d6 1196 if (!is_utf8_char(p))
1197 return FALSE;
a0ed51b3 1198 if (!PL_utf8_space)
3bec3564 1199 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
3568d838 1200 return swash_fetch(PL_utf8_space, p, TRUE);
a0ed51b3 1201}
1202
1203bool
864dbfa3 1204Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1205{
386d01d6 1206 if (!is_utf8_char(p))
1207 return FALSE;
a0ed51b3 1208 if (!PL_utf8_digit)
e24b16f9 1209 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
3568d838 1210 return swash_fetch(PL_utf8_digit, p, TRUE);
a0ed51b3 1211}
1212
1213bool
864dbfa3 1214Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1215{
386d01d6 1216 if (!is_utf8_char(p))
1217 return FALSE;
a0ed51b3 1218 if (!PL_utf8_upper)
e24b16f9 1219 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
3568d838 1220 return swash_fetch(PL_utf8_upper, p, TRUE);
a0ed51b3 1221}
1222
1223bool
864dbfa3 1224Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1225{
386d01d6 1226 if (!is_utf8_char(p))
1227 return FALSE;
a0ed51b3 1228 if (!PL_utf8_lower)
e24b16f9 1229 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
3568d838 1230 return swash_fetch(PL_utf8_lower, p, TRUE);
a0ed51b3 1231}
1232
1233bool
b8c5462f 1234Perl_is_utf8_cntrl(pTHX_ U8 *p)
1235{
386d01d6 1236 if (!is_utf8_char(p))
1237 return FALSE;
b8c5462f 1238 if (!PL_utf8_cntrl)
1239 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
3568d838 1240 return swash_fetch(PL_utf8_cntrl, p, TRUE);
b8c5462f 1241}
1242
1243bool
1244Perl_is_utf8_graph(pTHX_ U8 *p)
1245{
386d01d6 1246 if (!is_utf8_char(p))
1247 return FALSE;
b8c5462f 1248 if (!PL_utf8_graph)
1249 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
3568d838 1250 return swash_fetch(PL_utf8_graph, p, TRUE);
b8c5462f 1251}
1252
1253bool
864dbfa3 1254Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1255{
386d01d6 1256 if (!is_utf8_char(p))
1257 return FALSE;
a0ed51b3 1258 if (!PL_utf8_print)
e24b16f9 1259 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
3568d838 1260 return swash_fetch(PL_utf8_print, p, TRUE);
a0ed51b3 1261}
1262
1263bool
b8c5462f 1264Perl_is_utf8_punct(pTHX_ U8 *p)
1265{
386d01d6 1266 if (!is_utf8_char(p))
1267 return FALSE;
b8c5462f 1268 if (!PL_utf8_punct)
1269 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
3568d838 1270 return swash_fetch(PL_utf8_punct, p, TRUE);
b8c5462f 1271}
1272
1273bool
1274Perl_is_utf8_xdigit(pTHX_ U8 *p)
1275{
386d01d6 1276 if (!is_utf8_char(p))
1277 return FALSE;
b8c5462f 1278 if (!PL_utf8_xdigit)
1279 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
3568d838 1280 return swash_fetch(PL_utf8_xdigit, p, TRUE);
b8c5462f 1281}
1282
1283bool
864dbfa3 1284Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1285{
386d01d6 1286 if (!is_utf8_char(p))
1287 return FALSE;
a0ed51b3 1288 if (!PL_utf8_mark)
e24b16f9 1289 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
3568d838 1290 return swash_fetch(PL_utf8_mark, p, TRUE);
a0ed51b3 1291}
1292
6b5c0936 1293/*
1294=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1295
1296The "p" contains the pointer to the UTF-8 string encoding
1297the character that is being converted.
1298
1299The "ustrp" is a pointer to the character buffer to put the
1300conversion result to. The "lenp" is a pointer to the length
1301of the result.
1302
0134edef 1303The "swashp" is a pointer to the swash to use.
6b5c0936 1304
0134edef 1305Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1306and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1307but not always, a multicharacter mapping), is tried first.
6b5c0936 1308
0134edef 1309The "special" is a string like "utf8::ToSpecLower", which means the
1310hash %utf8::ToSpecLower. The access to the hash is through
1311Perl_to_utf8_case().
6b5c0936 1312
0134edef 1313The "normal" is a string like "ToLower" which means the swash
1314%utf8::ToLower.
1315
1316=cut */
6b5c0936 1317
2104c8d9 1318UV
a6872d42 1319Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
a0ed51b3 1320{
0134edef 1321 UV uv0, uv1;
2f9475ad 1322 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
0134edef 1323 STRLEN len = 0;
a0ed51b3 1324
1feea2c7 1325 uv0 = utf8_to_uvchr(p, 0);
1326 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1327 * are necessary in EBCDIC, they are redundant no-ops
1328 * in ASCII-ish platforms, and hopefully optimized away. */
1329 uv1 = NATIVE_TO_UNI(uv0);
1330 uvuni_to_utf8(tmpbuf, uv1);
0134edef 1331
1332 if (!*swashp) /* load on-demand */
1333 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1334
1335 if (special) {
1336 /* It might be "special" (sometimes, but not always,
2a37f04d 1337 * a multicharacter mapping) */
983ffd37 1338 HV *hv;
1339 SV *keysv;
1340 HE *he;
2a37f04d 1341 SV *val;
2f9475ad 1342
983ffd37 1343 if ((hv = get_hv(special, FALSE)) &&
1feea2c7 1344 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
2a37f04d 1345 (he = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
1346 (val = HeVAL(he))) {
47654450 1347 char *s;
47654450 1348
2a37f04d 1349 s = SvPV(val, len);
47654450 1350 if (len == 1)
1351 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1352 else {
2f9475ad 1353#ifdef EBCDIC
1354 /* If we have EBCDIC we need to remap the characters
1355 * since any characters in the low 256 are Unicode
1356 * code points, not EBCDIC. */
7cda7a3d 1357 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad 1358
1359 d = tmpbuf;
1360 if (SvUTF8(val)) {
1361 STRLEN tlen = 0;
1362
1363 while (t < tend) {
1364 UV c = utf8_to_uvchr(t, &tlen);
1365 if (tlen > 0) {
1366 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1367 t += tlen;
1368 }
1369 else
1370 break;
1371 }
1372 }
1373 else {
36fec512 1374 while (t < tend) {
1375 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1376 t++;
1377 }
2f9475ad 1378 }
1379 len = d - tmpbuf;
1380 Copy(tmpbuf, ustrp, len, U8);
1381#else
d2dcd0fb 1382 Copy(s, ustrp, len, U8);
2f9475ad 1383#endif
29e98929 1384 }
983ffd37 1385 }
0134edef 1386 }
1387
1388 if (!len && *swashp) {
1389 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1390
1391 if (uv2) {
1392 /* It was "normal" (a single character mapping). */
1393 UV uv3 = UNI_TO_NATIVE(uv2);
1394
e9101d72 1395 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d 1396 }
1397 }
1feea2c7 1398
0134edef 1399 if (!len) /* Neither: just copy. */
1400 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1401
2a37f04d 1402 if (lenp)
1403 *lenp = len;
1404
0134edef 1405 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3 1406}
1407
d3e79532 1408/*
1409=for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1410
1411Convert the UTF-8 encoded character at p to its uppercase version and
1412store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1413that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1414uppercase version may be longer than the original character (up to two
1415characters).
1416
1417The first character of the uppercased version is returned
1418(but note, as explained above, that there may be more.)
1419
1420=cut */
1421
2104c8d9 1422UV
983ffd37 1423Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1424{
983ffd37 1425 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1426 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1427}
a0ed51b3 1428
d3e79532 1429/*
1430=for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1431
1432Convert the UTF-8 encoded character at p to its titlecase version and
1433store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1434that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1435titlecase version may be longer than the original character (up to two
1436characters).
1437
1438The first character of the titlecased version is returned
1439(but note, as explained above, that there may be more.)
1440
1441=cut */
1442
983ffd37 1443UV
1444Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1445{
1446 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1447 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3 1448}
1449
d3e79532 1450/*
1451=for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1452
1453Convert the UTF-8 encoded character at p to its lowercase version and
1454store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1455that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1456lowercase version may be longer than the original character (up to two
1457characters).
1458
1459The first character of the lowercased version is returned
1460(but note, as explained above, that there may be more.)
1461
1462=cut */
1463
2104c8d9 1464UV
a2a2844f 1465Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1466{
983ffd37 1467 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1468 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1469}
1470
d3e79532 1471/*
1472=for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1473
1474Convert the UTF-8 encoded character at p to its foldcase version and
1475store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1476that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1477foldcase version may be longer than the original character (up to
1478three characters).
1479
1480The first character of the foldcased version is returned
1481(but note, as explained above, that there may be more.)
1482
1483=cut */
1484
b4e400f9 1485UV
1486Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1487{
1488 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1489 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3 1490}
1491
1492/* a "swash" is a swatch hash */
1493
1494SV*
864dbfa3 1495Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1496{
1497 SV* retval;
bf1fed83 1498 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1499 dSP;
1b026014 1500 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
f8be5cf0 1501 SV* errsv_save;
ce3b816e 1502
1b026014 1503 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1504 ENTER;
f8be5cf0 1505 errsv_save = newSVsv(ERRSV);
ce3b816e 1506 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
f8be5cf0 1507 if (!SvTRUE(ERRSV))
1508 sv_setsv(ERRSV, errsv_save);
1509 SvREFCNT_dec(errsv_save);
ce3b816e 1510 LEAVE;
1511 }
1512 SPAGAIN;
a0ed51b3 1513 PUSHSTACKi(PERLSI_MAGIC);
1514 PUSHMARK(SP);
1515 EXTEND(SP,5);
1516 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1517 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1518 PUSHs(listsv);
1519 PUSHs(sv_2mortal(newSViv(minbits)));
1520 PUSHs(sv_2mortal(newSViv(none)));
1521 PUTBACK;
1522 ENTER;
1523 SAVEI32(PL_hints);
1524 PL_hints = 0;
1525 save_re_context();
bf1fed83 1526 if (PL_curcop == &PL_compiling)
1527 /* XXX ought to be handled by lex_start */
1528 sv_setpv(tokenbufsv, PL_tokenbuf);
f8be5cf0 1529 errsv_save = newSVsv(ERRSV);
864dbfa3 1530 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1531 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1532 else
e24b16f9 1533 retval = &PL_sv_undef;
f8be5cf0 1534 if (!SvTRUE(ERRSV))
1535 sv_setsv(ERRSV, errsv_save);
1536 SvREFCNT_dec(errsv_save);
a0ed51b3 1537 LEAVE;
1538 POPSTACK;
e24b16f9 1539 if (PL_curcop == &PL_compiling) {
bf1fed83 1540 STRLEN len;
1541 char* pv = SvPV(tokenbufsv, len);
1542
1543 Copy(pv, PL_tokenbuf, len+1, char);
e24b16f9 1544 PL_curcop->op_private = PL_hints;
a0ed51b3 1545 }
bc45ce41 1546 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1547 if (SvPOK(retval))
1548 Perl_croak(aTHX_ "Can't find Unicode property definition \"%s\"",
1549 SvPV_nolen(retval));
cea2e8a9 1550 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1551 }
a0ed51b3 1552 return retval;
1553}
1554
035d37be 1555
1556/* This API is wrong for special case conversions since we may need to
1557 * return several Unicode characters for a single Unicode character
1558 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1559 * the lower-level routine, and it is similarly broken for returning
1560 * multiple values. --jhi */
a0ed51b3 1561UV
3568d838 1562Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
a0ed51b3 1563{
1564 HV* hv = (HV*)SvRV(sv);
3568d838 1565 U32 klen;
1566 U32 off;
a0ed51b3 1567 STRLEN slen;
7d85a32c 1568 STRLEN needents;
4ea42e7f 1569 U8 *tmps = NULL;
a0ed51b3 1570 U32 bit;
1571 SV *retval;
3568d838 1572 U8 tmputf8[2];
1573 UV c = NATIVE_TO_ASCII(*ptr);
1574
1575 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1576 tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1577 tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1578 ptr = tmputf8;
1579 }
1580 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1581 * then the "swatch" is a vec() for al the chars which start
1582 * with 0xAA..0xYY
1583 * So the key in the hash (klen) is length of encoded char -1
1584 */
1585 klen = UTF8SKIP(ptr) - 1;
1586 off = ptr[klen];
a0ed51b3 1587
7d85a32c 1588 if (klen == 0)
1589 {
1590 /* If char in invariant then swatch is for all the invariant chars
1591 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1592 */
1593 needents = UTF_CONTINUATION_MARK;
1594 off = NATIVE_TO_UTF(ptr[klen]);
1595 }
1596 else
1597 {
1598 /* If char is encoded then swatch is for the prefix */
1599 needents = (1 << UTF_ACCUMULATION_SHIFT);
1600 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1601 }
1602
a0ed51b3 1603 /*
1604 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1605 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1606 * it's nothing to sniff at.) Pity we usually come through at least
1607 * two function calls to get here...
1608 *
1609 * NB: this code assumes that swatches are never modified, once generated!
1610 */
1611
3568d838 1612 if (hv == PL_last_swash_hv &&
a0ed51b3 1613 klen == PL_last_swash_klen &&
3568d838 1614 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3 1615 {
1616 tmps = PL_last_swash_tmps;
1617 slen = PL_last_swash_slen;
1618 }
1619 else {
1620 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1621 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3 1622
1623 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1624 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1625 dSP;
2b9d42f0 1626 /* We use utf8n_to_uvuni() as we want an index into
1627 Unicode tables, not a native character number.
1628 */
1629 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
f8be5cf0 1630 SV *errsv_save;
a0ed51b3 1631 ENTER;
1632 SAVETMPS;
1633 save_re_context();
1634 PUSHSTACKi(PERLSI_MAGIC);
1635 PUSHMARK(SP);
1636 EXTEND(SP,3);
1637 PUSHs((SV*)sv);
ffbc6a93 1638 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838 1639 PUSHs(sv_2mortal(newSViv((klen) ?
1640 (code_point & ~(needents - 1)) : 0)));
a0ed51b3 1641 PUSHs(sv_2mortal(newSViv(needents)));
1642 PUTBACK;
f8be5cf0 1643 errsv_save = newSVsv(ERRSV);
864dbfa3 1644 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1645 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1646 else
e24b16f9 1647 retval = &PL_sv_undef;
f8be5cf0 1648 if (!SvTRUE(ERRSV))
1649 sv_setsv(ERRSV, errsv_save);
1650 SvREFCNT_dec(errsv_save);
a0ed51b3 1651 POPSTACK;
1652 FREETMPS;
1653 LEAVE;
e24b16f9 1654 if (PL_curcop == &PL_compiling)
1655 PL_curcop->op_private = PL_hints;
a0ed51b3 1656
dfe13c55 1657 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1658
7d85a32c 1659 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1660 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3 1661 }
1662
1663 PL_last_swash_hv = hv;
1664 PL_last_swash_klen = klen;
1665 PL_last_swash_tmps = tmps;
1666 PL_last_swash_slen = slen;
1667 if (klen)
1668 Copy(ptr, PL_last_swash_key, klen, U8);
1669 }
1670
9faf8d75 1671 switch ((int)((slen << 3) / needents)) {
a0ed51b3 1672 case 1:
1673 bit = 1 << (off & 7);
1674 off >>= 3;
1675 return (tmps[off] & bit) != 0;
1676 case 8:
1677 return tmps[off];
1678 case 16:
1679 off <<= 1;
1680 return (tmps[off] << 8) + tmps[off + 1] ;
1681 case 32:
1682 off <<= 2;
1683 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1684 }
cea2e8a9 1685 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3 1686 return 0;
1687}
2b9d42f0 1688
1689
1690/*
37607a96 1691=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0 1692
1693Adds the UTF8 representation of the Native codepoint C<uv> to the end
1694of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1695bytes available. The return value is the pointer to the byte after the
1696end of the new character. In other words,
1697
1698 d = uvchr_to_utf8(d, uv);
1699
1700is the recommended wide native character-aware way of saying
1701
1702 *(d++) = uv;
1703
1704=cut
1705*/
1706
1707/* On ASCII machines this is normally a macro but we want a
1708 real function in case XS code wants it
1709*/
1710#undef Perl_uvchr_to_utf8
1711U8 *
1712Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1713{
b851fbc1 1714 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2b9d42f0 1715}
1716
b851fbc1 1717U8 *
1718Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1719{
1720 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1721}
2b9d42f0 1722
1723/*
37607a96 1724=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0 1725
1726Returns the native character value of the first character in the string C<s>
1727which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1728length, in bytes, of that character.
1729
1730Allows length and flags to be passed to low level routine.
1731
1732=cut
1733*/
0a2ef054 1734/* On ASCII machines this is normally a macro but we want
1735 a real function in case XS code wants it
2b9d42f0 1736*/
1737#undef Perl_utf8n_to_uvchr
1738UV
37607a96 1739Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0 1740{
1741 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1742 return UNI_TO_NATIVE(uv);
1743}
1744
d2cc3551 1745/*
1746=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1747
1748Build to the scalar dsv a displayable version of the string spv,
1749length len, the displayable version being at most pvlim bytes long
1750(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1751
9e55ce06 1752The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1753isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054 1754to display the \\[nrfta\\] as the backslashed versions (like '\n')
1755(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1756UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1757UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1758
d2cc3551 1759The pointer to the PV of the dsv is returned.
1760
1761=cut */
e6b2e755 1762char *
1763Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1764{
1765 int truncated = 0;
1766 char *s, *e;
1767
1768 sv_setpvn(dsv, "", 0);
1769 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1770 UV u;
c728cb41 1771 bool ok = FALSE;
1772
e6b2e755 1773 if (pvlim && SvCUR(dsv) >= pvlim) {
1774 truncated++;
1775 break;
1776 }
1777 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1778 if (u < 256) {
c728cb41 1779 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1780 switch (u & 0xFF) {
1781 case '\n':
1782 Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1783 case '\r':
1784 Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1785 case '\t':
1786 Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1787 case '\f':
1788 Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1789 case '\a':
1790 Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1791 case '\\':
d79a7a3d 1792 Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
c728cb41 1793 default: break;
1794 }
1795 }
00e86452 1796 /* isPRINT() is the locale-blind version. */
1797 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
2c4547fe 1798 Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
0a2ef054 1799 ok = TRUE;
1800 }
c728cb41 1801 }
1802 if (!ok)
9e55ce06 1803 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755 1804 }
1805 if (truncated)
1806 sv_catpvn(dsv, "...", 3);
1807
1808 return SvPVX(dsv);
1809}
2b9d42f0 1810
d2cc3551 1811/*
1812=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1813
1814Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1815the displayable version being at most pvlim bytes long
d2cc3551 1816(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1817
1818The flags argument is as in pv_uni_display().
1819
d2cc3551 1820The pointer to the PV of the dsv is returned.
1821
1822=cut */
e6b2e755 1823char *
1824Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1825{
701a277b 1826 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1827 pvlim, flags);
1828}
1829
d2cc3551 1830/*
d07ddd77 1831=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551 1832
1833Return true if the strings s1 and s2 differ case-insensitively, false
1834if not (if they are equal case-insensitively). If u1 is true, the
1835string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77 1836the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1837are false, the respective string is assumed to be in native 8-bit
1838encoding.
1839
1840If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1841in there (they will point at the beginning of the I<next> character).
1842If the pointers behind pe1 or pe2 are non-NULL, they are the end
1843pointers beyond which scanning will not continue under any
1844circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1845s2+l2 will be used as goal end pointers that will also stop the scan,
1846and which qualify towards defining a successful match: all the scans
1847that define an explicit length must reach their goal pointers for
1848a match to succeed).
d2cc3551 1849
1850For case-insensitiveness, the "casefolding" of Unicode is used
1851instead of upper/lowercasing both the characters, see
1852http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1853
1854=cut */
701a277b 1855I32
d07ddd77 1856Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 1857{
5469e704 1858 register U8 *p1 = (U8*)s1;
1859 register U8 *p2 = (U8*)s2;
d07ddd77 1860 register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1861 register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1862 STRLEN n1 = 0, n2 = 0;
ffce6cc2 1863 U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1864 U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
d7f013c8 1865 U8 natbuf[1+1];
1866 STRLEN foldlen1, foldlen2;
d07ddd77 1867 bool match;
332ddc25 1868
d07ddd77 1869 if (pe1)
1870 e1 = *(U8**)pe1;
1871 if (e1 == 0 || (l1 && l1 < e1 - (U8*)s1))
1872 f1 = (U8*)s1 + l1;
1873 if (pe2)
1874 e2 = *(U8**)pe2;
1875 if (e2 == 0 || (l2 && l2 < e2 - (U8*)s2))
1876 f2 = (U8*)s2 + l2;
1877
1878 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1879 return 1; /* mismatch; possible infinite loop or false positive */
1880
a6872d42 1881 if (!u1 || !u2)
1882 natbuf[1] = 0; /* Need to terminate the buffer. */
1883
d07ddd77 1884 while ((e1 == 0 || p1 < e1) &&
1885 (f1 == 0 || p1 < f1) &&
1886 (e2 == 0 || p2 < e2) &&
1887 (f2 == 0 || p2 < f2)) {
1888 if (n1 == 0) {
d7f013c8 1889 if (u1)
1890 to_utf8_fold(p1, foldbuf1, &foldlen1);
1891 else {
f5cee151 1892 natbuf[0] = *p1;
d7f013c8 1893 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1894 }
1895 q1 = foldbuf1;
d07ddd77 1896 n1 = foldlen1;
332ddc25 1897 }
d07ddd77 1898 if (n2 == 0) {
d7f013c8 1899 if (u2)
1900 to_utf8_fold(p2, foldbuf2, &foldlen2);
1901 else {
f5cee151 1902 natbuf[0] = *p2;
d7f013c8 1903 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1904 }
1905 q2 = foldbuf2;
d07ddd77 1906 n2 = foldlen2;
332ddc25 1907 }
d07ddd77 1908 while (n1 && n2) {
1909 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1910 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1911 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 1912 return 1; /* mismatch */
d07ddd77 1913 n1 -= UTF8SKIP(q1);
d7f013c8 1914 q1 += UTF8SKIP(q1);
d07ddd77 1915 n2 -= UTF8SKIP(q2);
d7f013c8 1916 q2 += UTF8SKIP(q2);
701a277b 1917 }
d07ddd77 1918 if (n1 == 0)
d7f013c8 1919 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 1920 if (n2 == 0)
d7f013c8 1921 p2 += u2 ? UTF8SKIP(p2) : 1;
1922
d2cc3551 1923 }
5469e704 1924
d07ddd77 1925 /* A match is defined by all the scans that specified
1926 * an explicit length reaching their final goals. */
1927 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704 1928
1929 if (match) {
d07ddd77 1930 if (pe1)
1931 *pe1 = (char*)p1;
1932 if (pe2)
1933 *pe2 = (char*)p2;
5469e704 1934 }
1935
1936 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 1937}
701a277b 1938