Not extending enough.
[p5sagit/p5-mst-13.2.git] / utf8.c
CommitLineData
a0ed51b3 1/* utf8.c
2 *
be3c0a43 3 * Copyright (c) 1998-2002, Larry Wall
a0ed51b3 4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3 25#include "perl.h"
26
ccfc67b7 27/*
28=head1 Unicode Support
a0ed51b3 29
b851fbc1 30=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 31
32Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34bytes available. The return value is the pointer to the byte after the
9041c2e3 35end of the new character. In other words,
eebe1485 36
b851fbc1 37 d = uvuni_to_utf8_flags(d, uv, flags);
38
39or, in most cases,
40
9041c2e3 41 d = uvuni_to_utf8(d, uv);
eebe1485 42
b851fbc1 43(which is equivalent to)
44
45 d = uvuni_to_utf8_flags(d, uv, 0);
46
eebe1485 47is the recommended Unicode-aware way of saying
48
49 *(d++) = uv;
50
51=cut
52*/
53
dfe13c55 54U8 *
b851fbc1 55Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 56{
62961d2e 57 if (ckWARN(WARN_UTF8)) {
b851fbc1 58 if (UNICODE_IS_SURROGATE(uv) &&
59 !(flags & UNICODE_ALLOW_SURROGATE))
507b9800 60 Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1 61 else if (
62 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
63 !(flags & UNICODE_ALLOW_FDD0))
64 ||
65 ((uv & 0xFFFF) == 0xFFFE &&
66 !(flags & UNICODE_ALLOW_FFFE))
67 ||
68 ((uv & 0xFFFF) == 0xFFFF &&
69 !(flags & UNICODE_ALLOW_FFFF))) &&
70 /* UNICODE_ALLOW_SUPER includes
71 * FFFEs and FFFFs beyond 0x10FFFF. */
72 ((uv <= PERL_UNICODE_MAX) ||
73 !(flags & UNICODE_ALLOW_SUPER))
74 )
507b9800 75 Perl_warner(aTHX_ WARN_UTF8,
76 "Unicode character 0x%04"UVxf" is illegal", uv);
77 }
c4d5f83a 78 if (UNI_IS_INVARIANT(uv)) {
79 *d++ = UTF_TO_NATIVE(uv);
a0ed51b3 80 return d;
81 }
2d331972 82#if defined(EBCDIC)
1d72bdf6 83 else {
84 STRLEN len = UNISKIP(uv);
85 U8 *p = d+len-1;
86 while (p > d) {
87 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
88 uv >>= UTF_ACCUMULATION_SHIFT;
89 }
90 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
91 return d+len;
92 }
93#else /* Non loop style */
a0ed51b3 94 if (uv < 0x800) {
95 *d++ = (( uv >> 6) | 0xc0);
96 *d++ = (( uv & 0x3f) | 0x80);
97 return d;
98 }
99 if (uv < 0x10000) {
100 *d++ = (( uv >> 12) | 0xe0);
101 *d++ = (((uv >> 6) & 0x3f) | 0x80);
102 *d++ = (( uv & 0x3f) | 0x80);
103 return d;
104 }
105 if (uv < 0x200000) {
106 *d++ = (( uv >> 18) | 0xf0);
107 *d++ = (((uv >> 12) & 0x3f) | 0x80);
108 *d++ = (((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (( uv & 0x3f) | 0x80);
110 return d;
111 }
112 if (uv < 0x4000000) {
113 *d++ = (( uv >> 24) | 0xf8);
114 *d++ = (((uv >> 18) & 0x3f) | 0x80);
115 *d++ = (((uv >> 12) & 0x3f) | 0x80);
116 *d++ = (((uv >> 6) & 0x3f) | 0x80);
117 *d++ = (( uv & 0x3f) | 0x80);
118 return d;
119 }
120 if (uv < 0x80000000) {
121 *d++ = (( uv >> 30) | 0xfc);
122 *d++ = (((uv >> 24) & 0x3f) | 0x80);
123 *d++ = (((uv >> 18) & 0x3f) | 0x80);
124 *d++ = (((uv >> 12) & 0x3f) | 0x80);
125 *d++ = (((uv >> 6) & 0x3f) | 0x80);
126 *d++ = (( uv & 0x3f) | 0x80);
127 return d;
128 }
6b8eaf93 129#ifdef HAS_QUAD
d7578b48 130 if (uv < UTF8_QUAD_MAX)
a0ed51b3 131#endif
132 {
133 *d++ = 0xfe; /* Can't match U+FEFF! */
134 *d++ = (((uv >> 30) & 0x3f) | 0x80);
135 *d++ = (((uv >> 24) & 0x3f) | 0x80);
136 *d++ = (((uv >> 18) & 0x3f) | 0x80);
137 *d++ = (((uv >> 12) & 0x3f) | 0x80);
138 *d++ = (((uv >> 6) & 0x3f) | 0x80);
139 *d++ = (( uv & 0x3f) | 0x80);
140 return d;
141 }
6b8eaf93 142#ifdef HAS_QUAD
a0ed51b3 143 {
144 *d++ = 0xff; /* Can't match U+FFFE! */
3c77ea2b 145 *d++ = 0x80; /* 6 Reserved bits */
146 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
147 *d++ = (((uv >> 54) & 0x3f) | 0x80);
148 *d++ = (((uv >> 48) & 0x3f) | 0x80);
149 *d++ = (((uv >> 42) & 0x3f) | 0x80);
a0ed51b3 150 *d++ = (((uv >> 36) & 0x3f) | 0x80);
151 *d++ = (((uv >> 30) & 0x3f) | 0x80);
152 *d++ = (((uv >> 24) & 0x3f) | 0x80);
153 *d++ = (((uv >> 18) & 0x3f) | 0x80);
154 *d++ = (((uv >> 12) & 0x3f) | 0x80);
155 *d++ = (((uv >> 6) & 0x3f) | 0x80);
156 *d++ = (( uv & 0x3f) | 0x80);
157 return d;
158 }
159#endif
1d72bdf6 160#endif /* Loop style */
a0ed51b3 161}
b851fbc1 162
163U8 *
164Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
165{
166 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
167}
9041c2e3 168
169
170/*
eebe1485 171=for apidoc A|STRLEN|is_utf8_char|U8 *s
172
5da9da9e 173Tests if some arbitrary number of bytes begins in a valid UTF-8
1d72bdf6 174character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
5da9da9e 175The actual number of bytes in the UTF-8 character will be returned if
176it is valid, otherwise 0.
9041c2e3 177
37607a96 178=cut
179*/
067a85ef 180STRLEN
386d01d6 181Perl_is_utf8_char(pTHX_ U8 *s)
182{
183 U8 u = *s;
067a85ef 184 STRLEN slen, len;
185 UV uv, ouv;
386d01d6 186
1d72bdf6 187 if (UTF8_IS_INVARIANT(u))
386d01d6 188 return 1;
189
60006e79 190 if (!UTF8_IS_START(u))
386d01d6 191 return 0;
192
9f07fdcd 193 len = UTF8SKIP(s);
386d01d6 194
60006e79 195 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef 196 return 0;
197
386d01d6 198 slen = len - 1;
199 s++;
6eb6869e 200 u &= UTF_START_MASK(len);
1d72bdf6 201 uv = u;
067a85ef 202 ouv = uv;
386d01d6 203 while (slen--) {
60006e79 204 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 205 return 0;
8850bf83 206 uv = UTF8_ACCUMULATE(uv, *s);
209a9bc1 207 if (uv < ouv)
067a85ef 208 return 0;
209 ouv = uv;
386d01d6 210 s++;
211 }
067a85ef 212
5bbb0b5a 213 if (UNISKIP(uv) < len)
067a85ef 214 return 0;
215
386d01d6 216 return len;
217}
218
6662521e 219/*
eebe1485 220=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 221
5da9da9e 222Returns true if first C<len> bytes of the given string form a valid UTF8
223string, false otherwise. Note that 'a valid UTF8 string' does not mean
224'a string that contains UTF8' because a valid ASCII string is a valid
225UTF8 string.
6662521e 226
227=cut
228*/
229
8e84507e 230bool
6662521e 231Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
232{
067a85ef 233 U8* x = s;
1aa99e6b 234 U8* send;
067a85ef 235 STRLEN c;
236
1aa99e6b 237 if (!len)
6cd5fe39 238 len = strlen((char *)s);
1aa99e6b 239 send = s + len;
240
6662521e 241 while (x < send) {
242 c = is_utf8_char(x);
067a85ef 243 if (!c)
244 return FALSE;
6662521e 245 x += c;
6662521e 246 }
60006e79 247 if (x != send)
248 return FALSE;
067a85ef 249
250 return TRUE;
6662521e 251}
252
67e989fb 253/*
9041c2e3 254=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 255
9041c2e3 256Bottom level UTF-8 decode routine.
257Returns the unicode code point value of the first character in the string C<s>
ba210ebe 258which is assumed to be in UTF8 encoding and no longer than C<curlen>;
7df053ec 259C<retlen> will be set to the length, in bytes, of that character.
67e989fb 260
261If C<s> does not point to a well-formed UTF8 character, the behaviour
dcad2880 262is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
263it is assumed that the caller will raise a warning, and this function
28d3d195 264will silently just set C<retlen> to C<-1> and return zero. If the
265C<flags> does not contain UTF8_CHECK_ONLY, warnings about
266malformations will be given, C<retlen> will be set to the expected
267length of the UTF-8 character in bytes, and zero will be returned.
268
269The C<flags> can also contain various flags to allow deviations from
270the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 271
9041c2e3 272Most code should use utf8_to_uvchr() rather than call this directly.
273
37607a96 274=cut
275*/
67e989fb 276
a0ed51b3 277UV
37607a96 278Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 279{
097fb8e2 280 U8 *s0 = s;
9c5ffd7c 281 UV uv = *s, ouv = 0;
ba210ebe 282 STRLEN len = 1;
283 bool dowarn = ckWARN_d(WARN_UTF8);
4f849cb6 284 UV startbyte = *s;
ba210ebe 285 STRLEN expectlen = 0;
a0dbb045 286 U32 warning = 0;
287
288/* This list is a superset of the UTF8_ALLOW_XXX. */
289
290#define UTF8_WARN_EMPTY 1
291#define UTF8_WARN_CONTINUATION 2
292#define UTF8_WARN_NON_CONTINUATION 3
293#define UTF8_WARN_FE_FF 4
294#define UTF8_WARN_SHORT 5
295#define UTF8_WARN_OVERFLOW 6
296#define UTF8_WARN_SURROGATE 7
297#define UTF8_WARN_BOM 8
298#define UTF8_WARN_LONG 9
299#define UTF8_WARN_FFFF 10
300
301 if (curlen == 0 &&
302 !(flags & UTF8_ALLOW_EMPTY)) {
303 warning = UTF8_WARN_EMPTY;
0c443dc2 304 goto malformed;
305 }
306
1d72bdf6 307 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3 308 if (retlen)
309 *retlen = 1;
c4d5f83a 310 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 311 }
67e989fb 312
421a8bf2 313 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 314 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 315 warning = UTF8_WARN_CONTINUATION;
ba210ebe 316 goto malformed;
317 }
318
421a8bf2 319 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 320 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 321 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 322 goto malformed;
323 }
9041c2e3 324
1d72bdf6 325#ifdef EBCDIC
75383841 326 uv = NATIVE_TO_UTF(uv);
1d72bdf6 327#else
fcc8fcf6 328 if ((uv == 0xfe || uv == 0xff) &&
329 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 330 warning = UTF8_WARN_FE_FF;
ba210ebe 331 goto malformed;
a0ed51b3 332 }
1d72bdf6 333#endif
334
ba210ebe 335 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
336 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
337 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
338 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6 339#ifdef EBCDIC
340 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
341 else { len = 7; uv &= 0x01; }
342#else
ba210ebe 343 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
344 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6 345 else { len = 13; uv = 0; } /* whoa! */
346#endif
347
a0ed51b3 348 if (retlen)
349 *retlen = len;
9041c2e3 350
ba210ebe 351 expectlen = len;
352
fcc8fcf6 353 if ((curlen < expectlen) &&
354 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 355 warning = UTF8_WARN_SHORT;
ba210ebe 356 goto malformed;
357 }
358
359 len--;
a0ed51b3 360 s++;
ba210ebe 361 ouv = uv;
362
a0ed51b3 363 while (len--) {
421a8bf2 364 if (!UTF8_IS_CONTINUATION(*s) &&
365 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 366 s--;
367 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 368 goto malformed;
a0ed51b3 369 }
370 else
8850bf83 371 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045 372 if (!(uv > ouv)) {
373 /* These cannot be allowed. */
374 if (uv == ouv) {
375 if (!(flags & UTF8_ALLOW_LONG)) {
376 warning = UTF8_WARN_LONG;
377 goto malformed;
378 }
379 }
380 else { /* uv < ouv */
381 /* This cannot be allowed. */
382 warning = UTF8_WARN_OVERFLOW;
383 goto malformed;
384 }
ba210ebe 385 }
386 s++;
387 ouv = uv;
388 }
389
421a8bf2 390 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 391 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 392 warning = UTF8_WARN_SURROGATE;
ba210ebe 393 goto malformed;
421a8bf2 394 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
fcc8fcf6 395 !(flags & UTF8_ALLOW_BOM)) {
a0dbb045 396 warning = UTF8_WARN_BOM;
ba210ebe 397 goto malformed;
fcc8fcf6 398 } else if ((expectlen > UNISKIP(uv)) &&
399 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 400 warning = UTF8_WARN_LONG;
ba210ebe 401 goto malformed;
421a8bf2 402 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 403 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 404 warning = UTF8_WARN_FFFF;
a9917092 405 goto malformed;
a0ed51b3 406 }
ba210ebe 407
a0ed51b3 408 return uv;
ba210ebe 409
410malformed:
411
fcc8fcf6 412 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 413 if (retlen)
cc366d4b 414 *retlen = -1;
ba210ebe 415 return 0;
416 }
417
a0dbb045 418 if (dowarn) {
419 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
420
421 switch (warning) {
422 case 0: /* Intentionally empty. */ break;
423 case UTF8_WARN_EMPTY:
424 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
425 break;
426 case UTF8_WARN_CONTINUATION:
097fb8e2 427 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045 428 break;
429 case UTF8_WARN_NON_CONTINUATION:
097fb8e2 430 if (s == s0)
431 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
432 (UV)s[1], startbyte);
433 else
434 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
435 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
436
a0dbb045 437 break;
438 case UTF8_WARN_FE_FF:
439 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
440 break;
441 case UTF8_WARN_SHORT:
097fb8e2 442 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
443 curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
b31f83c2 444 expectlen = curlen; /* distance for caller to skip */
a0dbb045 445 break;
446 case UTF8_WARN_OVERFLOW:
097fb8e2 447 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
448 ouv, *s, startbyte);
a0dbb045 449 break;
450 case UTF8_WARN_SURROGATE:
451 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
452 break;
453 case UTF8_WARN_BOM:
454 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
455 break;
456 case UTF8_WARN_LONG:
097fb8e2 457 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
458 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045 459 break;
460 case UTF8_WARN_FFFF:
461 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
462 break;
463 default:
464 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
465 break;
466 }
467
468 if (warning) {
469 char *s = SvPVX(sv);
470
471 if (PL_op)
472 Perl_warner(aTHX_ WARN_UTF8,
53e06cf0 473 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 474 else
475 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
476 }
477 }
478
ba210ebe 479 if (retlen)
28d3d195 480 *retlen = expectlen ? expectlen : len;
ba210ebe 481
28d3d195 482 return 0;
a0ed51b3 483}
484
8e84507e 485/*
37607a96 486=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
9041c2e3 487
488Returns the native character value of the first character in the string C<s>
489which is assumed to be in UTF8 encoding; C<retlen> will be set to the
490length, in bytes, of that character.
491
492If C<s> does not point to a well-formed UTF8 character, zero is
493returned and retlen is set, if possible, to -1.
494
495=cut
496*/
497
498UV
37607a96 499Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
9041c2e3 500{
501 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
502}
503
504/*
37607a96 505=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
9041c2e3 506
507Returns the Unicode code point of the first character in the string C<s>
508which is assumed to be in UTF8 encoding; C<retlen> will be set to the
509length, in bytes, of that character.
510
511This function should only be used when returned UV is considered
512an index into the Unicode semantic tables (e.g. swashes).
513
ba210ebe 514If C<s> does not point to a well-formed UTF8 character, zero is
515returned and retlen is set, if possible, to -1.
8e84507e 516
517=cut
518*/
519
520UV
37607a96 521Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
8e84507e 522{
9041c2e3 523 /* Call the low level routine asking for checks */
524 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
8e84507e 525}
526
b76347f2 527/*
37607a96 528=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
b76347f2 529
530Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47 531Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
532up past C<e>, croaks.
b76347f2 533
534=cut
535*/
536
537STRLEN
37607a96 538Perl_utf8_length(pTHX_ U8 *s, U8 *e)
b76347f2 539{
540 STRLEN len = 0;
541
8850bf83 542 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
543 * the bitops (especially ~) can create illegal UTF-8.
544 * In other words: in Perl UTF-8 is not just for Unicode. */
545
b76347f2 546 if (e < s)
02eb7b47 547 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
b76347f2 548 while (s < e) {
02eb7b47 549 U8 t = UTF8SKIP(s);
b76347f2 550
551 if (e - s < t)
0064a8a9 552 Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
b76347f2 553 s += t;
554 len++;
555 }
556
557 return len;
558}
559
b06226ff 560/*
eebe1485 561=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff 562
563Returns the number of UTF8 characters between the UTF-8 pointers C<a>
564and C<b>.
565
566WARNING: use only if you *know* that the pointers point inside the
567same UTF-8 buffer.
568
37607a96 569=cut
570*/
a0ed51b3 571
02eb7b47 572IV
864dbfa3 573Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 574{
02eb7b47 575 IV off = 0;
576
8850bf83 577 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
578 * the bitops (especially ~) can create illegal UTF-8.
579 * In other words: in Perl UTF-8 is not just for Unicode. */
580
a0ed51b3 581 if (a < b) {
582 while (a < b) {
02eb7b47 583 U8 c = UTF8SKIP(a);
584
585 if (b - a < c)
586 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
587 a += c;
a0ed51b3 588 off--;
589 }
590 }
591 else {
592 while (b < a) {
02eb7b47 593 U8 c = UTF8SKIP(b);
594
595 if (a - b < c)
596 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
597 b += c;
a0ed51b3 598 off++;
599 }
600 }
02eb7b47 601
a0ed51b3 602 return off;
603}
604
b06226ff 605/*
37607a96 606=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 607
8850bf83 608Return the UTF-8 pointer C<s> displaced by C<off> characters, either
609forward or backward.
b06226ff 610
611WARNING: do not use the following unless you *know* C<off> is within
8850bf83 612the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
613on the first byte of character or just after the last byte of a character.
b06226ff 614
37607a96 615=cut
616*/
a0ed51b3 617
618U8 *
864dbfa3 619Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 620{
8850bf83 621 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
622 * the bitops (especially ~) can create illegal UTF-8.
623 * In other words: in Perl UTF-8 is not just for Unicode. */
624
a0ed51b3 625 if (off >= 0) {
626 while (off--)
627 s += UTF8SKIP(s);
628 }
629 else {
630 while (off++) {
631 s--;
8850bf83 632 while (UTF8_IS_CONTINUATION(*s))
633 s--;
a0ed51b3 634 }
635 }
636 return s;
637}
638
6940069f 639/*
eebe1485 640=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 641
246fae53 642Converts a string C<s> of length C<len> from UTF8 into byte encoding.
643Unlike C<bytes_to_utf8>, this over-writes the original string, and
644updates len to contain the new length.
67e989fb 645Returns zero on failure, setting C<len> to -1.
6940069f 646
647=cut
648*/
649
650U8 *
37607a96 651Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 652{
6940069f 653 U8 *send;
654 U8 *d;
dcad2880 655 U8 *save = s;
246fae53 656
657 /* ensure valid UTF8 and chars < 256 before updating string */
dcad2880 658 for (send = s + *len; s < send; ) {
659 U8 c = *s++;
660
1d72bdf6 661 if (!UTF8_IS_INVARIANT(c) &&
662 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
663 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880 664 *len = -1;
665 return 0;
666 }
246fae53 667 }
dcad2880 668
669 d = s = save;
6940069f 670 while (s < send) {
ed646e6e 671 STRLEN ulen;
9041c2e3 672 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 673 s += ulen;
6940069f 674 }
675 *d = '\0';
246fae53 676 *len = d - save;
6940069f 677 return save;
678}
679
680/*
f9a63242 681=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
682
683Converts a string C<s> of length C<len> from UTF8 into byte encoding.
684Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0 685the newly-created string, and updates C<len> to contain the new
686length. Returns the original string if no conversion occurs, C<len>
687is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
6880 if C<s> is converted or contains all 7bit characters.
f9a63242 689
37607a96 690=cut
691*/
f9a63242 692
693U8 *
37607a96 694Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 695{
f9a63242 696 U8 *d;
697 U8 *start = s;
db42d148 698 U8 *send;
f9a63242 699 I32 count = 0;
700
701 if (!*is_utf8)
702 return start;
703
ef9edfd0 704 /* ensure valid UTF8 and chars < 256 before converting string */
f9a63242 705 for (send = s + *len; s < send;) {
706 U8 c = *s++;
1d72bdf6 707 if (!UTF8_IS_INVARIANT(c)) {
db42d148 708 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
709 (c = *s++) && UTF8_IS_CONTINUATION(c))
710 count++;
711 else
f9a63242 712 return start;
db42d148 713 }
f9a63242 714 }
715
716 *is_utf8 = 0;
717
f9a63242 718 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 719 s = start; start = d;
f9a63242 720 while (s < send) {
721 U8 c = *s++;
c4d5f83a 722 if (!UTF8_IS_INVARIANT(c)) {
723 /* Then it is two-byte encoded */
724 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
725 c = ASCII_TO_NATIVE(c);
726 }
727 *d++ = c;
f9a63242 728 }
729 *d = '\0';
730 *len = d - start;
731 return start;
732}
733
734/*
eebe1485 735=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f 736
737Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
6662521e 738Returns a pointer to the newly-created string, and sets C<len> to
739reflect the new length.
6940069f 740
497711e7 741=cut
6940069f 742*/
743
744U8*
37607a96 745Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
6940069f 746{
6940069f 747 U8 *send;
748 U8 *d;
749 U8 *dst;
6662521e 750 send = s + (*len);
6940069f 751
6662521e 752 Newz(801, d, (*len) * 2 + 1, U8);
6940069f 753 dst = d;
754
755 while (s < send) {
db42d148 756 UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 757 if (UNI_IS_INVARIANT(uv))
758 *d++ = UTF_TO_NATIVE(uv);
6940069f 759 else {
90f44359 760 *d++ = UTF8_EIGHT_BIT_HI(uv);
761 *d++ = UTF8_EIGHT_BIT_LO(uv);
6940069f 762 }
763 }
764 *d = '\0';
6662521e 765 *len = d-dst;
6940069f 766 return dst;
767}
768
a0ed51b3 769/*
dea0fc0b 770 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3 771 *
772 * Destination must be pre-extended to 3/2 source. Do not use in-place.
773 * We optimize for native, for obvious reasons. */
774
775U8*
dea0fc0b 776Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 777{
dea0fc0b 778 U8* pend;
779 U8* dstart = d;
780
781 if (bytelen & 1)
a7867d0a 782 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b 783
784 pend = p + bytelen;
785
a0ed51b3 786 while (p < pend) {
dea0fc0b 787 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
788 p += 2;
a0ed51b3 789 if (uv < 0x80) {
790 *d++ = uv;
791 continue;
792 }
793 if (uv < 0x800) {
794 *d++ = (( uv >> 6) | 0xc0);
795 *d++ = (( uv & 0x3f) | 0x80);
796 continue;
797 }
798 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
dea0fc0b 799 UV low = *p++;
800 if (low < 0xdc00 || low >= 0xdfff)
801 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3 802 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
803 }
804 if (uv < 0x10000) {
805 *d++ = (( uv >> 12) | 0xe0);
806 *d++ = (((uv >> 6) & 0x3f) | 0x80);
807 *d++ = (( uv & 0x3f) | 0x80);
808 continue;
809 }
810 else {
811 *d++ = (( uv >> 18) | 0xf0);
812 *d++ = (((uv >> 12) & 0x3f) | 0x80);
813 *d++ = (((uv >> 6) & 0x3f) | 0x80);
814 *d++ = (( uv & 0x3f) | 0x80);
815 continue;
816 }
817 }
dea0fc0b 818 *newlen = d - dstart;
a0ed51b3 819 return d;
820}
821
822/* Note: this one is slightly destructive of the source. */
823
824U8*
dea0fc0b 825Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 826{
827 U8* s = (U8*)p;
828 U8* send = s + bytelen;
829 while (s < send) {
830 U8 tmp = s[0];
831 s[0] = s[1];
832 s[1] = tmp;
833 s += 2;
834 }
dea0fc0b 835 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3 836}
837
838/* for now these are all defined (inefficiently) in terms of the utf8 versions */
839
840bool
84afefe6 841Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 842{
ad391ad9 843 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 844 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 845 return is_utf8_alnum(tmpbuf);
846}
847
848bool
84afefe6 849Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 850{
ad391ad9 851 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 852 uvchr_to_utf8(tmpbuf, c);
b8c5462f 853 return is_utf8_alnumc(tmpbuf);
854}
855
856bool
84afefe6 857Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 858{
ad391ad9 859 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 860 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 861 return is_utf8_idfirst(tmpbuf);
862}
863
864bool
84afefe6 865Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 866{
ad391ad9 867 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 868 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 869 return is_utf8_alpha(tmpbuf);
870}
871
872bool
84afefe6 873Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 874{
ad391ad9 875 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 876 uvchr_to_utf8(tmpbuf, c);
4d61ec05 877 return is_utf8_ascii(tmpbuf);
878}
879
880bool
84afefe6 881Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 882{
ad391ad9 883 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 884 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 885 return is_utf8_space(tmpbuf);
886}
887
888bool
84afefe6 889Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 890{
ad391ad9 891 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 892 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 893 return is_utf8_digit(tmpbuf);
894}
895
896bool
84afefe6 897Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 898{
ad391ad9 899 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 900 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 901 return is_utf8_upper(tmpbuf);
902}
903
904bool
84afefe6 905Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 906{
ad391ad9 907 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 908 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 909 return is_utf8_lower(tmpbuf);
910}
911
912bool
84afefe6 913Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 914{
ad391ad9 915 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 916 uvchr_to_utf8(tmpbuf, c);
b8c5462f 917 return is_utf8_cntrl(tmpbuf);
918}
919
920bool
84afefe6 921Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 922{
ad391ad9 923 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 924 uvchr_to_utf8(tmpbuf, c);
b8c5462f 925 return is_utf8_graph(tmpbuf);
926}
927
928bool
84afefe6 929Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 930{
ad391ad9 931 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 932 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 933 return is_utf8_print(tmpbuf);
934}
935
b8c5462f 936bool
84afefe6 937Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 938{
ad391ad9 939 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 940 uvchr_to_utf8(tmpbuf, c);
b8c5462f 941 return is_utf8_punct(tmpbuf);
942}
943
4d61ec05 944bool
84afefe6 945Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 946{
e7ae6809 947 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
230880c1 948 uvchr_to_utf8(tmpbuf, c);
4d61ec05 949 return is_utf8_xdigit(tmpbuf);
950}
951
84afefe6 952UV
953Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 954{
e7ae6809 955 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
230880c1 956 uvchr_to_utf8(tmpbuf, c);
a2a2844f 957 return to_utf8_upper(tmpbuf, p, lenp);
a0ed51b3 958}
959
84afefe6 960UV
961Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 962{
e7ae6809 963 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
230880c1 964 uvchr_to_utf8(tmpbuf, c);
a2a2844f 965 return to_utf8_title(tmpbuf, p, lenp);
a0ed51b3 966}
967
84afefe6 968UV
969Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 970{
d90824e8 971 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
230880c1 972 uvchr_to_utf8(tmpbuf, c);
a2a2844f 973 return to_utf8_lower(tmpbuf, p, lenp);
a0ed51b3 974}
975
84afefe6 976UV
977Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
978{
d90824e8 979 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
230880c1 980 uvchr_to_utf8(tmpbuf, c);
84afefe6 981 return to_utf8_fold(tmpbuf, p, lenp);
982}
983
a0ed51b3 984/* for now these all assume no locale info available for Unicode > 255 */
985
986bool
84afefe6 987Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3 988{
989 return is_uni_alnum(c); /* XXX no locale support yet */
990}
991
992bool
84afefe6 993Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f 994{
995 return is_uni_alnumc(c); /* XXX no locale support yet */
996}
997
998bool
84afefe6 999Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3 1000{
1001 return is_uni_idfirst(c); /* XXX no locale support yet */
1002}
1003
1004bool
84afefe6 1005Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3 1006{
1007 return is_uni_alpha(c); /* XXX no locale support yet */
1008}
1009
1010bool
84afefe6 1011Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05 1012{
1013 return is_uni_ascii(c); /* XXX no locale support yet */
1014}
1015
1016bool
84afefe6 1017Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3 1018{
1019 return is_uni_space(c); /* XXX no locale support yet */
1020}
1021
1022bool
84afefe6 1023Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3 1024{
1025 return is_uni_digit(c); /* XXX no locale support yet */
1026}
1027
1028bool
84afefe6 1029Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3 1030{
1031 return is_uni_upper(c); /* XXX no locale support yet */
1032}
1033
1034bool
84afefe6 1035Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3 1036{
1037 return is_uni_lower(c); /* XXX no locale support yet */
1038}
1039
1040bool
84afefe6 1041Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f 1042{
1043 return is_uni_cntrl(c); /* XXX no locale support yet */
1044}
1045
1046bool
84afefe6 1047Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f 1048{
1049 return is_uni_graph(c); /* XXX no locale support yet */
1050}
1051
1052bool
84afefe6 1053Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3 1054{
1055 return is_uni_print(c); /* XXX no locale support yet */
1056}
1057
b8c5462f 1058bool
84afefe6 1059Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f 1060{
1061 return is_uni_punct(c); /* XXX no locale support yet */
1062}
1063
4d61ec05 1064bool
84afefe6 1065Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05 1066{
1067 return is_uni_xdigit(c); /* XXX no locale support yet */
1068}
1069
b7ac61fa 1070U32
1071Perl_to_uni_upper_lc(pTHX_ U32 c)
1072{
ee099d14 1073 /* XXX returns only the first character -- do not use XXX */
1074 /* XXX no locale support yet */
1075 STRLEN len;
1076 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1077 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa 1078}
1079
1080U32
1081Perl_to_uni_title_lc(pTHX_ U32 c)
1082{
ee099d14 1083 /* XXX returns only the first character XXX -- do not use XXX */
1084 /* XXX no locale support yet */
1085 STRLEN len;
1086 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1087 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa 1088}
1089
1090U32
1091Perl_to_uni_lower_lc(pTHX_ U32 c)
1092{
ee099d14 1093 /* XXX returns only the first character -- do not use XXX */
1094 /* XXX no locale support yet */
1095 STRLEN len;
1096 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1097 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa 1098}
1099
a0ed51b3 1100bool
864dbfa3 1101Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1102{
386d01d6 1103 if (!is_utf8_char(p))
1104 return FALSE;
a0ed51b3 1105 if (!PL_utf8_alnum)
289d4f09 1106 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1107 * descendant of isalnum(3), in other words, it doesn't
1108 * contain the '_'. --jhi */
1109 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
3568d838 1110 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3 1111/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1112#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1113 if (!PL_utf8_alnum)
1114 PL_utf8_alnum = swash_init("utf8", "",
1115 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1116 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3 1117#endif
1118}
1119
1120bool
b8c5462f 1121Perl_is_utf8_alnumc(pTHX_ U8 *p)
1122{
386d01d6 1123 if (!is_utf8_char(p))
1124 return FALSE;
b8c5462f 1125 if (!PL_utf8_alnum)
1126 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
3568d838 1127 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f 1128/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1129#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1130 if (!PL_utf8_alnum)
1131 PL_utf8_alnum = swash_init("utf8", "",
1132 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1133 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f 1134#endif
1135}
1136
1137bool
864dbfa3 1138Perl_is_utf8_idfirst(pTHX_ U8 *p)
a0ed51b3 1139{
1140 return *p == '_' || is_utf8_alpha(p);
1141}
1142
1143bool
864dbfa3 1144Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1145{
386d01d6 1146 if (!is_utf8_char(p))
1147 return FALSE;
a0ed51b3 1148 if (!PL_utf8_alpha)
e24b16f9 1149 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
3568d838 1150 return swash_fetch(PL_utf8_alpha, p, TRUE);
a0ed51b3 1151}
1152
1153bool
b8c5462f 1154Perl_is_utf8_ascii(pTHX_ U8 *p)
1155{
386d01d6 1156 if (!is_utf8_char(p))
1157 return FALSE;
b8c5462f 1158 if (!PL_utf8_ascii)
1159 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
3568d838 1160 return swash_fetch(PL_utf8_ascii, p, TRUE);
b8c5462f 1161}
1162
1163bool
864dbfa3 1164Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1165{
386d01d6 1166 if (!is_utf8_char(p))
1167 return FALSE;
a0ed51b3 1168 if (!PL_utf8_space)
3bec3564 1169 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
3568d838 1170 return swash_fetch(PL_utf8_space, p, TRUE);
a0ed51b3 1171}
1172
1173bool
864dbfa3 1174Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1175{
386d01d6 1176 if (!is_utf8_char(p))
1177 return FALSE;
a0ed51b3 1178 if (!PL_utf8_digit)
e24b16f9 1179 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
3568d838 1180 return swash_fetch(PL_utf8_digit, p, TRUE);
a0ed51b3 1181}
1182
1183bool
864dbfa3 1184Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1185{
386d01d6 1186 if (!is_utf8_char(p))
1187 return FALSE;
a0ed51b3 1188 if (!PL_utf8_upper)
e24b16f9 1189 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
3568d838 1190 return swash_fetch(PL_utf8_upper, p, TRUE);
a0ed51b3 1191}
1192
1193bool
864dbfa3 1194Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1195{
386d01d6 1196 if (!is_utf8_char(p))
1197 return FALSE;
a0ed51b3 1198 if (!PL_utf8_lower)
e24b16f9 1199 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
3568d838 1200 return swash_fetch(PL_utf8_lower, p, TRUE);
a0ed51b3 1201}
1202
1203bool
b8c5462f 1204Perl_is_utf8_cntrl(pTHX_ U8 *p)
1205{
386d01d6 1206 if (!is_utf8_char(p))
1207 return FALSE;
b8c5462f 1208 if (!PL_utf8_cntrl)
1209 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
3568d838 1210 return swash_fetch(PL_utf8_cntrl, p, TRUE);
b8c5462f 1211}
1212
1213bool
1214Perl_is_utf8_graph(pTHX_ U8 *p)
1215{
386d01d6 1216 if (!is_utf8_char(p))
1217 return FALSE;
b8c5462f 1218 if (!PL_utf8_graph)
1219 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
3568d838 1220 return swash_fetch(PL_utf8_graph, p, TRUE);
b8c5462f 1221}
1222
1223bool
864dbfa3 1224Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1225{
386d01d6 1226 if (!is_utf8_char(p))
1227 return FALSE;
a0ed51b3 1228 if (!PL_utf8_print)
e24b16f9 1229 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
3568d838 1230 return swash_fetch(PL_utf8_print, p, TRUE);
a0ed51b3 1231}
1232
1233bool
b8c5462f 1234Perl_is_utf8_punct(pTHX_ U8 *p)
1235{
386d01d6 1236 if (!is_utf8_char(p))
1237 return FALSE;
b8c5462f 1238 if (!PL_utf8_punct)
1239 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
3568d838 1240 return swash_fetch(PL_utf8_punct, p, TRUE);
b8c5462f 1241}
1242
1243bool
1244Perl_is_utf8_xdigit(pTHX_ U8 *p)
1245{
386d01d6 1246 if (!is_utf8_char(p))
1247 return FALSE;
b8c5462f 1248 if (!PL_utf8_xdigit)
1249 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
3568d838 1250 return swash_fetch(PL_utf8_xdigit, p, TRUE);
b8c5462f 1251}
1252
1253bool
864dbfa3 1254Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1255{
386d01d6 1256 if (!is_utf8_char(p))
1257 return FALSE;
a0ed51b3 1258 if (!PL_utf8_mark)
e24b16f9 1259 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
3568d838 1260 return swash_fetch(PL_utf8_mark, p, TRUE);
a0ed51b3 1261}
1262
6b5c0936 1263/*
1264=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1265
1266The "p" contains the pointer to the UTF-8 string encoding
1267the character that is being converted.
1268
1269The "ustrp" is a pointer to the character buffer to put the
1270conversion result to. The "lenp" is a pointer to the length
1271of the result.
1272
1273The "swash" is a pointer to the swash to use.
1274
1275The "normal" is a string like "ToLower" which means the swash
1276$utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
1277and loaded by SWASHGET, using lib/utf8_heavy.pl.
1278
1279The "special" is a string like "utf8::ToSpecLower", which means
d1be9408 1280the hash %utf8::ToSpecLower, which is stored in the same file,
6b5c0936 1281lib/unicore/To/Lower.pl, and also loaded by SWASHGET. The access
1282to the hash is by Perl_to_utf8_case().
1283
1284=cut
1285 */
1286
2104c8d9 1287UV
a6872d42 1288Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
a0ed51b3 1289{
1feea2c7 1290 UV uv0, uv1, uv2;
2f9475ad 1291 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
1feea2c7 1292 STRLEN len;
a0ed51b3 1293
983ffd37 1294 if (!*swashp)
1295 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1feea2c7 1296 uv0 = utf8_to_uvchr(p, 0);
1297 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1298 * are necessary in EBCDIC, they are redundant no-ops
1299 * in ASCII-ish platforms, and hopefully optimized away. */
1300 uv1 = NATIVE_TO_UNI(uv0);
1301 uvuni_to_utf8(tmpbuf, uv1);
1302 uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1303 if (uv2) {
2a37f04d 1304 /* It was "normal" (a single character mapping). */
2f9475ad 1305 UV uv3 = UNI_TO_NATIVE(uv2);
1306
47654450 1307 len = uvuni_to_utf8(ustrp, uv3) - ustrp;
1feea2c7 1308 }
1309 else {
2a37f04d 1310 /* It might be "special" (sometimes, but not always,
1311 * a multicharacter mapping) */
983ffd37 1312 HV *hv;
1313 SV *keysv;
1314 HE *he;
2a37f04d 1315 SV *val;
2f9475ad 1316
983ffd37 1317 if ((hv = get_hv(special, FALSE)) &&
1feea2c7 1318 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
2a37f04d 1319 (he = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
1320 (val = HeVAL(he))) {
47654450 1321 char *s;
47654450 1322
2a37f04d 1323 s = SvPV(val, len);
47654450 1324 if (len == 1)
1325 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1326 else {
2f9475ad 1327#ifdef EBCDIC
1328 /* If we have EBCDIC we need to remap the characters
1329 * since any characters in the low 256 are Unicode
1330 * code points, not EBCDIC. */
7cda7a3d 1331 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad 1332
1333 d = tmpbuf;
1334 if (SvUTF8(val)) {
1335 STRLEN tlen = 0;
1336
1337 while (t < tend) {
1338 UV c = utf8_to_uvchr(t, &tlen);
1339 if (tlen > 0) {
1340 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1341 t += tlen;
1342 }
1343 else
1344 break;
1345 }
1346 }
1347 else {
47654450 1348 while (t < tend)
1349 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t++));
2f9475ad 1350 }
1351 len = d - tmpbuf;
1352 Copy(tmpbuf, ustrp, len, U8);
1353#else
d2dcd0fb 1354 Copy(s, ustrp, len, U8);
d2dcd0fb 1355 }
2f9475ad 1356#endif
983ffd37 1357 }
2a37f04d 1358 else {
1359 /* It was not "special", either. */
2f9475ad 1360 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
2a37f04d 1361 }
1362 }
1feea2c7 1363
2a37f04d 1364 if (lenp)
1365 *lenp = len;
1366
1367 return utf8_to_uvchr(ustrp, 0);
a0ed51b3 1368}
1369
d3e79532 1370/*
1371=for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1372
1373Convert the UTF-8 encoded character at p to its uppercase version and
1374store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1375that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1376uppercase version may be longer than the original character (up to two
1377characters).
1378
1379The first character of the uppercased version is returned
1380(but note, as explained above, that there may be more.)
1381
1382=cut */
1383
2104c8d9 1384UV
983ffd37 1385Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1386{
983ffd37 1387 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1388 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1389}
a0ed51b3 1390
d3e79532 1391/*
1392=for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1393
1394Convert the UTF-8 encoded character at p to its titlecase version and
1395store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1396that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1397titlecase version may be longer than the original character (up to two
1398characters).
1399
1400The first character of the titlecased version is returned
1401(but note, as explained above, that there may be more.)
1402
1403=cut */
1404
983ffd37 1405UV
1406Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1407{
1408 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1409 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3 1410}
1411
d3e79532 1412/*
1413=for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1414
1415Convert the UTF-8 encoded character at p to its lowercase version and
1416store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1417that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1418lowercase version may be longer than the original character (up to two
1419characters).
1420
1421The first character of the lowercased version is returned
1422(but note, as explained above, that there may be more.)
1423
1424=cut */
1425
2104c8d9 1426UV
a2a2844f 1427Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1428{
983ffd37 1429 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1430 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1431}
1432
d3e79532 1433/*
1434=for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1435
1436Convert the UTF-8 encoded character at p to its foldcase version and
1437store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1438that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1439foldcase version may be longer than the original character (up to
1440three characters).
1441
1442The first character of the foldcased version is returned
1443(but note, as explained above, that there may be more.)
1444
1445=cut */
1446
b4e400f9 1447UV
1448Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1449{
1450 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1451 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3 1452}
1453
1454/* a "swash" is a swatch hash */
1455
1456SV*
864dbfa3 1457Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1458{
1459 SV* retval;
bf1fed83 1460 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1461 dSP;
1b026014 1462 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
f8be5cf0 1463 SV* errsv_save;
ce3b816e 1464
1b026014 1465 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1466 ENTER;
f8be5cf0 1467 errsv_save = newSVsv(ERRSV);
ce3b816e 1468 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
f8be5cf0 1469 if (!SvTRUE(ERRSV))
1470 sv_setsv(ERRSV, errsv_save);
1471 SvREFCNT_dec(errsv_save);
ce3b816e 1472 LEAVE;
1473 }
1474 SPAGAIN;
a0ed51b3 1475 PUSHSTACKi(PERLSI_MAGIC);
1476 PUSHMARK(SP);
1477 EXTEND(SP,5);
1478 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1479 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1480 PUSHs(listsv);
1481 PUSHs(sv_2mortal(newSViv(minbits)));
1482 PUSHs(sv_2mortal(newSViv(none)));
1483 PUTBACK;
1484 ENTER;
1485 SAVEI32(PL_hints);
1486 PL_hints = 0;
1487 save_re_context();
bf1fed83 1488 if (PL_curcop == &PL_compiling)
1489 /* XXX ought to be handled by lex_start */
1490 sv_setpv(tokenbufsv, PL_tokenbuf);
f8be5cf0 1491 errsv_save = newSVsv(ERRSV);
864dbfa3 1492 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1493 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1494 else
e24b16f9 1495 retval = &PL_sv_undef;
f8be5cf0 1496 if (!SvTRUE(ERRSV))
1497 sv_setsv(ERRSV, errsv_save);
1498 SvREFCNT_dec(errsv_save);
a0ed51b3 1499 LEAVE;
1500 POPSTACK;
e24b16f9 1501 if (PL_curcop == &PL_compiling) {
bf1fed83 1502 STRLEN len;
1503 char* pv = SvPV(tokenbufsv, len);
1504
1505 Copy(pv, PL_tokenbuf, len+1, char);
e24b16f9 1506 PL_curcop->op_private = PL_hints;
a0ed51b3 1507 }
1508 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
cea2e8a9 1509 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
a0ed51b3 1510 return retval;
1511}
1512
035d37be 1513
1514/* This API is wrong for special case conversions since we may need to
1515 * return several Unicode characters for a single Unicode character
1516 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1517 * the lower-level routine, and it is similarly broken for returning
1518 * multiple values. --jhi */
a0ed51b3 1519UV
3568d838 1520Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
a0ed51b3 1521{
1522 HV* hv = (HV*)SvRV(sv);
3568d838 1523 U32 klen;
1524 U32 off;
a0ed51b3 1525 STRLEN slen;
7d85a32c 1526 STRLEN needents;
4ea42e7f 1527 U8 *tmps = NULL;
a0ed51b3 1528 U32 bit;
1529 SV *retval;
3568d838 1530 U8 tmputf8[2];
1531 UV c = NATIVE_TO_ASCII(*ptr);
1532
1533 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1534 tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1535 tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1536 ptr = tmputf8;
1537 }
1538 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1539 * then the "swatch" is a vec() for al the chars which start
1540 * with 0xAA..0xYY
1541 * So the key in the hash (klen) is length of encoded char -1
1542 */
1543 klen = UTF8SKIP(ptr) - 1;
1544 off = ptr[klen];
a0ed51b3 1545
7d85a32c 1546 if (klen == 0)
1547 {
1548 /* If char in invariant then swatch is for all the invariant chars
1549 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1550 */
1551 needents = UTF_CONTINUATION_MARK;
1552 off = NATIVE_TO_UTF(ptr[klen]);
1553 }
1554 else
1555 {
1556 /* If char is encoded then swatch is for the prefix */
1557 needents = (1 << UTF_ACCUMULATION_SHIFT);
1558 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1559 }
1560
a0ed51b3 1561 /*
1562 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1563 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1564 * it's nothing to sniff at.) Pity we usually come through at least
1565 * two function calls to get here...
1566 *
1567 * NB: this code assumes that swatches are never modified, once generated!
1568 */
1569
3568d838 1570 if (hv == PL_last_swash_hv &&
a0ed51b3 1571 klen == PL_last_swash_klen &&
3568d838 1572 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3 1573 {
1574 tmps = PL_last_swash_tmps;
1575 slen = PL_last_swash_slen;
1576 }
1577 else {
1578 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1579 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3 1580
1581 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1582 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1583 dSP;
2b9d42f0 1584 /* We use utf8n_to_uvuni() as we want an index into
1585 Unicode tables, not a native character number.
1586 */
1587 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
f8be5cf0 1588 SV *errsv_save;
a0ed51b3 1589 ENTER;
1590 SAVETMPS;
1591 save_re_context();
1592 PUSHSTACKi(PERLSI_MAGIC);
1593 PUSHMARK(SP);
1594 EXTEND(SP,3);
1595 PUSHs((SV*)sv);
ffbc6a93 1596 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838 1597 PUSHs(sv_2mortal(newSViv((klen) ?
1598 (code_point & ~(needents - 1)) : 0)));
a0ed51b3 1599 PUSHs(sv_2mortal(newSViv(needents)));
1600 PUTBACK;
f8be5cf0 1601 errsv_save = newSVsv(ERRSV);
864dbfa3 1602 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1603 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1604 else
e24b16f9 1605 retval = &PL_sv_undef;
f8be5cf0 1606 if (!SvTRUE(ERRSV))
1607 sv_setsv(ERRSV, errsv_save);
1608 SvREFCNT_dec(errsv_save);
a0ed51b3 1609 POPSTACK;
1610 FREETMPS;
1611 LEAVE;
e24b16f9 1612 if (PL_curcop == &PL_compiling)
1613 PL_curcop->op_private = PL_hints;
a0ed51b3 1614
dfe13c55 1615 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1616
7d85a32c 1617 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1618 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3 1619 }
1620
1621 PL_last_swash_hv = hv;
1622 PL_last_swash_klen = klen;
1623 PL_last_swash_tmps = tmps;
1624 PL_last_swash_slen = slen;
1625 if (klen)
1626 Copy(ptr, PL_last_swash_key, klen, U8);
1627 }
1628
9faf8d75 1629 switch ((int)((slen << 3) / needents)) {
a0ed51b3 1630 case 1:
1631 bit = 1 << (off & 7);
1632 off >>= 3;
1633 return (tmps[off] & bit) != 0;
1634 case 8:
1635 return tmps[off];
1636 case 16:
1637 off <<= 1;
1638 return (tmps[off] << 8) + tmps[off + 1] ;
1639 case 32:
1640 off <<= 2;
1641 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1642 }
cea2e8a9 1643 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3 1644 return 0;
1645}
2b9d42f0 1646
1647
1648/*
37607a96 1649=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0 1650
1651Adds the UTF8 representation of the Native codepoint C<uv> to the end
1652of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1653bytes available. The return value is the pointer to the byte after the
1654end of the new character. In other words,
1655
1656 d = uvchr_to_utf8(d, uv);
1657
1658is the recommended wide native character-aware way of saying
1659
1660 *(d++) = uv;
1661
1662=cut
1663*/
1664
1665/* On ASCII machines this is normally a macro but we want a
1666 real function in case XS code wants it
1667*/
1668#undef Perl_uvchr_to_utf8
1669U8 *
1670Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1671{
b851fbc1 1672 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2b9d42f0 1673}
1674
b851fbc1 1675U8 *
1676Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1677{
1678 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1679}
2b9d42f0 1680
1681/*
37607a96 1682=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0 1683
1684Returns the native character value of the first character in the string C<s>
1685which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1686length, in bytes, of that character.
1687
1688Allows length and flags to be passed to low level routine.
1689
1690=cut
1691*/
0a2ef054 1692/* On ASCII machines this is normally a macro but we want
1693 a real function in case XS code wants it
2b9d42f0 1694*/
1695#undef Perl_utf8n_to_uvchr
1696UV
37607a96 1697Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0 1698{
1699 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1700 return UNI_TO_NATIVE(uv);
1701}
1702
d2cc3551 1703/*
1704=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1705
1706Build to the scalar dsv a displayable version of the string spv,
1707length len, the displayable version being at most pvlim bytes long
1708(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1709
9e55ce06 1710The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1711isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054 1712to display the \\[nrfta\\] as the backslashed versions (like '\n')
1713(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1714UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1715UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1716
d2cc3551 1717The pointer to the PV of the dsv is returned.
1718
1719=cut */
e6b2e755 1720char *
1721Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1722{
1723 int truncated = 0;
1724 char *s, *e;
1725
1726 sv_setpvn(dsv, "", 0);
1727 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1728 UV u;
c728cb41 1729 bool ok = FALSE;
1730
e6b2e755 1731 if (pvlim && SvCUR(dsv) >= pvlim) {
1732 truncated++;
1733 break;
1734 }
1735 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1736 if (u < 256) {
c728cb41 1737 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1738 switch (u & 0xFF) {
1739 case '\n':
1740 Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1741 case '\r':
1742 Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1743 case '\t':
1744 Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1745 case '\f':
1746 Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1747 case '\a':
1748 Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1749 case '\\':
1750 Perl_sv_catpvf(aTHX_ dsv, "\\" ); ok = TRUE; break;
1751 default: break;
1752 }
1753 }
00e86452 1754 /* isPRINT() is the locale-blind version. */
1755 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
2c4547fe 1756 Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
0a2ef054 1757 ok = TRUE;
1758 }
c728cb41 1759 }
1760 if (!ok)
9e55ce06 1761 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755 1762 }
1763 if (truncated)
1764 sv_catpvn(dsv, "...", 3);
1765
1766 return SvPVX(dsv);
1767}
2b9d42f0 1768
d2cc3551 1769/*
1770=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1771
1772Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1773the displayable version being at most pvlim bytes long
d2cc3551 1774(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1775
1776The flags argument is as in pv_uni_display().
1777
d2cc3551 1778The pointer to the PV of the dsv is returned.
1779
1780=cut */
e6b2e755 1781char *
1782Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1783{
701a277b 1784 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1785 pvlim, flags);
1786}
1787
d2cc3551 1788/*
d07ddd77 1789=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551 1790
1791Return true if the strings s1 and s2 differ case-insensitively, false
1792if not (if they are equal case-insensitively). If u1 is true, the
1793string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77 1794the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1795are false, the respective string is assumed to be in native 8-bit
1796encoding.
1797
1798If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1799in there (they will point at the beginning of the I<next> character).
1800If the pointers behind pe1 or pe2 are non-NULL, they are the end
1801pointers beyond which scanning will not continue under any
1802circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1803s2+l2 will be used as goal end pointers that will also stop the scan,
1804and which qualify towards defining a successful match: all the scans
1805that define an explicit length must reach their goal pointers for
1806a match to succeed).
d2cc3551 1807
1808For case-insensitiveness, the "casefolding" of Unicode is used
1809instead of upper/lowercasing both the characters, see
1810http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1811
1812=cut */
701a277b 1813I32
d07ddd77 1814Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 1815{
5469e704 1816 register U8 *p1 = (U8*)s1;
1817 register U8 *p2 = (U8*)s2;
d07ddd77 1818 register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1819 register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1820 STRLEN n1 = 0, n2 = 0;
ffce6cc2 1821 U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1822 U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
d7f013c8 1823 U8 natbuf[1+1];
1824 STRLEN foldlen1, foldlen2;
d07ddd77 1825 bool match;
332ddc25 1826
d07ddd77 1827 if (pe1)
1828 e1 = *(U8**)pe1;
1829 if (e1 == 0 || (l1 && l1 < e1 - (U8*)s1))
1830 f1 = (U8*)s1 + l1;
1831 if (pe2)
1832 e2 = *(U8**)pe2;
1833 if (e2 == 0 || (l2 && l2 < e2 - (U8*)s2))
1834 f2 = (U8*)s2 + l2;
1835
1836 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1837 return 1; /* mismatch; possible infinite loop or false positive */
1838
a6872d42 1839 if (!u1 || !u2)
1840 natbuf[1] = 0; /* Need to terminate the buffer. */
1841
d07ddd77 1842 while ((e1 == 0 || p1 < e1) &&
1843 (f1 == 0 || p1 < f1) &&
1844 (e2 == 0 || p2 < e2) &&
1845 (f2 == 0 || p2 < f2)) {
1846 if (n1 == 0) {
d7f013c8 1847 if (u1)
1848 to_utf8_fold(p1, foldbuf1, &foldlen1);
1849 else {
f5cee151 1850 natbuf[0] = *p1;
d7f013c8 1851 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1852 }
1853 q1 = foldbuf1;
d07ddd77 1854 n1 = foldlen1;
332ddc25 1855 }
d07ddd77 1856 if (n2 == 0) {
d7f013c8 1857 if (u2)
1858 to_utf8_fold(p2, foldbuf2, &foldlen2);
1859 else {
f5cee151 1860 natbuf[0] = *p2;
d7f013c8 1861 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1862 }
1863 q2 = foldbuf2;
d07ddd77 1864 n2 = foldlen2;
332ddc25 1865 }
d07ddd77 1866 while (n1 && n2) {
1867 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1868 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1869 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 1870 return 1; /* mismatch */
d07ddd77 1871 n1 -= UTF8SKIP(q1);
d7f013c8 1872 q1 += UTF8SKIP(q1);
d07ddd77 1873 n2 -= UTF8SKIP(q2);
d7f013c8 1874 q2 += UTF8SKIP(q2);
701a277b 1875 }
d07ddd77 1876 if (n1 == 0)
d7f013c8 1877 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 1878 if (n2 == 0)
d7f013c8 1879 p2 += u2 ? UTF8SKIP(p2) : 1;
1880
d2cc3551 1881 }
5469e704 1882
d07ddd77 1883 /* A match is defined by all the scans that specified
1884 * an explicit length reaching their final goals. */
1885 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704 1886
1887 if (match) {
d07ddd77 1888 if (pe1)
1889 *pe1 = (char*)p1;
1890 if (pe2)
1891 *pe2 = (char*)p2;
5469e704 1892 }
1893
1894 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 1895}
701a277b 1896