Upgrade to Archive-Tar-1.30. Since change #27571 is not included,
[p5sagit/p5-mst-13.2.git] / utf8.c
CommitLineData
a0ed51b3 1/* utf8.c
2 *
b94e2f88 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
4 * by Larry Wall and others
a0ed51b3 5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3 26#include "perl.h"
27
27da23d5 28static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 30
ccfc67b7 31/*
32=head1 Unicode Support
a0ed51b3 33
166f8a29 34This file contains various utility functions for manipulating UTF8-encoded
35strings. For the uninitiated, this is a method of representing arbitrary
61296642 36Unicode characters as a variable number of bytes, in such a way that
56da48f7 37characters in the ASCII range are unmodified, and a zero byte never appears
38within non-zero characters.
166f8a29 39
b851fbc1 40=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 41
1e54db1a 42Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 43of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 44bytes available. The return value is the pointer to the byte after the
9041c2e3 45end of the new character. In other words,
eebe1485 46
b851fbc1 47 d = uvuni_to_utf8_flags(d, uv, flags);
48
49or, in most cases,
50
9041c2e3 51 d = uvuni_to_utf8(d, uv);
eebe1485 52
b851fbc1 53(which is equivalent to)
54
55 d = uvuni_to_utf8_flags(d, uv, 0);
56
eebe1485 57is the recommended Unicode-aware way of saying
58
59 *(d++) = uv;
60
61=cut
62*/
63
dfe13c55 64U8 *
b851fbc1 65Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 66{
62961d2e 67 if (ckWARN(WARN_UTF8)) {
b851fbc1 68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1 71 else if (
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
74 ||
c867b360 75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1 76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
2a20b9da 78 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1 79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
81 )
9014280d 82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800 83 "Unicode character 0x%04"UVxf" is illegal", uv);
84 }
c4d5f83a 85 if (UNI_IS_INVARIANT(uv)) {
eb160463 86 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3 87 return d;
88 }
2d331972 89#if defined(EBCDIC)
1d72bdf6 90 else {
91 STRLEN len = UNISKIP(uv);
92 U8 *p = d+len-1;
93 while (p > d) {
eb160463 94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6 95 uv >>= UTF_ACCUMULATION_SHIFT;
96 }
eb160463 97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6 98 return d+len;
99 }
100#else /* Non loop style */
a0ed51b3 101 if (uv < 0x800) {
eb160463 102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 104 return d;
105 }
106 if (uv < 0x10000) {
eb160463 107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 110 return d;
111 }
112 if (uv < 0x200000) {
eb160463 113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 117 return d;
118 }
119 if (uv < 0x4000000) {
eb160463 120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 125 return d;
126 }
127 if (uv < 0x80000000) {
eb160463 128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 134 return d;
135 }
6b8eaf93 136#ifdef HAS_QUAD
d7578b48 137 if (uv < UTF8_QUAD_MAX)
a0ed51b3 138#endif
139 {
eb160463 140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 147 return d;
148 }
6b8eaf93 149#ifdef HAS_QUAD
a0ed51b3 150 {
eb160463 151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 164 return d;
165 }
166#endif
1d72bdf6 167#endif /* Loop style */
a0ed51b3 168}
9041c2e3 169
646ca15d 170/*
171
172Tests if some arbitrary number of bytes begins in a valid UTF-8
173character. Note that an INVARIANT (i.e. ASCII) character is a valid
174UTF-8 character. The actual number of bytes in the UTF-8 character
175will be returned if it is valid, otherwise 0.
176
177This is the "slow" version as opposed to the "fast" version which is
178the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
179difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
180or less you should use the IS_UTF8_CHAR(), for lengths of five or more
181you should use the _slow(). In practice this means that the _slow()
182will be used very rarely, since the maximum Unicode code point (as of
183Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
184the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
185five bytes or more.
186
187=cut */
c053b435 188STATIC STRLEN
5f66b61c 189S_is_utf8_char_slow(const U8 *s, const STRLEN len)
646ca15d 190{
191 U8 u = *s;
192 STRLEN slen;
193 UV uv, ouv;
194
195 if (UTF8_IS_INVARIANT(u))
196 return 1;
197
198 if (!UTF8_IS_START(u))
199 return 0;
200
201 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
202 return 0;
203
204 slen = len - 1;
205 s++;
77263263 206#ifdef EBCDIC
207 u = NATIVE_TO_UTF(u);
208#endif
646ca15d 209 u &= UTF_START_MASK(len);
210 uv = u;
211 ouv = uv;
212 while (slen--) {
213 if (!UTF8_IS_CONTINUATION(*s))
214 return 0;
215 uv = UTF8_ACCUMULATE(uv, *s);
216 if (uv < ouv)
217 return 0;
218 ouv = uv;
219 s++;
220 }
221
222 if ((STRLEN)UNISKIP(uv) < len)
223 return 0;
224
225 return len;
226}
9041c2e3 227
228/*
7fc63493 229=for apidoc A|STRLEN|is_utf8_char|const U8 *s
eebe1485 230
5da9da9e 231Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01 232character. Note that an INVARIANT (i.e. ASCII) character is a valid
233UTF-8 character. The actual number of bytes in the UTF-8 character
234will be returned if it is valid, otherwise 0.
9041c2e3 235
82686b01 236=cut */
067a85ef 237STRLEN
7fc63493 238Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 239{
44f8325f 240 const STRLEN len = UTF8SKIP(s);
96a5add6 241 PERL_UNUSED_CONTEXT;
3b0fc154 242#ifdef IS_UTF8_CHAR
768c67ee 243 if (IS_UTF8_CHAR_FAST(len))
3b0fc154 244 return IS_UTF8_CHAR(s, len) ? len : 0;
245#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 246 return is_utf8_char_slow(s, len);
386d01d6 247}
248
6662521e 249/*
7fc63493 250=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
6662521e 251
c9ada85f 252Returns true if first C<len> bytes of the given string form a valid
1e54db1a 253UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
254not mean 'a string that contains code points above 0x7F encoded in UTF-8'
255because a valid ASCII string is a valid UTF-8 string.
6662521e 256
768c67ee 257See also is_utf8_string_loclen() and is_utf8_string_loc().
258
6662521e 259=cut
260*/
261
8e84507e 262bool
7fc63493 263Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 264{
35da51f7 265 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 266 const U8* x = s;
067a85ef 267
96a5add6 268 PERL_UNUSED_CONTEXT;
1aa99e6b 269
6662521e 270 while (x < send) {
a3b680e6 271 STRLEN c;
1acdb0da 272 /* Inline the easy bits of is_utf8_char() here for speed... */
273 if (UTF8_IS_INVARIANT(*x))
274 c = 1;
275 else if (!UTF8_IS_START(*x))
768c67ee 276 goto out;
1acdb0da 277 else {
278 /* ... and call is_utf8_char() only if really needed. */
646ca15d 279#ifdef IS_UTF8_CHAR
280 c = UTF8SKIP(x);
768c67ee 281 if (IS_UTF8_CHAR_FAST(c)) {
282 if (!IS_UTF8_CHAR(x, c))
3c614e38 283 c = 0;
284 }
285 else
286 c = is_utf8_char_slow(x, c);
646ca15d 287#else
288 c = is_utf8_char(x);
289#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 290 if (!c)
768c67ee 291 goto out;
1acdb0da 292 }
6662521e 293 x += c;
6662521e 294 }
768c67ee 295
296 out:
60006e79 297 if (x != send)
298 return FALSE;
067a85ef 299
300 return TRUE;
6662521e 301}
302
67e989fb 303/*
814fafa7 304Implemented as a macro in utf8.h
305
306=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
307
308Like is_utf8_string() but stores the location of the failure (in the
309case of "utf8ness failure") or the location s+len (in the case of
310"utf8ness success") in the C<ep>.
311
312See also is_utf8_string_loclen() and is_utf8_string().
313
768c67ee 314=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
81cd54e3 315
e3e4599f 316Like is_utf8_string() but stores the location of the failure (in the
768c67ee 317case of "utf8ness failure") or the location s+len (in the case of
318"utf8ness success") in the C<ep>, and the number of UTF-8
319encoded characters in the C<el>.
320
321See also is_utf8_string_loc() and is_utf8_string().
81cd54e3 322
323=cut
324*/
325
326bool
768c67ee 327Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 328{
35da51f7 329 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 330 const U8* x = s;
81cd54e3 331 STRLEN c;
3ebfea28 332 STRLEN outlen = 0;
96a5add6 333 PERL_UNUSED_CONTEXT;
81cd54e3 334
81cd54e3 335 while (x < send) {
336 /* Inline the easy bits of is_utf8_char() here for speed... */
337 if (UTF8_IS_INVARIANT(*x))
768c67ee 338 c = 1;
339 else if (!UTF8_IS_START(*x))
340 goto out;
81cd54e3 341 else {
768c67ee 342 /* ... and call is_utf8_char() only if really needed. */
343#ifdef IS_UTF8_CHAR
344 c = UTF8SKIP(x);
345 if (IS_UTF8_CHAR_FAST(c)) {
346 if (!IS_UTF8_CHAR(x, c))
347 c = 0;
348 } else
349 c = is_utf8_char_slow(x, c);
350#else
351 c = is_utf8_char(x);
352#endif /* #ifdef IS_UTF8_CHAR */
353 if (!c)
354 goto out;
81cd54e3 355 }
768c67ee 356 x += c;
3ebfea28 357 outlen++;
81cd54e3 358 }
768c67ee 359
360 out:
3ebfea28 361 if (el)
362 *el = outlen;
363
768c67ee 364 if (ep)
365 *ep = x;
3ebfea28 366 return (x == send);
81cd54e3 367}
368
369/*
768c67ee 370
7fc63493 371=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 372
9041c2e3 373Bottom level UTF-8 decode routine.
374Returns the unicode code point value of the first character in the string C<s>
1e54db1a 375which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 376C<retlen> will be set to the length, in bytes, of that character.
67e989fb 377
1e54db1a 378If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880 379is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
380it is assumed that the caller will raise a warning, and this function
28d3d195 381will silently just set C<retlen> to C<-1> and return zero. If the
382C<flags> does not contain UTF8_CHECK_ONLY, warnings about
383malformations will be given, C<retlen> will be set to the expected
384length of the UTF-8 character in bytes, and zero will be returned.
385
386The C<flags> can also contain various flags to allow deviations from
387the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 388
9041c2e3 389Most code should use utf8_to_uvchr() rather than call this directly.
390
37607a96 391=cut
392*/
67e989fb 393
a0ed51b3 394UV
7fc63493 395Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 396{
97aff369 397 dVAR;
d4c19fe8 398 const U8 * const s0 = s;
9c5ffd7c 399 UV uv = *s, ouv = 0;
ba210ebe 400 STRLEN len = 1;
7fc63493 401 const bool dowarn = ckWARN_d(WARN_UTF8);
402 const UV startbyte = *s;
ba210ebe 403 STRLEN expectlen = 0;
a0dbb045 404 U32 warning = 0;
405
406/* This list is a superset of the UTF8_ALLOW_XXX. */
407
408#define UTF8_WARN_EMPTY 1
409#define UTF8_WARN_CONTINUATION 2
410#define UTF8_WARN_NON_CONTINUATION 3
411#define UTF8_WARN_FE_FF 4
412#define UTF8_WARN_SHORT 5
413#define UTF8_WARN_OVERFLOW 6
414#define UTF8_WARN_SURROGATE 7
c867b360 415#define UTF8_WARN_LONG 8
416#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045 417
418 if (curlen == 0 &&
419 !(flags & UTF8_ALLOW_EMPTY)) {
420 warning = UTF8_WARN_EMPTY;
0c443dc2 421 goto malformed;
422 }
423
1d72bdf6 424 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3 425 if (retlen)
426 *retlen = 1;
c4d5f83a 427 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 428 }
67e989fb 429
421a8bf2 430 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 431 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 432 warning = UTF8_WARN_CONTINUATION;
ba210ebe 433 goto malformed;
434 }
435
421a8bf2 436 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 437 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 438 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 439 goto malformed;
440 }
9041c2e3 441
1d72bdf6 442#ifdef EBCDIC
75383841 443 uv = NATIVE_TO_UTF(uv);
1d72bdf6 444#else
fcc8fcf6 445 if ((uv == 0xfe || uv == 0xff) &&
446 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 447 warning = UTF8_WARN_FE_FF;
ba210ebe 448 goto malformed;
a0ed51b3 449 }
1d72bdf6 450#endif
451
ba210ebe 452 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
453 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
454 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
455 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6 456#ifdef EBCDIC
457 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
458 else { len = 7; uv &= 0x01; }
459#else
ba210ebe 460 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
461 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6 462 else { len = 13; uv = 0; } /* whoa! */
463#endif
464
a0ed51b3 465 if (retlen)
466 *retlen = len;
9041c2e3 467
ba210ebe 468 expectlen = len;
469
fcc8fcf6 470 if ((curlen < expectlen) &&
471 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 472 warning = UTF8_WARN_SHORT;
ba210ebe 473 goto malformed;
474 }
475
476 len--;
a0ed51b3 477 s++;
ba210ebe 478 ouv = uv;
479
a0ed51b3 480 while (len--) {
421a8bf2 481 if (!UTF8_IS_CONTINUATION(*s) &&
482 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 483 s--;
484 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 485 goto malformed;
a0ed51b3 486 }
487 else
8850bf83 488 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045 489 if (!(uv > ouv)) {
490 /* These cannot be allowed. */
491 if (uv == ouv) {
75dbc644 492 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 493 warning = UTF8_WARN_LONG;
494 goto malformed;
495 }
496 }
497 else { /* uv < ouv */
498 /* This cannot be allowed. */
499 warning = UTF8_WARN_OVERFLOW;
500 goto malformed;
501 }
ba210ebe 502 }
503 s++;
504 ouv = uv;
505 }
506
421a8bf2 507 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 508 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 509 warning = UTF8_WARN_SURROGATE;
ba210ebe 510 goto malformed;
eb160463 511 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 512 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 513 warning = UTF8_WARN_LONG;
ba210ebe 514 goto malformed;
421a8bf2 515 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 516 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 517 warning = UTF8_WARN_FFFF;
a9917092 518 goto malformed;
a0ed51b3 519 }
ba210ebe 520
a0ed51b3 521 return uv;
ba210ebe 522
523malformed:
524
fcc8fcf6 525 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 526 if (retlen)
cc366d4b 527 *retlen = -1;
ba210ebe 528 return 0;
529 }
530
a0dbb045 531 if (dowarn) {
396482e1 532 SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
a0dbb045 533
534 switch (warning) {
535 case 0: /* Intentionally empty. */ break;
536 case UTF8_WARN_EMPTY:
396482e1 537 sv_catpvs(sv, "(empty string)");
a0dbb045 538 break;
539 case UTF8_WARN_CONTINUATION:
097fb8e2 540 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045 541 break;
542 case UTF8_WARN_NON_CONTINUATION:
097fb8e2 543 if (s == s0)
544 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
545 (UV)s[1], startbyte);
551405c4 546 else {
547 const int len = (int)(s-s0);
097fb8e2 548 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
551405c4 549 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
550 }
551
a0dbb045 552 break;
553 case UTF8_WARN_FE_FF:
554 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
555 break;
556 case UTF8_WARN_SHORT:
097fb8e2 557 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 558 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 559 expectlen = curlen; /* distance for caller to skip */
a0dbb045 560 break;
561 case UTF8_WARN_OVERFLOW:
097fb8e2 562 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
563 ouv, *s, startbyte);
a0dbb045 564 break;
565 case UTF8_WARN_SURROGATE:
566 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
567 break;
a0dbb045 568 case UTF8_WARN_LONG:
097fb8e2 569 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 570 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045 571 break;
572 case UTF8_WARN_FFFF:
573 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
574 break;
575 default:
396482e1 576 sv_catpvs(sv, "(unknown reason)");
a0dbb045 577 break;
578 }
579
580 if (warning) {
44f8325f 581 const char * const s = SvPVX_const(sv);
a0dbb045 582
583 if (PL_op)
9014280d 584 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 585 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 586 else
9014280d 587 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045 588 }
589 }
590
ba210ebe 591 if (retlen)
28d3d195 592 *retlen = expectlen ? expectlen : len;
ba210ebe 593
28d3d195 594 return 0;
a0ed51b3 595}
596
8e84507e 597/*
7fc63493 598=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
9041c2e3 599
600Returns the native character value of the first character in the string C<s>
1e54db1a 601which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3 602length, in bytes, of that character.
603
1e54db1a 604If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3 605returned and retlen is set, if possible, to -1.
606
607=cut
608*/
609
610UV
7fc63493 611Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 612{
1754c1a1 613 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
614 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3 615}
616
617/*
7fc63493 618=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
9041c2e3 619
620Returns the Unicode code point of the first character in the string C<s>
1e54db1a 621which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3 622length, in bytes, of that character.
623
624This function should only be used when returned UV is considered
625an index into the Unicode semantic tables (e.g. swashes).
626
1e54db1a 627If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 628returned and retlen is set, if possible, to -1.
8e84507e 629
630=cut
631*/
632
633UV
7fc63493 634Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 635{
9041c2e3 636 /* Call the low level routine asking for checks */
89ebb4a3 637 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 638 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e 639}
640
b76347f2 641/*
35a4481c 642=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
b76347f2 643
644Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47 645Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
646up past C<e>, croaks.
b76347f2 647
648=cut
649*/
650
651STRLEN
35a4481c 652Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2 653{
97aff369 654 dVAR;
b76347f2 655 STRLEN len = 0;
656
8850bf83 657 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
658 * the bitops (especially ~) can create illegal UTF-8.
659 * In other words: in Perl UTF-8 is not just for Unicode. */
660
a3b680e6 661 if (e < s)
662 goto warn_and_return;
b76347f2 663 while (s < e) {
4373e329 664 const U8 t = UTF8SKIP(s);
901b21bf 665 if (e - s < t) {
a3b680e6 666 warn_and_return:
901b21bf 667 if (ckWARN_d(WARN_UTF8)) {
668 if (PL_op)
669 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 670 "%s in %s", unees, OP_DESC(PL_op));
901b21bf 671 else
672 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
673 }
674 return len;
675 }
b76347f2 676 s += t;
677 len++;
678 }
679
680 return len;
681}
682
b06226ff 683/*
35a4481c 684=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
b06226ff 685
1e54db1a 686Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff 687and C<b>.
688
689WARNING: use only if you *know* that the pointers point inside the
690same UTF-8 buffer.
691
37607a96 692=cut
693*/
a0ed51b3 694
02eb7b47 695IV
35a4481c 696Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 697{
bf1665bc 698 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
a0ed51b3 699}
700
b06226ff 701/*
37607a96 702=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 703
8850bf83 704Return the UTF-8 pointer C<s> displaced by C<off> characters, either
705forward or backward.
b06226ff 706
707WARNING: do not use the following unless you *know* C<off> is within
8850bf83 708the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
709on the first byte of character or just after the last byte of a character.
b06226ff 710
37607a96 711=cut
712*/
a0ed51b3 713
714U8 *
4373e329 715Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 716{
96a5add6 717 PERL_UNUSED_CONTEXT;
8850bf83 718 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
719 * the bitops (especially ~) can create illegal UTF-8.
720 * In other words: in Perl UTF-8 is not just for Unicode. */
721
a0ed51b3 722 if (off >= 0) {
723 while (off--)
724 s += UTF8SKIP(s);
725 }
726 else {
727 while (off++) {
728 s--;
8850bf83 729 while (UTF8_IS_CONTINUATION(*s))
730 s--;
a0ed51b3 731 }
732 }
4373e329 733 return (U8 *)s;
a0ed51b3 734}
735
6940069f 736/*
eebe1485 737=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 738
1e54db1a 739Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53 740Unlike C<bytes_to_utf8>, this over-writes the original string, and
741updates len to contain the new length.
67e989fb 742Returns zero on failure, setting C<len> to -1.
6940069f 743
95be277c 744If you need a copy of the string, see C<bytes_from_utf8>.
745
6940069f 746=cut
747*/
748
749U8 *
37607a96 750Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 751{
d4c19fe8 752 U8 * const save = s;
753 U8 * const send = s + *len;
6940069f 754 U8 *d;
246fae53 755
1e54db1a 756 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 757 while (s < send) {
dcad2880 758 U8 c = *s++;
759
1d72bdf6 760 if (!UTF8_IS_INVARIANT(c) &&
761 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
762 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880 763 *len = -1;
764 return 0;
765 }
246fae53 766 }
dcad2880 767
768 d = s = save;
6940069f 769 while (s < send) {
ed646e6e 770 STRLEN ulen;
9041c2e3 771 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 772 s += ulen;
6940069f 773 }
774 *d = '\0';
246fae53 775 *len = d - save;
6940069f 776 return save;
777}
778
779/*
e1ec3a88 780=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 781
1e54db1a 782Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
35a4481c 783Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0 784the newly-created string, and updates C<len> to contain the new
785length. Returns the original string if no conversion occurs, C<len>
786is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
7870 if C<s> is converted or contains all 7bit characters.
f9a63242 788
37607a96 789=cut
790*/
f9a63242 791
792U8 *
e1ec3a88 793Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 794{
f9a63242 795 U8 *d;
e1ec3a88 796 const U8 *start = s;
797 const U8 *send;
f9a63242 798 I32 count = 0;
799
96a5add6 800 PERL_UNUSED_CONTEXT;
f9a63242 801 if (!*is_utf8)
73d840c0 802 return (U8 *)start;
f9a63242 803
1e54db1a 804 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 805 for (send = s + *len; s < send;) {
e1ec3a88 806 U8 c = *s++;
1d72bdf6 807 if (!UTF8_IS_INVARIANT(c)) {
db42d148 808 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
809 (c = *s++) && UTF8_IS_CONTINUATION(c))
810 count++;
811 else
73d840c0 812 return (U8 *)start;
db42d148 813 }
f9a63242 814 }
815
35da51f7 816 *is_utf8 = FALSE;
f9a63242 817
212542aa 818 Newx(d, (*len) - count + 1, U8);
ef9edfd0 819 s = start; start = d;
f9a63242 820 while (s < send) {
821 U8 c = *s++;
c4d5f83a 822 if (!UTF8_IS_INVARIANT(c)) {
823 /* Then it is two-byte encoded */
824 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
825 c = ASCII_TO_NATIVE(c);
826 }
827 *d++ = c;
f9a63242 828 }
829 *d = '\0';
830 *len = d - start;
73d840c0 831 return (U8 *)start;
f9a63242 832}
833
834/*
35a4481c 835=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
6940069f 836
1e54db1a 837Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e 838Returns a pointer to the newly-created string, and sets C<len> to
839reflect the new length.
6940069f 840
1e54db1a 841If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f 842see sv_recode_to_utf8().
843
497711e7 844=cut
6940069f 845*/
846
847U8*
35a4481c 848Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 849{
35a4481c 850 const U8 * const send = s + (*len);
6940069f 851 U8 *d;
852 U8 *dst;
96a5add6 853 PERL_UNUSED_CONTEXT;
6940069f 854
212542aa 855 Newx(d, (*len) * 2 + 1, U8);
6940069f 856 dst = d;
857
858 while (s < send) {
35a4481c 859 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 860 if (UNI_IS_INVARIANT(uv))
eb160463 861 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 862 else {
eb160463 863 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
864 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f 865 }
866 }
867 *d = '\0';
6662521e 868 *len = d-dst;
6940069f 869 return dst;
870}
871
a0ed51b3 872/*
dea0fc0b 873 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3 874 *
875 * Destination must be pre-extended to 3/2 source. Do not use in-place.
876 * We optimize for native, for obvious reasons. */
877
878U8*
dea0fc0b 879Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 880{
dea0fc0b 881 U8* pend;
882 U8* dstart = d;
883
1de9afcd 884 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
885 d[0] = 0;
886 *newlen = 1;
887 return d;
888 }
889
dea0fc0b 890 if (bytelen & 1)
014ead4b 891 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
dea0fc0b 892
893 pend = p + bytelen;
894
a0ed51b3 895 while (p < pend) {
dea0fc0b 896 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
897 p += 2;
a0ed51b3 898 if (uv < 0x80) {
e294cc5d 899#ifdef EBCDIC
900 *d++ = UNI_TO_NATIVE(uv);
901#else
eb160463 902 *d++ = (U8)uv;
e294cc5d 903#endif
a0ed51b3 904 continue;
905 }
906 if (uv < 0x800) {
eb160463 907 *d++ = (U8)(( uv >> 6) | 0xc0);
908 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 909 continue;
910 }
911 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e 912 UV low = (p[0] << 8) + p[1];
913 p += 2;
dea0fc0b 914 if (low < 0xdc00 || low >= 0xdfff)
915 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3 916 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
917 }
918 if (uv < 0x10000) {
eb160463 919 *d++ = (U8)(( uv >> 12) | 0xe0);
920 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
921 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 922 continue;
923 }
924 else {
eb160463 925 *d++ = (U8)(( uv >> 18) | 0xf0);
926 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
927 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
928 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 929 continue;
930 }
931 }
dea0fc0b 932 *newlen = d - dstart;
a0ed51b3 933 return d;
934}
935
936/* Note: this one is slightly destructive of the source. */
937
938U8*
dea0fc0b 939Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 940{
941 U8* s = (U8*)p;
d4c19fe8 942 U8* const send = s + bytelen;
a0ed51b3 943 while (s < send) {
d4c19fe8 944 const U8 tmp = s[0];
a0ed51b3 945 s[0] = s[1];
946 s[1] = tmp;
947 s += 2;
948 }
dea0fc0b 949 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3 950}
951
952/* for now these are all defined (inefficiently) in terms of the utf8 versions */
953
954bool
84afefe6 955Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 956{
89ebb4a3 957 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 958 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 959 return is_utf8_alnum(tmpbuf);
960}
961
962bool
84afefe6 963Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 964{
89ebb4a3 965 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 966 uvchr_to_utf8(tmpbuf, c);
b8c5462f 967 return is_utf8_alnumc(tmpbuf);
968}
969
970bool
84afefe6 971Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 972{
89ebb4a3 973 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 974 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 975 return is_utf8_idfirst(tmpbuf);
976}
977
978bool
84afefe6 979Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 980{
89ebb4a3 981 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 982 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 983 return is_utf8_alpha(tmpbuf);
984}
985
986bool
84afefe6 987Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 988{
89ebb4a3 989 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 990 uvchr_to_utf8(tmpbuf, c);
4d61ec05 991 return is_utf8_ascii(tmpbuf);
992}
993
994bool
84afefe6 995Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 996{
89ebb4a3 997 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 998 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 999 return is_utf8_space(tmpbuf);
1000}
1001
1002bool
84afefe6 1003Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1004{
89ebb4a3 1005 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1006 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1007 return is_utf8_digit(tmpbuf);
1008}
1009
1010bool
84afefe6 1011Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1012{
89ebb4a3 1013 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1014 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1015 return is_utf8_upper(tmpbuf);
1016}
1017
1018bool
84afefe6 1019Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1020{
89ebb4a3 1021 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1022 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1023 return is_utf8_lower(tmpbuf);
1024}
1025
1026bool
84afefe6 1027Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1028{
89ebb4a3 1029 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1030 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1031 return is_utf8_cntrl(tmpbuf);
1032}
1033
1034bool
84afefe6 1035Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1036{
89ebb4a3 1037 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1038 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1039 return is_utf8_graph(tmpbuf);
1040}
1041
1042bool
84afefe6 1043Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1044{
89ebb4a3 1045 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1046 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1047 return is_utf8_print(tmpbuf);
1048}
1049
b8c5462f 1050bool
84afefe6 1051Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1052{
89ebb4a3 1053 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1054 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1055 return is_utf8_punct(tmpbuf);
1056}
1057
4d61ec05 1058bool
84afefe6 1059Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1060{
89ebb4a3 1061 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1062 uvchr_to_utf8(tmpbuf, c);
4d61ec05 1063 return is_utf8_xdigit(tmpbuf);
1064}
1065
84afefe6 1066UV
1067Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1068{
0ebc6274 1069 uvchr_to_utf8(p, c);
1070 return to_utf8_upper(p, p, lenp);
a0ed51b3 1071}
1072
84afefe6 1073UV
1074Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1075{
0ebc6274 1076 uvchr_to_utf8(p, c);
1077 return to_utf8_title(p, p, lenp);
a0ed51b3 1078}
1079
84afefe6 1080UV
1081Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1082{
0ebc6274 1083 uvchr_to_utf8(p, c);
1084 return to_utf8_lower(p, p, lenp);
a0ed51b3 1085}
1086
84afefe6 1087UV
1088Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1089{
0ebc6274 1090 uvchr_to_utf8(p, c);
1091 return to_utf8_fold(p, p, lenp);
84afefe6 1092}
1093
a0ed51b3 1094/* for now these all assume no locale info available for Unicode > 255 */
1095
1096bool
84afefe6 1097Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3 1098{
1099 return is_uni_alnum(c); /* XXX no locale support yet */
1100}
1101
1102bool
84afefe6 1103Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f 1104{
1105 return is_uni_alnumc(c); /* XXX no locale support yet */
1106}
1107
1108bool
84afefe6 1109Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3 1110{
1111 return is_uni_idfirst(c); /* XXX no locale support yet */
1112}
1113
1114bool
84afefe6 1115Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3 1116{
1117 return is_uni_alpha(c); /* XXX no locale support yet */
1118}
1119
1120bool
84afefe6 1121Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05 1122{
1123 return is_uni_ascii(c); /* XXX no locale support yet */
1124}
1125
1126bool
84afefe6 1127Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3 1128{
1129 return is_uni_space(c); /* XXX no locale support yet */
1130}
1131
1132bool
84afefe6 1133Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3 1134{
1135 return is_uni_digit(c); /* XXX no locale support yet */
1136}
1137
1138bool
84afefe6 1139Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3 1140{
1141 return is_uni_upper(c); /* XXX no locale support yet */
1142}
1143
1144bool
84afefe6 1145Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3 1146{
1147 return is_uni_lower(c); /* XXX no locale support yet */
1148}
1149
1150bool
84afefe6 1151Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f 1152{
1153 return is_uni_cntrl(c); /* XXX no locale support yet */
1154}
1155
1156bool
84afefe6 1157Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f 1158{
1159 return is_uni_graph(c); /* XXX no locale support yet */
1160}
1161
1162bool
84afefe6 1163Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3 1164{
1165 return is_uni_print(c); /* XXX no locale support yet */
1166}
1167
b8c5462f 1168bool
84afefe6 1169Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f 1170{
1171 return is_uni_punct(c); /* XXX no locale support yet */
1172}
1173
4d61ec05 1174bool
84afefe6 1175Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05 1176{
1177 return is_uni_xdigit(c); /* XXX no locale support yet */
1178}
1179
b7ac61fa 1180U32
1181Perl_to_uni_upper_lc(pTHX_ U32 c)
1182{
ee099d14 1183 /* XXX returns only the first character -- do not use XXX */
1184 /* XXX no locale support yet */
1185 STRLEN len;
89ebb4a3 1186 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1187 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa 1188}
1189
1190U32
1191Perl_to_uni_title_lc(pTHX_ U32 c)
1192{
ee099d14 1193 /* XXX returns only the first character XXX -- do not use XXX */
1194 /* XXX no locale support yet */
1195 STRLEN len;
89ebb4a3 1196 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1197 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa 1198}
1199
1200U32
1201Perl_to_uni_lower_lc(pTHX_ U32 c)
1202{
ee099d14 1203 /* XXX returns only the first character -- do not use XXX */
1204 /* XXX no locale support yet */
1205 STRLEN len;
89ebb4a3 1206 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1207 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa 1208}
1209
7452cf6a 1210static bool
5141f98e 1211S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
bde6a22d 1212 const char *const swashname)
1213{
97aff369 1214 dVAR;
bde6a22d 1215 if (!is_utf8_char(p))
1216 return FALSE;
1217 if (!*swash)
711a919c 1218 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
bde6a22d 1219 return swash_fetch(*swash, p, TRUE) != 0;
1220}
1221
1222bool
7fc63493 1223Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1224{
97aff369 1225 dVAR;
671c33bf 1226 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1227 * descendant of isalnum(3), in other words, it doesn't
1228 * contain the '_'. --jhi */
d4c19fe8 1229 return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
a0ed51b3 1230}
1231
1232bool
7fc63493 1233Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1234{
97aff369 1235 dVAR;
d4c19fe8 1236 return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
b8c5462f 1237}
1238
1239bool
7fc63493 1240Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1241{
97aff369 1242 dVAR;
82686b01 1243 if (*p == '_')
1244 return TRUE;
bde6a22d 1245 /* is_utf8_idstart would be more logical. */
d4c19fe8 1246 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
82686b01 1247}
1248
1249bool
7fc63493 1250Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01 1251{
97aff369 1252 dVAR;
82686b01 1253 if (*p == '_')
1254 return TRUE;
d4c19fe8 1255 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
a0ed51b3 1256}
1257
1258bool
7fc63493 1259Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1260{
97aff369 1261 dVAR;
d4c19fe8 1262 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
a0ed51b3 1263}
1264
1265bool
7fc63493 1266Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1267{
97aff369 1268 dVAR;
d4c19fe8 1269 return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
b8c5462f 1270}
1271
1272bool
7fc63493 1273Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1274{
97aff369 1275 dVAR;
d4c19fe8 1276 return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
a0ed51b3 1277}
1278
1279bool
7fc63493 1280Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1281{
97aff369 1282 dVAR;
d4c19fe8 1283 return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
a0ed51b3 1284}
1285
1286bool
7fc63493 1287Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1288{
97aff369 1289 dVAR;
d4c19fe8 1290 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
a0ed51b3 1291}
1292
1293bool
7fc63493 1294Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1295{
97aff369 1296 dVAR;
d4c19fe8 1297 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
a0ed51b3 1298}
1299
1300bool
7fc63493 1301Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1302{
97aff369 1303 dVAR;
d4c19fe8 1304 return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
b8c5462f 1305}
1306
1307bool
7fc63493 1308Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1309{
97aff369 1310 dVAR;
d4c19fe8 1311 return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
b8c5462f 1312}
1313
1314bool
7fc63493 1315Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1316{
97aff369 1317 dVAR;
d4c19fe8 1318 return is_utf8_common(p, &PL_utf8_print, "IsPrint");
a0ed51b3 1319}
1320
1321bool
7fc63493 1322Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1323{
97aff369 1324 dVAR;
d4c19fe8 1325 return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
b8c5462f 1326}
1327
1328bool
7fc63493 1329Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1330{
97aff369 1331 dVAR;
d4c19fe8 1332 return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
b8c5462f 1333}
1334
1335bool
7fc63493 1336Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1337{
97aff369 1338 dVAR;
d4c19fe8 1339 return is_utf8_common(p, &PL_utf8_mark, "IsM");
a0ed51b3 1340}
1341
6b5c0936 1342/*
1343=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1344
1345The "p" contains the pointer to the UTF-8 string encoding
1346the character that is being converted.
1347
1348The "ustrp" is a pointer to the character buffer to put the
1349conversion result to. The "lenp" is a pointer to the length
1350of the result.
1351
0134edef 1352The "swashp" is a pointer to the swash to use.
6b5c0936 1353
0134edef 1354Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
8fe4d5b2 1355and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
0134edef 1356but not always, a multicharacter mapping), is tried first.
6b5c0936 1357
0134edef 1358The "special" is a string like "utf8::ToSpecLower", which means the
1359hash %utf8::ToSpecLower. The access to the hash is through
1360Perl_to_utf8_case().
6b5c0936 1361
0134edef 1362The "normal" is a string like "ToLower" which means the swash
1363%utf8::ToLower.
1364
1365=cut */
6b5c0936 1366
2104c8d9 1367UV
9a957fbc 1368Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1369 SV **swashp, const char *normal, const char *special)
a0ed51b3 1370{
97aff369 1371 dVAR;
89ebb4a3 1372 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1373 STRLEN len = 0;
a0ed51b3 1374
aec46f14 1375 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7 1376 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1377 * are necessary in EBCDIC, they are redundant no-ops
1378 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1379 const UV uv1 = NATIVE_TO_UNI(uv0);
1feea2c7 1380 uvuni_to_utf8(tmpbuf, uv1);
0134edef 1381
1382 if (!*swashp) /* load on-demand */
1383 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1384
b08cf34e 1385 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1386 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1387 /* It might be "special" (sometimes, but not always,
2a37f04d 1388 * a multicharacter mapping) */
35da51f7 1389 HV * const hv = get_hv(special, FALSE);
b08cf34e 1390 SV **svp;
1391
35da51f7 1392 if (hv &&
b08cf34e 1393 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1394 (*svp)) {
cfd0369c 1395 const char *s;
47654450 1396
cfd0369c 1397 s = SvPV_const(*svp, len);
47654450 1398 if (len == 1)
1399 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1400 else {
2f9475ad 1401#ifdef EBCDIC
1402 /* If we have EBCDIC we need to remap the characters
1403 * since any characters in the low 256 are Unicode
1404 * code points, not EBCDIC. */
7cda7a3d 1405 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad 1406
1407 d = tmpbuf;
b08cf34e 1408 if (SvUTF8(*svp)) {
2f9475ad 1409 STRLEN tlen = 0;
1410
1411 while (t < tend) {
d4c19fe8 1412 const UV c = utf8_to_uvchr(t, &tlen);
2f9475ad 1413 if (tlen > 0) {
1414 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1415 t += tlen;
1416 }
1417 else
1418 break;
1419 }
1420 }
1421 else {
36fec512 1422 while (t < tend) {
1423 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1424 t++;
1425 }
2f9475ad 1426 }
1427 len = d - tmpbuf;
1428 Copy(tmpbuf, ustrp, len, U8);
1429#else
d2dcd0fb 1430 Copy(s, ustrp, len, U8);
2f9475ad 1431#endif
29e98929 1432 }
983ffd37 1433 }
0134edef 1434 }
1435
1436 if (!len && *swashp) {
d4c19fe8 1437 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1438
0134edef 1439 if (uv2) {
1440 /* It was "normal" (a single character mapping). */
d4c19fe8 1441 const UV uv3 = UNI_TO_NATIVE(uv2);
e9101d72 1442 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d 1443 }
1444 }
1feea2c7 1445
0134edef 1446 if (!len) /* Neither: just copy. */
1447 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1448
2a37f04d 1449 if (lenp)
1450 *lenp = len;
1451
0134edef 1452 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3 1453}
1454
d3e79532 1455/*
7fc63493 1456=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1457
1458Convert the UTF-8 encoded character at p to its uppercase version and
1459store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1460that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1461the uppercase version may be longer than the original character.
d3e79532 1462
1463The first character of the uppercased version is returned
1464(but note, as explained above, that there may be more.)
1465
1466=cut */
1467
2104c8d9 1468UV
7fc63493 1469Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1470{
97aff369 1471 dVAR;
983ffd37 1472 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1473 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1474}
a0ed51b3 1475
d3e79532 1476/*
7fc63493 1477=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1478
1479Convert the UTF-8 encoded character at p to its titlecase version and
1480store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1481that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1482titlecase version may be longer than the original character.
d3e79532 1483
1484The first character of the titlecased version is returned
1485(but note, as explained above, that there may be more.)
1486
1487=cut */
1488
983ffd37 1489UV
7fc63493 1490Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37 1491{
97aff369 1492 dVAR;
983ffd37 1493 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1494 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3 1495}
1496
d3e79532 1497/*
7fc63493 1498=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1499
1500Convert the UTF-8 encoded character at p to its lowercase version and
1501store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1502that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1503lowercase version may be longer than the original character.
d3e79532 1504
1505The first character of the lowercased version is returned
1506(but note, as explained above, that there may be more.)
1507
1508=cut */
1509
2104c8d9 1510UV
7fc63493 1511Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1512{
97aff369 1513 dVAR;
983ffd37 1514 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1515 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1516}
1517
d3e79532 1518/*
7fc63493 1519=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1520
1521Convert the UTF-8 encoded character at p to its foldcase version and
1522store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1523that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532 1524foldcase version may be longer than the original character (up to
1525three characters).
1526
1527The first character of the foldcased version is returned
1528(but note, as explained above, that there may be more.)
1529
1530=cut */
1531
b4e400f9 1532UV
7fc63493 1533Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9 1534{
97aff369 1535 dVAR;
b4e400f9 1536 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1537 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3 1538}
1539
711a919c 1540/* Note:
1541 * A "swash" is a swatch hash.
1542 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1543 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1544 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1545 */
a0ed51b3 1546SV*
7fc63493 1547Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1548{
27da23d5 1549 dVAR;
a0ed51b3 1550 SV* retval;
9a957fbc 1551 SV* const tokenbufsv = sv_newmortal();
8e84507e 1552 dSP;
7fc63493 1553 const size_t pkg_len = strlen(pkg);
1554 const size_t name_len = strlen(name);
aec46f14 1555 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1556 SV* errsv_save;
ce3b816e 1557
96ca9f55 1558 PUSHSTACKi(PERLSI_MAGIC);
1559 ENTER;
1560 SAVEI32(PL_hints);
1561 PL_hints = 0;
1562 save_re_context();
1b026014 1563 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1564 ENTER;
f8be5cf0 1565 errsv_save = newSVsv(ERRSV);
dc0c6abb 1566 /* It is assumed that callers of this routine are not passing in any
1567 user derived data. */
1568 /* Need to do this after save_re_context() as it will set PL_tainted to
1569 1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1570 Even line to create errsv_save can turn on PL_tainted. */
1571 SAVEBOOL(PL_tainted);
1572 PL_tainted = 0;
71bed85a 1573 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
a0714e2c 1574 NULL);
f8be5cf0 1575 if (!SvTRUE(ERRSV))
1576 sv_setsv(ERRSV, errsv_save);
1577 SvREFCNT_dec(errsv_save);
ce3b816e 1578 LEAVE;
1579 }
1580 SPAGAIN;
a0ed51b3 1581 PUSHMARK(SP);
1582 EXTEND(SP,5);
71bed85a 1583 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1584 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3 1585 PUSHs(listsv);
1586 PUSHs(sv_2mortal(newSViv(minbits)));
1587 PUSHs(sv_2mortal(newSViv(none)));
1588 PUTBACK;
923e4eb5 1589 if (IN_PERL_COMPILETIME) {
bf1fed83 1590 /* XXX ought to be handled by lex_start */
82686b01 1591 SAVEI32(PL_in_my);
2b4bd638 1592 PL_in_my = 0;
bf1fed83 1593 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1594 }
f8be5cf0 1595 errsv_save = newSVsv(ERRSV);
864dbfa3 1596 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1597 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1598 else
e24b16f9 1599 retval = &PL_sv_undef;
f8be5cf0 1600 if (!SvTRUE(ERRSV))
1601 sv_setsv(ERRSV, errsv_save);
1602 SvREFCNT_dec(errsv_save);
a0ed51b3 1603 LEAVE;
1604 POPSTACK;
923e4eb5 1605 if (IN_PERL_COMPILETIME) {
bf1fed83 1606 STRLEN len;
aec46f14 1607 const char* const pv = SvPV_const(tokenbufsv, len);
bf1fed83 1608
1609 Copy(pv, PL_tokenbuf, len+1, char);
623e6609 1610 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1611 }
bc45ce41 1612 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1613 if (SvPOK(retval))
35c1215d 1614 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
95b63a38 1615 (void*)retval);
cea2e8a9 1616 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1617 }
a0ed51b3 1618 return retval;
1619}
1620
035d37be 1621
1622/* This API is wrong for special case conversions since we may need to
1623 * return several Unicode characters for a single Unicode character
1624 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1625 * the lower-level routine, and it is similarly broken for returning
1626 * multiple values. --jhi */
979f2922 1627/* Now SWASHGET is recasted into S_swash_get in this file. */
680c470c 1628
1629/* Note:
1630 * Returns the value of property/mapping C<swash> for the first character
1631 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1632 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1633 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1634 */
a0ed51b3 1635UV
680c470c 1636Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 1637{
27da23d5 1638 dVAR;
680c470c 1639 HV* const hv = (HV*)SvRV(swash);
3568d838 1640 U32 klen;
1641 U32 off;
a0ed51b3 1642 STRLEN slen;
7d85a32c 1643 STRLEN needents;
cfd0369c 1644 const U8 *tmps = NULL;
a0ed51b3 1645 U32 bit;
979f2922 1646 SV *swatch;
3568d838 1647 U8 tmputf8[2];
35da51f7 1648 const UV c = NATIVE_TO_ASCII(*ptr);
3568d838 1649
1650 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
979f2922 1651 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1652 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1653 ptr = tmputf8;
3568d838 1654 }
1655 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1656 * then the "swatch" is a vec() for al the chars which start
1657 * with 0xAA..0xYY
1658 * So the key in the hash (klen) is length of encoded char -1
1659 */
1660 klen = UTF8SKIP(ptr) - 1;
1661 off = ptr[klen];
a0ed51b3 1662
979f2922 1663 if (klen == 0) {
7d85a32c 1664 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1665 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1666 */
979f2922 1667 needents = UTF_CONTINUATION_MARK;
1668 off = NATIVE_TO_UTF(ptr[klen]);
1669 }
1670 else {
7d85a32c 1671 /* If char is encoded then swatch is for the prefix */
979f2922 1672 needents = (1 << UTF_ACCUMULATION_SHIFT);
1673 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1674 }
7d85a32c 1675
a0ed51b3 1676 /*
1677 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1678 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1679 * it's nothing to sniff at.) Pity we usually come through at least
1680 * two function calls to get here...
1681 *
1682 * NB: this code assumes that swatches are never modified, once generated!
1683 */
1684
3568d838 1685 if (hv == PL_last_swash_hv &&
a0ed51b3 1686 klen == PL_last_swash_klen &&
27da23d5 1687 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3 1688 {
1689 tmps = PL_last_swash_tmps;
1690 slen = PL_last_swash_slen;
1691 }
1692 else {
1693 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1694 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 1695
979f2922 1696 /* If not cached, generate it via swash_get */
1697 if (!svp || !SvPOK(*svp)
1698 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
2b9d42f0 1699 /* We use utf8n_to_uvuni() as we want an index into
1700 Unicode tables, not a native character number.
1701 */
aec46f14 1702 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae 1703 ckWARN(WARN_UTF8) ?
1704 0 : UTF8_ALLOW_ANY);
680c470c 1705 swatch = swash_get(swash,
979f2922 1706 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1707 (klen) ? (code_point & ~(needents - 1)) : 0,
1708 needents);
1709
923e4eb5 1710 if (IN_PERL_COMPILETIME)
623e6609 1711 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1712
979f2922 1713 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 1714
979f2922 1715 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1716 || (slen << 3) < needents)
660a4616 1717 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
a0ed51b3 1718 }
1719
1720 PL_last_swash_hv = hv;
1721 PL_last_swash_klen = klen;
cfd0369c 1722 /* FIXME change interpvar.h? */
1723 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3 1724 PL_last_swash_slen = slen;
1725 if (klen)
1726 Copy(ptr, PL_last_swash_key, klen, U8);
1727 }
1728
9faf8d75 1729 switch ((int)((slen << 3) / needents)) {
a0ed51b3 1730 case 1:
1731 bit = 1 << (off & 7);
1732 off >>= 3;
1733 return (tmps[off] & bit) != 0;
1734 case 8:
1735 return tmps[off];
1736 case 16:
1737 off <<= 1;
1738 return (tmps[off] << 8) + tmps[off + 1] ;
1739 case 32:
1740 off <<= 2;
1741 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1742 }
660a4616 1743 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
670f1322 1744 NORETURN_FUNCTION_END;
a0ed51b3 1745}
2b9d42f0 1746
979f2922 1747/* Note:
1748 * Returns a swatch (a bit vector string) for a code point sequence
1749 * that starts from the value C<start> and comprises the number C<span>.
1750 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1751 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1752 */
1753STATIC SV*
1754S_swash_get(pTHX_ SV* swash, UV start, UV span)
1755{
1756 SV *swatch;
711a919c 1757 U8 *l, *lend, *x, *xend, *s;
979f2922 1758 STRLEN lcur, xcur, scur;
1759
1760 HV* const hv = (HV*)SvRV(swash);
017a3ce5 1761 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1762 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1763 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1764 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1765 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
0bd48802 1766 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1767 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1768 const STRLEN bits = SvUV(*bitssvp);
1769 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1770 const UV none = SvUV(*nonesvp);
1771 const UV end = start + span;
979f2922 1772
1773 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
660a4616 1774 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1775 (UV)bits);
979f2922 1776 }
1777
1778 /* create and initialize $swatch */
396482e1 1779 swatch = newSVpvs("");
979f2922 1780 scur = octets ? (span * octets) : (span + 7) / 8;
1781 SvGROW(swatch, scur + 1);
1782 s = (U8*)SvPVX(swatch);
1783 if (octets && none) {
0bd48802 1784 const U8* const e = s + scur;
979f2922 1785 while (s < e) {
1786 if (bits == 8)
1787 *s++ = (U8)(none & 0xff);
1788 else if (bits == 16) {
1789 *s++ = (U8)((none >> 8) & 0xff);
1790 *s++ = (U8)( none & 0xff);
1791 }
1792 else if (bits == 32) {
1793 *s++ = (U8)((none >> 24) & 0xff);
1794 *s++ = (U8)((none >> 16) & 0xff);
1795 *s++ = (U8)((none >> 8) & 0xff);
1796 *s++ = (U8)( none & 0xff);
1797 }
1798 }
1799 *s = '\0';
1800 }
1801 else {
1802 (void)memzero((U8*)s, scur + 1);
1803 }
1804 SvCUR_set(swatch, scur);
1805 s = (U8*)SvPVX(swatch);
1806
1807 /* read $swash->{LIST} */
1808 l = (U8*)SvPV(*listsvp, lcur);
1809 lend = l + lcur;
1810 while (l < lend) {
35da51f7 1811 UV min, max, val;
979f2922 1812 STRLEN numlen;
1813 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1814
0bd48802 1815 U8* const nl = (U8*)memchr(l, '\n', lend - l);
979f2922 1816
1817 numlen = lend - l;
1818 min = grok_hex((char *)l, &numlen, &flags, NULL);
1819 if (numlen)
1820 l += numlen;
1821 else if (nl) {
1822 l = nl + 1; /* 1 is length of "\n" */
1823 continue;
1824 }
1825 else {
1826 l = lend; /* to LIST's end at which \n is not found */
1827 break;
1828 }
1829
1830 if (isBLANK(*l)) {
1831 ++l;
1832 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1833 numlen = lend - l;
1834 max = grok_hex((char *)l, &numlen, &flags, NULL);
1835 if (numlen)
1836 l += numlen;
1837 else
1838 max = min;
1839
1840 if (octets) {
1841 if (isBLANK(*l)) {
1842 ++l;
1843 flags = PERL_SCAN_SILENT_ILLDIGIT |
1844 PERL_SCAN_DISALLOW_PREFIX;
1845 numlen = lend - l;
1846 val = grok_hex((char *)l, &numlen, &flags, NULL);
1847 if (numlen)
1848 l += numlen;
1849 else
1850 val = 0;
1851 }
1852 else {
1853 val = 0;
1854 if (typeto) {
1855 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1856 typestr, l);
1857 }
1858 }
1859 }
711a919c 1860 else
1861 val = 0; /* bits == 1, then val should be ignored */
979f2922 1862 }
1863 else {
1864 max = min;
1865 if (octets) {
1866 val = 0;
1867 if (typeto) {
1868 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1869 }
1870 }
711a919c 1871 else
1872 val = 0; /* bits == 1, then val should be ignored */
979f2922 1873 }
1874
1875 if (nl)
1876 l = nl + 1;
1877 else
1878 l = lend;
1879
1880 if (max < start)
1881 continue;
1882
1883 if (octets) {
35da51f7 1884 UV key;
979f2922 1885 if (min < start) {
1886 if (!none || val < none) {
1887 val += start - min;
1888 }
1889 min = start;
1890 }
1891 for (key = min; key <= max; key++) {
1892 STRLEN offset;
1893 if (key >= end)
1894 goto go_out_list;
1895 /* offset must be non-negative (start <= min <= key < end) */
1896 offset = octets * (key - start);
1897 if (bits == 8)
1898 s[offset] = (U8)(val & 0xff);
1899 else if (bits == 16) {
1900 s[offset ] = (U8)((val >> 8) & 0xff);
1901 s[offset + 1] = (U8)( val & 0xff);
1902 }
1903 else if (bits == 32) {
1904 s[offset ] = (U8)((val >> 24) & 0xff);
1905 s[offset + 1] = (U8)((val >> 16) & 0xff);
1906 s[offset + 2] = (U8)((val >> 8) & 0xff);
1907 s[offset + 3] = (U8)( val & 0xff);
1908 }
1909
1910 if (!none || val < none)
1911 ++val;
1912 }
1913 }
711a919c 1914 else { /* bits == 1, then val should be ignored */
35da51f7 1915 UV key;
979f2922 1916 if (min < start)
1917 min = start;
1918 for (key = min; key <= max; key++) {
0bd48802 1919 const STRLEN offset = (STRLEN)(key - start);
979f2922 1920 if (key >= end)
1921 goto go_out_list;
1922 s[offset >> 3] |= 1 << (offset & 7);
1923 }
1924 }
1925 } /* while */
1926 go_out_list:
1927
1928 /* read $swash->{EXTRAS} */
1929 x = (U8*)SvPV(*extssvp, xcur);
1930 xend = x + xcur;
1931 while (x < xend) {
1932 STRLEN namelen;
1933 U8 *namestr;
1934 SV** othersvp;
1935 HV* otherhv;
1936 STRLEN otherbits;
1937 SV **otherbitssvp, *other;
711a919c 1938 U8 *s, *o, *nl;
979f2922 1939 STRLEN slen, olen;
1940
35da51f7 1941 const U8 opc = *x++;
979f2922 1942 if (opc == '\n')
1943 continue;
1944
1945 nl = (U8*)memchr(x, '\n', xend - x);
1946
1947 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1948 if (nl) {
1949 x = nl + 1; /* 1 is length of "\n" */
1950 continue;
1951 }
1952 else {
1953 x = xend; /* to EXTRAS' end at which \n is not found */
1954 break;
1955 }
1956 }
1957
1958 namestr = x;
1959 if (nl) {
1960 namelen = nl - namestr;
1961 x = nl + 1;
1962 }
1963 else {
1964 namelen = xend - namestr;
1965 x = xend;
1966 }
1967
1968 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
660a4616 1969 otherhv = (HV*)SvRV(*othersvp);
017a3ce5 1970 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
979f2922 1971 otherbits = (STRLEN)SvUV(*otherbitssvp);
1972 if (bits < otherbits)
660a4616 1973 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
979f2922 1974
1975 /* The "other" swatch must be destroyed after. */
1976 other = swash_get(*othersvp, start, span);
1977 o = (U8*)SvPV(other, olen);
1978
1979 if (!olen)
660a4616 1980 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
979f2922 1981
1982 s = (U8*)SvPV(swatch, slen);
1983 if (bits == 1 && otherbits == 1) {
1984 if (slen != olen)
660a4616 1985 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
979f2922 1986
1987 switch (opc) {
1988 case '+':
1989 while (slen--)
1990 *s++ |= *o++;
1991 break;
1992 case '!':
1993 while (slen--)
1994 *s++ |= ~*o++;
1995 break;
1996 case '-':
1997 while (slen--)
1998 *s++ &= ~*o++;
1999 break;
2000 case '&':
2001 while (slen--)
2002 *s++ &= *o++;
2003 break;
2004 default:
2005 break;
2006 }
2007 }
711a919c 2008 else {
979f2922 2009 STRLEN otheroctets = otherbits >> 3;
2010 STRLEN offset = 0;
35da51f7 2011 U8* const send = s + slen;
979f2922 2012
2013 while (s < send) {
2014 UV otherval = 0;
2015
2016 if (otherbits == 1) {
2017 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2018 ++offset;
2019 }
2020 else {
2021 STRLEN vlen = otheroctets;
2022 otherval = *o++;
2023 while (--vlen) {
2024 otherval <<= 8;
2025 otherval |= *o++;
2026 }
2027 }
2028
711a919c 2029 if (opc == '+' && otherval)
6f207bd3 2030 NOOP; /* replace with otherval */
979f2922 2031 else if (opc == '!' && !otherval)
2032 otherval = 1;
2033 else if (opc == '-' && otherval)
2034 otherval = 0;
2035 else if (opc == '&' && !otherval)
2036 otherval = 0;
2037 else {
711a919c 2038 s += octets; /* no replacement */
979f2922 2039 continue;
2040 }
2041
2042 if (bits == 8)
2043 *s++ = (U8)( otherval & 0xff);
2044 else if (bits == 16) {
2045 *s++ = (U8)((otherval >> 8) & 0xff);
2046 *s++ = (U8)( otherval & 0xff);
2047 }
2048 else if (bits == 32) {
2049 *s++ = (U8)((otherval >> 24) & 0xff);
2050 *s++ = (U8)((otherval >> 16) & 0xff);
2051 *s++ = (U8)((otherval >> 8) & 0xff);
2052 *s++ = (U8)( otherval & 0xff);
2053 }
2054 }
2055 }
2056 sv_free(other); /* through with it! */
2057 } /* while */
2058 return swatch;
2059}
2060
0f830e0b 2061/*
2062=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2063
2064Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2065of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2066bytes available. The return value is the pointer to the byte after the
2067end of the new character. In other words,
2068
2069 d = uvchr_to_utf8(d, uv);
2070
2071is the recommended wide native character-aware way of saying
2072
2073 *(d++) = uv;
2074
2075=cut
2076*/
2077
2078/* On ASCII machines this is normally a macro but we want a
2079 real function in case XS code wants it
2080*/
0f830e0b 2081U8 *
2082Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2083{
2084 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2085}
2086
b851fbc1 2087U8 *
2088Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2089{
2090 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2091}
2b9d42f0 2092
2093/*
0f830e0b 2094=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2095flags
2096
2097Returns the native character value of the first character in the string
2098C<s>
2099which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2100length, in bytes, of that character.
2101
2102Allows length and flags to be passed to low level routine.
2103
2104=cut
2105*/
2106/* On ASCII machines this is normally a macro but we want
2107 a real function in case XS code wants it
2108*/
0f830e0b 2109UV
2110Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2111U32 flags)
2112{
2113 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2114 return UNI_TO_NATIVE(uv);
2115}
2116
2117/*
d2cc3551 2118=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2119
2120Build to the scalar dsv a displayable version of the string spv,
2121length len, the displayable version being at most pvlim bytes long
2122(if longer, the rest is truncated and "..." will be appended).
0a2ef054 2123
9e55ce06 2124The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 2125isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054 2126to display the \\[nrfta\\] as the backslashed versions (like '\n')
2127(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2128UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2129UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2130
d2cc3551 2131The pointer to the PV of the dsv is returned.
2132
2133=cut */
e6b2e755 2134char *
e1ec3a88 2135Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755 2136{
2137 int truncated = 0;
e1ec3a88 2138 const char *s, *e;
e6b2e755 2139
2140 sv_setpvn(dsv, "", 0);
e1ec3a88 2141 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 2142 UV u;
a49f32c6 2143 /* This serves double duty as a flag and a character to print after
2144 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2145 */
2146 char ok = 0;
c728cb41 2147
e6b2e755 2148 if (pvlim && SvCUR(dsv) >= pvlim) {
2149 truncated++;
2150 break;
2151 }
2152 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 2153 if (u < 256) {
a3b680e6 2154 const unsigned char c = (unsigned char)u & 0xFF;
0bd48802 2155 if (flags & UNI_DISPLAY_BACKSLASH) {
a49f32c6 2156 switch (c) {
c728cb41 2157 case '\n':
a49f32c6 2158 ok = 'n'; break;
c728cb41 2159 case '\r':
a49f32c6 2160 ok = 'r'; break;
c728cb41 2161 case '\t':
a49f32c6 2162 ok = 't'; break;
c728cb41 2163 case '\f':
a49f32c6 2164 ok = 'f'; break;
c728cb41 2165 case '\a':
a49f32c6 2166 ok = 'a'; break;
c728cb41 2167 case '\\':
a49f32c6 2168 ok = '\\'; break;
c728cb41 2169 default: break;
2170 }
a49f32c6 2171 if (ok) {
2172 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2173 }
c728cb41 2174 }
00e86452 2175 /* isPRINT() is the locale-blind version. */
a49f32c6 2176 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2177 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2178 ok = 1;
0a2ef054 2179 }
c728cb41 2180 }
2181 if (!ok)
9e55ce06 2182 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755 2183 }
2184 if (truncated)
396482e1 2185 sv_catpvs(dsv, "...");
e6b2e755 2186
2187 return SvPVX(dsv);
2188}
2b9d42f0 2189
d2cc3551 2190/*
2191=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2192
2193Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 2194the displayable version being at most pvlim bytes long
d2cc3551 2195(if longer, the rest is truncated and "..." will be appended).
0a2ef054 2196
2197The flags argument is as in pv_uni_display().
2198
d2cc3551 2199The pointer to the PV of the dsv is returned.
2200
d4c19fe8 2201=cut
2202*/
e6b2e755 2203char *
2204Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2205{
cfd0369c 2206 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2207 SvCUR(ssv), pvlim, flags);
701a277b 2208}
2209
d2cc3551 2210/*
d07ddd77 2211=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551 2212
2213Return true if the strings s1 and s2 differ case-insensitively, false
2214if not (if they are equal case-insensitively). If u1 is true, the
2215string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77 2216the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2217are false, the respective string is assumed to be in native 8-bit
2218encoding.
2219
2220If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2221in there (they will point at the beginning of the I<next> character).
2222If the pointers behind pe1 or pe2 are non-NULL, they are the end
2223pointers beyond which scanning will not continue under any
4cdaeff7 2224circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77 2225s2+l2 will be used as goal end pointers that will also stop the scan,
2226and which qualify towards defining a successful match: all the scans
2227that define an explicit length must reach their goal pointers for
2228a match to succeed).
d2cc3551 2229
2230For case-insensitiveness, the "casefolding" of Unicode is used
2231instead of upper/lowercasing both the characters, see
2232http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2233
2234=cut */
701a277b 2235I32
d07ddd77 2236Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 2237{
97aff369 2238 dVAR;
e1ec3a88 2239 register const U8 *p1 = (const U8*)s1;
2240 register const U8 *p2 = (const U8*)s2;
cbbf8932 2241 register const U8 *f1 = NULL;
2f73348c 2242 register const U8 *f2 = NULL;
cbbf8932 2243 register U8 *e1 = NULL;
2244 register U8 *q1 = NULL;
2245 register U8 *e2 = NULL;
2246 register U8 *q2 = NULL;
d07ddd77 2247 STRLEN n1 = 0, n2 = 0;
89ebb4a3 2248 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2249 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8 2250 U8 natbuf[1+1];
2251 STRLEN foldlen1, foldlen2;
d07ddd77 2252 bool match;
332ddc25 2253
d07ddd77 2254 if (pe1)
2255 e1 = *(U8**)pe1;
e1ec3a88 2256 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2257 f1 = (const U8*)s1 + l1;
d07ddd77 2258 if (pe2)
2259 e2 = *(U8**)pe2;
e1ec3a88 2260 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2261 f2 = (const U8*)s2 + l2;
d07ddd77 2262
2263 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2264 return 1; /* mismatch; possible infinite loop or false positive */
2265
a6872d42 2266 if (!u1 || !u2)
2267 natbuf[1] = 0; /* Need to terminate the buffer. */
2268
d07ddd77 2269 while ((e1 == 0 || p1 < e1) &&
2270 (f1 == 0 || p1 < f1) &&
2271 (e2 == 0 || p2 < e2) &&
2272 (f2 == 0 || p2 < f2)) {
2273 if (n1 == 0) {
d7f013c8 2274 if (u1)
2275 to_utf8_fold(p1, foldbuf1, &foldlen1);
2276 else {
809e8e66 2277 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8 2278 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2279 }
2280 q1 = foldbuf1;
d07ddd77 2281 n1 = foldlen1;
332ddc25 2282 }
d07ddd77 2283 if (n2 == 0) {
d7f013c8 2284 if (u2)
2285 to_utf8_fold(p2, foldbuf2, &foldlen2);
2286 else {
809e8e66 2287 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8 2288 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2289 }
2290 q2 = foldbuf2;
d07ddd77 2291 n2 = foldlen2;
332ddc25 2292 }
d07ddd77 2293 while (n1 && n2) {
2294 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2295 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2296 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2297 return 1; /* mismatch */
d07ddd77 2298 n1 -= UTF8SKIP(q1);
d7f013c8 2299 q1 += UTF8SKIP(q1);
d07ddd77 2300 n2 -= UTF8SKIP(q2);
d7f013c8 2301 q2 += UTF8SKIP(q2);
701a277b 2302 }
d07ddd77 2303 if (n1 == 0)
d7f013c8 2304 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2305 if (n2 == 0)
d7f013c8 2306 p2 += u2 ? UTF8SKIP(p2) : 1;
2307
d2cc3551 2308 }
5469e704 2309
d07ddd77 2310 /* A match is defined by all the scans that specified
2311 * an explicit length reaching their final goals. */
2312 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704 2313
2314 if (match) {
d07ddd77 2315 if (pe1)
2316 *pe1 = (char*)p1;
2317 if (pe2)
2318 *pe2 = (char*)p2;
5469e704 2319 }
2320
2321 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2322}
701a277b 2323
a49f32c6 2324/*
2325 * Local variables:
2326 * c-indentation-style: bsd
2327 * c-basic-offset: 4
2328 * indent-tabs-mode: t
2329 * End:
2330 *
37442d52 2331 * ex: set ts=8 sts=4 sw=4 noet:
2332 */