Silence a const warning
[p5sagit/p5-mst-13.2.git] / utf8.c
CommitLineData
a0ed51b3 1/* utf8.c
2 *
af3babe4 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
4 * others
a0ed51b3 5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3 26#include "perl.h"
27
27da23d5 28static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 30
ccfc67b7 31/*
32=head1 Unicode Support
a0ed51b3 33
166f8a29 34This file contains various utility functions for manipulating UTF8-encoded
35strings. For the uninitiated, this is a method of representing arbitrary
61296642 36Unicode characters as a variable number of bytes, in such a way that
56da48f7 37characters in the ASCII range are unmodified, and a zero byte never appears
38within non-zero characters.
166f8a29 39
b851fbc1 40=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 41
1e54db1a 42Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 43of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 44bytes available. The return value is the pointer to the byte after the
9041c2e3 45end of the new character. In other words,
eebe1485 46
b851fbc1 47 d = uvuni_to_utf8_flags(d, uv, flags);
48
49or, in most cases,
50
9041c2e3 51 d = uvuni_to_utf8(d, uv);
eebe1485 52
b851fbc1 53(which is equivalent to)
54
55 d = uvuni_to_utf8_flags(d, uv, 0);
56
eebe1485 57is the recommended Unicode-aware way of saying
58
59 *(d++) = uv;
60
61=cut
62*/
63
dfe13c55 64U8 *
b851fbc1 65Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 66{
62961d2e 67 if (ckWARN(WARN_UTF8)) {
b851fbc1 68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1 71 else if (
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
74 ||
c867b360 75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1 76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
2a20b9da 78 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1 79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
81 )
9014280d 82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800 83 "Unicode character 0x%04"UVxf" is illegal", uv);
84 }
c4d5f83a 85 if (UNI_IS_INVARIANT(uv)) {
eb160463 86 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3 87 return d;
88 }
2d331972 89#if defined(EBCDIC)
1d72bdf6 90 else {
91 STRLEN len = UNISKIP(uv);
92 U8 *p = d+len-1;
93 while (p > d) {
eb160463 94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6 95 uv >>= UTF_ACCUMULATION_SHIFT;
96 }
eb160463 97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6 98 return d+len;
99 }
100#else /* Non loop style */
a0ed51b3 101 if (uv < 0x800) {
eb160463 102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 104 return d;
105 }
106 if (uv < 0x10000) {
eb160463 107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 110 return d;
111 }
112 if (uv < 0x200000) {
eb160463 113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 117 return d;
118 }
119 if (uv < 0x4000000) {
eb160463 120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 125 return d;
126 }
127 if (uv < 0x80000000) {
eb160463 128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 134 return d;
135 }
6b8eaf93 136#ifdef HAS_QUAD
d7578b48 137 if (uv < UTF8_QUAD_MAX)
a0ed51b3 138#endif
139 {
eb160463 140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 147 return d;
148 }
6b8eaf93 149#ifdef HAS_QUAD
a0ed51b3 150 {
eb160463 151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 164 return d;
165 }
166#endif
1d72bdf6 167#endif /* Loop style */
a0ed51b3 168}
b851fbc1 169
170U8 *
171Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
172{
173 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
174}
9041c2e3 175
176
177/*
7fc63493 178=for apidoc A|STRLEN|is_utf8_char|const U8 *s
eebe1485 179
5da9da9e 180Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01 181character. Note that an INVARIANT (i.e. ASCII) character is a valid
182UTF-8 character. The actual number of bytes in the UTF-8 character
183will be returned if it is valid, otherwise 0.
9041c2e3 184
82686b01 185=cut */
067a85ef 186STRLEN
7fc63493 187Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 188{
3b0fc154 189 STRLEN len;
190#ifdef IS_UTF8_CHAR
9f07fdcd 191 len = UTF8SKIP(s);
3b0fc154 192 if (len <= 4)
193 return IS_UTF8_CHAR(s, len) ? len : 0;
194#endif /* #ifdef IS_UTF8_CHAR */
195 {
196 U8 u = *s;
197 STRLEN slen;
198 UV uv, ouv;
199
200 if (UTF8_IS_INVARIANT(u))
201 return 1;
202
203 if (!UTF8_IS_START(u))
204 return 0;
205
206 len = UTF8SKIP(s);
207
208 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
209 return 0;
210
211 slen = len - 1;
212 s++;
213 u &= UTF_START_MASK(len);
214 uv = u;
215 ouv = uv;
216 while (slen--) {
217 if (!UTF8_IS_CONTINUATION(*s))
218 return 0;
219 uv = UTF8_ACCUMULATE(uv, *s);
220 if (uv < ouv)
221 return 0;
222 ouv = uv;
223 s++;
224 }
225
226 if ((STRLEN)UNISKIP(uv) < len)
227 return 0;
228
229 return len;
386d01d6 230 }
386d01d6 231}
232
6662521e 233/*
7fc63493 234=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
6662521e 235
c9ada85f 236Returns true if first C<len> bytes of the given string form a valid
1e54db1a 237UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
238not mean 'a string that contains code points above 0x7F encoded in UTF-8'
239because a valid ASCII string is a valid UTF-8 string.
6662521e 240
241=cut
242*/
243
8e84507e 244bool
7fc63493 245Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 246{
7fc63493 247 const U8* x = s;
248 const U8* send;
067a85ef 249
61468b03 250 if (!len && s)
e1ec3a88 251 len = strlen((const char *)s);
1aa99e6b 252 send = s + len;
253
6662521e 254 while (x < send) {
a3b680e6 255 STRLEN c;
1acdb0da 256 /* Inline the easy bits of is_utf8_char() here for speed... */
257 if (UTF8_IS_INVARIANT(*x))
258 c = 1;
259 else if (!UTF8_IS_START(*x))
260 return FALSE;
261 else {
262 /* ... and call is_utf8_char() only if really needed. */
263 c = is_utf8_char(x);
264 if (!c)
265 return FALSE;
266 }
6662521e 267 x += c;
6662521e 268 }
60006e79 269 if (x != send)
270 return FALSE;
067a85ef 271
272 return TRUE;
6662521e 273}
274
67e989fb 275/*
7fc63493 276=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **p
81cd54e3 277
278Like is_ut8_string but store the location of the failure in
279the last argument.
280
281=cut
282*/
283
284bool
7fc63493 285Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **p)
81cd54e3 286{
7fc63493 287 const U8* x = s;
288 const U8* send;
81cd54e3 289 STRLEN c;
290
61468b03 291 if (!len && s)
e1ec3a88 292 len = strlen((const char *)s);
81cd54e3 293 send = s + len;
294
295 while (x < send) {
296 /* Inline the easy bits of is_utf8_char() here for speed... */
297 if (UTF8_IS_INVARIANT(*x))
298 c = 1;
299 else if (!UTF8_IS_START(*x)) {
300 if (p)
301 *p = x;
302 return FALSE;
303 }
304 else {
305 /* ... and call is_utf8_char() only if really needed. */
306 c = is_utf8_char(x);
307 if (!c) {
308 if (p)
309 *p = x;
310 return FALSE;
311 }
312 }
313 x += c;
314 }
315 if (x != send) {
316 if (p)
317 *p = x;
318 return FALSE;
319 }
320
321 return TRUE;
322}
323
324/*
7fc63493 325=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 326
9041c2e3 327Bottom level UTF-8 decode routine.
328Returns the unicode code point value of the first character in the string C<s>
1e54db1a 329which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 330C<retlen> will be set to the length, in bytes, of that character.
67e989fb 331
1e54db1a 332If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880 333is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
334it is assumed that the caller will raise a warning, and this function
28d3d195 335will silently just set C<retlen> to C<-1> and return zero. If the
336C<flags> does not contain UTF8_CHECK_ONLY, warnings about
337malformations will be given, C<retlen> will be set to the expected
338length of the UTF-8 character in bytes, and zero will be returned.
339
340The C<flags> can also contain various flags to allow deviations from
341the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 342
9041c2e3 343Most code should use utf8_to_uvchr() rather than call this directly.
344
37607a96 345=cut
346*/
67e989fb 347
a0ed51b3 348UV
7fc63493 349Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 350{
7fc63493 351 const U8 *s0 = s;
9c5ffd7c 352 UV uv = *s, ouv = 0;
ba210ebe 353 STRLEN len = 1;
7fc63493 354 const bool dowarn = ckWARN_d(WARN_UTF8);
355 const UV startbyte = *s;
ba210ebe 356 STRLEN expectlen = 0;
a0dbb045 357 U32 warning = 0;
358
359/* This list is a superset of the UTF8_ALLOW_XXX. */
360
361#define UTF8_WARN_EMPTY 1
362#define UTF8_WARN_CONTINUATION 2
363#define UTF8_WARN_NON_CONTINUATION 3
364#define UTF8_WARN_FE_FF 4
365#define UTF8_WARN_SHORT 5
366#define UTF8_WARN_OVERFLOW 6
367#define UTF8_WARN_SURROGATE 7
c867b360 368#define UTF8_WARN_LONG 8
369#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045 370
371 if (curlen == 0 &&
372 !(flags & UTF8_ALLOW_EMPTY)) {
373 warning = UTF8_WARN_EMPTY;
0c443dc2 374 goto malformed;
375 }
376
1d72bdf6 377 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3 378 if (retlen)
379 *retlen = 1;
c4d5f83a 380 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 381 }
67e989fb 382
421a8bf2 383 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 384 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 385 warning = UTF8_WARN_CONTINUATION;
ba210ebe 386 goto malformed;
387 }
388
421a8bf2 389 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 390 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 391 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 392 goto malformed;
393 }
9041c2e3 394
1d72bdf6 395#ifdef EBCDIC
75383841 396 uv = NATIVE_TO_UTF(uv);
1d72bdf6 397#else
fcc8fcf6 398 if ((uv == 0xfe || uv == 0xff) &&
399 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 400 warning = UTF8_WARN_FE_FF;
ba210ebe 401 goto malformed;
a0ed51b3 402 }
1d72bdf6 403#endif
404
ba210ebe 405 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
406 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
407 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
408 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6 409#ifdef EBCDIC
410 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
411 else { len = 7; uv &= 0x01; }
412#else
ba210ebe 413 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
414 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6 415 else { len = 13; uv = 0; } /* whoa! */
416#endif
417
a0ed51b3 418 if (retlen)
419 *retlen = len;
9041c2e3 420
ba210ebe 421 expectlen = len;
422
fcc8fcf6 423 if ((curlen < expectlen) &&
424 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 425 warning = UTF8_WARN_SHORT;
ba210ebe 426 goto malformed;
427 }
428
429 len--;
a0ed51b3 430 s++;
ba210ebe 431 ouv = uv;
432
a0ed51b3 433 while (len--) {
421a8bf2 434 if (!UTF8_IS_CONTINUATION(*s) &&
435 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 436 s--;
437 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 438 goto malformed;
a0ed51b3 439 }
440 else
8850bf83 441 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045 442 if (!(uv > ouv)) {
443 /* These cannot be allowed. */
444 if (uv == ouv) {
75dbc644 445 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 446 warning = UTF8_WARN_LONG;
447 goto malformed;
448 }
449 }
450 else { /* uv < ouv */
451 /* This cannot be allowed. */
452 warning = UTF8_WARN_OVERFLOW;
453 goto malformed;
454 }
ba210ebe 455 }
456 s++;
457 ouv = uv;
458 }
459
421a8bf2 460 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 461 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 462 warning = UTF8_WARN_SURROGATE;
ba210ebe 463 goto malformed;
eb160463 464 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 465 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 466 warning = UTF8_WARN_LONG;
ba210ebe 467 goto malformed;
421a8bf2 468 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 469 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 470 warning = UTF8_WARN_FFFF;
a9917092 471 goto malformed;
a0ed51b3 472 }
ba210ebe 473
a0ed51b3 474 return uv;
ba210ebe 475
476malformed:
477
fcc8fcf6 478 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 479 if (retlen)
cc366d4b 480 *retlen = -1;
ba210ebe 481 return 0;
482 }
483
a0dbb045 484 if (dowarn) {
485 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
486
487 switch (warning) {
488 case 0: /* Intentionally empty. */ break;
489 case UTF8_WARN_EMPTY:
54667de8 490 Perl_sv_catpv(aTHX_ sv, "(empty string)");
a0dbb045 491 break;
492 case UTF8_WARN_CONTINUATION:
097fb8e2 493 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045 494 break;
495 case UTF8_WARN_NON_CONTINUATION:
097fb8e2 496 if (s == s0)
497 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
498 (UV)s[1], startbyte);
499 else
500 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
5d7488b2 501 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, (int)expectlen);
097fb8e2 502
a0dbb045 503 break;
504 case UTF8_WARN_FE_FF:
505 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
506 break;
507 case UTF8_WARN_SHORT:
097fb8e2 508 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 509 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 510 expectlen = curlen; /* distance for caller to skip */
a0dbb045 511 break;
512 case UTF8_WARN_OVERFLOW:
097fb8e2 513 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
514 ouv, *s, startbyte);
a0dbb045 515 break;
516 case UTF8_WARN_SURROGATE:
517 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
518 break;
a0dbb045 519 case UTF8_WARN_LONG:
097fb8e2 520 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 521 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045 522 break;
523 case UTF8_WARN_FFFF:
524 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
525 break;
526 default:
54667de8 527 Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
a0dbb045 528 break;
529 }
530
531 if (warning) {
504618e9 532 const char *s = SvPVX_const(sv);
a0dbb045 533
534 if (PL_op)
9014280d 535 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 536 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 537 else
9014280d 538 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045 539 }
540 }
541
ba210ebe 542 if (retlen)
28d3d195 543 *retlen = expectlen ? expectlen : len;
ba210ebe 544
28d3d195 545 return 0;
a0ed51b3 546}
547
8e84507e 548/*
7fc63493 549=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
9041c2e3 550
551Returns the native character value of the first character in the string C<s>
1e54db1a 552which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3 553length, in bytes, of that character.
554
1e54db1a 555If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3 556returned and retlen is set, if possible, to -1.
557
558=cut
559*/
560
561UV
7fc63493 562Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 563{
89ebb4a3 564 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 565 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3 566}
567
568/*
7fc63493 569=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
9041c2e3 570
571Returns the Unicode code point of the first character in the string C<s>
1e54db1a 572which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3 573length, in bytes, of that character.
574
575This function should only be used when returned UV is considered
576an index into the Unicode semantic tables (e.g. swashes).
577
1e54db1a 578If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 579returned and retlen is set, if possible, to -1.
8e84507e 580
581=cut
582*/
583
584UV
7fc63493 585Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 586{
9041c2e3 587 /* Call the low level routine asking for checks */
89ebb4a3 588 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 589 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e 590}
591
b76347f2 592/*
35a4481c 593=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
b76347f2 594
595Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47 596Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
597up past C<e>, croaks.
b76347f2 598
599=cut
600*/
601
602STRLEN
35a4481c 603Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2 604{
605 STRLEN len = 0;
606
8850bf83 607 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
608 * the bitops (especially ~) can create illegal UTF-8.
609 * In other words: in Perl UTF-8 is not just for Unicode. */
610
a3b680e6 611 if (e < s)
612 goto warn_and_return;
b76347f2 613 while (s < e) {
4373e329 614 const U8 t = UTF8SKIP(s);
901b21bf 615 if (e - s < t) {
a3b680e6 616 warn_and_return:
901b21bf 617 if (ckWARN_d(WARN_UTF8)) {
618 if (PL_op)
619 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 620 "%s in %s", unees, OP_DESC(PL_op));
901b21bf 621 else
622 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
623 }
624 return len;
625 }
b76347f2 626 s += t;
627 len++;
628 }
629
630 return len;
631}
632
b06226ff 633/*
35a4481c 634=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
b06226ff 635
1e54db1a 636Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff 637and C<b>.
638
639WARNING: use only if you *know* that the pointers point inside the
640same UTF-8 buffer.
641
37607a96 642=cut
643*/
a0ed51b3 644
02eb7b47 645IV
35a4481c 646Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 647{
02eb7b47 648 IV off = 0;
649
8850bf83 650 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
651 * the bitops (especially ~) can create illegal UTF-8.
652 * In other words: in Perl UTF-8 is not just for Unicode. */
653
a0ed51b3 654 if (a < b) {
655 while (a < b) {
35a4481c 656 const U8 c = UTF8SKIP(a);
a3b680e6 657 if (b - a < c)
658 goto warn_and_return;
02eb7b47 659 a += c;
a0ed51b3 660 off--;
661 }
662 }
663 else {
664 while (b < a) {
4373e329 665 const U8 c = UTF8SKIP(b);
02eb7b47 666
901b21bf 667 if (a - b < c) {
a3b680e6 668 warn_and_return:
901b21bf 669 if (ckWARN_d(WARN_UTF8)) {
670 if (PL_op)
671 Perl_warner(aTHX_ packWARN(WARN_UTF8),
672 "%s in %s", unees, OP_DESC(PL_op));
673 else
674 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
675 }
676 return off;
677 }
02eb7b47 678 b += c;
a0ed51b3 679 off++;
680 }
681 }
02eb7b47 682
a0ed51b3 683 return off;
684}
685
b06226ff 686/*
37607a96 687=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 688
8850bf83 689Return the UTF-8 pointer C<s> displaced by C<off> characters, either
690forward or backward.
b06226ff 691
692WARNING: do not use the following unless you *know* C<off> is within
8850bf83 693the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
694on the first byte of character or just after the last byte of a character.
b06226ff 695
37607a96 696=cut
697*/
a0ed51b3 698
699U8 *
4373e329 700Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 701{
8850bf83 702 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
703 * the bitops (especially ~) can create illegal UTF-8.
704 * In other words: in Perl UTF-8 is not just for Unicode. */
705
a0ed51b3 706 if (off >= 0) {
707 while (off--)
708 s += UTF8SKIP(s);
709 }
710 else {
711 while (off++) {
712 s--;
8850bf83 713 while (UTF8_IS_CONTINUATION(*s))
714 s--;
a0ed51b3 715 }
716 }
4373e329 717 return (U8 *)s;
a0ed51b3 718}
719
6940069f 720/*
eebe1485 721=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 722
1e54db1a 723Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53 724Unlike C<bytes_to_utf8>, this over-writes the original string, and
725updates len to contain the new length.
67e989fb 726Returns zero on failure, setting C<len> to -1.
6940069f 727
728=cut
729*/
730
731U8 *
37607a96 732Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 733{
6940069f 734 U8 *send;
735 U8 *d;
dcad2880 736 U8 *save = s;
246fae53 737
1e54db1a 738 /* ensure valid UTF-8 and chars < 256 before updating string */
dcad2880 739 for (send = s + *len; s < send; ) {
740 U8 c = *s++;
741
1d72bdf6 742 if (!UTF8_IS_INVARIANT(c) &&
743 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
744 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880 745 *len = -1;
746 return 0;
747 }
246fae53 748 }
dcad2880 749
750 d = s = save;
6940069f 751 while (s < send) {
ed646e6e 752 STRLEN ulen;
9041c2e3 753 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 754 s += ulen;
6940069f 755 }
756 *d = '\0';
246fae53 757 *len = d - save;
6940069f 758 return save;
759}
760
761/*
e1ec3a88 762=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 763
1e54db1a 764Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
35a4481c 765Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0 766the newly-created string, and updates C<len> to contain the new
767length. Returns the original string if no conversion occurs, C<len>
768is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
7690 if C<s> is converted or contains all 7bit characters.
f9a63242 770
37607a96 771=cut
772*/
f9a63242 773
774U8 *
e1ec3a88 775Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 776{
f9a63242 777 U8 *d;
e1ec3a88 778 const U8 *start = s;
779 const U8 *send;
f9a63242 780 I32 count = 0;
781
782 if (!*is_utf8)
73d840c0 783 return (U8 *)start;
f9a63242 784
1e54db1a 785 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 786 for (send = s + *len; s < send;) {
e1ec3a88 787 U8 c = *s++;
1d72bdf6 788 if (!UTF8_IS_INVARIANT(c)) {
db42d148 789 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
790 (c = *s++) && UTF8_IS_CONTINUATION(c))
791 count++;
792 else
73d840c0 793 return (U8 *)start;
db42d148 794 }
f9a63242 795 }
796
797 *is_utf8 = 0;
798
f9a63242 799 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 800 s = start; start = d;
f9a63242 801 while (s < send) {
802 U8 c = *s++;
c4d5f83a 803 if (!UTF8_IS_INVARIANT(c)) {
804 /* Then it is two-byte encoded */
805 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
806 c = ASCII_TO_NATIVE(c);
807 }
808 *d++ = c;
f9a63242 809 }
810 *d = '\0';
811 *len = d - start;
73d840c0 812 return (U8 *)start;
f9a63242 813}
814
815/*
35a4481c 816=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
6940069f 817
1e54db1a 818Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e 819Returns a pointer to the newly-created string, and sets C<len> to
820reflect the new length.
6940069f 821
1e54db1a 822If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f 823see sv_recode_to_utf8().
824
497711e7 825=cut
6940069f 826*/
827
828U8*
35a4481c 829Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 830{
35a4481c 831 const U8 * const send = s + (*len);
6940069f 832 U8 *d;
833 U8 *dst;
6940069f 834
6662521e 835 Newz(801, d, (*len) * 2 + 1, U8);
6940069f 836 dst = d;
837
838 while (s < send) {
35a4481c 839 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 840 if (UNI_IS_INVARIANT(uv))
eb160463 841 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 842 else {
eb160463 843 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
844 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f 845 }
846 }
847 *d = '\0';
6662521e 848 *len = d-dst;
6940069f 849 return dst;
850}
851
a0ed51b3 852/*
dea0fc0b 853 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3 854 *
855 * Destination must be pre-extended to 3/2 source. Do not use in-place.
856 * We optimize for native, for obvious reasons. */
857
858U8*
dea0fc0b 859Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 860{
dea0fc0b 861 U8* pend;
862 U8* dstart = d;
863
1de9afcd 864 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
865 d[0] = 0;
866 *newlen = 1;
867 return d;
868 }
869
dea0fc0b 870 if (bytelen & 1)
014ead4b 871 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
dea0fc0b 872
873 pend = p + bytelen;
874
a0ed51b3 875 while (p < pend) {
dea0fc0b 876 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
877 p += 2;
a0ed51b3 878 if (uv < 0x80) {
eb160463 879 *d++ = (U8)uv;
a0ed51b3 880 continue;
881 }
882 if (uv < 0x800) {
eb160463 883 *d++ = (U8)(( uv >> 6) | 0xc0);
884 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 885 continue;
886 }
887 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e 888 UV low = (p[0] << 8) + p[1];
889 p += 2;
dea0fc0b 890 if (low < 0xdc00 || low >= 0xdfff)
891 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3 892 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
893 }
894 if (uv < 0x10000) {
eb160463 895 *d++ = (U8)(( uv >> 12) | 0xe0);
896 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
897 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 898 continue;
899 }
900 else {
eb160463 901 *d++ = (U8)(( uv >> 18) | 0xf0);
902 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
903 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
904 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 905 continue;
906 }
907 }
dea0fc0b 908 *newlen = d - dstart;
a0ed51b3 909 return d;
910}
911
912/* Note: this one is slightly destructive of the source. */
913
914U8*
dea0fc0b 915Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 916{
917 U8* s = (U8*)p;
918 U8* send = s + bytelen;
919 while (s < send) {
920 U8 tmp = s[0];
921 s[0] = s[1];
922 s[1] = tmp;
923 s += 2;
924 }
dea0fc0b 925 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3 926}
927
928/* for now these are all defined (inefficiently) in terms of the utf8 versions */
929
930bool
84afefe6 931Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 932{
89ebb4a3 933 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 934 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 935 return is_utf8_alnum(tmpbuf);
936}
937
938bool
84afefe6 939Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 940{
89ebb4a3 941 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 942 uvchr_to_utf8(tmpbuf, c);
b8c5462f 943 return is_utf8_alnumc(tmpbuf);
944}
945
946bool
84afefe6 947Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 948{
89ebb4a3 949 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 950 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 951 return is_utf8_idfirst(tmpbuf);
952}
953
954bool
84afefe6 955Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 956{
89ebb4a3 957 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 958 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 959 return is_utf8_alpha(tmpbuf);
960}
961
962bool
84afefe6 963Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 964{
89ebb4a3 965 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 966 uvchr_to_utf8(tmpbuf, c);
4d61ec05 967 return is_utf8_ascii(tmpbuf);
968}
969
970bool
84afefe6 971Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 972{
89ebb4a3 973 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 974 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 975 return is_utf8_space(tmpbuf);
976}
977
978bool
84afefe6 979Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 980{
89ebb4a3 981 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 982 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 983 return is_utf8_digit(tmpbuf);
984}
985
986bool
84afefe6 987Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 988{
89ebb4a3 989 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 990 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 991 return is_utf8_upper(tmpbuf);
992}
993
994bool
84afefe6 995Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 996{
89ebb4a3 997 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 998 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 999 return is_utf8_lower(tmpbuf);
1000}
1001
1002bool
84afefe6 1003Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1004{
89ebb4a3 1005 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1006 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1007 return is_utf8_cntrl(tmpbuf);
1008}
1009
1010bool
84afefe6 1011Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1012{
89ebb4a3 1013 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1014 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1015 return is_utf8_graph(tmpbuf);
1016}
1017
1018bool
84afefe6 1019Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1020{
89ebb4a3 1021 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1022 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1023 return is_utf8_print(tmpbuf);
1024}
1025
b8c5462f 1026bool
84afefe6 1027Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1028{
89ebb4a3 1029 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1030 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1031 return is_utf8_punct(tmpbuf);
1032}
1033
4d61ec05 1034bool
84afefe6 1035Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1036{
89ebb4a3 1037 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1038 uvchr_to_utf8(tmpbuf, c);
4d61ec05 1039 return is_utf8_xdigit(tmpbuf);
1040}
1041
84afefe6 1042UV
1043Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1044{
0ebc6274 1045 uvchr_to_utf8(p, c);
1046 return to_utf8_upper(p, p, lenp);
a0ed51b3 1047}
1048
84afefe6 1049UV
1050Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1051{
0ebc6274 1052 uvchr_to_utf8(p, c);
1053 return to_utf8_title(p, p, lenp);
a0ed51b3 1054}
1055
84afefe6 1056UV
1057Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1058{
0ebc6274 1059 uvchr_to_utf8(p, c);
1060 return to_utf8_lower(p, p, lenp);
a0ed51b3 1061}
1062
84afefe6 1063UV
1064Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1065{
0ebc6274 1066 uvchr_to_utf8(p, c);
1067 return to_utf8_fold(p, p, lenp);
84afefe6 1068}
1069
a0ed51b3 1070/* for now these all assume no locale info available for Unicode > 255 */
1071
1072bool
84afefe6 1073Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3 1074{
1075 return is_uni_alnum(c); /* XXX no locale support yet */
1076}
1077
1078bool
84afefe6 1079Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f 1080{
1081 return is_uni_alnumc(c); /* XXX no locale support yet */
1082}
1083
1084bool
84afefe6 1085Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3 1086{
1087 return is_uni_idfirst(c); /* XXX no locale support yet */
1088}
1089
1090bool
84afefe6 1091Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3 1092{
1093 return is_uni_alpha(c); /* XXX no locale support yet */
1094}
1095
1096bool
84afefe6 1097Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05 1098{
1099 return is_uni_ascii(c); /* XXX no locale support yet */
1100}
1101
1102bool
84afefe6 1103Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3 1104{
1105 return is_uni_space(c); /* XXX no locale support yet */
1106}
1107
1108bool
84afefe6 1109Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3 1110{
1111 return is_uni_digit(c); /* XXX no locale support yet */
1112}
1113
1114bool
84afefe6 1115Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3 1116{
1117 return is_uni_upper(c); /* XXX no locale support yet */
1118}
1119
1120bool
84afefe6 1121Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3 1122{
1123 return is_uni_lower(c); /* XXX no locale support yet */
1124}
1125
1126bool
84afefe6 1127Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f 1128{
1129 return is_uni_cntrl(c); /* XXX no locale support yet */
1130}
1131
1132bool
84afefe6 1133Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f 1134{
1135 return is_uni_graph(c); /* XXX no locale support yet */
1136}
1137
1138bool
84afefe6 1139Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3 1140{
1141 return is_uni_print(c); /* XXX no locale support yet */
1142}
1143
b8c5462f 1144bool
84afefe6 1145Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f 1146{
1147 return is_uni_punct(c); /* XXX no locale support yet */
1148}
1149
4d61ec05 1150bool
84afefe6 1151Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05 1152{
1153 return is_uni_xdigit(c); /* XXX no locale support yet */
1154}
1155
b7ac61fa 1156U32
1157Perl_to_uni_upper_lc(pTHX_ U32 c)
1158{
ee099d14 1159 /* XXX returns only the first character -- do not use XXX */
1160 /* XXX no locale support yet */
1161 STRLEN len;
89ebb4a3 1162 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1163 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa 1164}
1165
1166U32
1167Perl_to_uni_title_lc(pTHX_ U32 c)
1168{
ee099d14 1169 /* XXX returns only the first character XXX -- do not use XXX */
1170 /* XXX no locale support yet */
1171 STRLEN len;
89ebb4a3 1172 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1173 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa 1174}
1175
1176U32
1177Perl_to_uni_lower_lc(pTHX_ U32 c)
1178{
ee099d14 1179 /* XXX returns only the first character -- do not use XXX */
1180 /* XXX no locale support yet */
1181 STRLEN len;
89ebb4a3 1182 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1183 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa 1184}
1185
a0ed51b3 1186bool
7fc63493 1187Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1188{
386d01d6 1189 if (!is_utf8_char(p))
1190 return FALSE;
a0ed51b3 1191 if (!PL_utf8_alnum)
289d4f09 1192 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1193 * descendant of isalnum(3), in other words, it doesn't
1194 * contain the '_'. --jhi */
1195 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
eb160463 1196 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3 1197/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1198#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1199 if (!PL_utf8_alnum)
1200 PL_utf8_alnum = swash_init("utf8", "",
1201 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1202 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3 1203#endif
1204}
1205
1206bool
7fc63493 1207Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1208{
386d01d6 1209 if (!is_utf8_char(p))
1210 return FALSE;
b8c5462f 1211 if (!PL_utf8_alnum)
1212 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
eb160463 1213 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f 1214/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1215#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1216 if (!PL_utf8_alnum)
1217 PL_utf8_alnum = swash_init("utf8", "",
1218 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1219 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f 1220#endif
1221}
1222
1223bool
7fc63493 1224Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1225{
82686b01 1226 if (*p == '_')
1227 return TRUE;
1228 if (!is_utf8_char(p))
1229 return FALSE;
1230 if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1231 PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
eb160463 1232 return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
82686b01 1233}
1234
1235bool
7fc63493 1236Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01 1237{
1238 if (*p == '_')
1239 return TRUE;
1240 if (!is_utf8_char(p))
1241 return FALSE;
1242 if (!PL_utf8_idcont)
1243 PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
eb160463 1244 return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
a0ed51b3 1245}
1246
1247bool
7fc63493 1248Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1249{
386d01d6 1250 if (!is_utf8_char(p))
1251 return FALSE;
a0ed51b3 1252 if (!PL_utf8_alpha)
e24b16f9 1253 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
eb160463 1254 return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
a0ed51b3 1255}
1256
1257bool
7fc63493 1258Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1259{
386d01d6 1260 if (!is_utf8_char(p))
1261 return FALSE;
b8c5462f 1262 if (!PL_utf8_ascii)
1263 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
eb160463 1264 return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
b8c5462f 1265}
1266
1267bool
7fc63493 1268Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1269{
386d01d6 1270 if (!is_utf8_char(p))
1271 return FALSE;
a0ed51b3 1272 if (!PL_utf8_space)
3bec3564 1273 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
eb160463 1274 return swash_fetch(PL_utf8_space, p, TRUE) != 0;
a0ed51b3 1275}
1276
1277bool
7fc63493 1278Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1279{
386d01d6 1280 if (!is_utf8_char(p))
1281 return FALSE;
a0ed51b3 1282 if (!PL_utf8_digit)
e24b16f9 1283 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
eb160463 1284 return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
a0ed51b3 1285}
1286
1287bool
7fc63493 1288Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1289{
386d01d6 1290 if (!is_utf8_char(p))
1291 return FALSE;
a0ed51b3 1292 if (!PL_utf8_upper)
c65e4d19 1293 PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
eb160463 1294 return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
a0ed51b3 1295}
1296
1297bool
7fc63493 1298Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1299{
386d01d6 1300 if (!is_utf8_char(p))
1301 return FALSE;
a0ed51b3 1302 if (!PL_utf8_lower)
c65e4d19 1303 PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
eb160463 1304 return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
a0ed51b3 1305}
1306
1307bool
7fc63493 1308Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1309{
386d01d6 1310 if (!is_utf8_char(p))
1311 return FALSE;
b8c5462f 1312 if (!PL_utf8_cntrl)
1313 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
eb160463 1314 return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
b8c5462f 1315}
1316
1317bool
7fc63493 1318Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1319{
386d01d6 1320 if (!is_utf8_char(p))
1321 return FALSE;
b8c5462f 1322 if (!PL_utf8_graph)
1323 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
eb160463 1324 return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
b8c5462f 1325}
1326
1327bool
7fc63493 1328Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1329{
386d01d6 1330 if (!is_utf8_char(p))
1331 return FALSE;
a0ed51b3 1332 if (!PL_utf8_print)
e24b16f9 1333 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
eb160463 1334 return swash_fetch(PL_utf8_print, p, TRUE) != 0;
a0ed51b3 1335}
1336
1337bool
7fc63493 1338Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1339{
386d01d6 1340 if (!is_utf8_char(p))
1341 return FALSE;
b8c5462f 1342 if (!PL_utf8_punct)
1343 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
eb160463 1344 return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
b8c5462f 1345}
1346
1347bool
7fc63493 1348Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1349{
386d01d6 1350 if (!is_utf8_char(p))
1351 return FALSE;
b8c5462f 1352 if (!PL_utf8_xdigit)
1353 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
eb160463 1354 return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
b8c5462f 1355}
1356
1357bool
7fc63493 1358Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1359{
386d01d6 1360 if (!is_utf8_char(p))
1361 return FALSE;
a0ed51b3 1362 if (!PL_utf8_mark)
e24b16f9 1363 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
eb160463 1364 return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
a0ed51b3 1365}
1366
6b5c0936 1367/*
1368=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1369
1370The "p" contains the pointer to the UTF-8 string encoding
1371the character that is being converted.
1372
1373The "ustrp" is a pointer to the character buffer to put the
1374conversion result to. The "lenp" is a pointer to the length
1375of the result.
1376
0134edef 1377The "swashp" is a pointer to the swash to use.
6b5c0936 1378
0134edef 1379Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1380and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1381but not always, a multicharacter mapping), is tried first.
6b5c0936 1382
0134edef 1383The "special" is a string like "utf8::ToSpecLower", which means the
1384hash %utf8::ToSpecLower. The access to the hash is through
1385Perl_to_utf8_case().
6b5c0936 1386
0134edef 1387The "normal" is a string like "ToLower" which means the swash
1388%utf8::ToLower.
1389
1390=cut */
6b5c0936 1391
2104c8d9 1392UV
e1ec3a88 1393Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, const char *normal, const char *special)
a0ed51b3 1394{
89ebb4a3 1395 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1396 STRLEN len = 0;
a0ed51b3 1397
7fc63493 1398 const UV uv0 = utf8_to_uvchr(p, 0);
1feea2c7 1399 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1400 * are necessary in EBCDIC, they are redundant no-ops
1401 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1402 const UV uv1 = NATIVE_TO_UNI(uv0);
1feea2c7 1403 uvuni_to_utf8(tmpbuf, uv1);
0134edef 1404
1405 if (!*swashp) /* load on-demand */
1406 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1407
b08cf34e 1408 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1409 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1410 /* It might be "special" (sometimes, but not always,
2a37f04d 1411 * a multicharacter mapping) */
983ffd37 1412 HV *hv;
b08cf34e 1413 SV **svp;
1414
1415 if ((hv = get_hv(special, FALSE)) &&
1416 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1417 (*svp)) {
1418 char *s;
47654450 1419
b08cf34e 1420 s = SvPV(*svp, len);
47654450 1421 if (len == 1)
1422 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1423 else {
2f9475ad 1424#ifdef EBCDIC
1425 /* If we have EBCDIC we need to remap the characters
1426 * since any characters in the low 256 are Unicode
1427 * code points, not EBCDIC. */
7cda7a3d 1428 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad 1429
1430 d = tmpbuf;
b08cf34e 1431 if (SvUTF8(*svp)) {
2f9475ad 1432 STRLEN tlen = 0;
1433
1434 while (t < tend) {
1435 UV c = utf8_to_uvchr(t, &tlen);
1436 if (tlen > 0) {
1437 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1438 t += tlen;
1439 }
1440 else
1441 break;
1442 }
1443 }
1444 else {
36fec512 1445 while (t < tend) {
1446 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1447 t++;
1448 }
2f9475ad 1449 }
1450 len = d - tmpbuf;
1451 Copy(tmpbuf, ustrp, len, U8);
1452#else
d2dcd0fb 1453 Copy(s, ustrp, len, U8);
2f9475ad 1454#endif
29e98929 1455 }
983ffd37 1456 }
0134edef 1457 }
1458
1459 if (!len && *swashp) {
1460 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1461
1462 if (uv2) {
1463 /* It was "normal" (a single character mapping). */
1464 UV uv3 = UNI_TO_NATIVE(uv2);
1465
e9101d72 1466 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d 1467 }
1468 }
1feea2c7 1469
0134edef 1470 if (!len) /* Neither: just copy. */
1471 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1472
2a37f04d 1473 if (lenp)
1474 *lenp = len;
1475
0134edef 1476 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3 1477}
1478
d3e79532 1479/*
7fc63493 1480=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1481
1482Convert the UTF-8 encoded character at p to its uppercase version and
1483store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1484that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1485the uppercase version may be longer than the original character.
d3e79532 1486
1487The first character of the uppercased version is returned
1488(but note, as explained above, that there may be more.)
1489
1490=cut */
1491
2104c8d9 1492UV
7fc63493 1493Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1494{
983ffd37 1495 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1496 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1497}
a0ed51b3 1498
d3e79532 1499/*
7fc63493 1500=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1501
1502Convert the UTF-8 encoded character at p to its titlecase version and
1503store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1504that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1505titlecase version may be longer than the original character.
d3e79532 1506
1507The first character of the titlecased version is returned
1508(but note, as explained above, that there may be more.)
1509
1510=cut */
1511
983ffd37 1512UV
7fc63493 1513Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37 1514{
1515 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1516 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3 1517}
1518
d3e79532 1519/*
7fc63493 1520=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1521
1522Convert the UTF-8 encoded character at p to its lowercase version and
1523store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1524that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1525lowercase version may be longer than the original character.
d3e79532 1526
1527The first character of the lowercased version is returned
1528(but note, as explained above, that there may be more.)
1529
1530=cut */
1531
2104c8d9 1532UV
7fc63493 1533Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1534{
983ffd37 1535 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1536 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1537}
1538
d3e79532 1539/*
7fc63493 1540=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1541
1542Convert the UTF-8 encoded character at p to its foldcase version and
1543store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1544that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532 1545foldcase version may be longer than the original character (up to
1546three characters).
1547
1548The first character of the foldcased version is returned
1549(but note, as explained above, that there may be more.)
1550
1551=cut */
1552
b4e400f9 1553UV
7fc63493 1554Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9 1555{
1556 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1557 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3 1558}
1559
1560/* a "swash" is a swatch hash */
1561
1562SV*
7fc63493 1563Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1564{
27da23d5 1565 dVAR;
a0ed51b3 1566 SV* retval;
243b1711 1567 SV* tokenbufsv = sv_newmortal();
8e84507e 1568 dSP;
7fc63493 1569 const size_t pkg_len = strlen(pkg);
1570 const size_t name_len = strlen(name);
71bed85a 1571 HV *stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1572 SV* errsv_save;
ce3b816e 1573
96ca9f55 1574 PUSHSTACKi(PERLSI_MAGIC);
1575 ENTER;
1576 SAVEI32(PL_hints);
1577 PL_hints = 0;
1578 save_re_context();
1b026014 1579 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1580 ENTER;
f8be5cf0 1581 errsv_save = newSVsv(ERRSV);
71bed85a 1582 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1583 Nullsv);
f8be5cf0 1584 if (!SvTRUE(ERRSV))
1585 sv_setsv(ERRSV, errsv_save);
1586 SvREFCNT_dec(errsv_save);
ce3b816e 1587 LEAVE;
1588 }
1589 SPAGAIN;
a0ed51b3 1590 PUSHMARK(SP);
1591 EXTEND(SP,5);
71bed85a 1592 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1593 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3 1594 PUSHs(listsv);
1595 PUSHs(sv_2mortal(newSViv(minbits)));
1596 PUSHs(sv_2mortal(newSViv(none)));
1597 PUTBACK;
923e4eb5 1598 if (IN_PERL_COMPILETIME) {
bf1fed83 1599 /* XXX ought to be handled by lex_start */
82686b01 1600 SAVEI32(PL_in_my);
2b4bd638 1601 PL_in_my = 0;
bf1fed83 1602 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1603 }
f8be5cf0 1604 errsv_save = newSVsv(ERRSV);
864dbfa3 1605 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1606 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1607 else
e24b16f9 1608 retval = &PL_sv_undef;
f8be5cf0 1609 if (!SvTRUE(ERRSV))
1610 sv_setsv(ERRSV, errsv_save);
1611 SvREFCNT_dec(errsv_save);
a0ed51b3 1612 LEAVE;
1613 POPSTACK;
923e4eb5 1614 if (IN_PERL_COMPILETIME) {
bf1fed83 1615 STRLEN len;
73d840c0 1616 const char* pv = SvPV(tokenbufsv, len);
bf1fed83 1617
1618 Copy(pv, PL_tokenbuf, len+1, char);
eb160463 1619 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1620 }
bc45ce41 1621 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1622 if (SvPOK(retval))
35c1215d 1623 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1624 retval);
cea2e8a9 1625 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1626 }
a0ed51b3 1627 return retval;
1628}
1629
035d37be 1630
1631/* This API is wrong for special case conversions since we may need to
1632 * return several Unicode characters for a single Unicode character
1633 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1634 * the lower-level routine, and it is similarly broken for returning
1635 * multiple values. --jhi */
a0ed51b3 1636UV
7fc63493 1637Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
a0ed51b3 1638{
27da23d5 1639 dVAR;
a0ed51b3 1640 HV* hv = (HV*)SvRV(sv);
3568d838 1641 U32 klen;
1642 U32 off;
a0ed51b3 1643 STRLEN slen;
7d85a32c 1644 STRLEN needents;
4ea42e7f 1645 U8 *tmps = NULL;
a0ed51b3 1646 U32 bit;
1647 SV *retval;
3568d838 1648 U8 tmputf8[2];
1649 UV c = NATIVE_TO_ASCII(*ptr);
1650
1651 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
eb160463 1652 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1653 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
3568d838 1654 ptr = tmputf8;
1655 }
1656 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1657 * then the "swatch" is a vec() for al the chars which start
1658 * with 0xAA..0xYY
1659 * So the key in the hash (klen) is length of encoded char -1
1660 */
1661 klen = UTF8SKIP(ptr) - 1;
1662 off = ptr[klen];
a0ed51b3 1663
7d85a32c 1664 if (klen == 0)
1665 {
1666 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1667 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1668 */
1669 needents = UTF_CONTINUATION_MARK;
1670 off = NATIVE_TO_UTF(ptr[klen]);
1671 }
1672 else
1673 {
1674 /* If char is encoded then swatch is for the prefix */
1675 needents = (1 << UTF_ACCUMULATION_SHIFT);
1676 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1677 }
1678
a0ed51b3 1679 /*
1680 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1681 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1682 * it's nothing to sniff at.) Pity we usually come through at least
1683 * two function calls to get here...
1684 *
1685 * NB: this code assumes that swatches are never modified, once generated!
1686 */
1687
3568d838 1688 if (hv == PL_last_swash_hv &&
a0ed51b3 1689 klen == PL_last_swash_klen &&
27da23d5 1690 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3 1691 {
1692 tmps = PL_last_swash_tmps;
1693 slen = PL_last_swash_slen;
1694 }
1695 else {
1696 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1697 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 1698
1699 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1700 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1701 dSP;
2b9d42f0 1702 /* We use utf8n_to_uvuni() as we want an index into
1703 Unicode tables, not a native character number.
1704 */
89ebb4a3 1705 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae 1706 ckWARN(WARN_UTF8) ?
1707 0 : UTF8_ALLOW_ANY);
f8be5cf0 1708 SV *errsv_save;
a0ed51b3 1709 ENTER;
1710 SAVETMPS;
1711 save_re_context();
1712 PUSHSTACKi(PERLSI_MAGIC);
1713 PUSHMARK(SP);
1714 EXTEND(SP,3);
1715 PUSHs((SV*)sv);
ffbc6a93 1716 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838 1717 PUSHs(sv_2mortal(newSViv((klen) ?
1718 (code_point & ~(needents - 1)) : 0)));
a0ed51b3 1719 PUSHs(sv_2mortal(newSViv(needents)));
1720 PUTBACK;
f8be5cf0 1721 errsv_save = newSVsv(ERRSV);
864dbfa3 1722 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1723 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1724 else
e24b16f9 1725 retval = &PL_sv_undef;
f8be5cf0 1726 if (!SvTRUE(ERRSV))
1727 sv_setsv(ERRSV, errsv_save);
1728 SvREFCNT_dec(errsv_save);
a0ed51b3 1729 POPSTACK;
1730 FREETMPS;
1731 LEAVE;
923e4eb5 1732 if (IN_PERL_COMPILETIME)
eb160463 1733 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1734
e1ec3a88 1735 svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
a0ed51b3 1736
7d85a32c 1737 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1738 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3 1739 }
1740
1741 PL_last_swash_hv = hv;
1742 PL_last_swash_klen = klen;
1743 PL_last_swash_tmps = tmps;
1744 PL_last_swash_slen = slen;
1745 if (klen)
1746 Copy(ptr, PL_last_swash_key, klen, U8);
1747 }
1748
9faf8d75 1749 switch ((int)((slen << 3) / needents)) {
a0ed51b3 1750 case 1:
1751 bit = 1 << (off & 7);
1752 off >>= 3;
1753 return (tmps[off] & bit) != 0;
1754 case 8:
1755 return tmps[off];
1756 case 16:
1757 off <<= 1;
1758 return (tmps[off] << 8) + tmps[off + 1] ;
1759 case 32:
1760 off <<= 2;
1761 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1762 }
cea2e8a9 1763 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3 1764 return 0;
1765}
2b9d42f0 1766
1767
1768/*
37607a96 1769=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0 1770
1e54db1a 1771Adds the UTF-8 representation of the Native codepoint C<uv> to the end
89ebb4a3 1772of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2b9d42f0 1773bytes available. The return value is the pointer to the byte after the
1774end of the new character. In other words,
1775
1776 d = uvchr_to_utf8(d, uv);
1777
1778is the recommended wide native character-aware way of saying
1779
1780 *(d++) = uv;
1781
1782=cut
1783*/
1784
1785/* On ASCII machines this is normally a macro but we want a
1786 real function in case XS code wants it
1787*/
1788#undef Perl_uvchr_to_utf8
1789U8 *
1790Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1791{
b851fbc1 1792 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2b9d42f0 1793}
1794
b851fbc1 1795U8 *
1796Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1797{
1798 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1799}
2b9d42f0 1800
1801/*
37607a96 1802=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0 1803
1804Returns the native character value of the first character in the string C<s>
1e54db1a 1805which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2b9d42f0 1806length, in bytes, of that character.
1807
1808Allows length and flags to be passed to low level routine.
1809
1810=cut
1811*/
0a2ef054 1812/* On ASCII machines this is normally a macro but we want
1813 a real function in case XS code wants it
2b9d42f0 1814*/
1815#undef Perl_utf8n_to_uvchr
1816UV
7fc63493 1817Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0 1818{
1819 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1820 return UNI_TO_NATIVE(uv);
1821}
1822
d2cc3551 1823/*
1824=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1825
1826Build to the scalar dsv a displayable version of the string spv,
1827length len, the displayable version being at most pvlim bytes long
1828(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1829
9e55ce06 1830The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1831isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054 1832to display the \\[nrfta\\] as the backslashed versions (like '\n')
1833(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1834UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1835UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1836
d2cc3551 1837The pointer to the PV of the dsv is returned.
1838
1839=cut */
e6b2e755 1840char *
e1ec3a88 1841Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755 1842{
1843 int truncated = 0;
e1ec3a88 1844 const char *s, *e;
e6b2e755 1845
1846 sv_setpvn(dsv, "", 0);
e1ec3a88 1847 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 1848 UV u;
a49f32c6 1849 /* This serves double duty as a flag and a character to print after
1850 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1851 */
1852 char ok = 0;
c728cb41 1853
e6b2e755 1854 if (pvlim && SvCUR(dsv) >= pvlim) {
1855 truncated++;
1856 break;
1857 }
1858 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1859 if (u < 256) {
a3b680e6 1860 const unsigned char c = (unsigned char)u & 0xFF;
c728cb41 1861 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
a49f32c6 1862 switch (c) {
c728cb41 1863 case '\n':
a49f32c6 1864 ok = 'n'; break;
c728cb41 1865 case '\r':
a49f32c6 1866 ok = 'r'; break;
c728cb41 1867 case '\t':
a49f32c6 1868 ok = 't'; break;
c728cb41 1869 case '\f':
a49f32c6 1870 ok = 'f'; break;
c728cb41 1871 case '\a':
a49f32c6 1872 ok = 'a'; break;
c728cb41 1873 case '\\':
a49f32c6 1874 ok = '\\'; break;
c728cb41 1875 default: break;
1876 }
a49f32c6 1877 if (ok) {
1878 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1879 }
c728cb41 1880 }
00e86452 1881 /* isPRINT() is the locale-blind version. */
a49f32c6 1882 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1883 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1884 ok = 1;
0a2ef054 1885 }
c728cb41 1886 }
1887 if (!ok)
9e55ce06 1888 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755 1889 }
1890 if (truncated)
1891 sv_catpvn(dsv, "...", 3);
1892
1893 return SvPVX(dsv);
1894}
2b9d42f0 1895
d2cc3551 1896/*
1897=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1898
1899Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1900the displayable version being at most pvlim bytes long
d2cc3551 1901(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1902
1903The flags argument is as in pv_uni_display().
1904
d2cc3551 1905The pointer to the PV of the dsv is returned.
1906
1907=cut */
e6b2e755 1908char *
1909Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1910{
701a277b 1911 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1912 pvlim, flags);
1913}
1914
d2cc3551 1915/*
d07ddd77 1916=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551 1917
1918Return true if the strings s1 and s2 differ case-insensitively, false
1919if not (if they are equal case-insensitively). If u1 is true, the
1920string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77 1921the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1922are false, the respective string is assumed to be in native 8-bit
1923encoding.
1924
1925If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1926in there (they will point at the beginning of the I<next> character).
1927If the pointers behind pe1 or pe2 are non-NULL, they are the end
1928pointers beyond which scanning will not continue under any
4cdaeff7 1929circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77 1930s2+l2 will be used as goal end pointers that will also stop the scan,
1931and which qualify towards defining a successful match: all the scans
1932that define an explicit length must reach their goal pointers for
1933a match to succeed).
d2cc3551 1934
1935For case-insensitiveness, the "casefolding" of Unicode is used
1936instead of upper/lowercasing both the characters, see
1937http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1938
1939=cut */
701a277b 1940I32
d07ddd77 1941Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 1942{
e1ec3a88 1943 register const U8 *p1 = (const U8*)s1;
1944 register const U8 *p2 = (const U8*)s2;
1945 register const U8 *f1 = 0, *f2 = 0;
1946 register U8 *e1 = 0, *q1 = 0;
1947 register U8 *e2 = 0, *q2 = 0;
d07ddd77 1948 STRLEN n1 = 0, n2 = 0;
89ebb4a3 1949 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
1950 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8 1951 U8 natbuf[1+1];
1952 STRLEN foldlen1, foldlen2;
d07ddd77 1953 bool match;
332ddc25 1954
d07ddd77 1955 if (pe1)
1956 e1 = *(U8**)pe1;
e1ec3a88 1957 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
1958 f1 = (const U8*)s1 + l1;
d07ddd77 1959 if (pe2)
1960 e2 = *(U8**)pe2;
e1ec3a88 1961 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
1962 f2 = (const U8*)s2 + l2;
d07ddd77 1963
1964 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1965 return 1; /* mismatch; possible infinite loop or false positive */
1966
a6872d42 1967 if (!u1 || !u2)
1968 natbuf[1] = 0; /* Need to terminate the buffer. */
1969
d07ddd77 1970 while ((e1 == 0 || p1 < e1) &&
1971 (f1 == 0 || p1 < f1) &&
1972 (e2 == 0 || p2 < e2) &&
1973 (f2 == 0 || p2 < f2)) {
1974 if (n1 == 0) {
d7f013c8 1975 if (u1)
1976 to_utf8_fold(p1, foldbuf1, &foldlen1);
1977 else {
f5cee151 1978 natbuf[0] = *p1;
d7f013c8 1979 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1980 }
1981 q1 = foldbuf1;
d07ddd77 1982 n1 = foldlen1;
332ddc25 1983 }
d07ddd77 1984 if (n2 == 0) {
d7f013c8 1985 if (u2)
1986 to_utf8_fold(p2, foldbuf2, &foldlen2);
1987 else {
f5cee151 1988 natbuf[0] = *p2;
d7f013c8 1989 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1990 }
1991 q2 = foldbuf2;
d07ddd77 1992 n2 = foldlen2;
332ddc25 1993 }
d07ddd77 1994 while (n1 && n2) {
1995 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1996 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1997 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 1998 return 1; /* mismatch */
d07ddd77 1999 n1 -= UTF8SKIP(q1);
d7f013c8 2000 q1 += UTF8SKIP(q1);
d07ddd77 2001 n2 -= UTF8SKIP(q2);
d7f013c8 2002 q2 += UTF8SKIP(q2);
701a277b 2003 }
d07ddd77 2004 if (n1 == 0)
d7f013c8 2005 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2006 if (n2 == 0)
d7f013c8 2007 p2 += u2 ? UTF8SKIP(p2) : 1;
2008
d2cc3551 2009 }
5469e704 2010
d07ddd77 2011 /* A match is defined by all the scans that specified
2012 * an explicit length reaching their final goals. */
2013 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704 2014
2015 if (match) {
d07ddd77 2016 if (pe1)
2017 *pe1 = (char*)p1;
2018 if (pe2)
2019 *pe2 = (char*)p2;
5469e704 2020 }
2021
2022 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2023}
701a277b 2024
a49f32c6 2025/*
2026 * Local variables:
2027 * c-indentation-style: bsd
2028 * c-basic-offset: 4
2029 * indent-tabs-mode: t
2030 * End:
2031 *
37442d52 2032 * ex: set ts=8 sts=4 sw=4 noet:
2033 */