[perl #24254] Attempt to free unreferenced scalar
[p5sagit/p5-mst-13.2.git] / utf8.c
CommitLineData
a0ed51b3 1/* utf8.c
2 *
af3babe4 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
4 * others
a0ed51b3 5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3 26#include "perl.h"
27
27da23d5 28static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 30
ccfc67b7 31/*
32=head1 Unicode Support
a0ed51b3 33
166f8a29 34This file contains various utility functions for manipulating UTF8-encoded
35strings. For the uninitiated, this is a method of representing arbitrary
61296642 36Unicode characters as a variable number of bytes, in such a way that
56da48f7 37characters in the ASCII range are unmodified, and a zero byte never appears
38within non-zero characters.
166f8a29 39
b851fbc1 40=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 41
1e54db1a 42Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 43of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 44bytes available. The return value is the pointer to the byte after the
9041c2e3 45end of the new character. In other words,
eebe1485 46
b851fbc1 47 d = uvuni_to_utf8_flags(d, uv, flags);
48
49or, in most cases,
50
9041c2e3 51 d = uvuni_to_utf8(d, uv);
eebe1485 52
b851fbc1 53(which is equivalent to)
54
55 d = uvuni_to_utf8_flags(d, uv, 0);
56
eebe1485 57is the recommended Unicode-aware way of saying
58
59 *(d++) = uv;
60
61=cut
62*/
63
dfe13c55 64U8 *
b851fbc1 65Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 66{
62961d2e 67 if (ckWARN(WARN_UTF8)) {
b851fbc1 68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1 71 else if (
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
74 ||
c867b360 75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1 76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
2a20b9da 78 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1 79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
81 )
9014280d 82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800 83 "Unicode character 0x%04"UVxf" is illegal", uv);
84 }
c4d5f83a 85 if (UNI_IS_INVARIANT(uv)) {
eb160463 86 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3 87 return d;
88 }
2d331972 89#if defined(EBCDIC)
1d72bdf6 90 else {
91 STRLEN len = UNISKIP(uv);
92 U8 *p = d+len-1;
93 while (p > d) {
eb160463 94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6 95 uv >>= UTF_ACCUMULATION_SHIFT;
96 }
eb160463 97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6 98 return d+len;
99 }
100#else /* Non loop style */
a0ed51b3 101 if (uv < 0x800) {
eb160463 102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 104 return d;
105 }
106 if (uv < 0x10000) {
eb160463 107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 110 return d;
111 }
112 if (uv < 0x200000) {
eb160463 113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 117 return d;
118 }
119 if (uv < 0x4000000) {
eb160463 120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 125 return d;
126 }
127 if (uv < 0x80000000) {
eb160463 128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 134 return d;
135 }
6b8eaf93 136#ifdef HAS_QUAD
d7578b48 137 if (uv < UTF8_QUAD_MAX)
a0ed51b3 138#endif
139 {
eb160463 140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 147 return d;
148 }
6b8eaf93 149#ifdef HAS_QUAD
a0ed51b3 150 {
eb160463 151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 164 return d;
165 }
166#endif
1d72bdf6 167#endif /* Loop style */
a0ed51b3 168}
9041c2e3 169
646ca15d 170/*
171
172Tests if some arbitrary number of bytes begins in a valid UTF-8
173character. Note that an INVARIANT (i.e. ASCII) character is a valid
174UTF-8 character. The actual number of bytes in the UTF-8 character
175will be returned if it is valid, otherwise 0.
176
177This is the "slow" version as opposed to the "fast" version which is
178the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
179difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
180or less you should use the IS_UTF8_CHAR(), for lengths of five or more
181you should use the _slow(). In practice this means that the _slow()
182will be used very rarely, since the maximum Unicode code point (as of
183Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
184the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
185five bytes or more.
186
187=cut */
c053b435 188STATIC STRLEN
646ca15d 189S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
190{
191 U8 u = *s;
192 STRLEN slen;
193 UV uv, ouv;
194
195 if (UTF8_IS_INVARIANT(u))
196 return 1;
197
198 if (!UTF8_IS_START(u))
199 return 0;
200
201 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
202 return 0;
203
204 slen = len - 1;
205 s++;
77263263 206#ifdef EBCDIC
207 u = NATIVE_TO_UTF(u);
208#endif
646ca15d 209 u &= UTF_START_MASK(len);
210 uv = u;
211 ouv = uv;
212 while (slen--) {
213 if (!UTF8_IS_CONTINUATION(*s))
214 return 0;
215 uv = UTF8_ACCUMULATE(uv, *s);
216 if (uv < ouv)
217 return 0;
218 ouv = uv;
219 s++;
220 }
221
222 if ((STRLEN)UNISKIP(uv) < len)
223 return 0;
224
225 return len;
226}
9041c2e3 227
228/*
7fc63493 229=for apidoc A|STRLEN|is_utf8_char|const U8 *s
eebe1485 230
5da9da9e 231Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01 232character. Note that an INVARIANT (i.e. ASCII) character is a valid
233UTF-8 character. The actual number of bytes in the UTF-8 character
234will be returned if it is valid, otherwise 0.
9041c2e3 235
82686b01 236=cut */
067a85ef 237STRLEN
7fc63493 238Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 239{
44f8325f 240 const STRLEN len = UTF8SKIP(s);
3b0fc154 241#ifdef IS_UTF8_CHAR
768c67ee 242 if (IS_UTF8_CHAR_FAST(len))
3b0fc154 243 return IS_UTF8_CHAR(s, len) ? len : 0;
244#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 245 return is_utf8_char_slow(s, len);
386d01d6 246}
247
6662521e 248/*
7fc63493 249=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
6662521e 250
c9ada85f 251Returns true if first C<len> bytes of the given string form a valid
1e54db1a 252UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
253not mean 'a string that contains code points above 0x7F encoded in UTF-8'
254because a valid ASCII string is a valid UTF-8 string.
6662521e 255
768c67ee 256See also is_utf8_string_loclen() and is_utf8_string_loc().
257
6662521e 258=cut
259*/
260
8e84507e 261bool
7fc63493 262Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 263{
7fc63493 264 const U8* x = s;
265 const U8* send;
067a85ef 266
44f8325f 267 if (!len)
e1ec3a88 268 len = strlen((const char *)s);
1aa99e6b 269 send = s + len;
270
6662521e 271 while (x < send) {
a3b680e6 272 STRLEN c;
1acdb0da 273 /* Inline the easy bits of is_utf8_char() here for speed... */
274 if (UTF8_IS_INVARIANT(*x))
275 c = 1;
276 else if (!UTF8_IS_START(*x))
768c67ee 277 goto out;
1acdb0da 278 else {
279 /* ... and call is_utf8_char() only if really needed. */
646ca15d 280#ifdef IS_UTF8_CHAR
281 c = UTF8SKIP(x);
768c67ee 282 if (IS_UTF8_CHAR_FAST(c)) {
283 if (!IS_UTF8_CHAR(x, c))
284 goto out;
285 } else if (!is_utf8_char_slow(x, c))
286 goto out;
646ca15d 287#else
288 c = is_utf8_char(x);
289#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 290 if (!c)
768c67ee 291 goto out;
1acdb0da 292 }
6662521e 293 x += c;
6662521e 294 }
768c67ee 295
296 out:
60006e79 297 if (x != send)
298 return FALSE;
067a85ef 299
300 return TRUE;
6662521e 301}
302
67e989fb 303/*
814fafa7 304Implemented as a macro in utf8.h
305
306=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
307
308Like is_utf8_string() but stores the location of the failure (in the
309case of "utf8ness failure") or the location s+len (in the case of
310"utf8ness success") in the C<ep>.
311
312See also is_utf8_string_loclen() and is_utf8_string().
313
768c67ee 314=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
81cd54e3 315
e3e4599f 316Like is_utf8_string() but stores the location of the failure (in the
768c67ee 317case of "utf8ness failure") or the location s+len (in the case of
318"utf8ness success") in the C<ep>, and the number of UTF-8
319encoded characters in the C<el>.
320
321See also is_utf8_string_loc() and is_utf8_string().
81cd54e3 322
323=cut
324*/
325
326bool
768c67ee 327Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 328{
7fc63493 329 const U8* x = s;
330 const U8* send;
81cd54e3 331 STRLEN c;
332
44f8325f 333 if (!len)
e1ec3a88 334 len = strlen((const char *)s);
81cd54e3 335 send = s + len;
768c67ee 336 if (el)
337 *el = 0;
81cd54e3 338
339 while (x < send) {
340 /* Inline the easy bits of is_utf8_char() here for speed... */
341 if (UTF8_IS_INVARIANT(*x))
768c67ee 342 c = 1;
343 else if (!UTF8_IS_START(*x))
344 goto out;
81cd54e3 345 else {
768c67ee 346 /* ... and call is_utf8_char() only if really needed. */
347#ifdef IS_UTF8_CHAR
348 c = UTF8SKIP(x);
349 if (IS_UTF8_CHAR_FAST(c)) {
350 if (!IS_UTF8_CHAR(x, c))
351 c = 0;
352 } else
353 c = is_utf8_char_slow(x, c);
354#else
355 c = is_utf8_char(x);
356#endif /* #ifdef IS_UTF8_CHAR */
357 if (!c)
358 goto out;
81cd54e3 359 }
768c67ee 360 x += c;
361 if (el)
362 (*el)++;
81cd54e3 363 }
768c67ee 364
365 out:
366 if (ep)
367 *ep = x;
368 if (x != send)
81cd54e3 369 return FALSE;
81cd54e3 370
371 return TRUE;
372}
373
374/*
768c67ee 375
7fc63493 376=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 377
9041c2e3 378Bottom level UTF-8 decode routine.
379Returns the unicode code point value of the first character in the string C<s>
1e54db1a 380which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 381C<retlen> will be set to the length, in bytes, of that character.
67e989fb 382
1e54db1a 383If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880 384is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
385it is assumed that the caller will raise a warning, and this function
28d3d195 386will silently just set C<retlen> to C<-1> and return zero. If the
387C<flags> does not contain UTF8_CHECK_ONLY, warnings about
388malformations will be given, C<retlen> will be set to the expected
389length of the UTF-8 character in bytes, and zero will be returned.
390
391The C<flags> can also contain various flags to allow deviations from
392the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 393
9041c2e3 394Most code should use utf8_to_uvchr() rather than call this directly.
395
37607a96 396=cut
397*/
67e989fb 398
a0ed51b3 399UV
7fc63493 400Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 401{
7fc63493 402 const U8 *s0 = s;
9c5ffd7c 403 UV uv = *s, ouv = 0;
ba210ebe 404 STRLEN len = 1;
7fc63493 405 const bool dowarn = ckWARN_d(WARN_UTF8);
406 const UV startbyte = *s;
ba210ebe 407 STRLEN expectlen = 0;
a0dbb045 408 U32 warning = 0;
409
410/* This list is a superset of the UTF8_ALLOW_XXX. */
411
412#define UTF8_WARN_EMPTY 1
413#define UTF8_WARN_CONTINUATION 2
414#define UTF8_WARN_NON_CONTINUATION 3
415#define UTF8_WARN_FE_FF 4
416#define UTF8_WARN_SHORT 5
417#define UTF8_WARN_OVERFLOW 6
418#define UTF8_WARN_SURROGATE 7
c867b360 419#define UTF8_WARN_LONG 8
420#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045 421
422 if (curlen == 0 &&
423 !(flags & UTF8_ALLOW_EMPTY)) {
424 warning = UTF8_WARN_EMPTY;
0c443dc2 425 goto malformed;
426 }
427
1d72bdf6 428 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3 429 if (retlen)
430 *retlen = 1;
c4d5f83a 431 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 432 }
67e989fb 433
421a8bf2 434 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 435 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 436 warning = UTF8_WARN_CONTINUATION;
ba210ebe 437 goto malformed;
438 }
439
421a8bf2 440 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 441 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 442 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 443 goto malformed;
444 }
9041c2e3 445
1d72bdf6 446#ifdef EBCDIC
75383841 447 uv = NATIVE_TO_UTF(uv);
1d72bdf6 448#else
fcc8fcf6 449 if ((uv == 0xfe || uv == 0xff) &&
450 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 451 warning = UTF8_WARN_FE_FF;
ba210ebe 452 goto malformed;
a0ed51b3 453 }
1d72bdf6 454#endif
455
ba210ebe 456 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
457 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
458 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
459 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6 460#ifdef EBCDIC
461 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
462 else { len = 7; uv &= 0x01; }
463#else
ba210ebe 464 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
465 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6 466 else { len = 13; uv = 0; } /* whoa! */
467#endif
468
a0ed51b3 469 if (retlen)
470 *retlen = len;
9041c2e3 471
ba210ebe 472 expectlen = len;
473
fcc8fcf6 474 if ((curlen < expectlen) &&
475 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 476 warning = UTF8_WARN_SHORT;
ba210ebe 477 goto malformed;
478 }
479
480 len--;
a0ed51b3 481 s++;
ba210ebe 482 ouv = uv;
483
a0ed51b3 484 while (len--) {
421a8bf2 485 if (!UTF8_IS_CONTINUATION(*s) &&
486 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 487 s--;
488 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 489 goto malformed;
a0ed51b3 490 }
491 else
8850bf83 492 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045 493 if (!(uv > ouv)) {
494 /* These cannot be allowed. */
495 if (uv == ouv) {
75dbc644 496 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 497 warning = UTF8_WARN_LONG;
498 goto malformed;
499 }
500 }
501 else { /* uv < ouv */
502 /* This cannot be allowed. */
503 warning = UTF8_WARN_OVERFLOW;
504 goto malformed;
505 }
ba210ebe 506 }
507 s++;
508 ouv = uv;
509 }
510
421a8bf2 511 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 512 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 513 warning = UTF8_WARN_SURROGATE;
ba210ebe 514 goto malformed;
eb160463 515 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 516 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 517 warning = UTF8_WARN_LONG;
ba210ebe 518 goto malformed;
421a8bf2 519 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 520 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 521 warning = UTF8_WARN_FFFF;
a9917092 522 goto malformed;
a0ed51b3 523 }
ba210ebe 524
a0ed51b3 525 return uv;
ba210ebe 526
527malformed:
528
fcc8fcf6 529 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 530 if (retlen)
cc366d4b 531 *retlen = -1;
ba210ebe 532 return 0;
533 }
534
a0dbb045 535 if (dowarn) {
44f8325f 536 SV* const sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
a0dbb045 537
538 switch (warning) {
539 case 0: /* Intentionally empty. */ break;
540 case UTF8_WARN_EMPTY:
54667de8 541 Perl_sv_catpv(aTHX_ sv, "(empty string)");
a0dbb045 542 break;
543 case UTF8_WARN_CONTINUATION:
097fb8e2 544 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045 545 break;
546 case UTF8_WARN_NON_CONTINUATION:
097fb8e2 547 if (s == s0)
548 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
549 (UV)s[1], startbyte);
551405c4 550 else {
551 const int len = (int)(s-s0);
097fb8e2 552 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
551405c4 553 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
554 }
555
a0dbb045 556 break;
557 case UTF8_WARN_FE_FF:
558 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
559 break;
560 case UTF8_WARN_SHORT:
097fb8e2 561 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 562 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 563 expectlen = curlen; /* distance for caller to skip */
a0dbb045 564 break;
565 case UTF8_WARN_OVERFLOW:
097fb8e2 566 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
567 ouv, *s, startbyte);
a0dbb045 568 break;
569 case UTF8_WARN_SURROGATE:
570 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
571 break;
a0dbb045 572 case UTF8_WARN_LONG:
097fb8e2 573 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 574 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045 575 break;
576 case UTF8_WARN_FFFF:
577 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
578 break;
579 default:
54667de8 580 Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
a0dbb045 581 break;
582 }
583
584 if (warning) {
44f8325f 585 const char * const s = SvPVX_const(sv);
a0dbb045 586
587 if (PL_op)
9014280d 588 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 589 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 590 else
9014280d 591 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045 592 }
593 }
594
ba210ebe 595 if (retlen)
28d3d195 596 *retlen = expectlen ? expectlen : len;
ba210ebe 597
28d3d195 598 return 0;
a0ed51b3 599}
600
8e84507e 601/*
7fc63493 602=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
9041c2e3 603
604Returns the native character value of the first character in the string C<s>
1e54db1a 605which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3 606length, in bytes, of that character.
607
1e54db1a 608If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3 609returned and retlen is set, if possible, to -1.
610
611=cut
612*/
613
614UV
7fc63493 615Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 616{
1754c1a1 617 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
618 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3 619}
620
621/*
7fc63493 622=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
9041c2e3 623
624Returns the Unicode code point of the first character in the string C<s>
1e54db1a 625which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3 626length, in bytes, of that character.
627
628This function should only be used when returned UV is considered
629an index into the Unicode semantic tables (e.g. swashes).
630
1e54db1a 631If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 632returned and retlen is set, if possible, to -1.
8e84507e 633
634=cut
635*/
636
637UV
7fc63493 638Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 639{
9041c2e3 640 /* Call the low level routine asking for checks */
89ebb4a3 641 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 642 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e 643}
644
b76347f2 645/*
35a4481c 646=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
b76347f2 647
648Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47 649Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
650up past C<e>, croaks.
b76347f2 651
652=cut
653*/
654
655STRLEN
35a4481c 656Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2 657{
658 STRLEN len = 0;
659
8850bf83 660 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
661 * the bitops (especially ~) can create illegal UTF-8.
662 * In other words: in Perl UTF-8 is not just for Unicode. */
663
a3b680e6 664 if (e < s)
665 goto warn_and_return;
b76347f2 666 while (s < e) {
4373e329 667 const U8 t = UTF8SKIP(s);
901b21bf 668 if (e - s < t) {
a3b680e6 669 warn_and_return:
901b21bf 670 if (ckWARN_d(WARN_UTF8)) {
671 if (PL_op)
672 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 673 "%s in %s", unees, OP_DESC(PL_op));
901b21bf 674 else
675 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
676 }
677 return len;
678 }
b76347f2 679 s += t;
680 len++;
681 }
682
683 return len;
684}
685
b06226ff 686/*
35a4481c 687=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
b06226ff 688
1e54db1a 689Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff 690and C<b>.
691
692WARNING: use only if you *know* that the pointers point inside the
693same UTF-8 buffer.
694
37607a96 695=cut
696*/
a0ed51b3 697
02eb7b47 698IV
35a4481c 699Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 700{
02eb7b47 701 IV off = 0;
702
8850bf83 703 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
704 * the bitops (especially ~) can create illegal UTF-8.
705 * In other words: in Perl UTF-8 is not just for Unicode. */
706
a0ed51b3 707 if (a < b) {
708 while (a < b) {
35a4481c 709 const U8 c = UTF8SKIP(a);
a3b680e6 710 if (b - a < c)
711 goto warn_and_return;
02eb7b47 712 a += c;
a0ed51b3 713 off--;
714 }
715 }
716 else {
717 while (b < a) {
4373e329 718 const U8 c = UTF8SKIP(b);
02eb7b47 719
901b21bf 720 if (a - b < c) {
a3b680e6 721 warn_and_return:
901b21bf 722 if (ckWARN_d(WARN_UTF8)) {
723 if (PL_op)
724 Perl_warner(aTHX_ packWARN(WARN_UTF8),
725 "%s in %s", unees, OP_DESC(PL_op));
726 else
727 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
728 }
729 return off;
730 }
02eb7b47 731 b += c;
a0ed51b3 732 off++;
733 }
734 }
02eb7b47 735
a0ed51b3 736 return off;
737}
738
b06226ff 739/*
37607a96 740=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 741
8850bf83 742Return the UTF-8 pointer C<s> displaced by C<off> characters, either
743forward or backward.
b06226ff 744
745WARNING: do not use the following unless you *know* C<off> is within
8850bf83 746the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
747on the first byte of character or just after the last byte of a character.
b06226ff 748
37607a96 749=cut
750*/
a0ed51b3 751
752U8 *
4373e329 753Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 754{
8850bf83 755 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
756 * the bitops (especially ~) can create illegal UTF-8.
757 * In other words: in Perl UTF-8 is not just for Unicode. */
758
a0ed51b3 759 if (off >= 0) {
760 while (off--)
761 s += UTF8SKIP(s);
762 }
763 else {
764 while (off++) {
765 s--;
8850bf83 766 while (UTF8_IS_CONTINUATION(*s))
767 s--;
a0ed51b3 768 }
769 }
4373e329 770 return (U8 *)s;
a0ed51b3 771}
772
6940069f 773/*
eebe1485 774=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 775
1e54db1a 776Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53 777Unlike C<bytes_to_utf8>, this over-writes the original string, and
778updates len to contain the new length.
67e989fb 779Returns zero on failure, setting C<len> to -1.
6940069f 780
781=cut
782*/
783
784U8 *
37607a96 785Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 786{
6940069f 787 U8 *send;
788 U8 *d;
dcad2880 789 U8 *save = s;
246fae53 790
1e54db1a 791 /* ensure valid UTF-8 and chars < 256 before updating string */
dcad2880 792 for (send = s + *len; s < send; ) {
793 U8 c = *s++;
794
1d72bdf6 795 if (!UTF8_IS_INVARIANT(c) &&
796 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
797 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880 798 *len = -1;
799 return 0;
800 }
246fae53 801 }
dcad2880 802
803 d = s = save;
6940069f 804 while (s < send) {
ed646e6e 805 STRLEN ulen;
9041c2e3 806 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 807 s += ulen;
6940069f 808 }
809 *d = '\0';
246fae53 810 *len = d - save;
6940069f 811 return save;
812}
813
814/*
e1ec3a88 815=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 816
1e54db1a 817Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
35a4481c 818Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0 819the newly-created string, and updates C<len> to contain the new
820length. Returns the original string if no conversion occurs, C<len>
821is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
8220 if C<s> is converted or contains all 7bit characters.
f9a63242 823
37607a96 824=cut
825*/
f9a63242 826
827U8 *
e1ec3a88 828Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 829{
f9a63242 830 U8 *d;
e1ec3a88 831 const U8 *start = s;
832 const U8 *send;
f9a63242 833 I32 count = 0;
834
835 if (!*is_utf8)
73d840c0 836 return (U8 *)start;
f9a63242 837
1e54db1a 838 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 839 for (send = s + *len; s < send;) {
e1ec3a88 840 U8 c = *s++;
1d72bdf6 841 if (!UTF8_IS_INVARIANT(c)) {
db42d148 842 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
843 (c = *s++) && UTF8_IS_CONTINUATION(c))
844 count++;
845 else
73d840c0 846 return (U8 *)start;
db42d148 847 }
f9a63242 848 }
849
850 *is_utf8 = 0;
851
a02a5408 852 Newxz(d, (*len) - count + 1, U8);
ef9edfd0 853 s = start; start = d;
f9a63242 854 while (s < send) {
855 U8 c = *s++;
c4d5f83a 856 if (!UTF8_IS_INVARIANT(c)) {
857 /* Then it is two-byte encoded */
858 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
859 c = ASCII_TO_NATIVE(c);
860 }
861 *d++ = c;
f9a63242 862 }
863 *d = '\0';
864 *len = d - start;
73d840c0 865 return (U8 *)start;
f9a63242 866}
867
868/*
35a4481c 869=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
6940069f 870
1e54db1a 871Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e 872Returns a pointer to the newly-created string, and sets C<len> to
873reflect the new length.
6940069f 874
1e54db1a 875If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f 876see sv_recode_to_utf8().
877
497711e7 878=cut
6940069f 879*/
880
881U8*
35a4481c 882Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 883{
35a4481c 884 const U8 * const send = s + (*len);
6940069f 885 U8 *d;
886 U8 *dst;
6940069f 887
a02a5408 888 Newxz(d, (*len) * 2 + 1, U8);
6940069f 889 dst = d;
890
891 while (s < send) {
35a4481c 892 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 893 if (UNI_IS_INVARIANT(uv))
eb160463 894 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 895 else {
eb160463 896 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
897 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f 898 }
899 }
900 *d = '\0';
6662521e 901 *len = d-dst;
6940069f 902 return dst;
903}
904
a0ed51b3 905/*
dea0fc0b 906 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3 907 *
908 * Destination must be pre-extended to 3/2 source. Do not use in-place.
909 * We optimize for native, for obvious reasons. */
910
911U8*
dea0fc0b 912Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 913{
dea0fc0b 914 U8* pend;
915 U8* dstart = d;
916
1de9afcd 917 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
918 d[0] = 0;
919 *newlen = 1;
920 return d;
921 }
922
dea0fc0b 923 if (bytelen & 1)
014ead4b 924 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
dea0fc0b 925
926 pend = p + bytelen;
927
a0ed51b3 928 while (p < pend) {
dea0fc0b 929 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
930 p += 2;
a0ed51b3 931 if (uv < 0x80) {
eb160463 932 *d++ = (U8)uv;
a0ed51b3 933 continue;
934 }
935 if (uv < 0x800) {
eb160463 936 *d++ = (U8)(( uv >> 6) | 0xc0);
937 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 938 continue;
939 }
940 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e 941 UV low = (p[0] << 8) + p[1];
942 p += 2;
dea0fc0b 943 if (low < 0xdc00 || low >= 0xdfff)
944 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3 945 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
946 }
947 if (uv < 0x10000) {
eb160463 948 *d++ = (U8)(( uv >> 12) | 0xe0);
949 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
950 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 951 continue;
952 }
953 else {
eb160463 954 *d++ = (U8)(( uv >> 18) | 0xf0);
955 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
956 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
957 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 958 continue;
959 }
960 }
dea0fc0b 961 *newlen = d - dstart;
a0ed51b3 962 return d;
963}
964
965/* Note: this one is slightly destructive of the source. */
966
967U8*
dea0fc0b 968Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 969{
970 U8* s = (U8*)p;
971 U8* send = s + bytelen;
972 while (s < send) {
973 U8 tmp = s[0];
974 s[0] = s[1];
975 s[1] = tmp;
976 s += 2;
977 }
dea0fc0b 978 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3 979}
980
981/* for now these are all defined (inefficiently) in terms of the utf8 versions */
982
983bool
84afefe6 984Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 985{
89ebb4a3 986 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 987 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 988 return is_utf8_alnum(tmpbuf);
989}
990
991bool
84afefe6 992Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 993{
89ebb4a3 994 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 995 uvchr_to_utf8(tmpbuf, c);
b8c5462f 996 return is_utf8_alnumc(tmpbuf);
997}
998
999bool
84afefe6 1000Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1001{
89ebb4a3 1002 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1003 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1004 return is_utf8_idfirst(tmpbuf);
1005}
1006
1007bool
84afefe6 1008Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1009{
89ebb4a3 1010 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1011 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1012 return is_utf8_alpha(tmpbuf);
1013}
1014
1015bool
84afefe6 1016Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1017{
89ebb4a3 1018 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1019 uvchr_to_utf8(tmpbuf, c);
4d61ec05 1020 return is_utf8_ascii(tmpbuf);
1021}
1022
1023bool
84afefe6 1024Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1025{
89ebb4a3 1026 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1027 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1028 return is_utf8_space(tmpbuf);
1029}
1030
1031bool
84afefe6 1032Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1033{
89ebb4a3 1034 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1035 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1036 return is_utf8_digit(tmpbuf);
1037}
1038
1039bool
84afefe6 1040Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1041{
89ebb4a3 1042 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1043 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1044 return is_utf8_upper(tmpbuf);
1045}
1046
1047bool
84afefe6 1048Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1049{
89ebb4a3 1050 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1051 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1052 return is_utf8_lower(tmpbuf);
1053}
1054
1055bool
84afefe6 1056Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1057{
89ebb4a3 1058 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1059 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1060 return is_utf8_cntrl(tmpbuf);
1061}
1062
1063bool
84afefe6 1064Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1065{
89ebb4a3 1066 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1067 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1068 return is_utf8_graph(tmpbuf);
1069}
1070
1071bool
84afefe6 1072Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1073{
89ebb4a3 1074 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1075 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1076 return is_utf8_print(tmpbuf);
1077}
1078
b8c5462f 1079bool
84afefe6 1080Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1081{
89ebb4a3 1082 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1083 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1084 return is_utf8_punct(tmpbuf);
1085}
1086
4d61ec05 1087bool
84afefe6 1088Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1089{
89ebb4a3 1090 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1091 uvchr_to_utf8(tmpbuf, c);
4d61ec05 1092 return is_utf8_xdigit(tmpbuf);
1093}
1094
84afefe6 1095UV
1096Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1097{
0ebc6274 1098 uvchr_to_utf8(p, c);
1099 return to_utf8_upper(p, p, lenp);
a0ed51b3 1100}
1101
84afefe6 1102UV
1103Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1104{
0ebc6274 1105 uvchr_to_utf8(p, c);
1106 return to_utf8_title(p, p, lenp);
a0ed51b3 1107}
1108
84afefe6 1109UV
1110Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1111{
0ebc6274 1112 uvchr_to_utf8(p, c);
1113 return to_utf8_lower(p, p, lenp);
a0ed51b3 1114}
1115
84afefe6 1116UV
1117Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1118{
0ebc6274 1119 uvchr_to_utf8(p, c);
1120 return to_utf8_fold(p, p, lenp);
84afefe6 1121}
1122
a0ed51b3 1123/* for now these all assume no locale info available for Unicode > 255 */
1124
1125bool
84afefe6 1126Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3 1127{
1128 return is_uni_alnum(c); /* XXX no locale support yet */
1129}
1130
1131bool
84afefe6 1132Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f 1133{
1134 return is_uni_alnumc(c); /* XXX no locale support yet */
1135}
1136
1137bool
84afefe6 1138Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3 1139{
1140 return is_uni_idfirst(c); /* XXX no locale support yet */
1141}
1142
1143bool
84afefe6 1144Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3 1145{
1146 return is_uni_alpha(c); /* XXX no locale support yet */
1147}
1148
1149bool
84afefe6 1150Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05 1151{
1152 return is_uni_ascii(c); /* XXX no locale support yet */
1153}
1154
1155bool
84afefe6 1156Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3 1157{
1158 return is_uni_space(c); /* XXX no locale support yet */
1159}
1160
1161bool
84afefe6 1162Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3 1163{
1164 return is_uni_digit(c); /* XXX no locale support yet */
1165}
1166
1167bool
84afefe6 1168Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3 1169{
1170 return is_uni_upper(c); /* XXX no locale support yet */
1171}
1172
1173bool
84afefe6 1174Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3 1175{
1176 return is_uni_lower(c); /* XXX no locale support yet */
1177}
1178
1179bool
84afefe6 1180Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f 1181{
1182 return is_uni_cntrl(c); /* XXX no locale support yet */
1183}
1184
1185bool
84afefe6 1186Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f 1187{
1188 return is_uni_graph(c); /* XXX no locale support yet */
1189}
1190
1191bool
84afefe6 1192Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3 1193{
1194 return is_uni_print(c); /* XXX no locale support yet */
1195}
1196
b8c5462f 1197bool
84afefe6 1198Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f 1199{
1200 return is_uni_punct(c); /* XXX no locale support yet */
1201}
1202
4d61ec05 1203bool
84afefe6 1204Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05 1205{
1206 return is_uni_xdigit(c); /* XXX no locale support yet */
1207}
1208
b7ac61fa 1209U32
1210Perl_to_uni_upper_lc(pTHX_ U32 c)
1211{
ee099d14 1212 /* XXX returns only the first character -- do not use XXX */
1213 /* XXX no locale support yet */
1214 STRLEN len;
89ebb4a3 1215 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1216 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa 1217}
1218
1219U32
1220Perl_to_uni_title_lc(pTHX_ U32 c)
1221{
ee099d14 1222 /* XXX returns only the first character XXX -- do not use XXX */
1223 /* XXX no locale support yet */
1224 STRLEN len;
89ebb4a3 1225 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1226 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa 1227}
1228
1229U32
1230Perl_to_uni_lower_lc(pTHX_ U32 c)
1231{
ee099d14 1232 /* XXX returns only the first character -- do not use XXX */
1233 /* XXX no locale support yet */
1234 STRLEN len;
89ebb4a3 1235 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1236 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa 1237}
1238
a0ed51b3 1239bool
5141f98e 1240S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
bde6a22d 1241 const char *const swashname)
1242{
1243 if (!is_utf8_char(p))
1244 return FALSE;
1245 if (!*swash)
1246 *swash = swash_init("utf8", swashname, &PL_sv_undef, 0, 0);
1247 return swash_fetch(*swash, p, TRUE) != 0;
1248}
1249
1250bool
7fc63493 1251Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1252{
671c33bf 1253 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1254 * descendant of isalnum(3), in other words, it doesn't
1255 * contain the '_'. --jhi */
1256 return S_is_utf8_common(aTHX_ p, &PL_utf8_alnum, "IsWord");
a0ed51b3 1257}
1258
1259bool
7fc63493 1260Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1261{
671c33bf 1262 return S_is_utf8_common(aTHX_ p, &PL_utf8_alnumc, "IsAlnumC");
b8c5462f 1263}
1264
1265bool
7fc63493 1266Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1267{
82686b01 1268 if (*p == '_')
1269 return TRUE;
bde6a22d 1270 /* is_utf8_idstart would be more logical. */
1271 return S_is_utf8_common(aTHX_ p, &PL_utf8_idstart, "IdStart");
82686b01 1272}
1273
1274bool
7fc63493 1275Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01 1276{
1277 if (*p == '_')
1278 return TRUE;
bde6a22d 1279 return S_is_utf8_common(aTHX_ p, &PL_utf8_idcont, "IdContinue");
a0ed51b3 1280}
1281
1282bool
7fc63493 1283Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1284{
bde6a22d 1285 return S_is_utf8_common(aTHX_ p, &PL_utf8_alpha, "IsAlpha");
a0ed51b3 1286}
1287
1288bool
7fc63493 1289Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1290{
bde6a22d 1291 return S_is_utf8_common(aTHX_ p, &PL_utf8_ascii, "IsAscii");
b8c5462f 1292}
1293
1294bool
7fc63493 1295Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1296{
bde6a22d 1297 return S_is_utf8_common(aTHX_ p, &PL_utf8_space, "IsSpacePerl");
a0ed51b3 1298}
1299
1300bool
7fc63493 1301Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1302{
bde6a22d 1303 return S_is_utf8_common(aTHX_ p, &PL_utf8_digit, "IsDigit");
a0ed51b3 1304}
1305
1306bool
7fc63493 1307Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1308{
bde6a22d 1309 return S_is_utf8_common(aTHX_ p, &PL_utf8_upper, "IsUppercase");
a0ed51b3 1310}
1311
1312bool
7fc63493 1313Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1314{
bde6a22d 1315 return S_is_utf8_common(aTHX_ p, &PL_utf8_lower, "IsLowercase");
a0ed51b3 1316}
1317
1318bool
7fc63493 1319Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1320{
bde6a22d 1321 return S_is_utf8_common(aTHX_ p, &PL_utf8_cntrl, "IsCntrl");
b8c5462f 1322}
1323
1324bool
7fc63493 1325Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1326{
bde6a22d 1327 return S_is_utf8_common(aTHX_ p, &PL_utf8_graph, "IsGraph");
b8c5462f 1328}
1329
1330bool
7fc63493 1331Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1332{
bde6a22d 1333 return S_is_utf8_common(aTHX_ p, &PL_utf8_print, "IsPrint");
a0ed51b3 1334}
1335
1336bool
7fc63493 1337Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1338{
bde6a22d 1339 return S_is_utf8_common(aTHX_ p, &PL_utf8_punct, "IsPunct");
b8c5462f 1340}
1341
1342bool
7fc63493 1343Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1344{
bde6a22d 1345 return S_is_utf8_common(aTHX_ p, &PL_utf8_xdigit, "Isxdigit");
b8c5462f 1346}
1347
1348bool
7fc63493 1349Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1350{
bde6a22d 1351 return S_is_utf8_common(aTHX_ p, &PL_utf8_mark, "IsM");
a0ed51b3 1352}
1353
6b5c0936 1354/*
1355=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1356
1357The "p" contains the pointer to the UTF-8 string encoding
1358the character that is being converted.
1359
1360The "ustrp" is a pointer to the character buffer to put the
1361conversion result to. The "lenp" is a pointer to the length
1362of the result.
1363
0134edef 1364The "swashp" is a pointer to the swash to use.
6b5c0936 1365
0134edef 1366Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1367and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1368but not always, a multicharacter mapping), is tried first.
6b5c0936 1369
0134edef 1370The "special" is a string like "utf8::ToSpecLower", which means the
1371hash %utf8::ToSpecLower. The access to the hash is through
1372Perl_to_utf8_case().
6b5c0936 1373
0134edef 1374The "normal" is a string like "ToLower" which means the swash
1375%utf8::ToLower.
1376
1377=cut */
6b5c0936 1378
2104c8d9 1379UV
9a957fbc 1380Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1381 SV **swashp, const char *normal, const char *special)
a0ed51b3 1382{
89ebb4a3 1383 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1384 STRLEN len = 0;
a0ed51b3 1385
aec46f14 1386 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7 1387 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1388 * are necessary in EBCDIC, they are redundant no-ops
1389 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1390 const UV uv1 = NATIVE_TO_UNI(uv0);
1feea2c7 1391 uvuni_to_utf8(tmpbuf, uv1);
0134edef 1392
1393 if (!*swashp) /* load on-demand */
1394 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1395
b08cf34e 1396 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1397 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1398 /* It might be "special" (sometimes, but not always,
2a37f04d 1399 * a multicharacter mapping) */
983ffd37 1400 HV *hv;
b08cf34e 1401 SV **svp;
1402
1403 if ((hv = get_hv(special, FALSE)) &&
1404 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1405 (*svp)) {
cfd0369c 1406 const char *s;
47654450 1407
cfd0369c 1408 s = SvPV_const(*svp, len);
47654450 1409 if (len == 1)
1410 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1411 else {
2f9475ad 1412#ifdef EBCDIC
1413 /* If we have EBCDIC we need to remap the characters
1414 * since any characters in the low 256 are Unicode
1415 * code points, not EBCDIC. */
7cda7a3d 1416 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad 1417
1418 d = tmpbuf;
b08cf34e 1419 if (SvUTF8(*svp)) {
2f9475ad 1420 STRLEN tlen = 0;
1421
1422 while (t < tend) {
1423 UV c = utf8_to_uvchr(t, &tlen);
1424 if (tlen > 0) {
1425 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1426 t += tlen;
1427 }
1428 else
1429 break;
1430 }
1431 }
1432 else {
36fec512 1433 while (t < tend) {
1434 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1435 t++;
1436 }
2f9475ad 1437 }
1438 len = d - tmpbuf;
1439 Copy(tmpbuf, ustrp, len, U8);
1440#else
d2dcd0fb 1441 Copy(s, ustrp, len, U8);
2f9475ad 1442#endif
29e98929 1443 }
983ffd37 1444 }
0134edef 1445 }
1446
1447 if (!len && *swashp) {
1448 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1449
1450 if (uv2) {
1451 /* It was "normal" (a single character mapping). */
1452 UV uv3 = UNI_TO_NATIVE(uv2);
1453
e9101d72 1454 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d 1455 }
1456 }
1feea2c7 1457
0134edef 1458 if (!len) /* Neither: just copy. */
1459 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1460
2a37f04d 1461 if (lenp)
1462 *lenp = len;
1463
0134edef 1464 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3 1465}
1466
d3e79532 1467/*
7fc63493 1468=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1469
1470Convert the UTF-8 encoded character at p to its uppercase version and
1471store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1472that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1473the uppercase version may be longer than the original character.
d3e79532 1474
1475The first character of the uppercased version is returned
1476(but note, as explained above, that there may be more.)
1477
1478=cut */
1479
2104c8d9 1480UV
7fc63493 1481Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1482{
983ffd37 1483 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1484 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1485}
a0ed51b3 1486
d3e79532 1487/*
7fc63493 1488=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1489
1490Convert the UTF-8 encoded character at p to its titlecase version and
1491store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1492that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1493titlecase version may be longer than the original character.
d3e79532 1494
1495The first character of the titlecased version is returned
1496(but note, as explained above, that there may be more.)
1497
1498=cut */
1499
983ffd37 1500UV
7fc63493 1501Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37 1502{
1503 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1504 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3 1505}
1506
d3e79532 1507/*
7fc63493 1508=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1509
1510Convert the UTF-8 encoded character at p to its lowercase version and
1511store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1512that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1513lowercase version may be longer than the original character.
d3e79532 1514
1515The first character of the lowercased version is returned
1516(but note, as explained above, that there may be more.)
1517
1518=cut */
1519
2104c8d9 1520UV
7fc63493 1521Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1522{
983ffd37 1523 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1524 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1525}
1526
d3e79532 1527/*
7fc63493 1528=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1529
1530Convert the UTF-8 encoded character at p to its foldcase version and
1531store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1532that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532 1533foldcase version may be longer than the original character (up to
1534three characters).
1535
1536The first character of the foldcased version is returned
1537(but note, as explained above, that there may be more.)
1538
1539=cut */
1540
b4e400f9 1541UV
7fc63493 1542Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9 1543{
1544 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1545 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3 1546}
1547
1548/* a "swash" is a swatch hash */
1549
1550SV*
7fc63493 1551Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1552{
27da23d5 1553 dVAR;
a0ed51b3 1554 SV* retval;
9a957fbc 1555 SV* const tokenbufsv = sv_newmortal();
8e84507e 1556 dSP;
7fc63493 1557 const size_t pkg_len = strlen(pkg);
1558 const size_t name_len = strlen(name);
aec46f14 1559 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1560 SV* errsv_save;
ce3b816e 1561
96ca9f55 1562 PUSHSTACKi(PERLSI_MAGIC);
1563 ENTER;
1564 SAVEI32(PL_hints);
1565 PL_hints = 0;
1566 save_re_context();
1b026014 1567 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1568 ENTER;
f8be5cf0 1569 errsv_save = newSVsv(ERRSV);
71bed85a 1570 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1571 Nullsv);
f8be5cf0 1572 if (!SvTRUE(ERRSV))
1573 sv_setsv(ERRSV, errsv_save);
1574 SvREFCNT_dec(errsv_save);
ce3b816e 1575 LEAVE;
1576 }
1577 SPAGAIN;
a0ed51b3 1578 PUSHMARK(SP);
1579 EXTEND(SP,5);
71bed85a 1580 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1581 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3 1582 PUSHs(listsv);
1583 PUSHs(sv_2mortal(newSViv(minbits)));
1584 PUSHs(sv_2mortal(newSViv(none)));
1585 PUTBACK;
923e4eb5 1586 if (IN_PERL_COMPILETIME) {
bf1fed83 1587 /* XXX ought to be handled by lex_start */
82686b01 1588 SAVEI32(PL_in_my);
2b4bd638 1589 PL_in_my = 0;
bf1fed83 1590 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1591 }
f8be5cf0 1592 errsv_save = newSVsv(ERRSV);
864dbfa3 1593 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1594 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1595 else
e24b16f9 1596 retval = &PL_sv_undef;
f8be5cf0 1597 if (!SvTRUE(ERRSV))
1598 sv_setsv(ERRSV, errsv_save);
1599 SvREFCNT_dec(errsv_save);
a0ed51b3 1600 LEAVE;
1601 POPSTACK;
923e4eb5 1602 if (IN_PERL_COMPILETIME) {
bf1fed83 1603 STRLEN len;
aec46f14 1604 const char* const pv = SvPV_const(tokenbufsv, len);
bf1fed83 1605
1606 Copy(pv, PL_tokenbuf, len+1, char);
eb160463 1607 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1608 }
bc45ce41 1609 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1610 if (SvPOK(retval))
35c1215d 1611 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1612 retval);
cea2e8a9 1613 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1614 }
a0ed51b3 1615 return retval;
1616}
1617
035d37be 1618
1619/* This API is wrong for special case conversions since we may need to
1620 * return several Unicode characters for a single Unicode character
1621 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1622 * the lower-level routine, and it is similarly broken for returning
1623 * multiple values. --jhi */
a0ed51b3 1624UV
7fc63493 1625Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
a0ed51b3 1626{
27da23d5 1627 dVAR;
aec46f14 1628 HV* const hv = (HV*)SvRV(sv);
3568d838 1629 U32 klen;
1630 U32 off;
a0ed51b3 1631 STRLEN slen;
7d85a32c 1632 STRLEN needents;
cfd0369c 1633 const U8 *tmps = NULL;
a0ed51b3 1634 U32 bit;
1635 SV *retval;
3568d838 1636 U8 tmputf8[2];
1637 UV c = NATIVE_TO_ASCII(*ptr);
1638
1639 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
eb160463 1640 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1641 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
3568d838 1642 ptr = tmputf8;
1643 }
1644 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1645 * then the "swatch" is a vec() for al the chars which start
1646 * with 0xAA..0xYY
1647 * So the key in the hash (klen) is length of encoded char -1
1648 */
1649 klen = UTF8SKIP(ptr) - 1;
1650 off = ptr[klen];
a0ed51b3 1651
7d85a32c 1652 if (klen == 0)
1653 {
1654 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1655 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1656 */
1657 needents = UTF_CONTINUATION_MARK;
1658 off = NATIVE_TO_UTF(ptr[klen]);
1659 }
1660 else
1661 {
1662 /* If char is encoded then swatch is for the prefix */
1663 needents = (1 << UTF_ACCUMULATION_SHIFT);
1664 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1665 }
1666
a0ed51b3 1667 /*
1668 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1669 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1670 * it's nothing to sniff at.) Pity we usually come through at least
1671 * two function calls to get here...
1672 *
1673 * NB: this code assumes that swatches are never modified, once generated!
1674 */
1675
3568d838 1676 if (hv == PL_last_swash_hv &&
a0ed51b3 1677 klen == PL_last_swash_klen &&
27da23d5 1678 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3 1679 {
1680 tmps = PL_last_swash_tmps;
1681 slen = PL_last_swash_slen;
1682 }
1683 else {
1684 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1685 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 1686
1687 /* If not cached, generate it via utf8::SWASHGET */
cfd0369c 1688 if (!svp || !SvPOK(*svp) || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
a0ed51b3 1689 dSP;
2b9d42f0 1690 /* We use utf8n_to_uvuni() as we want an index into
1691 Unicode tables, not a native character number.
1692 */
aec46f14 1693 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae 1694 ckWARN(WARN_UTF8) ?
1695 0 : UTF8_ALLOW_ANY);
f8be5cf0 1696 SV *errsv_save;
a0ed51b3 1697 ENTER;
1698 SAVETMPS;
1699 save_re_context();
1700 PUSHSTACKi(PERLSI_MAGIC);
1701 PUSHMARK(SP);
1702 EXTEND(SP,3);
1703 PUSHs((SV*)sv);
ffbc6a93 1704 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838 1705 PUSHs(sv_2mortal(newSViv((klen) ?
1706 (code_point & ~(needents - 1)) : 0)));
a0ed51b3 1707 PUSHs(sv_2mortal(newSViv(needents)));
1708 PUTBACK;
f8be5cf0 1709 errsv_save = newSVsv(ERRSV);
864dbfa3 1710 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1711 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1712 else
e24b16f9 1713 retval = &PL_sv_undef;
f8be5cf0 1714 if (!SvTRUE(ERRSV))
1715 sv_setsv(ERRSV, errsv_save);
1716 SvREFCNT_dec(errsv_save);
a0ed51b3 1717 POPSTACK;
1718 FREETMPS;
1719 LEAVE;
923e4eb5 1720 if (IN_PERL_COMPILETIME)
eb160463 1721 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1722
e1ec3a88 1723 svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
a0ed51b3 1724
7d85a32c 1725 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1726 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3 1727 }
1728
1729 PL_last_swash_hv = hv;
1730 PL_last_swash_klen = klen;
cfd0369c 1731 /* FIXME change interpvar.h? */
1732 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3 1733 PL_last_swash_slen = slen;
1734 if (klen)
1735 Copy(ptr, PL_last_swash_key, klen, U8);
1736 }
1737
9faf8d75 1738 switch ((int)((slen << 3) / needents)) {
a0ed51b3 1739 case 1:
1740 bit = 1 << (off & 7);
1741 off >>= 3;
1742 return (tmps[off] & bit) != 0;
1743 case 8:
1744 return tmps[off];
1745 case 16:
1746 off <<= 1;
1747 return (tmps[off] << 8) + tmps[off + 1] ;
1748 case 32:
1749 off <<= 2;
1750 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1751 }
cea2e8a9 1752 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3 1753 return 0;
1754}
2b9d42f0 1755
0f830e0b 1756/*
1757=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1758
1759Adds the UTF-8 representation of the Native codepoint C<uv> to the end
1760of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
1761bytes available. The return value is the pointer to the byte after the
1762end of the new character. In other words,
1763
1764 d = uvchr_to_utf8(d, uv);
1765
1766is the recommended wide native character-aware way of saying
1767
1768 *(d++) = uv;
1769
1770=cut
1771*/
1772
1773/* On ASCII machines this is normally a macro but we want a
1774 real function in case XS code wants it
1775*/
0f830e0b 1776U8 *
1777Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1778{
1779 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1780}
1781
b851fbc1 1782U8 *
1783Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1784{
1785 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1786}
2b9d42f0 1787
1788/*
0f830e0b 1789=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
1790flags
1791
1792Returns the native character value of the first character in the string
1793C<s>
1794which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
1795length, in bytes, of that character.
1796
1797Allows length and flags to be passed to low level routine.
1798
1799=cut
1800*/
1801/* On ASCII machines this is normally a macro but we want
1802 a real function in case XS code wants it
1803*/
0f830e0b 1804UV
1805Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
1806U32 flags)
1807{
1808 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1809 return UNI_TO_NATIVE(uv);
1810}
1811
1812/*
d2cc3551 1813=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1814
1815Build to the scalar dsv a displayable version of the string spv,
1816length len, the displayable version being at most pvlim bytes long
1817(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1818
9e55ce06 1819The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1820isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054 1821to display the \\[nrfta\\] as the backslashed versions (like '\n')
1822(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1823UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1824UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1825
d2cc3551 1826The pointer to the PV of the dsv is returned.
1827
1828=cut */
e6b2e755 1829char *
e1ec3a88 1830Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755 1831{
1832 int truncated = 0;
e1ec3a88 1833 const char *s, *e;
e6b2e755 1834
1835 sv_setpvn(dsv, "", 0);
e1ec3a88 1836 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 1837 UV u;
a49f32c6 1838 /* This serves double duty as a flag and a character to print after
1839 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1840 */
1841 char ok = 0;
c728cb41 1842
e6b2e755 1843 if (pvlim && SvCUR(dsv) >= pvlim) {
1844 truncated++;
1845 break;
1846 }
1847 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1848 if (u < 256) {
a3b680e6 1849 const unsigned char c = (unsigned char)u & 0xFF;
c728cb41 1850 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
a49f32c6 1851 switch (c) {
c728cb41 1852 case '\n':
a49f32c6 1853 ok = 'n'; break;
c728cb41 1854 case '\r':
a49f32c6 1855 ok = 'r'; break;
c728cb41 1856 case '\t':
a49f32c6 1857 ok = 't'; break;
c728cb41 1858 case '\f':
a49f32c6 1859 ok = 'f'; break;
c728cb41 1860 case '\a':
a49f32c6 1861 ok = 'a'; break;
c728cb41 1862 case '\\':
a49f32c6 1863 ok = '\\'; break;
c728cb41 1864 default: break;
1865 }
a49f32c6 1866 if (ok) {
1867 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1868 }
c728cb41 1869 }
00e86452 1870 /* isPRINT() is the locale-blind version. */
a49f32c6 1871 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1872 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1873 ok = 1;
0a2ef054 1874 }
c728cb41 1875 }
1876 if (!ok)
9e55ce06 1877 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755 1878 }
1879 if (truncated)
1880 sv_catpvn(dsv, "...", 3);
1881
1882 return SvPVX(dsv);
1883}
2b9d42f0 1884
d2cc3551 1885/*
1886=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1887
1888Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1889the displayable version being at most pvlim bytes long
d2cc3551 1890(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1891
1892The flags argument is as in pv_uni_display().
1893
d2cc3551 1894The pointer to the PV of the dsv is returned.
1895
1896=cut */
e6b2e755 1897char *
1898Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1899{
cfd0369c 1900 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
1901 SvCUR(ssv), pvlim, flags);
701a277b 1902}
1903
d2cc3551 1904/*
d07ddd77 1905=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551 1906
1907Return true if the strings s1 and s2 differ case-insensitively, false
1908if not (if they are equal case-insensitively). If u1 is true, the
1909string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77 1910the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1911are false, the respective string is assumed to be in native 8-bit
1912encoding.
1913
1914If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1915in there (they will point at the beginning of the I<next> character).
1916If the pointers behind pe1 or pe2 are non-NULL, they are the end
1917pointers beyond which scanning will not continue under any
4cdaeff7 1918circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77 1919s2+l2 will be used as goal end pointers that will also stop the scan,
1920and which qualify towards defining a successful match: all the scans
1921that define an explicit length must reach their goal pointers for
1922a match to succeed).
d2cc3551 1923
1924For case-insensitiveness, the "casefolding" of Unicode is used
1925instead of upper/lowercasing both the characters, see
1926http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1927
1928=cut */
701a277b 1929I32
d07ddd77 1930Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 1931{
e1ec3a88 1932 register const U8 *p1 = (const U8*)s1;
1933 register const U8 *p2 = (const U8*)s2;
1934 register const U8 *f1 = 0, *f2 = 0;
1935 register U8 *e1 = 0, *q1 = 0;
1936 register U8 *e2 = 0, *q2 = 0;
d07ddd77 1937 STRLEN n1 = 0, n2 = 0;
89ebb4a3 1938 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
1939 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8 1940 U8 natbuf[1+1];
1941 STRLEN foldlen1, foldlen2;
d07ddd77 1942 bool match;
332ddc25 1943
d07ddd77 1944 if (pe1)
1945 e1 = *(U8**)pe1;
e1ec3a88 1946 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
1947 f1 = (const U8*)s1 + l1;
d07ddd77 1948 if (pe2)
1949 e2 = *(U8**)pe2;
e1ec3a88 1950 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
1951 f2 = (const U8*)s2 + l2;
d07ddd77 1952
1953 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1954 return 1; /* mismatch; possible infinite loop or false positive */
1955
a6872d42 1956 if (!u1 || !u2)
1957 natbuf[1] = 0; /* Need to terminate the buffer. */
1958
d07ddd77 1959 while ((e1 == 0 || p1 < e1) &&
1960 (f1 == 0 || p1 < f1) &&
1961 (e2 == 0 || p2 < e2) &&
1962 (f2 == 0 || p2 < f2)) {
1963 if (n1 == 0) {
d7f013c8 1964 if (u1)
1965 to_utf8_fold(p1, foldbuf1, &foldlen1);
1966 else {
809e8e66 1967 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8 1968 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1969 }
1970 q1 = foldbuf1;
d07ddd77 1971 n1 = foldlen1;
332ddc25 1972 }
d07ddd77 1973 if (n2 == 0) {
d7f013c8 1974 if (u2)
1975 to_utf8_fold(p2, foldbuf2, &foldlen2);
1976 else {
809e8e66 1977 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8 1978 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1979 }
1980 q2 = foldbuf2;
d07ddd77 1981 n2 = foldlen2;
332ddc25 1982 }
d07ddd77 1983 while (n1 && n2) {
1984 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1985 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1986 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 1987 return 1; /* mismatch */
d07ddd77 1988 n1 -= UTF8SKIP(q1);
d7f013c8 1989 q1 += UTF8SKIP(q1);
d07ddd77 1990 n2 -= UTF8SKIP(q2);
d7f013c8 1991 q2 += UTF8SKIP(q2);
701a277b 1992 }
d07ddd77 1993 if (n1 == 0)
d7f013c8 1994 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 1995 if (n2 == 0)
d7f013c8 1996 p2 += u2 ? UTF8SKIP(p2) : 1;
1997
d2cc3551 1998 }
5469e704 1999
d07ddd77 2000 /* A match is defined by all the scans that specified
2001 * an explicit length reaching their final goals. */
2002 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704 2003
2004 if (match) {
d07ddd77 2005 if (pe1)
2006 *pe1 = (char*)p1;
2007 if (pe2)
2008 *pe2 = (char*)p2;
5469e704 2009 }
2010
2011 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2012}
701a277b 2013
a49f32c6 2014/*
2015 * Local variables:
2016 * c-indentation-style: bsd
2017 * c-basic-offset: 4
2018 * indent-tabs-mode: t
2019 * End:
2020 *
37442d52 2021 * ex: set ts=8 sts=4 sw=4 noet:
2022 */