Updated to Pod::Simple 3.11 from CPAN [perl #71004]
[p5sagit/p5-mst-13.2.git] / utf8.c
CommitLineData
a0ed51b3 1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3 5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550 12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3 17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550 20 * as is the custom in the West, if you wish to be answered?'
21 * --Gandalf, addressing Théoden's door wardens
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3 24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550 27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3 29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3 33#include "perl.h"
34
a0c21aa1 35#ifndef EBCDIC
36/* Separate prototypes needed because in ASCII systems these
37 * usually macros but they still are compiled as code, too. */
38PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
39PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
40#endif
41
27da23d5 42static const char unees[] =
43 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 44
ccfc67b7 45/*
46=head1 Unicode Support
a0ed51b3 47
166f8a29 48This file contains various utility functions for manipulating UTF8-encoded
49strings. For the uninitiated, this is a method of representing arbitrary
61296642 50Unicode characters as a variable number of bytes, in such a way that
56da48f7 51characters in the ASCII range are unmodified, and a zero byte never appears
52within non-zero characters.
166f8a29 53
eaf7a4d2 54=cut
55*/
56
57/*
58=for apidoc is_ascii_string
59
60Returns true if first C<len> bytes of the given string are ASCII (i.e. none
61of them even raise the question of UTF-8-ness).
62
63See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
64
65=cut
66*/
67
68bool
668b6d8d 69Perl_is_ascii_string(const U8 *s, STRLEN len)
eaf7a4d2 70{
71 const U8* const send = s + (len ? len : strlen((const char *)s));
72 const U8* x = s;
73
74 PERL_ARGS_ASSERT_IS_ASCII_STRING;
eaf7a4d2 75
76 for (; x < send; ++x) {
77 if (!UTF8_IS_INVARIANT(*x))
78 break;
79 }
80
81 return x == send;
82}
83
84/*
87cea99e 85=for apidoc uvuni_to_utf8_flags
eebe1485 86
1e54db1a 87Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 88of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 89bytes available. The return value is the pointer to the byte after the
9041c2e3 90end of the new character. In other words,
eebe1485 91
b851fbc1 92 d = uvuni_to_utf8_flags(d, uv, flags);
93
94or, in most cases,
95
9041c2e3 96 d = uvuni_to_utf8(d, uv);
eebe1485 97
b851fbc1 98(which is equivalent to)
99
100 d = uvuni_to_utf8_flags(d, uv, 0);
101
eebe1485 102is the recommended Unicode-aware way of saying
103
104 *(d++) = uv;
105
106=cut
107*/
108
dfe13c55 109U8 *
b851fbc1 110Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 111{
7918f24d 112 PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
113
62961d2e 114 if (ckWARN(WARN_UTF8)) {
b851fbc1 115 if (UNICODE_IS_SURROGATE(uv) &&
116 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 117 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1 118 else if (
119 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
120 !(flags & UNICODE_ALLOW_FDD0))
121 ||
c867b360 122 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1 123 !(flags & UNICODE_ALLOW_FFFF))) &&
124 /* UNICODE_ALLOW_SUPER includes
2a20b9da 125 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1 126 ((uv <= PERL_UNICODE_MAX) ||
127 !(flags & UNICODE_ALLOW_SUPER))
128 )
9014280d 129 Perl_warner(aTHX_ packWARN(WARN_UTF8),
6f6ac1de 130 "Unicode non-character 0x%04"UVxf" is illegal for interchange", uv);
507b9800 131 }
c4d5f83a 132 if (UNI_IS_INVARIANT(uv)) {
eb160463 133 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3 134 return d;
135 }
2d331972 136#if defined(EBCDIC)
1d72bdf6 137 else {
138 STRLEN len = UNISKIP(uv);
139 U8 *p = d+len-1;
140 while (p > d) {
eb160463 141 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6 142 uv >>= UTF_ACCUMULATION_SHIFT;
143 }
eb160463 144 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6 145 return d+len;
146 }
147#else /* Non loop style */
a0ed51b3 148 if (uv < 0x800) {
eb160463 149 *d++ = (U8)(( uv >> 6) | 0xc0);
150 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 151 return d;
152 }
153 if (uv < 0x10000) {
eb160463 154 *d++ = (U8)(( uv >> 12) | 0xe0);
155 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
156 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 157 return d;
158 }
159 if (uv < 0x200000) {
eb160463 160 *d++ = (U8)(( uv >> 18) | 0xf0);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 164 return d;
165 }
166 if (uv < 0x4000000) {
eb160463 167 *d++ = (U8)(( uv >> 24) | 0xf8);
168 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
169 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
170 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
171 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 172 return d;
173 }
174 if (uv < 0x80000000) {
eb160463 175 *d++ = (U8)(( uv >> 30) | 0xfc);
176 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
177 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
178 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
179 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
180 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 181 return d;
182 }
6b8eaf93 183#ifdef HAS_QUAD
d7578b48 184 if (uv < UTF8_QUAD_MAX)
a0ed51b3 185#endif
186 {
eb160463 187 *d++ = 0xfe; /* Can't match U+FEFF! */
188 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
189 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
190 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
191 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
192 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
193 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 194 return d;
195 }
6b8eaf93 196#ifdef HAS_QUAD
a0ed51b3 197 {
eb160463 198 *d++ = 0xff; /* Can't match U+FFFE! */
199 *d++ = 0x80; /* 6 Reserved bits */
200 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
201 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
202 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
203 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
204 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
205 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
206 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
207 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
208 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
209 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
210 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 211 return d;
212 }
213#endif
1d72bdf6 214#endif /* Loop style */
a0ed51b3 215}
9041c2e3 216
646ca15d 217/*
218
219Tests if some arbitrary number of bytes begins in a valid UTF-8
220character. Note that an INVARIANT (i.e. ASCII) character is a valid
221UTF-8 character. The actual number of bytes in the UTF-8 character
222will be returned if it is valid, otherwise 0.
223
224This is the "slow" version as opposed to the "fast" version which is
225the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
226difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
227or less you should use the IS_UTF8_CHAR(), for lengths of five or more
228you should use the _slow(). In practice this means that the _slow()
229will be used very rarely, since the maximum Unicode code point (as of
230Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
231the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
232five bytes or more.
233
234=cut */
c053b435 235STATIC STRLEN
5f66b61c 236S_is_utf8_char_slow(const U8 *s, const STRLEN len)
646ca15d 237{
238 U8 u = *s;
239 STRLEN slen;
240 UV uv, ouv;
241
7918f24d 242 PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
243
646ca15d 244 if (UTF8_IS_INVARIANT(u))
245 return 1;
246
247 if (!UTF8_IS_START(u))
248 return 0;
249
250 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
251 return 0;
252
253 slen = len - 1;
254 s++;
77263263 255#ifdef EBCDIC
256 u = NATIVE_TO_UTF(u);
257#endif
646ca15d 258 u &= UTF_START_MASK(len);
259 uv = u;
260 ouv = uv;
261 while (slen--) {
262 if (!UTF8_IS_CONTINUATION(*s))
263 return 0;
264 uv = UTF8_ACCUMULATE(uv, *s);
265 if (uv < ouv)
266 return 0;
267 ouv = uv;
268 s++;
269 }
270
271 if ((STRLEN)UNISKIP(uv) < len)
272 return 0;
273
274 return len;
275}
9041c2e3 276
277/*
87cea99e 278=for apidoc is_utf8_char
eebe1485 279
5da9da9e 280Tests if some arbitrary number of bytes begins in a valid UTF-8
2bbc8d55 281character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
282character is a valid UTF-8 character. The actual number of bytes in the UTF-8
283character will be returned if it is valid, otherwise 0.
9041c2e3 284
82686b01 285=cut */
067a85ef 286STRLEN
668b6d8d 287Perl_is_utf8_char(const U8 *s)
386d01d6 288{
44f8325f 289 const STRLEN len = UTF8SKIP(s);
7918f24d 290
291 PERL_ARGS_ASSERT_IS_UTF8_CHAR;
3b0fc154 292#ifdef IS_UTF8_CHAR
768c67ee 293 if (IS_UTF8_CHAR_FAST(len))
3b0fc154 294 return IS_UTF8_CHAR(s, len) ? len : 0;
295#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 296 return is_utf8_char_slow(s, len);
386d01d6 297}
298
eaf7a4d2 299
6662521e 300/*
87cea99e 301=for apidoc is_utf8_string
6662521e 302
c9ada85f 303Returns true if first C<len> bytes of the given string form a valid
1e54db1a 304UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
305not mean 'a string that contains code points above 0x7F encoded in UTF-8'
306because a valid ASCII string is a valid UTF-8 string.
6662521e 307
eaf7a4d2 308See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
768c67ee 309
6662521e 310=cut
311*/
312
8e84507e 313bool
668b6d8d 314Perl_is_utf8_string(const U8 *s, STRLEN len)
6662521e 315{
35da51f7 316 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 317 const U8* x = s;
067a85ef 318
7918f24d 319 PERL_ARGS_ASSERT_IS_UTF8_STRING;
1aa99e6b 320
6662521e 321 while (x < send) {
a3b680e6 322 STRLEN c;
1acdb0da 323 /* Inline the easy bits of is_utf8_char() here for speed... */
324 if (UTF8_IS_INVARIANT(*x))
325 c = 1;
326 else if (!UTF8_IS_START(*x))
768c67ee 327 goto out;
1acdb0da 328 else {
329 /* ... and call is_utf8_char() only if really needed. */
646ca15d 330#ifdef IS_UTF8_CHAR
331 c = UTF8SKIP(x);
768c67ee 332 if (IS_UTF8_CHAR_FAST(c)) {
333 if (!IS_UTF8_CHAR(x, c))
3c614e38 334 c = 0;
335 }
336 else
337 c = is_utf8_char_slow(x, c);
646ca15d 338#else
339 c = is_utf8_char(x);
340#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 341 if (!c)
768c67ee 342 goto out;
1acdb0da 343 }
6662521e 344 x += c;
6662521e 345 }
768c67ee 346
347 out:
60006e79 348 if (x != send)
349 return FALSE;
067a85ef 350
351 return TRUE;
6662521e 352}
353
67e989fb 354/*
814fafa7 355Implemented as a macro in utf8.h
356
87cea99e 357=for apidoc is_utf8_string_loc
814fafa7 358
359Like is_utf8_string() but stores the location of the failure (in the
360case of "utf8ness failure") or the location s+len (in the case of
361"utf8ness success") in the C<ep>.
362
363See also is_utf8_string_loclen() and is_utf8_string().
364
87cea99e 365=for apidoc is_utf8_string_loclen
81cd54e3 366
e3e4599f 367Like is_utf8_string() but stores the location of the failure (in the
768c67ee 368case of "utf8ness failure") or the location s+len (in the case of
369"utf8ness success") in the C<ep>, and the number of UTF-8
370encoded characters in the C<el>.
371
372See also is_utf8_string_loc() and is_utf8_string().
81cd54e3 373
374=cut
375*/
376
377bool
668b6d8d 378Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 379{
35da51f7 380 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 381 const U8* x = s;
81cd54e3 382 STRLEN c;
3ebfea28 383 STRLEN outlen = 0;
7918f24d 384
385 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
81cd54e3 386
81cd54e3 387 while (x < send) {
388 /* Inline the easy bits of is_utf8_char() here for speed... */
389 if (UTF8_IS_INVARIANT(*x))
768c67ee 390 c = 1;
391 else if (!UTF8_IS_START(*x))
392 goto out;
81cd54e3 393 else {
768c67ee 394 /* ... and call is_utf8_char() only if really needed. */
395#ifdef IS_UTF8_CHAR
396 c = UTF8SKIP(x);
397 if (IS_UTF8_CHAR_FAST(c)) {
398 if (!IS_UTF8_CHAR(x, c))
399 c = 0;
400 } else
401 c = is_utf8_char_slow(x, c);
402#else
403 c = is_utf8_char(x);
404#endif /* #ifdef IS_UTF8_CHAR */
405 if (!c)
406 goto out;
81cd54e3 407 }
768c67ee 408 x += c;
3ebfea28 409 outlen++;
81cd54e3 410 }
768c67ee 411
412 out:
3ebfea28 413 if (el)
414 *el = outlen;
415
768c67ee 416 if (ep)
417 *ep = x;
3ebfea28 418 return (x == send);
81cd54e3 419}
420
421/*
768c67ee 422
87cea99e 423=for apidoc utf8n_to_uvuni
67e989fb 424
9041c2e3 425Bottom level UTF-8 decode routine.
38a44b82 426Returns the Unicode code point value of the first character in the string C<s>
1e54db1a 427which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 428C<retlen> will be set to the length, in bytes, of that character.
67e989fb 429
1e54db1a 430If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880 431is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
432it is assumed that the caller will raise a warning, and this function
28d3d195 433will silently just set C<retlen> to C<-1> and return zero. If the
434C<flags> does not contain UTF8_CHECK_ONLY, warnings about
435malformations will be given, C<retlen> will be set to the expected
436length of the UTF-8 character in bytes, and zero will be returned.
437
438The C<flags> can also contain various flags to allow deviations from
439the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 440
9041c2e3 441Most code should use utf8_to_uvchr() rather than call this directly.
442
37607a96 443=cut
444*/
67e989fb 445
a0ed51b3 446UV
7fc63493 447Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 448{
97aff369 449 dVAR;
d4c19fe8 450 const U8 * const s0 = s;
9c5ffd7c 451 UV uv = *s, ouv = 0;
ba210ebe 452 STRLEN len = 1;
7fc63493 453 const bool dowarn = ckWARN_d(WARN_UTF8);
454 const UV startbyte = *s;
ba210ebe 455 STRLEN expectlen = 0;
a0dbb045 456 U32 warning = 0;
457
7918f24d 458 PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
459
a0dbb045 460/* This list is a superset of the UTF8_ALLOW_XXX. */
461
462#define UTF8_WARN_EMPTY 1
463#define UTF8_WARN_CONTINUATION 2
464#define UTF8_WARN_NON_CONTINUATION 3
465#define UTF8_WARN_FE_FF 4
466#define UTF8_WARN_SHORT 5
467#define UTF8_WARN_OVERFLOW 6
468#define UTF8_WARN_SURROGATE 7
c867b360 469#define UTF8_WARN_LONG 8
470#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045 471
472 if (curlen == 0 &&
473 !(flags & UTF8_ALLOW_EMPTY)) {
474 warning = UTF8_WARN_EMPTY;
0c443dc2 475 goto malformed;
476 }
477
1d72bdf6 478 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3 479 if (retlen)
480 *retlen = 1;
c4d5f83a 481 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 482 }
67e989fb 483
421a8bf2 484 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 485 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 486 warning = UTF8_WARN_CONTINUATION;
ba210ebe 487 goto malformed;
488 }
489
421a8bf2 490 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 491 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 492 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 493 goto malformed;
494 }
9041c2e3 495
1d72bdf6 496#ifdef EBCDIC
75383841 497 uv = NATIVE_TO_UTF(uv);
1d72bdf6 498#else
fcc8fcf6 499 if ((uv == 0xfe || uv == 0xff) &&
500 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 501 warning = UTF8_WARN_FE_FF;
ba210ebe 502 goto malformed;
a0ed51b3 503 }
1d72bdf6 504#endif
505
ba210ebe 506 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
507 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
508 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
509 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6 510#ifdef EBCDIC
511 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
512 else { len = 7; uv &= 0x01; }
513#else
ba210ebe 514 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
515 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6 516 else { len = 13; uv = 0; } /* whoa! */
517#endif
518
a0ed51b3 519 if (retlen)
520 *retlen = len;
9041c2e3 521
ba210ebe 522 expectlen = len;
523
fcc8fcf6 524 if ((curlen < expectlen) &&
525 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 526 warning = UTF8_WARN_SHORT;
ba210ebe 527 goto malformed;
528 }
529
530 len--;
a0ed51b3 531 s++;
ba210ebe 532 ouv = uv;
533
a0ed51b3 534 while (len--) {
421a8bf2 535 if (!UTF8_IS_CONTINUATION(*s) &&
536 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 537 s--;
538 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 539 goto malformed;
a0ed51b3 540 }
541 else
8850bf83 542 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045 543 if (!(uv > ouv)) {
544 /* These cannot be allowed. */
545 if (uv == ouv) {
75dbc644 546 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 547 warning = UTF8_WARN_LONG;
548 goto malformed;
549 }
550 }
551 else { /* uv < ouv */
552 /* This cannot be allowed. */
553 warning = UTF8_WARN_OVERFLOW;
554 goto malformed;
555 }
ba210ebe 556 }
557 s++;
558 ouv = uv;
559 }
560
421a8bf2 561 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 562 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 563 warning = UTF8_WARN_SURROGATE;
ba210ebe 564 goto malformed;
eb160463 565 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 566 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 567 warning = UTF8_WARN_LONG;
ba210ebe 568 goto malformed;
421a8bf2 569 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 570 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 571 warning = UTF8_WARN_FFFF;
a9917092 572 goto malformed;
a0ed51b3 573 }
ba210ebe 574
a0ed51b3 575 return uv;
ba210ebe 576
577malformed:
578
fcc8fcf6 579 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 580 if (retlen)
10edeb5d 581 *retlen = ((STRLEN) -1);
ba210ebe 582 return 0;
583 }
584
a0dbb045 585 if (dowarn) {
84bafc02 586 SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
a0dbb045 587
588 switch (warning) {
589 case 0: /* Intentionally empty. */ break;
590 case UTF8_WARN_EMPTY:
396482e1 591 sv_catpvs(sv, "(empty string)");
a0dbb045 592 break;
593 case UTF8_WARN_CONTINUATION:
097fb8e2 594 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045 595 break;
596 case UTF8_WARN_NON_CONTINUATION:
097fb8e2 597 if (s == s0)
598 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
599 (UV)s[1], startbyte);
551405c4 600 else {
601 const int len = (int)(s-s0);
097fb8e2 602 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
551405c4 603 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
604 }
605
a0dbb045 606 break;
607 case UTF8_WARN_FE_FF:
608 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
609 break;
610 case UTF8_WARN_SHORT:
097fb8e2 611 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 612 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 613 expectlen = curlen; /* distance for caller to skip */
a0dbb045 614 break;
615 case UTF8_WARN_OVERFLOW:
097fb8e2 616 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
617 ouv, *s, startbyte);
a0dbb045 618 break;
619 case UTF8_WARN_SURROGATE:
620 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
621 break;
a0dbb045 622 case UTF8_WARN_LONG:
097fb8e2 623 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 624 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045 625 break;
626 case UTF8_WARN_FFFF:
627 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
628 break;
629 default:
396482e1 630 sv_catpvs(sv, "(unknown reason)");
a0dbb045 631 break;
632 }
633
634 if (warning) {
44f8325f 635 const char * const s = SvPVX_const(sv);
a0dbb045 636
637 if (PL_op)
9014280d 638 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 639 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 640 else
9014280d 641 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045 642 }
643 }
644
ba210ebe 645 if (retlen)
28d3d195 646 *retlen = expectlen ? expectlen : len;
ba210ebe 647
28d3d195 648 return 0;
a0ed51b3 649}
650
8e84507e 651/*
87cea99e 652=for apidoc utf8_to_uvchr
9041c2e3 653
654Returns the native character value of the first character in the string C<s>
1e54db1a 655which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3 656length, in bytes, of that character.
657
1e54db1a 658If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3 659returned and retlen is set, if possible, to -1.
660
661=cut
662*/
663
664UV
7fc63493 665Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 666{
7918f24d 667 PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
668
1754c1a1 669 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
670 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3 671}
672
673/*
87cea99e 674=for apidoc utf8_to_uvuni
9041c2e3 675
676Returns the Unicode code point of the first character in the string C<s>
1e54db1a 677which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3 678length, in bytes, of that character.
679
2bbc8d55 680This function should only be used when the returned UV is considered
9041c2e3 681an index into the Unicode semantic tables (e.g. swashes).
682
1e54db1a 683If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 684returned and retlen is set, if possible, to -1.
8e84507e 685
686=cut
687*/
688
689UV
7fc63493 690Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 691{
7918f24d 692 PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
693
9041c2e3 694 /* Call the low level routine asking for checks */
89ebb4a3 695 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 696 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e 697}
698
b76347f2 699/*
87cea99e 700=for apidoc utf8_length
b76347f2 701
702Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47 703Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
704up past C<e>, croaks.
b76347f2 705
706=cut
707*/
708
709STRLEN
35a4481c 710Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2 711{
97aff369 712 dVAR;
b76347f2 713 STRLEN len = 0;
714
7918f24d 715 PERL_ARGS_ASSERT_UTF8_LENGTH;
716
8850bf83 717 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
718 * the bitops (especially ~) can create illegal UTF-8.
719 * In other words: in Perl UTF-8 is not just for Unicode. */
720
a3b680e6 721 if (e < s)
722 goto warn_and_return;
b76347f2 723 while (s < e) {
8e91ec7f 724 if (!UTF8_IS_INVARIANT(*s))
725 s += UTF8SKIP(s);
726 else
727 s++;
728 len++;
729 }
730
731 if (e != s) {
732 len--;
733 warn_and_return:
9b387841 734 if (PL_op)
735 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
736 "%s in %s", unees, OP_DESC(PL_op));
737 else
738 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees);
b76347f2 739 }
740
741 return len;
742}
743
b06226ff 744/*
87cea99e 745=for apidoc utf8_distance
b06226ff 746
1e54db1a 747Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff 748and C<b>.
749
750WARNING: use only if you *know* that the pointers point inside the
751same UTF-8 buffer.
752
37607a96 753=cut
754*/
a0ed51b3 755
02eb7b47 756IV
35a4481c 757Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 758{
7918f24d 759 PERL_ARGS_ASSERT_UTF8_DISTANCE;
760
bf1665bc 761 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
a0ed51b3 762}
763
b06226ff 764/*
87cea99e 765=for apidoc utf8_hop
b06226ff 766
8850bf83 767Return the UTF-8 pointer C<s> displaced by C<off> characters, either
768forward or backward.
b06226ff 769
770WARNING: do not use the following unless you *know* C<off> is within
8850bf83 771the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
772on the first byte of character or just after the last byte of a character.
b06226ff 773
37607a96 774=cut
775*/
a0ed51b3 776
777U8 *
4373e329 778Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 779{
7918f24d 780 PERL_ARGS_ASSERT_UTF8_HOP;
781
96a5add6 782 PERL_UNUSED_CONTEXT;
8850bf83 783 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
784 * the bitops (especially ~) can create illegal UTF-8.
785 * In other words: in Perl UTF-8 is not just for Unicode. */
786
a0ed51b3 787 if (off >= 0) {
788 while (off--)
789 s += UTF8SKIP(s);
790 }
791 else {
792 while (off++) {
793 s--;
8850bf83 794 while (UTF8_IS_CONTINUATION(*s))
795 s--;
a0ed51b3 796 }
797 }
4373e329 798 return (U8 *)s;
a0ed51b3 799}
800
6940069f 801/*
87cea99e 802=for apidoc utf8_to_bytes
6940069f 803
2bbc8d55 804Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
246fae53 805Unlike C<bytes_to_utf8>, this over-writes the original string, and
806updates len to contain the new length.
67e989fb 807Returns zero on failure, setting C<len> to -1.
6940069f 808
95be277c 809If you need a copy of the string, see C<bytes_from_utf8>.
810
6940069f 811=cut
812*/
813
814U8 *
37607a96 815Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 816{
d4c19fe8 817 U8 * const save = s;
818 U8 * const send = s + *len;
6940069f 819 U8 *d;
246fae53 820
7918f24d 821 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
822
1e54db1a 823 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 824 while (s < send) {
dcad2880 825 U8 c = *s++;
826
1d72bdf6 827 if (!UTF8_IS_INVARIANT(c) &&
828 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
829 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
10edeb5d 830 *len = ((STRLEN) -1);
dcad2880 831 return 0;
832 }
246fae53 833 }
dcad2880 834
835 d = s = save;
6940069f 836 while (s < send) {
ed646e6e 837 STRLEN ulen;
9041c2e3 838 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 839 s += ulen;
6940069f 840 }
841 *d = '\0';
246fae53 842 *len = d - save;
6940069f 843 return save;
844}
845
846/*
87cea99e 847=for apidoc bytes_from_utf8
f9a63242 848
2bbc8d55 849Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
35a4481c 850Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0 851the newly-created string, and updates C<len> to contain the new
852length. Returns the original string if no conversion occurs, C<len>
853is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
2bbc8d55 8540 if C<s> is converted or consisted entirely of characters that are invariant
855in utf8 (i.e., US-ASCII on non-EBCDIC machines).
f9a63242 856
37607a96 857=cut
858*/
f9a63242 859
860U8 *
e1ec3a88 861Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 862{
f9a63242 863 U8 *d;
e1ec3a88 864 const U8 *start = s;
865 const U8 *send;
f9a63242 866 I32 count = 0;
867
7918f24d 868 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
869
96a5add6 870 PERL_UNUSED_CONTEXT;
f9a63242 871 if (!*is_utf8)
73d840c0 872 return (U8 *)start;
f9a63242 873
1e54db1a 874 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 875 for (send = s + *len; s < send;) {
e1ec3a88 876 U8 c = *s++;
1d72bdf6 877 if (!UTF8_IS_INVARIANT(c)) {
db42d148 878 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
879 (c = *s++) && UTF8_IS_CONTINUATION(c))
880 count++;
881 else
73d840c0 882 return (U8 *)start;
db42d148 883 }
f9a63242 884 }
885
35da51f7 886 *is_utf8 = FALSE;
f9a63242 887
212542aa 888 Newx(d, (*len) - count + 1, U8);
ef9edfd0 889 s = start; start = d;
f9a63242 890 while (s < send) {
891 U8 c = *s++;
c4d5f83a 892 if (!UTF8_IS_INVARIANT(c)) {
893 /* Then it is two-byte encoded */
894 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
895 c = ASCII_TO_NATIVE(c);
896 }
897 *d++ = c;
f9a63242 898 }
899 *d = '\0';
900 *len = d - start;
73d840c0 901 return (U8 *)start;
f9a63242 902}
903
904/*
87cea99e 905=for apidoc bytes_to_utf8
6940069f 906
2bbc8d55 907Converts a string C<s> of length C<len> from the native encoding into UTF-8.
6662521e 908Returns a pointer to the newly-created string, and sets C<len> to
909reflect the new length.
6940069f 910
2bbc8d55 911A NUL character will be written after the end of the string.
912
913If you want to convert to UTF-8 from encodings other than
914the native (Latin1 or EBCDIC),
c9ada85f 915see sv_recode_to_utf8().
916
497711e7 917=cut
6940069f 918*/
919
920U8*
35a4481c 921Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 922{
35a4481c 923 const U8 * const send = s + (*len);
6940069f 924 U8 *d;
925 U8 *dst;
7918f24d 926
927 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 928 PERL_UNUSED_CONTEXT;
6940069f 929
212542aa 930 Newx(d, (*len) * 2 + 1, U8);
6940069f 931 dst = d;
932
933 while (s < send) {
35a4481c 934 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 935 if (UNI_IS_INVARIANT(uv))
eb160463 936 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 937 else {
eb160463 938 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
939 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f 940 }
941 }
942 *d = '\0';
6662521e 943 *len = d-dst;
6940069f 944 return dst;
945}
946
a0ed51b3 947/*
dea0fc0b 948 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3 949 *
950 * Destination must be pre-extended to 3/2 source. Do not use in-place.
951 * We optimize for native, for obvious reasons. */
952
953U8*
dea0fc0b 954Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 955{
dea0fc0b 956 U8* pend;
957 U8* dstart = d;
958
7918f24d 959 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
960
dea0fc0b 961 if (bytelen & 1)
f5992bc4 962 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
dea0fc0b 963
964 pend = p + bytelen;
965
a0ed51b3 966 while (p < pend) {
dea0fc0b 967 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
968 p += 2;
a0ed51b3 969 if (uv < 0x80) {
e294cc5d 970#ifdef EBCDIC
971 *d++ = UNI_TO_NATIVE(uv);
972#else
eb160463 973 *d++ = (U8)uv;
e294cc5d 974#endif
a0ed51b3 975 continue;
976 }
977 if (uv < 0x800) {
eb160463 978 *d++ = (U8)(( uv >> 6) | 0xc0);
979 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 980 continue;
981 }
52b9aa85 982 if (uv >= 0xd800 && uv <= 0xdbff) { /* surrogates */
01ea242b 983 if (p >= pend) {
dea0fc0b 984 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
01ea242b 985 } else {
986 UV low = (p[0] << 8) + p[1];
987 p += 2;
52b9aa85 988 if (low < 0xdc00 || low > 0xdfff)
01ea242b 989 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
990 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
991 }
dbde1951 992 } else if (uv >= 0xdc00 && uv <= 0xdfff) {
993 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3 994 }
995 if (uv < 0x10000) {
eb160463 996 *d++ = (U8)(( uv >> 12) | 0xe0);
997 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
998 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 999 continue;
1000 }
1001 else {
eb160463 1002 *d++ = (U8)(( uv >> 18) | 0xf0);
1003 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1004 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1005 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 1006 continue;
1007 }
1008 }
dea0fc0b 1009 *newlen = d - dstart;
a0ed51b3 1010 return d;
1011}
1012
1013/* Note: this one is slightly destructive of the source. */
1014
1015U8*
dea0fc0b 1016Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 1017{
1018 U8* s = (U8*)p;
d4c19fe8 1019 U8* const send = s + bytelen;
7918f24d 1020
1021 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1022
e0ea5e2d 1023 if (bytelen & 1)
1024 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1025 (UV)bytelen);
1026
a0ed51b3 1027 while (s < send) {
d4c19fe8 1028 const U8 tmp = s[0];
a0ed51b3 1029 s[0] = s[1];
1030 s[1] = tmp;
1031 s += 2;
1032 }
dea0fc0b 1033 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3 1034}
1035
1036/* for now these are all defined (inefficiently) in terms of the utf8 versions */
1037
1038bool
84afefe6 1039Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 1040{
89ebb4a3 1041 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1042 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1043 return is_utf8_alnum(tmpbuf);
1044}
1045
1046bool
84afefe6 1047Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1048{
89ebb4a3 1049 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1050 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1051 return is_utf8_idfirst(tmpbuf);
1052}
1053
1054bool
84afefe6 1055Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1056{
89ebb4a3 1057 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1058 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1059 return is_utf8_alpha(tmpbuf);
1060}
1061
1062bool
84afefe6 1063Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1064{
89ebb4a3 1065 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1066 uvchr_to_utf8(tmpbuf, c);
4d61ec05 1067 return is_utf8_ascii(tmpbuf);
1068}
1069
1070bool
84afefe6 1071Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1072{
89ebb4a3 1073 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1074 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1075 return is_utf8_space(tmpbuf);
1076}
1077
1078bool
84afefe6 1079Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1080{
89ebb4a3 1081 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1082 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1083 return is_utf8_digit(tmpbuf);
1084}
1085
1086bool
84afefe6 1087Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1088{
89ebb4a3 1089 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1090 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1091 return is_utf8_upper(tmpbuf);
1092}
1093
1094bool
84afefe6 1095Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1096{
89ebb4a3 1097 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1098 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1099 return is_utf8_lower(tmpbuf);
1100}
1101
1102bool
84afefe6 1103Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1104{
89ebb4a3 1105 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1106 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1107 return is_utf8_cntrl(tmpbuf);
1108}
1109
1110bool
84afefe6 1111Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1112{
89ebb4a3 1113 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1114 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1115 return is_utf8_graph(tmpbuf);
1116}
1117
1118bool
84afefe6 1119Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1120{
89ebb4a3 1121 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1122 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1123 return is_utf8_print(tmpbuf);
1124}
1125
b8c5462f 1126bool
84afefe6 1127Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1128{
89ebb4a3 1129 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1130 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1131 return is_utf8_punct(tmpbuf);
1132}
1133
4d61ec05 1134bool
84afefe6 1135Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1136{
89ebb4a3 1137 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1138 uvchr_to_utf8(tmpbuf, c);
4d61ec05 1139 return is_utf8_xdigit(tmpbuf);
1140}
1141
84afefe6 1142UV
1143Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1144{
7918f24d 1145 PERL_ARGS_ASSERT_TO_UNI_UPPER;
1146
0ebc6274 1147 uvchr_to_utf8(p, c);
1148 return to_utf8_upper(p, p, lenp);
a0ed51b3 1149}
1150
84afefe6 1151UV
1152Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1153{
7918f24d 1154 PERL_ARGS_ASSERT_TO_UNI_TITLE;
1155
0ebc6274 1156 uvchr_to_utf8(p, c);
1157 return to_utf8_title(p, p, lenp);
a0ed51b3 1158}
1159
84afefe6 1160UV
1161Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1162{
7918f24d 1163 PERL_ARGS_ASSERT_TO_UNI_LOWER;
1164
0ebc6274 1165 uvchr_to_utf8(p, c);
1166 return to_utf8_lower(p, p, lenp);
a0ed51b3 1167}
1168
84afefe6 1169UV
1170Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1171{
7918f24d 1172 PERL_ARGS_ASSERT_TO_UNI_FOLD;
1173
0ebc6274 1174 uvchr_to_utf8(p, c);
1175 return to_utf8_fold(p, p, lenp);
84afefe6 1176}
1177
a0ed51b3 1178/* for now these all assume no locale info available for Unicode > 255 */
1179
1180bool
84afefe6 1181Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3 1182{
1183 return is_uni_alnum(c); /* XXX no locale support yet */
1184}
1185
1186bool
84afefe6 1187Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3 1188{
1189 return is_uni_idfirst(c); /* XXX no locale support yet */
1190}
1191
1192bool
84afefe6 1193Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3 1194{
1195 return is_uni_alpha(c); /* XXX no locale support yet */
1196}
1197
1198bool
84afefe6 1199Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05 1200{
1201 return is_uni_ascii(c); /* XXX no locale support yet */
1202}
1203
1204bool
84afefe6 1205Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3 1206{
1207 return is_uni_space(c); /* XXX no locale support yet */
1208}
1209
1210bool
84afefe6 1211Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3 1212{
1213 return is_uni_digit(c); /* XXX no locale support yet */
1214}
1215
1216bool
84afefe6 1217Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3 1218{
1219 return is_uni_upper(c); /* XXX no locale support yet */
1220}
1221
1222bool
84afefe6 1223Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3 1224{
1225 return is_uni_lower(c); /* XXX no locale support yet */
1226}
1227
1228bool
84afefe6 1229Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f 1230{
1231 return is_uni_cntrl(c); /* XXX no locale support yet */
1232}
1233
1234bool
84afefe6 1235Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f 1236{
1237 return is_uni_graph(c); /* XXX no locale support yet */
1238}
1239
1240bool
84afefe6 1241Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3 1242{
1243 return is_uni_print(c); /* XXX no locale support yet */
1244}
1245
b8c5462f 1246bool
84afefe6 1247Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f 1248{
1249 return is_uni_punct(c); /* XXX no locale support yet */
1250}
1251
4d61ec05 1252bool
84afefe6 1253Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05 1254{
1255 return is_uni_xdigit(c); /* XXX no locale support yet */
1256}
1257
b7ac61fa 1258U32
1259Perl_to_uni_upper_lc(pTHX_ U32 c)
1260{
ee099d14 1261 /* XXX returns only the first character -- do not use XXX */
1262 /* XXX no locale support yet */
1263 STRLEN len;
89ebb4a3 1264 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1265 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa 1266}
1267
1268U32
1269Perl_to_uni_title_lc(pTHX_ U32 c)
1270{
ee099d14 1271 /* XXX returns only the first character XXX -- do not use XXX */
1272 /* XXX no locale support yet */
1273 STRLEN len;
89ebb4a3 1274 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1275 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa 1276}
1277
1278U32
1279Perl_to_uni_lower_lc(pTHX_ U32 c)
1280{
ee099d14 1281 /* XXX returns only the first character -- do not use XXX */
1282 /* XXX no locale support yet */
1283 STRLEN len;
89ebb4a3 1284 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1285 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa 1286}
1287
7452cf6a 1288static bool
5141f98e 1289S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
bde6a22d 1290 const char *const swashname)
1291{
97aff369 1292 dVAR;
7918f24d 1293
1294 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1295
bde6a22d 1296 if (!is_utf8_char(p))
1297 return FALSE;
1298 if (!*swash)
711a919c 1299 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
bde6a22d 1300 return swash_fetch(*swash, p, TRUE) != 0;
1301}
1302
1303bool
7fc63493 1304Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1305{
97aff369 1306 dVAR;
7918f24d 1307
1308 PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1309
671c33bf 1310 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1311 * descendant of isalnum(3), in other words, it doesn't
1312 * contain the '_'. --jhi */
d4c19fe8 1313 return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
a0ed51b3 1314}
1315
1316bool
7fc63493 1317Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1318{
97aff369 1319 dVAR;
7918f24d 1320
1321 PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1322
82686b01 1323 if (*p == '_')
1324 return TRUE;
bde6a22d 1325 /* is_utf8_idstart would be more logical. */
d4c19fe8 1326 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
82686b01 1327}
1328
1329bool
7fc63493 1330Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01 1331{
97aff369 1332 dVAR;
7918f24d 1333
1334 PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1335
82686b01 1336 if (*p == '_')
1337 return TRUE;
d4c19fe8 1338 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
a0ed51b3 1339}
1340
1341bool
7fc63493 1342Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1343{
97aff369 1344 dVAR;
7918f24d 1345
1346 PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1347
d4c19fe8 1348 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
a0ed51b3 1349}
1350
1351bool
7fc63493 1352Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1353{
97aff369 1354 dVAR;
7918f24d 1355
1356 PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1357
d4c19fe8 1358 return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
b8c5462f 1359}
1360
1361bool
7fc63493 1362Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1363{
97aff369 1364 dVAR;
7918f24d 1365
1366 PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1367
d4c19fe8 1368 return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
a0ed51b3 1369}
1370
1371bool
d1eb3177 1372Perl_is_utf8_perl_space(pTHX_ const U8 *p)
1373{
1374 dVAR;
1375
1376 PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
1377
1378 return is_utf8_common(p, &PL_utf8_perl_space, "IsPerlSpace");
1379}
1380
1381bool
1382Perl_is_utf8_perl_word(pTHX_ const U8 *p)
1383{
1384 dVAR;
1385
1386 PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
1387
1388 return is_utf8_common(p, &PL_utf8_perl_word, "IsPerlWord");
1389}
1390
1391bool
7fc63493 1392Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1393{
97aff369 1394 dVAR;
7918f24d 1395
1396 PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1397
d4c19fe8 1398 return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
a0ed51b3 1399}
1400
1401bool
d1eb3177 1402Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
1403{
1404 dVAR;
1405
1406 PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
1407
1408 return is_utf8_common(p, &PL_utf8_posix_digit, "IsPosixDigit");
1409}
1410
1411bool
7fc63493 1412Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1413{
97aff369 1414 dVAR;
7918f24d 1415
1416 PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1417
d4c19fe8 1418 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
a0ed51b3 1419}
1420
1421bool
7fc63493 1422Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1423{
97aff369 1424 dVAR;
7918f24d 1425
1426 PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1427
d4c19fe8 1428 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
a0ed51b3 1429}
1430
1431bool
7fc63493 1432Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1433{
97aff369 1434 dVAR;
7918f24d 1435
1436 PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1437
d4c19fe8 1438 return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
b8c5462f 1439}
1440
1441bool
7fc63493 1442Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1443{
97aff369 1444 dVAR;
7918f24d 1445
1446 PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1447
d4c19fe8 1448 return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
b8c5462f 1449}
1450
1451bool
7fc63493 1452Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1453{
97aff369 1454 dVAR;
7918f24d 1455
1456 PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1457
d4c19fe8 1458 return is_utf8_common(p, &PL_utf8_print, "IsPrint");
a0ed51b3 1459}
1460
1461bool
7fc63493 1462Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1463{
97aff369 1464 dVAR;
7918f24d 1465
1466 PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1467
d4c19fe8 1468 return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
b8c5462f 1469}
1470
1471bool
7fc63493 1472Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1473{
97aff369 1474 dVAR;
7918f24d 1475
1476 PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1477
d1eb3177 1478 return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit");
b8c5462f 1479}
1480
1481bool
7fc63493 1482Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1483{
97aff369 1484 dVAR;
7918f24d 1485
1486 PERL_ARGS_ASSERT_IS_UTF8_MARK;
1487
d4c19fe8 1488 return is_utf8_common(p, &PL_utf8_mark, "IsM");
a0ed51b3 1489}
1490
6b5c0936 1491/*
87cea99e 1492=for apidoc to_utf8_case
6b5c0936 1493
1494The "p" contains the pointer to the UTF-8 string encoding
1495the character that is being converted.
1496
1497The "ustrp" is a pointer to the character buffer to put the
1498conversion result to. The "lenp" is a pointer to the length
1499of the result.
1500
0134edef 1501The "swashp" is a pointer to the swash to use.
6b5c0936 1502
0134edef 1503Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
8fe4d5b2 1504and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
0134edef 1505but not always, a multicharacter mapping), is tried first.
6b5c0936 1506
0134edef 1507The "special" is a string like "utf8::ToSpecLower", which means the
1508hash %utf8::ToSpecLower. The access to the hash is through
1509Perl_to_utf8_case().
6b5c0936 1510
0134edef 1511The "normal" is a string like "ToLower" which means the swash
1512%utf8::ToLower.
1513
1514=cut */
6b5c0936 1515
2104c8d9 1516UV
9a957fbc 1517Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1518 SV **swashp, const char *normal, const char *special)
a0ed51b3 1519{
97aff369 1520 dVAR;
89ebb4a3 1521 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1522 STRLEN len = 0;
aec46f14 1523 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7 1524 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1525 * are necessary in EBCDIC, they are redundant no-ops
1526 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1527 const UV uv1 = NATIVE_TO_UNI(uv0);
7918f24d 1528
1529 PERL_ARGS_ASSERT_TO_UTF8_CASE;
1530
1feea2c7 1531 uvuni_to_utf8(tmpbuf, uv1);
0134edef 1532
1533 if (!*swashp) /* load on-demand */
1534 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1535
b08cf34e 1536 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1537 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1538 /* It might be "special" (sometimes, but not always,
2a37f04d 1539 * a multicharacter mapping) */
6673a63c 1540 HV * const hv = get_hv(special, 0);
b08cf34e 1541 SV **svp;
1542
35da51f7 1543 if (hv &&
b08cf34e 1544 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1545 (*svp)) {
cfd0369c 1546 const char *s;
47654450 1547
cfd0369c 1548 s = SvPV_const(*svp, len);
47654450 1549 if (len == 1)
1550 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1551 else {
2f9475ad 1552#ifdef EBCDIC
1553 /* If we have EBCDIC we need to remap the characters
1554 * since any characters in the low 256 are Unicode
1555 * code points, not EBCDIC. */
7cda7a3d 1556 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad 1557
1558 d = tmpbuf;
b08cf34e 1559 if (SvUTF8(*svp)) {
2f9475ad 1560 STRLEN tlen = 0;
1561
1562 while (t < tend) {
d4c19fe8 1563 const UV c = utf8_to_uvchr(t, &tlen);
2f9475ad 1564 if (tlen > 0) {
1565 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1566 t += tlen;
1567 }
1568 else
1569 break;
1570 }
1571 }
1572 else {
36fec512 1573 while (t < tend) {
1574 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1575 t++;
1576 }
2f9475ad 1577 }
1578 len = d - tmpbuf;
1579 Copy(tmpbuf, ustrp, len, U8);
1580#else
d2dcd0fb 1581 Copy(s, ustrp, len, U8);
2f9475ad 1582#endif
29e98929 1583 }
983ffd37 1584 }
0134edef 1585 }
1586
1587 if (!len && *swashp) {
d4c19fe8 1588 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1589
0134edef 1590 if (uv2) {
1591 /* It was "normal" (a single character mapping). */
d4c19fe8 1592 const UV uv3 = UNI_TO_NATIVE(uv2);
e9101d72 1593 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d 1594 }
1595 }
1feea2c7 1596
0134edef 1597 if (!len) /* Neither: just copy. */
1598 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1599
2a37f04d 1600 if (lenp)
1601 *lenp = len;
1602
0134edef 1603 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3 1604}
1605
d3e79532 1606/*
87cea99e 1607=for apidoc to_utf8_upper
d3e79532 1608
1609Convert the UTF-8 encoded character at p to its uppercase version and
1610store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1611that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1612the uppercase version may be longer than the original character.
d3e79532 1613
1614The first character of the uppercased version is returned
1615(but note, as explained above, that there may be more.)
1616
1617=cut */
1618
2104c8d9 1619UV
7fc63493 1620Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1621{
97aff369 1622 dVAR;
7918f24d 1623
1624 PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1625
983ffd37 1626 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1627 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1628}
a0ed51b3 1629
d3e79532 1630/*
87cea99e 1631=for apidoc to_utf8_title
d3e79532 1632
1633Convert the UTF-8 encoded character at p to its titlecase version and
1634store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1635that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1636titlecase version may be longer than the original character.
d3e79532 1637
1638The first character of the titlecased version is returned
1639(but note, as explained above, that there may be more.)
1640
1641=cut */
1642
983ffd37 1643UV
7fc63493 1644Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37 1645{
97aff369 1646 dVAR;
7918f24d 1647
1648 PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1649
983ffd37 1650 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1651 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3 1652}
1653
d3e79532 1654/*
87cea99e 1655=for apidoc to_utf8_lower
d3e79532 1656
1657Convert the UTF-8 encoded character at p to its lowercase version and
1658store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1659that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1660lowercase version may be longer than the original character.
d3e79532 1661
1662The first character of the lowercased version is returned
1663(but note, as explained above, that there may be more.)
1664
1665=cut */
1666
2104c8d9 1667UV
7fc63493 1668Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1669{
97aff369 1670 dVAR;
7918f24d 1671
1672 PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1673
983ffd37 1674 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1675 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1676}
1677
d3e79532 1678/*
87cea99e 1679=for apidoc to_utf8_fold
d3e79532 1680
1681Convert the UTF-8 encoded character at p to its foldcase version and
1682store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1683that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532 1684foldcase version may be longer than the original character (up to
1685three characters).
1686
1687The first character of the foldcased version is returned
1688(but note, as explained above, that there may be more.)
1689
1690=cut */
1691
b4e400f9 1692UV
7fc63493 1693Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9 1694{
97aff369 1695 dVAR;
7918f24d 1696
1697 PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1698
b4e400f9 1699 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1700 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3 1701}
1702
711a919c 1703/* Note:
1704 * A "swash" is a swatch hash.
1705 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1706 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1707 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1708 */
a0ed51b3 1709SV*
7fc63493 1710Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1711{
27da23d5 1712 dVAR;
a0ed51b3 1713 SV* retval;
8e84507e 1714 dSP;
7fc63493 1715 const size_t pkg_len = strlen(pkg);
1716 const size_t name_len = strlen(name);
da51bb9b 1717 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
f8be5cf0 1718 SV* errsv_save;
ce3b816e 1719
7918f24d 1720 PERL_ARGS_ASSERT_SWASH_INIT;
1721
96ca9f55 1722 PUSHSTACKi(PERLSI_MAGIC);
1723 ENTER;
1724 SAVEI32(PL_hints);
1725 PL_hints = 0;
1726 save_re_context();
1b026014 1727 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1728 ENTER;
f8be5cf0 1729 errsv_save = newSVsv(ERRSV);
dc0c6abb 1730 /* It is assumed that callers of this routine are not passing in any
1731 user derived data. */
1732 /* Need to do this after save_re_context() as it will set PL_tainted to
1733 1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1734 Even line to create errsv_save can turn on PL_tainted. */
1735 SAVEBOOL(PL_tainted);
1736 PL_tainted = 0;
71bed85a 1737 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
a0714e2c 1738 NULL);
f8be5cf0 1739 if (!SvTRUE(ERRSV))
1740 sv_setsv(ERRSV, errsv_save);
1741 SvREFCNT_dec(errsv_save);
ce3b816e 1742 LEAVE;
1743 }
1744 SPAGAIN;
a0ed51b3 1745 PUSHMARK(SP);
1746 EXTEND(SP,5);
6e449a3a 1747 mPUSHp(pkg, pkg_len);
1748 mPUSHp(name, name_len);
a0ed51b3 1749 PUSHs(listsv);
6e449a3a 1750 mPUSHi(minbits);
1751 mPUSHi(none);
a0ed51b3 1752 PUTBACK;
f8be5cf0 1753 errsv_save = newSVsv(ERRSV);
864dbfa3 1754 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1755 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1756 else
e24b16f9 1757 retval = &PL_sv_undef;
f8be5cf0 1758 if (!SvTRUE(ERRSV))
1759 sv_setsv(ERRSV, errsv_save);
1760 SvREFCNT_dec(errsv_save);
a0ed51b3 1761 LEAVE;
1762 POPSTACK;
923e4eb5 1763 if (IN_PERL_COMPILETIME) {
623e6609 1764 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1765 }
bc45ce41 1766 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1767 if (SvPOK(retval))
35c1215d 1768 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
be2597df 1769 SVfARG(retval));
cea2e8a9 1770 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1771 }
a0ed51b3 1772 return retval;
1773}
1774
035d37be 1775
1776/* This API is wrong for special case conversions since we may need to
1777 * return several Unicode characters for a single Unicode character
1778 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1779 * the lower-level routine, and it is similarly broken for returning
1780 * multiple values. --jhi */
979f2922 1781/* Now SWASHGET is recasted into S_swash_get in this file. */
680c470c 1782
1783/* Note:
1784 * Returns the value of property/mapping C<swash> for the first character
1785 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1786 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1787 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1788 */
a0ed51b3 1789UV
680c470c 1790Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 1791{
27da23d5 1792 dVAR;
ef8f7699 1793 HV *const hv = MUTABLE_HV(SvRV(swash));
3568d838 1794 U32 klen;
1795 U32 off;
a0ed51b3 1796 STRLEN slen;
7d85a32c 1797 STRLEN needents;
cfd0369c 1798 const U8 *tmps = NULL;
a0ed51b3 1799 U32 bit;
979f2922 1800 SV *swatch;
3568d838 1801 U8 tmputf8[2];
35da51f7 1802 const UV c = NATIVE_TO_ASCII(*ptr);
3568d838 1803
7918f24d 1804 PERL_ARGS_ASSERT_SWASH_FETCH;
1805
3568d838 1806 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
979f2922 1807 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1808 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1809 ptr = tmputf8;
3568d838 1810 }
1811 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1812 * then the "swatch" is a vec() for al the chars which start
1813 * with 0xAA..0xYY
1814 * So the key in the hash (klen) is length of encoded char -1
1815 */
1816 klen = UTF8SKIP(ptr) - 1;
1817 off = ptr[klen];
a0ed51b3 1818
979f2922 1819 if (klen == 0) {
7d85a32c 1820 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1821 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1822 */
979f2922 1823 needents = UTF_CONTINUATION_MARK;
1824 off = NATIVE_TO_UTF(ptr[klen]);
1825 }
1826 else {
7d85a32c 1827 /* If char is encoded then swatch is for the prefix */
979f2922 1828 needents = (1 << UTF_ACCUMULATION_SHIFT);
1829 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1830 }
7d85a32c 1831
a0ed51b3 1832 /*
1833 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1834 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1835 * it's nothing to sniff at.) Pity we usually come through at least
1836 * two function calls to get here...
1837 *
1838 * NB: this code assumes that swatches are never modified, once generated!
1839 */
1840
3568d838 1841 if (hv == PL_last_swash_hv &&
a0ed51b3 1842 klen == PL_last_swash_klen &&
27da23d5 1843 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3 1844 {
1845 tmps = PL_last_swash_tmps;
1846 slen = PL_last_swash_slen;
1847 }
1848 else {
1849 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1850 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 1851
979f2922 1852 /* If not cached, generate it via swash_get */
1853 if (!svp || !SvPOK(*svp)
1854 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
2b9d42f0 1855 /* We use utf8n_to_uvuni() as we want an index into
1856 Unicode tables, not a native character number.
1857 */
aec46f14 1858 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae 1859 ckWARN(WARN_UTF8) ?
1860 0 : UTF8_ALLOW_ANY);
680c470c 1861 swatch = swash_get(swash,
979f2922 1862 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1863 (klen) ? (code_point & ~(needents - 1)) : 0,
1864 needents);
1865
923e4eb5 1866 if (IN_PERL_COMPILETIME)
623e6609 1867 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1868
979f2922 1869 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 1870
979f2922 1871 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1872 || (slen << 3) < needents)
660a4616 1873 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
a0ed51b3 1874 }
1875
1876 PL_last_swash_hv = hv;
16d8f38a 1877 assert(klen <= sizeof(PL_last_swash_key));
eac04b2e 1878 PL_last_swash_klen = (U8)klen;
cfd0369c 1879 /* FIXME change interpvar.h? */
1880 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3 1881 PL_last_swash_slen = slen;
1882 if (klen)
1883 Copy(ptr, PL_last_swash_key, klen, U8);
1884 }
1885
9faf8d75 1886 switch ((int)((slen << 3) / needents)) {
a0ed51b3 1887 case 1:
1888 bit = 1 << (off & 7);
1889 off >>= 3;
1890 return (tmps[off] & bit) != 0;
1891 case 8:
1892 return tmps[off];
1893 case 16:
1894 off <<= 1;
1895 return (tmps[off] << 8) + tmps[off + 1] ;
1896 case 32:
1897 off <<= 2;
1898 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1899 }
660a4616 1900 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
670f1322 1901 NORETURN_FUNCTION_END;
a0ed51b3 1902}
2b9d42f0 1903
979f2922 1904/* Note:
1905 * Returns a swatch (a bit vector string) for a code point sequence
1906 * that starts from the value C<start> and comprises the number C<span>.
1907 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1908 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1909 */
1910STATIC SV*
1911S_swash_get(pTHX_ SV* swash, UV start, UV span)
1912{
1913 SV *swatch;
711a919c 1914 U8 *l, *lend, *x, *xend, *s;
979f2922 1915 STRLEN lcur, xcur, scur;
ef8f7699 1916 HV *const hv = MUTABLE_HV(SvRV(swash));
017a3ce5 1917 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1918 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1919 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1920 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1921 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
0bd48802 1922 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1923 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1924 const STRLEN bits = SvUV(*bitssvp);
1925 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1926 const UV none = SvUV(*nonesvp);
1927 const UV end = start + span;
979f2922 1928
7918f24d 1929 PERL_ARGS_ASSERT_SWASH_GET;
1930
979f2922 1931 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
660a4616 1932 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1933 (UV)bits);
979f2922 1934 }
1935
1936 /* create and initialize $swatch */
979f2922 1937 scur = octets ? (span * octets) : (span + 7) / 8;
e524fe40 1938 swatch = newSV(scur);
1939 SvPOK_on(swatch);
979f2922 1940 s = (U8*)SvPVX(swatch);
1941 if (octets && none) {
0bd48802 1942 const U8* const e = s + scur;
979f2922 1943 while (s < e) {
1944 if (bits == 8)
1945 *s++ = (U8)(none & 0xff);
1946 else if (bits == 16) {
1947 *s++ = (U8)((none >> 8) & 0xff);
1948 *s++ = (U8)( none & 0xff);
1949 }
1950 else if (bits == 32) {
1951 *s++ = (U8)((none >> 24) & 0xff);
1952 *s++ = (U8)((none >> 16) & 0xff);
1953 *s++ = (U8)((none >> 8) & 0xff);
1954 *s++ = (U8)( none & 0xff);
1955 }
1956 }
1957 *s = '\0';
1958 }
1959 else {
1960 (void)memzero((U8*)s, scur + 1);
1961 }
1962 SvCUR_set(swatch, scur);
1963 s = (U8*)SvPVX(swatch);
1964
1965 /* read $swash->{LIST} */
1966 l = (U8*)SvPV(*listsvp, lcur);
1967 lend = l + lcur;
1968 while (l < lend) {
35da51f7 1969 UV min, max, val;
979f2922 1970 STRLEN numlen;
1971 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1972
0bd48802 1973 U8* const nl = (U8*)memchr(l, '\n', lend - l);
979f2922 1974
1975 numlen = lend - l;
1976 min = grok_hex((char *)l, &numlen, &flags, NULL);
1977 if (numlen)
1978 l += numlen;
1979 else if (nl) {
1980 l = nl + 1; /* 1 is length of "\n" */
1981 continue;
1982 }
1983 else {
1984 l = lend; /* to LIST's end at which \n is not found */
1985 break;
1986 }
1987
1988 if (isBLANK(*l)) {
1989 ++l;
1990 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1991 numlen = lend - l;
1992 max = grok_hex((char *)l, &numlen, &flags, NULL);
1993 if (numlen)
1994 l += numlen;
1995 else
1996 max = min;
1997
1998 if (octets) {
1999 if (isBLANK(*l)) {
2000 ++l;
2001 flags = PERL_SCAN_SILENT_ILLDIGIT |
2002 PERL_SCAN_DISALLOW_PREFIX;
2003 numlen = lend - l;
2004 val = grok_hex((char *)l, &numlen, &flags, NULL);
2005 if (numlen)
2006 l += numlen;
2007 else
2008 val = 0;
2009 }
2010 else {
2011 val = 0;
2012 if (typeto) {
2013 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
2014 typestr, l);
2015 }
2016 }
2017 }
711a919c 2018 else
2019 val = 0; /* bits == 1, then val should be ignored */
979f2922 2020 }
2021 else {
2022 max = min;
2023 if (octets) {
2024 val = 0;
2025 if (typeto) {
2026 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
2027 }
2028 }
711a919c 2029 else
2030 val = 0; /* bits == 1, then val should be ignored */
979f2922 2031 }
2032
2033 if (nl)
2034 l = nl + 1;
2035 else
2036 l = lend;
2037
2038 if (max < start)
2039 continue;
2040
2041 if (octets) {
35da51f7 2042 UV key;
979f2922 2043 if (min < start) {
2044 if (!none || val < none) {
2045 val += start - min;
2046 }
2047 min = start;
2048 }
2049 for (key = min; key <= max; key++) {
2050 STRLEN offset;
2051 if (key >= end)
2052 goto go_out_list;
2053 /* offset must be non-negative (start <= min <= key < end) */
2054 offset = octets * (key - start);
2055 if (bits == 8)
2056 s[offset] = (U8)(val & 0xff);
2057 else if (bits == 16) {
2058 s[offset ] = (U8)((val >> 8) & 0xff);
2059 s[offset + 1] = (U8)( val & 0xff);
2060 }
2061 else if (bits == 32) {
2062 s[offset ] = (U8)((val >> 24) & 0xff);
2063 s[offset + 1] = (U8)((val >> 16) & 0xff);
2064 s[offset + 2] = (U8)((val >> 8) & 0xff);
2065 s[offset + 3] = (U8)( val & 0xff);
2066 }
2067
2068 if (!none || val < none)
2069 ++val;
2070 }
2071 }
711a919c 2072 else { /* bits == 1, then val should be ignored */
35da51f7 2073 UV key;
979f2922 2074 if (min < start)
2075 min = start;
2076 for (key = min; key <= max; key++) {
0bd48802 2077 const STRLEN offset = (STRLEN)(key - start);
979f2922 2078 if (key >= end)
2079 goto go_out_list;
2080 s[offset >> 3] |= 1 << (offset & 7);
2081 }
2082 }
2083 } /* while */
2084 go_out_list:
2085
2086 /* read $swash->{EXTRAS} */
2087 x = (U8*)SvPV(*extssvp, xcur);
2088 xend = x + xcur;
2089 while (x < xend) {
2090 STRLEN namelen;
2091 U8 *namestr;
2092 SV** othersvp;
2093 HV* otherhv;
2094 STRLEN otherbits;
2095 SV **otherbitssvp, *other;
711a919c 2096 U8 *s, *o, *nl;
979f2922 2097 STRLEN slen, olen;
2098
35da51f7 2099 const U8 opc = *x++;
979f2922 2100 if (opc == '\n')
2101 continue;
2102
2103 nl = (U8*)memchr(x, '\n', xend - x);
2104
2105 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2106 if (nl) {
2107 x = nl + 1; /* 1 is length of "\n" */
2108 continue;
2109 }
2110 else {
2111 x = xend; /* to EXTRAS' end at which \n is not found */
2112 break;
2113 }
2114 }
2115
2116 namestr = x;
2117 if (nl) {
2118 namelen = nl - namestr;
2119 x = nl + 1;
2120 }
2121 else {
2122 namelen = xend - namestr;
2123 x = xend;
2124 }
2125
2126 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
ef8f7699 2127 otherhv = MUTABLE_HV(SvRV(*othersvp));
017a3ce5 2128 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
979f2922 2129 otherbits = (STRLEN)SvUV(*otherbitssvp);
2130 if (bits < otherbits)
660a4616 2131 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
979f2922 2132
2133 /* The "other" swatch must be destroyed after. */
2134 other = swash_get(*othersvp, start, span);
2135 o = (U8*)SvPV(other, olen);
2136
2137 if (!olen)
660a4616 2138 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
979f2922 2139
2140 s = (U8*)SvPV(swatch, slen);
2141 if (bits == 1 && otherbits == 1) {
2142 if (slen != olen)
660a4616 2143 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
979f2922 2144
2145 switch (opc) {
2146 case '+':
2147 while (slen--)
2148 *s++ |= *o++;
2149 break;
2150 case '!':
2151 while (slen--)
2152 *s++ |= ~*o++;
2153 break;
2154 case '-':
2155 while (slen--)
2156 *s++ &= ~*o++;
2157 break;
2158 case '&':
2159 while (slen--)
2160 *s++ &= *o++;
2161 break;
2162 default:
2163 break;
2164 }
2165 }
711a919c 2166 else {
979f2922 2167 STRLEN otheroctets = otherbits >> 3;
2168 STRLEN offset = 0;
35da51f7 2169 U8* const send = s + slen;
979f2922 2170
2171 while (s < send) {
2172 UV otherval = 0;
2173
2174 if (otherbits == 1) {
2175 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2176 ++offset;
2177 }
2178 else {
2179 STRLEN vlen = otheroctets;
2180 otherval = *o++;
2181 while (--vlen) {
2182 otherval <<= 8;
2183 otherval |= *o++;
2184 }
2185 }
2186
711a919c 2187 if (opc == '+' && otherval)
6f207bd3 2188 NOOP; /* replace with otherval */
979f2922 2189 else if (opc == '!' && !otherval)
2190 otherval = 1;
2191 else if (opc == '-' && otherval)
2192 otherval = 0;
2193 else if (opc == '&' && !otherval)
2194 otherval = 0;
2195 else {
711a919c 2196 s += octets; /* no replacement */
979f2922 2197 continue;
2198 }
2199
2200 if (bits == 8)
2201 *s++ = (U8)( otherval & 0xff);
2202 else if (bits == 16) {
2203 *s++ = (U8)((otherval >> 8) & 0xff);
2204 *s++ = (U8)( otherval & 0xff);
2205 }
2206 else if (bits == 32) {
2207 *s++ = (U8)((otherval >> 24) & 0xff);
2208 *s++ = (U8)((otherval >> 16) & 0xff);
2209 *s++ = (U8)((otherval >> 8) & 0xff);
2210 *s++ = (U8)( otherval & 0xff);
2211 }
2212 }
2213 }
2214 sv_free(other); /* through with it! */
2215 } /* while */
2216 return swatch;
2217}
2218
0f830e0b 2219/*
87cea99e 2220=for apidoc uvchr_to_utf8
0f830e0b 2221
2222Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2223of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2224bytes available. The return value is the pointer to the byte after the
2225end of the new character. In other words,
2226
2227 d = uvchr_to_utf8(d, uv);
2228
2229is the recommended wide native character-aware way of saying
2230
2231 *(d++) = uv;
2232
2233=cut
2234*/
2235
2236/* On ASCII machines this is normally a macro but we want a
2237 real function in case XS code wants it
2238*/
0f830e0b 2239U8 *
2240Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2241{
7918f24d 2242 PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2243
0f830e0b 2244 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2245}
2246
b851fbc1 2247U8 *
2248Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2249{
7918f24d 2250 PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2251
b851fbc1 2252 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2253}
2b9d42f0 2254
2255/*
87cea99e 2256=for apidoc utf8n_to_uvchr
0f830e0b 2257flags
2258
2259Returns the native character value of the first character in the string
2260C<s>
2261which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2262length, in bytes, of that character.
2263
2264Allows length and flags to be passed to low level routine.
2265
2266=cut
2267*/
2268/* On ASCII machines this is normally a macro but we want
2269 a real function in case XS code wants it
2270*/
0f830e0b 2271UV
2272Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2273U32 flags)
2274{
2275 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
7918f24d 2276
2277 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2278
0f830e0b 2279 return UNI_TO_NATIVE(uv);
2280}
2281
2282/*
87cea99e 2283=for apidoc pv_uni_display
d2cc3551 2284
2285Build to the scalar dsv a displayable version of the string spv,
2286length len, the displayable version being at most pvlim bytes long
2287(if longer, the rest is truncated and "..." will be appended).
0a2ef054 2288
9e55ce06 2289The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 2290isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054 2291to display the \\[nrfta\\] as the backslashed versions (like '\n')
2292(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2293UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2294UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2295
d2cc3551 2296The pointer to the PV of the dsv is returned.
2297
2298=cut */
e6b2e755 2299char *
e1ec3a88 2300Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755 2301{
2302 int truncated = 0;
e1ec3a88 2303 const char *s, *e;
e6b2e755 2304
7918f24d 2305 PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2306
76f68e9b 2307 sv_setpvs(dsv, "");
7fddd944 2308 SvUTF8_off(dsv);
e1ec3a88 2309 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 2310 UV u;
a49f32c6 2311 /* This serves double duty as a flag and a character to print after
2312 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2313 */
2314 char ok = 0;
c728cb41 2315
e6b2e755 2316 if (pvlim && SvCUR(dsv) >= pvlim) {
2317 truncated++;
2318 break;
2319 }
2320 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 2321 if (u < 256) {
a3b680e6 2322 const unsigned char c = (unsigned char)u & 0xFF;
0bd48802 2323 if (flags & UNI_DISPLAY_BACKSLASH) {
a49f32c6 2324 switch (c) {
c728cb41 2325 case '\n':
a49f32c6 2326 ok = 'n'; break;
c728cb41 2327 case '\r':
a49f32c6 2328 ok = 'r'; break;
c728cb41 2329 case '\t':
a49f32c6 2330 ok = 't'; break;
c728cb41 2331 case '\f':
a49f32c6 2332 ok = 'f'; break;
c728cb41 2333 case '\a':
a49f32c6 2334 ok = 'a'; break;
c728cb41 2335 case '\\':
a49f32c6 2336 ok = '\\'; break;
c728cb41 2337 default: break;
2338 }
a49f32c6 2339 if (ok) {
88c9ea1e 2340 const char string = ok;
76f68e9b 2341 sv_catpvs(dsv, "\\");
5e7aa789 2342 sv_catpvn(dsv, &string, 1);
a49f32c6 2343 }
c728cb41 2344 }
00e86452 2345 /* isPRINT() is the locale-blind version. */
a49f32c6 2346 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
88c9ea1e 2347 const char string = c;
5e7aa789 2348 sv_catpvn(dsv, &string, 1);
a49f32c6 2349 ok = 1;
0a2ef054 2350 }
c728cb41 2351 }
2352 if (!ok)
9e55ce06 2353 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755 2354 }
2355 if (truncated)
396482e1 2356 sv_catpvs(dsv, "...");
e6b2e755 2357
2358 return SvPVX(dsv);
2359}
2b9d42f0 2360
d2cc3551 2361/*
87cea99e 2362=for apidoc sv_uni_display
d2cc3551 2363
2364Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 2365the displayable version being at most pvlim bytes long
d2cc3551 2366(if longer, the rest is truncated and "..." will be appended).
0a2ef054 2367
2368The flags argument is as in pv_uni_display().
2369
d2cc3551 2370The pointer to the PV of the dsv is returned.
2371
d4c19fe8 2372=cut
2373*/
e6b2e755 2374char *
2375Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2376{
7918f24d 2377 PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2378
cfd0369c 2379 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2380 SvCUR(ssv), pvlim, flags);
701a277b 2381}
2382
d2cc3551 2383/*
87cea99e 2384=for apidoc ibcmp_utf8
d2cc3551 2385
2386Return true if the strings s1 and s2 differ case-insensitively, false
2387if not (if they are equal case-insensitively). If u1 is true, the
2388string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77 2389the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2390are false, the respective string is assumed to be in native 8-bit
2391encoding.
2392
2393If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2394in there (they will point at the beginning of the I<next> character).
2395If the pointers behind pe1 or pe2 are non-NULL, they are the end
2396pointers beyond which scanning will not continue under any
4cdaeff7 2397circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77 2398s2+l2 will be used as goal end pointers that will also stop the scan,
2399and which qualify towards defining a successful match: all the scans
2400that define an explicit length must reach their goal pointers for
2401a match to succeed).
d2cc3551 2402
2403For case-insensitiveness, the "casefolding" of Unicode is used
2404instead of upper/lowercasing both the characters, see
2405http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2406
2407=cut */
701a277b 2408I32
d07ddd77 2409Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 2410{
97aff369 2411 dVAR;
e1ec3a88 2412 register const U8 *p1 = (const U8*)s1;
2413 register const U8 *p2 = (const U8*)s2;
cbbf8932 2414 register const U8 *f1 = NULL;
2f73348c 2415 register const U8 *f2 = NULL;
cbbf8932 2416 register U8 *e1 = NULL;
2417 register U8 *q1 = NULL;
2418 register U8 *e2 = NULL;
2419 register U8 *q2 = NULL;
d07ddd77 2420 STRLEN n1 = 0, n2 = 0;
89ebb4a3 2421 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2422 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8 2423 U8 natbuf[1+1];
2424 STRLEN foldlen1, foldlen2;
d07ddd77 2425 bool match;
7918f24d 2426
2427 PERL_ARGS_ASSERT_IBCMP_UTF8;
332ddc25 2428
d07ddd77 2429 if (pe1)
2430 e1 = *(U8**)pe1;
a0a388a1 2431 /* assert(e1 || l1); */
e1ec3a88 2432 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2433 f1 = (const U8*)s1 + l1;
d07ddd77 2434 if (pe2)
2435 e2 = *(U8**)pe2;
a0a388a1 2436 /* assert(e2 || l2); */
e1ec3a88 2437 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2438 f2 = (const U8*)s2 + l2;
d07ddd77 2439
a0a388a1 2440 /* This shouldn't happen. However, putting an assert() there makes some
2441 * tests fail. */
2442 /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
d07ddd77 2443 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2444 return 1; /* mismatch; possible infinite loop or false positive */
2445
a6872d42 2446 if (!u1 || !u2)
2447 natbuf[1] = 0; /* Need to terminate the buffer. */
2448
d07ddd77 2449 while ((e1 == 0 || p1 < e1) &&
2450 (f1 == 0 || p1 < f1) &&
2451 (e2 == 0 || p2 < e2) &&
2452 (f2 == 0 || p2 < f2)) {
2453 if (n1 == 0) {
d7f013c8 2454 if (u1)
2455 to_utf8_fold(p1, foldbuf1, &foldlen1);
2456 else {
809e8e66 2457 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8 2458 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2459 }
2460 q1 = foldbuf1;
d07ddd77 2461 n1 = foldlen1;
332ddc25 2462 }
d07ddd77 2463 if (n2 == 0) {
d7f013c8 2464 if (u2)
2465 to_utf8_fold(p2, foldbuf2, &foldlen2);
2466 else {
809e8e66 2467 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8 2468 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2469 }
2470 q2 = foldbuf2;
d07ddd77 2471 n2 = foldlen2;
332ddc25 2472 }
d07ddd77 2473 while (n1 && n2) {
2474 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2475 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2476 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2477 return 1; /* mismatch */
d07ddd77 2478 n1 -= UTF8SKIP(q1);
d7f013c8 2479 q1 += UTF8SKIP(q1);
d07ddd77 2480 n2 -= UTF8SKIP(q2);
d7f013c8 2481 q2 += UTF8SKIP(q2);
701a277b 2482 }
d07ddd77 2483 if (n1 == 0)
d7f013c8 2484 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2485 if (n2 == 0)
d7f013c8 2486 p2 += u2 ? UTF8SKIP(p2) : 1;
2487
d2cc3551 2488 }
5469e704 2489
d07ddd77 2490 /* A match is defined by all the scans that specified
2491 * an explicit length reaching their final goals. */
2492 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704 2493
2494 if (match) {
d07ddd77 2495 if (pe1)
2496 *pe1 = (char*)p1;
2497 if (pe2)
2498 *pe2 = (char*)p2;
5469e704 2499 }
2500
2501 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2502}
701a277b 2503
a49f32c6 2504/*
2505 * Local variables:
2506 * c-indentation-style: bsd
2507 * c-basic-offset: 4
2508 * indent-tabs-mode: t
2509 * End:
2510 *
37442d52 2511 * ex: set ts=8 sts=4 sw=4 noet:
2512 */