Add a new file, mathoms.c, to hold old code kept around for binary
[p5sagit/p5-mst-13.2.git] / utf8.c
CommitLineData
a0ed51b3 1/* utf8.c
2 *
af3babe4 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
4 * others
a0ed51b3 5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3 26#include "perl.h"
27
27da23d5 28static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 30
ccfc67b7 31/*
32=head1 Unicode Support
a0ed51b3 33
166f8a29 34This file contains various utility functions for manipulating UTF8-encoded
35strings. For the uninitiated, this is a method of representing arbitrary
61296642 36Unicode characters as a variable number of bytes, in such a way that
56da48f7 37characters in the ASCII range are unmodified, and a zero byte never appears
38within non-zero characters.
166f8a29 39
b851fbc1 40=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 41
1e54db1a 42Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 43of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 44bytes available. The return value is the pointer to the byte after the
9041c2e3 45end of the new character. In other words,
eebe1485 46
b851fbc1 47 d = uvuni_to_utf8_flags(d, uv, flags);
48
49or, in most cases,
50
9041c2e3 51 d = uvuni_to_utf8(d, uv);
eebe1485 52
b851fbc1 53(which is equivalent to)
54
55 d = uvuni_to_utf8_flags(d, uv, 0);
56
eebe1485 57is the recommended Unicode-aware way of saying
58
59 *(d++) = uv;
60
61=cut
62*/
63
dfe13c55 64U8 *
b851fbc1 65Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 66{
62961d2e 67 if (ckWARN(WARN_UTF8)) {
b851fbc1 68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1 71 else if (
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
74 ||
c867b360 75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1 76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
2a20b9da 78 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1 79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
81 )
9014280d 82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800 83 "Unicode character 0x%04"UVxf" is illegal", uv);
84 }
c4d5f83a 85 if (UNI_IS_INVARIANT(uv)) {
eb160463 86 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3 87 return d;
88 }
2d331972 89#if defined(EBCDIC)
1d72bdf6 90 else {
91 STRLEN len = UNISKIP(uv);
92 U8 *p = d+len-1;
93 while (p > d) {
eb160463 94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6 95 uv >>= UTF_ACCUMULATION_SHIFT;
96 }
eb160463 97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6 98 return d+len;
99 }
100#else /* Non loop style */
a0ed51b3 101 if (uv < 0x800) {
eb160463 102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 104 return d;
105 }
106 if (uv < 0x10000) {
eb160463 107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 110 return d;
111 }
112 if (uv < 0x200000) {
eb160463 113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 117 return d;
118 }
119 if (uv < 0x4000000) {
eb160463 120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 125 return d;
126 }
127 if (uv < 0x80000000) {
eb160463 128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 134 return d;
135 }
6b8eaf93 136#ifdef HAS_QUAD
d7578b48 137 if (uv < UTF8_QUAD_MAX)
a0ed51b3 138#endif
139 {
eb160463 140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 147 return d;
148 }
6b8eaf93 149#ifdef HAS_QUAD
a0ed51b3 150 {
eb160463 151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 164 return d;
165 }
166#endif
1d72bdf6 167#endif /* Loop style */
a0ed51b3 168}
b851fbc1 169
170U8 *
171Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
172{
173 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
174}
9041c2e3 175
646ca15d 176/*
177
178Tests if some arbitrary number of bytes begins in a valid UTF-8
179character. Note that an INVARIANT (i.e. ASCII) character is a valid
180UTF-8 character. The actual number of bytes in the UTF-8 character
181will be returned if it is valid, otherwise 0.
182
183This is the "slow" version as opposed to the "fast" version which is
184the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
185difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
186or less you should use the IS_UTF8_CHAR(), for lengths of five or more
187you should use the _slow(). In practice this means that the _slow()
188will be used very rarely, since the maximum Unicode code point (as of
189Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
190the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
191five bytes or more.
192
193=cut */
c053b435 194STATIC STRLEN
646ca15d 195S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
196{
197 U8 u = *s;
198 STRLEN slen;
199 UV uv, ouv;
200
201 if (UTF8_IS_INVARIANT(u))
202 return 1;
203
204 if (!UTF8_IS_START(u))
205 return 0;
206
207 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
208 return 0;
209
210 slen = len - 1;
211 s++;
77263263 212#ifdef EBCDIC
213 u = NATIVE_TO_UTF(u);
214#endif
646ca15d 215 u &= UTF_START_MASK(len);
216 uv = u;
217 ouv = uv;
218 while (slen--) {
219 if (!UTF8_IS_CONTINUATION(*s))
220 return 0;
221 uv = UTF8_ACCUMULATE(uv, *s);
222 if (uv < ouv)
223 return 0;
224 ouv = uv;
225 s++;
226 }
227
228 if ((STRLEN)UNISKIP(uv) < len)
229 return 0;
230
231 return len;
232}
9041c2e3 233
234/*
7fc63493 235=for apidoc A|STRLEN|is_utf8_char|const U8 *s
eebe1485 236
5da9da9e 237Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01 238character. Note that an INVARIANT (i.e. ASCII) character is a valid
239UTF-8 character. The actual number of bytes in the UTF-8 character
240will be returned if it is valid, otherwise 0.
9041c2e3 241
82686b01 242=cut */
067a85ef 243STRLEN
7fc63493 244Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 245{
44f8325f 246 const STRLEN len = UTF8SKIP(s);
3b0fc154 247#ifdef IS_UTF8_CHAR
768c67ee 248 if (IS_UTF8_CHAR_FAST(len))
3b0fc154 249 return IS_UTF8_CHAR(s, len) ? len : 0;
250#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 251 return is_utf8_char_slow(s, len);
386d01d6 252}
253
6662521e 254/*
7fc63493 255=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
6662521e 256
c9ada85f 257Returns true if first C<len> bytes of the given string form a valid
1e54db1a 258UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
259not mean 'a string that contains code points above 0x7F encoded in UTF-8'
260because a valid ASCII string is a valid UTF-8 string.
6662521e 261
768c67ee 262See also is_utf8_string_loclen() and is_utf8_string_loc().
263
6662521e 264=cut
265*/
266
8e84507e 267bool
7fc63493 268Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 269{
7fc63493 270 const U8* x = s;
271 const U8* send;
067a85ef 272
44f8325f 273 if (!len)
e1ec3a88 274 len = strlen((const char *)s);
1aa99e6b 275 send = s + len;
276
6662521e 277 while (x < send) {
a3b680e6 278 STRLEN c;
1acdb0da 279 /* Inline the easy bits of is_utf8_char() here for speed... */
280 if (UTF8_IS_INVARIANT(*x))
281 c = 1;
282 else if (!UTF8_IS_START(*x))
768c67ee 283 goto out;
1acdb0da 284 else {
285 /* ... and call is_utf8_char() only if really needed. */
646ca15d 286#ifdef IS_UTF8_CHAR
287 c = UTF8SKIP(x);
768c67ee 288 if (IS_UTF8_CHAR_FAST(c)) {
289 if (!IS_UTF8_CHAR(x, c))
290 goto out;
291 } else if (!is_utf8_char_slow(x, c))
292 goto out;
646ca15d 293#else
294 c = is_utf8_char(x);
295#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 296 if (!c)
768c67ee 297 goto out;
1acdb0da 298 }
6662521e 299 x += c;
6662521e 300 }
768c67ee 301
302 out:
60006e79 303 if (x != send)
304 return FALSE;
067a85ef 305
306 return TRUE;
6662521e 307}
308
67e989fb 309/*
768c67ee 310=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
81cd54e3 311
768c67ee 312Like is_ut8_string() but stores the location of the failure (in the
313case of "utf8ness failure") or the location s+len (in the case of
314"utf8ness success") in the C<ep>, and the number of UTF-8
315encoded characters in the C<el>.
316
317See also is_utf8_string_loc() and is_utf8_string().
81cd54e3 318
319=cut
320*/
321
322bool
768c67ee 323Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 324{
7fc63493 325 const U8* x = s;
326 const U8* send;
81cd54e3 327 STRLEN c;
328
44f8325f 329 if (!len)
e1ec3a88 330 len = strlen((const char *)s);
81cd54e3 331 send = s + len;
768c67ee 332 if (el)
333 *el = 0;
81cd54e3 334
335 while (x < send) {
336 /* Inline the easy bits of is_utf8_char() here for speed... */
337 if (UTF8_IS_INVARIANT(*x))
768c67ee 338 c = 1;
339 else if (!UTF8_IS_START(*x))
340 goto out;
81cd54e3 341 else {
768c67ee 342 /* ... and call is_utf8_char() only if really needed. */
343#ifdef IS_UTF8_CHAR
344 c = UTF8SKIP(x);
345 if (IS_UTF8_CHAR_FAST(c)) {
346 if (!IS_UTF8_CHAR(x, c))
347 c = 0;
348 } else
349 c = is_utf8_char_slow(x, c);
350#else
351 c = is_utf8_char(x);
352#endif /* #ifdef IS_UTF8_CHAR */
353 if (!c)
354 goto out;
81cd54e3 355 }
768c67ee 356 x += c;
357 if (el)
358 (*el)++;
81cd54e3 359 }
768c67ee 360
361 out:
362 if (ep)
363 *ep = x;
364 if (x != send)
81cd54e3 365 return FALSE;
81cd54e3 366
367 return TRUE;
368}
369
370/*
768c67ee 371=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
372
373Like is_ut8_string() but stores the location of the failure (in the
374case of "utf8ness failure") or the location s+len (in the case of
375"utf8ness success") in the C<ep>.
376
377See also is_utf8_string_loclen() and is_utf8_string().
378
379=cut
380*/
381
382bool
383Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **ep)
384{
385 return is_utf8_string_loclen(s, len, ep, 0);
386}
387
388/*
7fc63493 389=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 390
9041c2e3 391Bottom level UTF-8 decode routine.
392Returns the unicode code point value of the first character in the string C<s>
1e54db1a 393which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 394C<retlen> will be set to the length, in bytes, of that character.
67e989fb 395
1e54db1a 396If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880 397is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
398it is assumed that the caller will raise a warning, and this function
28d3d195 399will silently just set C<retlen> to C<-1> and return zero. If the
400C<flags> does not contain UTF8_CHECK_ONLY, warnings about
401malformations will be given, C<retlen> will be set to the expected
402length of the UTF-8 character in bytes, and zero will be returned.
403
404The C<flags> can also contain various flags to allow deviations from
405the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 406
9041c2e3 407Most code should use utf8_to_uvchr() rather than call this directly.
408
37607a96 409=cut
410*/
67e989fb 411
a0ed51b3 412UV
7fc63493 413Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 414{
7fc63493 415 const U8 *s0 = s;
9c5ffd7c 416 UV uv = *s, ouv = 0;
ba210ebe 417 STRLEN len = 1;
7fc63493 418 const bool dowarn = ckWARN_d(WARN_UTF8);
419 const UV startbyte = *s;
ba210ebe 420 STRLEN expectlen = 0;
a0dbb045 421 U32 warning = 0;
422
423/* This list is a superset of the UTF8_ALLOW_XXX. */
424
425#define UTF8_WARN_EMPTY 1
426#define UTF8_WARN_CONTINUATION 2
427#define UTF8_WARN_NON_CONTINUATION 3
428#define UTF8_WARN_FE_FF 4
429#define UTF8_WARN_SHORT 5
430#define UTF8_WARN_OVERFLOW 6
431#define UTF8_WARN_SURROGATE 7
c867b360 432#define UTF8_WARN_LONG 8
433#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045 434
435 if (curlen == 0 &&
436 !(flags & UTF8_ALLOW_EMPTY)) {
437 warning = UTF8_WARN_EMPTY;
0c443dc2 438 goto malformed;
439 }
440
1d72bdf6 441 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3 442 if (retlen)
443 *retlen = 1;
c4d5f83a 444 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 445 }
67e989fb 446
421a8bf2 447 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 448 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 449 warning = UTF8_WARN_CONTINUATION;
ba210ebe 450 goto malformed;
451 }
452
421a8bf2 453 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 454 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 455 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 456 goto malformed;
457 }
9041c2e3 458
1d72bdf6 459#ifdef EBCDIC
75383841 460 uv = NATIVE_TO_UTF(uv);
1d72bdf6 461#else
fcc8fcf6 462 if ((uv == 0xfe || uv == 0xff) &&
463 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 464 warning = UTF8_WARN_FE_FF;
ba210ebe 465 goto malformed;
a0ed51b3 466 }
1d72bdf6 467#endif
468
ba210ebe 469 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
470 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
471 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
472 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6 473#ifdef EBCDIC
474 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
475 else { len = 7; uv &= 0x01; }
476#else
ba210ebe 477 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
478 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6 479 else { len = 13; uv = 0; } /* whoa! */
480#endif
481
a0ed51b3 482 if (retlen)
483 *retlen = len;
9041c2e3 484
ba210ebe 485 expectlen = len;
486
fcc8fcf6 487 if ((curlen < expectlen) &&
488 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 489 warning = UTF8_WARN_SHORT;
ba210ebe 490 goto malformed;
491 }
492
493 len--;
a0ed51b3 494 s++;
ba210ebe 495 ouv = uv;
496
a0ed51b3 497 while (len--) {
421a8bf2 498 if (!UTF8_IS_CONTINUATION(*s) &&
499 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 500 s--;
501 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 502 goto malformed;
a0ed51b3 503 }
504 else
8850bf83 505 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045 506 if (!(uv > ouv)) {
507 /* These cannot be allowed. */
508 if (uv == ouv) {
75dbc644 509 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 510 warning = UTF8_WARN_LONG;
511 goto malformed;
512 }
513 }
514 else { /* uv < ouv */
515 /* This cannot be allowed. */
516 warning = UTF8_WARN_OVERFLOW;
517 goto malformed;
518 }
ba210ebe 519 }
520 s++;
521 ouv = uv;
522 }
523
421a8bf2 524 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 525 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 526 warning = UTF8_WARN_SURROGATE;
ba210ebe 527 goto malformed;
eb160463 528 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 529 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 530 warning = UTF8_WARN_LONG;
ba210ebe 531 goto malformed;
421a8bf2 532 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 533 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 534 warning = UTF8_WARN_FFFF;
a9917092 535 goto malformed;
a0ed51b3 536 }
ba210ebe 537
a0ed51b3 538 return uv;
ba210ebe 539
540malformed:
541
fcc8fcf6 542 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 543 if (retlen)
cc366d4b 544 *retlen = -1;
ba210ebe 545 return 0;
546 }
547
a0dbb045 548 if (dowarn) {
44f8325f 549 SV* const sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
a0dbb045 550
551 switch (warning) {
552 case 0: /* Intentionally empty. */ break;
553 case UTF8_WARN_EMPTY:
54667de8 554 Perl_sv_catpv(aTHX_ sv, "(empty string)");
a0dbb045 555 break;
556 case UTF8_WARN_CONTINUATION:
097fb8e2 557 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045 558 break;
559 case UTF8_WARN_NON_CONTINUATION:
097fb8e2 560 if (s == s0)
561 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
562 (UV)s[1], startbyte);
551405c4 563 else {
564 const int len = (int)(s-s0);
097fb8e2 565 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
551405c4 566 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
567 }
568
a0dbb045 569 break;
570 case UTF8_WARN_FE_FF:
571 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
572 break;
573 case UTF8_WARN_SHORT:
097fb8e2 574 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 575 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 576 expectlen = curlen; /* distance for caller to skip */
a0dbb045 577 break;
578 case UTF8_WARN_OVERFLOW:
097fb8e2 579 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
580 ouv, *s, startbyte);
a0dbb045 581 break;
582 case UTF8_WARN_SURROGATE:
583 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
584 break;
a0dbb045 585 case UTF8_WARN_LONG:
097fb8e2 586 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 587 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045 588 break;
589 case UTF8_WARN_FFFF:
590 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
591 break;
592 default:
54667de8 593 Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
a0dbb045 594 break;
595 }
596
597 if (warning) {
44f8325f 598 const char * const s = SvPVX_const(sv);
a0dbb045 599
600 if (PL_op)
9014280d 601 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 602 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 603 else
9014280d 604 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045 605 }
606 }
607
ba210ebe 608 if (retlen)
28d3d195 609 *retlen = expectlen ? expectlen : len;
ba210ebe 610
28d3d195 611 return 0;
a0ed51b3 612}
613
8e84507e 614/*
7fc63493 615=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
9041c2e3 616
617Returns the native character value of the first character in the string C<s>
1e54db1a 618which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3 619length, in bytes, of that character.
620
1e54db1a 621If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3 622returned and retlen is set, if possible, to -1.
623
624=cut
625*/
626
627UV
7fc63493 628Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 629{
89ebb4a3 630 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 631 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3 632}
633
634/*
7fc63493 635=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
9041c2e3 636
637Returns the Unicode code point of the first character in the string C<s>
1e54db1a 638which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3 639length, in bytes, of that character.
640
641This function should only be used when returned UV is considered
642an index into the Unicode semantic tables (e.g. swashes).
643
1e54db1a 644If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 645returned and retlen is set, if possible, to -1.
8e84507e 646
647=cut
648*/
649
650UV
7fc63493 651Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 652{
9041c2e3 653 /* Call the low level routine asking for checks */
89ebb4a3 654 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 655 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e 656}
657
b76347f2 658/*
35a4481c 659=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
b76347f2 660
661Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47 662Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
663up past C<e>, croaks.
b76347f2 664
665=cut
666*/
667
668STRLEN
35a4481c 669Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2 670{
671 STRLEN len = 0;
672
8850bf83 673 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
674 * the bitops (especially ~) can create illegal UTF-8.
675 * In other words: in Perl UTF-8 is not just for Unicode. */
676
a3b680e6 677 if (e < s)
678 goto warn_and_return;
b76347f2 679 while (s < e) {
4373e329 680 const U8 t = UTF8SKIP(s);
901b21bf 681 if (e - s < t) {
a3b680e6 682 warn_and_return:
901b21bf 683 if (ckWARN_d(WARN_UTF8)) {
684 if (PL_op)
685 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 686 "%s in %s", unees, OP_DESC(PL_op));
901b21bf 687 else
688 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
689 }
690 return len;
691 }
b76347f2 692 s += t;
693 len++;
694 }
695
696 return len;
697}
698
b06226ff 699/*
35a4481c 700=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
b06226ff 701
1e54db1a 702Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff 703and C<b>.
704
705WARNING: use only if you *know* that the pointers point inside the
706same UTF-8 buffer.
707
37607a96 708=cut
709*/
a0ed51b3 710
02eb7b47 711IV
35a4481c 712Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 713{
02eb7b47 714 IV off = 0;
715
8850bf83 716 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
717 * the bitops (especially ~) can create illegal UTF-8.
718 * In other words: in Perl UTF-8 is not just for Unicode. */
719
a0ed51b3 720 if (a < b) {
721 while (a < b) {
35a4481c 722 const U8 c = UTF8SKIP(a);
a3b680e6 723 if (b - a < c)
724 goto warn_and_return;
02eb7b47 725 a += c;
a0ed51b3 726 off--;
727 }
728 }
729 else {
730 while (b < a) {
4373e329 731 const U8 c = UTF8SKIP(b);
02eb7b47 732
901b21bf 733 if (a - b < c) {
a3b680e6 734 warn_and_return:
901b21bf 735 if (ckWARN_d(WARN_UTF8)) {
736 if (PL_op)
737 Perl_warner(aTHX_ packWARN(WARN_UTF8),
738 "%s in %s", unees, OP_DESC(PL_op));
739 else
740 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
741 }
742 return off;
743 }
02eb7b47 744 b += c;
a0ed51b3 745 off++;
746 }
747 }
02eb7b47 748
a0ed51b3 749 return off;
750}
751
b06226ff 752/*
37607a96 753=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 754
8850bf83 755Return the UTF-8 pointer C<s> displaced by C<off> characters, either
756forward or backward.
b06226ff 757
758WARNING: do not use the following unless you *know* C<off> is within
8850bf83 759the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
760on the first byte of character or just after the last byte of a character.
b06226ff 761
37607a96 762=cut
763*/
a0ed51b3 764
765U8 *
4373e329 766Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 767{
8850bf83 768 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
769 * the bitops (especially ~) can create illegal UTF-8.
770 * In other words: in Perl UTF-8 is not just for Unicode. */
771
a0ed51b3 772 if (off >= 0) {
773 while (off--)
774 s += UTF8SKIP(s);
775 }
776 else {
777 while (off++) {
778 s--;
8850bf83 779 while (UTF8_IS_CONTINUATION(*s))
780 s--;
a0ed51b3 781 }
782 }
4373e329 783 return (U8 *)s;
a0ed51b3 784}
785
6940069f 786/*
eebe1485 787=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 788
1e54db1a 789Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53 790Unlike C<bytes_to_utf8>, this over-writes the original string, and
791updates len to contain the new length.
67e989fb 792Returns zero on failure, setting C<len> to -1.
6940069f 793
794=cut
795*/
796
797U8 *
37607a96 798Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 799{
6940069f 800 U8 *send;
801 U8 *d;
dcad2880 802 U8 *save = s;
246fae53 803
1e54db1a 804 /* ensure valid UTF-8 and chars < 256 before updating string */
dcad2880 805 for (send = s + *len; s < send; ) {
806 U8 c = *s++;
807
1d72bdf6 808 if (!UTF8_IS_INVARIANT(c) &&
809 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
810 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880 811 *len = -1;
812 return 0;
813 }
246fae53 814 }
dcad2880 815
816 d = s = save;
6940069f 817 while (s < send) {
ed646e6e 818 STRLEN ulen;
9041c2e3 819 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 820 s += ulen;
6940069f 821 }
822 *d = '\0';
246fae53 823 *len = d - save;
6940069f 824 return save;
825}
826
827/*
e1ec3a88 828=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 829
1e54db1a 830Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
35a4481c 831Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0 832the newly-created string, and updates C<len> to contain the new
833length. Returns the original string if no conversion occurs, C<len>
834is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
8350 if C<s> is converted or contains all 7bit characters.
f9a63242 836
37607a96 837=cut
838*/
f9a63242 839
840U8 *
e1ec3a88 841Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 842{
f9a63242 843 U8 *d;
e1ec3a88 844 const U8 *start = s;
845 const U8 *send;
f9a63242 846 I32 count = 0;
847
848 if (!*is_utf8)
73d840c0 849 return (U8 *)start;
f9a63242 850
1e54db1a 851 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 852 for (send = s + *len; s < send;) {
e1ec3a88 853 U8 c = *s++;
1d72bdf6 854 if (!UTF8_IS_INVARIANT(c)) {
db42d148 855 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
856 (c = *s++) && UTF8_IS_CONTINUATION(c))
857 count++;
858 else
73d840c0 859 return (U8 *)start;
db42d148 860 }
f9a63242 861 }
862
863 *is_utf8 = 0;
864
a02a5408 865 Newxz(d, (*len) - count + 1, U8);
ef9edfd0 866 s = start; start = d;
f9a63242 867 while (s < send) {
868 U8 c = *s++;
c4d5f83a 869 if (!UTF8_IS_INVARIANT(c)) {
870 /* Then it is two-byte encoded */
871 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
872 c = ASCII_TO_NATIVE(c);
873 }
874 *d++ = c;
f9a63242 875 }
876 *d = '\0';
877 *len = d - start;
73d840c0 878 return (U8 *)start;
f9a63242 879}
880
881/*
35a4481c 882=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
6940069f 883
1e54db1a 884Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e 885Returns a pointer to the newly-created string, and sets C<len> to
886reflect the new length.
6940069f 887
1e54db1a 888If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f 889see sv_recode_to_utf8().
890
497711e7 891=cut
6940069f 892*/
893
894U8*
35a4481c 895Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 896{
35a4481c 897 const U8 * const send = s + (*len);
6940069f 898 U8 *d;
899 U8 *dst;
6940069f 900
a02a5408 901 Newxz(d, (*len) * 2 + 1, U8);
6940069f 902 dst = d;
903
904 while (s < send) {
35a4481c 905 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 906 if (UNI_IS_INVARIANT(uv))
eb160463 907 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 908 else {
eb160463 909 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
910 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f 911 }
912 }
913 *d = '\0';
6662521e 914 *len = d-dst;
6940069f 915 return dst;
916}
917
a0ed51b3 918/*
dea0fc0b 919 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3 920 *
921 * Destination must be pre-extended to 3/2 source. Do not use in-place.
922 * We optimize for native, for obvious reasons. */
923
924U8*
dea0fc0b 925Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 926{
dea0fc0b 927 U8* pend;
928 U8* dstart = d;
929
1de9afcd 930 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
931 d[0] = 0;
932 *newlen = 1;
933 return d;
934 }
935
dea0fc0b 936 if (bytelen & 1)
014ead4b 937 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
dea0fc0b 938
939 pend = p + bytelen;
940
a0ed51b3 941 while (p < pend) {
dea0fc0b 942 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
943 p += 2;
a0ed51b3 944 if (uv < 0x80) {
eb160463 945 *d++ = (U8)uv;
a0ed51b3 946 continue;
947 }
948 if (uv < 0x800) {
eb160463 949 *d++ = (U8)(( uv >> 6) | 0xc0);
950 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 951 continue;
952 }
953 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e 954 UV low = (p[0] << 8) + p[1];
955 p += 2;
dea0fc0b 956 if (low < 0xdc00 || low >= 0xdfff)
957 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3 958 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
959 }
960 if (uv < 0x10000) {
eb160463 961 *d++ = (U8)(( uv >> 12) | 0xe0);
962 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
963 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 964 continue;
965 }
966 else {
eb160463 967 *d++ = (U8)(( uv >> 18) | 0xf0);
968 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
969 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
970 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 971 continue;
972 }
973 }
dea0fc0b 974 *newlen = d - dstart;
a0ed51b3 975 return d;
976}
977
978/* Note: this one is slightly destructive of the source. */
979
980U8*
dea0fc0b 981Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 982{
983 U8* s = (U8*)p;
984 U8* send = s + bytelen;
985 while (s < send) {
986 U8 tmp = s[0];
987 s[0] = s[1];
988 s[1] = tmp;
989 s += 2;
990 }
dea0fc0b 991 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3 992}
993
994/* for now these are all defined (inefficiently) in terms of the utf8 versions */
995
996bool
84afefe6 997Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 998{
89ebb4a3 999 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1000 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1001 return is_utf8_alnum(tmpbuf);
1002}
1003
1004bool
84afefe6 1005Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 1006{
89ebb4a3 1007 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1008 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1009 return is_utf8_alnumc(tmpbuf);
1010}
1011
1012bool
84afefe6 1013Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1014{
89ebb4a3 1015 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1016 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1017 return is_utf8_idfirst(tmpbuf);
1018}
1019
1020bool
84afefe6 1021Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1022{
89ebb4a3 1023 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1024 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1025 return is_utf8_alpha(tmpbuf);
1026}
1027
1028bool
84afefe6 1029Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1030{
89ebb4a3 1031 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1032 uvchr_to_utf8(tmpbuf, c);
4d61ec05 1033 return is_utf8_ascii(tmpbuf);
1034}
1035
1036bool
84afefe6 1037Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1038{
89ebb4a3 1039 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1040 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1041 return is_utf8_space(tmpbuf);
1042}
1043
1044bool
84afefe6 1045Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1046{
89ebb4a3 1047 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1048 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1049 return is_utf8_digit(tmpbuf);
1050}
1051
1052bool
84afefe6 1053Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1054{
89ebb4a3 1055 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1056 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1057 return is_utf8_upper(tmpbuf);
1058}
1059
1060bool
84afefe6 1061Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1062{
89ebb4a3 1063 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1064 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1065 return is_utf8_lower(tmpbuf);
1066}
1067
1068bool
84afefe6 1069Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1070{
89ebb4a3 1071 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1072 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1073 return is_utf8_cntrl(tmpbuf);
1074}
1075
1076bool
84afefe6 1077Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1078{
89ebb4a3 1079 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1080 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1081 return is_utf8_graph(tmpbuf);
1082}
1083
1084bool
84afefe6 1085Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1086{
89ebb4a3 1087 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1088 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1089 return is_utf8_print(tmpbuf);
1090}
1091
b8c5462f 1092bool
84afefe6 1093Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1094{
89ebb4a3 1095 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1096 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1097 return is_utf8_punct(tmpbuf);
1098}
1099
4d61ec05 1100bool
84afefe6 1101Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1102{
89ebb4a3 1103 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1104 uvchr_to_utf8(tmpbuf, c);
4d61ec05 1105 return is_utf8_xdigit(tmpbuf);
1106}
1107
84afefe6 1108UV
1109Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1110{
0ebc6274 1111 uvchr_to_utf8(p, c);
1112 return to_utf8_upper(p, p, lenp);
a0ed51b3 1113}
1114
84afefe6 1115UV
1116Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1117{
0ebc6274 1118 uvchr_to_utf8(p, c);
1119 return to_utf8_title(p, p, lenp);
a0ed51b3 1120}
1121
84afefe6 1122UV
1123Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1124{
0ebc6274 1125 uvchr_to_utf8(p, c);
1126 return to_utf8_lower(p, p, lenp);
a0ed51b3 1127}
1128
84afefe6 1129UV
1130Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1131{
0ebc6274 1132 uvchr_to_utf8(p, c);
1133 return to_utf8_fold(p, p, lenp);
84afefe6 1134}
1135
a0ed51b3 1136/* for now these all assume no locale info available for Unicode > 255 */
1137
1138bool
84afefe6 1139Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3 1140{
1141 return is_uni_alnum(c); /* XXX no locale support yet */
1142}
1143
1144bool
84afefe6 1145Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f 1146{
1147 return is_uni_alnumc(c); /* XXX no locale support yet */
1148}
1149
1150bool
84afefe6 1151Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3 1152{
1153 return is_uni_idfirst(c); /* XXX no locale support yet */
1154}
1155
1156bool
84afefe6 1157Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3 1158{
1159 return is_uni_alpha(c); /* XXX no locale support yet */
1160}
1161
1162bool
84afefe6 1163Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05 1164{
1165 return is_uni_ascii(c); /* XXX no locale support yet */
1166}
1167
1168bool
84afefe6 1169Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3 1170{
1171 return is_uni_space(c); /* XXX no locale support yet */
1172}
1173
1174bool
84afefe6 1175Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3 1176{
1177 return is_uni_digit(c); /* XXX no locale support yet */
1178}
1179
1180bool
84afefe6 1181Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3 1182{
1183 return is_uni_upper(c); /* XXX no locale support yet */
1184}
1185
1186bool
84afefe6 1187Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3 1188{
1189 return is_uni_lower(c); /* XXX no locale support yet */
1190}
1191
1192bool
84afefe6 1193Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f 1194{
1195 return is_uni_cntrl(c); /* XXX no locale support yet */
1196}
1197
1198bool
84afefe6 1199Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f 1200{
1201 return is_uni_graph(c); /* XXX no locale support yet */
1202}
1203
1204bool
84afefe6 1205Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3 1206{
1207 return is_uni_print(c); /* XXX no locale support yet */
1208}
1209
b8c5462f 1210bool
84afefe6 1211Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f 1212{
1213 return is_uni_punct(c); /* XXX no locale support yet */
1214}
1215
4d61ec05 1216bool
84afefe6 1217Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05 1218{
1219 return is_uni_xdigit(c); /* XXX no locale support yet */
1220}
1221
b7ac61fa 1222U32
1223Perl_to_uni_upper_lc(pTHX_ U32 c)
1224{
ee099d14 1225 /* XXX returns only the first character -- do not use XXX */
1226 /* XXX no locale support yet */
1227 STRLEN len;
89ebb4a3 1228 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1229 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa 1230}
1231
1232U32
1233Perl_to_uni_title_lc(pTHX_ U32 c)
1234{
ee099d14 1235 /* XXX returns only the first character XXX -- do not use XXX */
1236 /* XXX no locale support yet */
1237 STRLEN len;
89ebb4a3 1238 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1239 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa 1240}
1241
1242U32
1243Perl_to_uni_lower_lc(pTHX_ U32 c)
1244{
ee099d14 1245 /* XXX returns only the first character -- do not use XXX */
1246 /* XXX no locale support yet */
1247 STRLEN len;
89ebb4a3 1248 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1249 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa 1250}
1251
a0ed51b3 1252bool
7fc63493 1253Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1254{
386d01d6 1255 if (!is_utf8_char(p))
1256 return FALSE;
a0ed51b3 1257 if (!PL_utf8_alnum)
289d4f09 1258 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1259 * descendant of isalnum(3), in other words, it doesn't
1260 * contain the '_'. --jhi */
1261 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
eb160463 1262 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3 1263/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1264#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1265 if (!PL_utf8_alnum)
1266 PL_utf8_alnum = swash_init("utf8", "",
1267 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1268 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3 1269#endif
1270}
1271
1272bool
7fc63493 1273Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1274{
386d01d6 1275 if (!is_utf8_char(p))
1276 return FALSE;
b8c5462f 1277 if (!PL_utf8_alnum)
1278 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
eb160463 1279 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f 1280/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1281#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1282 if (!PL_utf8_alnum)
1283 PL_utf8_alnum = swash_init("utf8", "",
1284 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1285 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f 1286#endif
1287}
1288
1289bool
7fc63493 1290Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1291{
82686b01 1292 if (*p == '_')
1293 return TRUE;
1294 if (!is_utf8_char(p))
1295 return FALSE;
1296 if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1297 PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
eb160463 1298 return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
82686b01 1299}
1300
1301bool
7fc63493 1302Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01 1303{
1304 if (*p == '_')
1305 return TRUE;
1306 if (!is_utf8_char(p))
1307 return FALSE;
1308 if (!PL_utf8_idcont)
1309 PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
eb160463 1310 return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
a0ed51b3 1311}
1312
1313bool
7fc63493 1314Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1315{
386d01d6 1316 if (!is_utf8_char(p))
1317 return FALSE;
a0ed51b3 1318 if (!PL_utf8_alpha)
e24b16f9 1319 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
eb160463 1320 return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
a0ed51b3 1321}
1322
1323bool
7fc63493 1324Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1325{
386d01d6 1326 if (!is_utf8_char(p))
1327 return FALSE;
b8c5462f 1328 if (!PL_utf8_ascii)
1329 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
eb160463 1330 return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
b8c5462f 1331}
1332
1333bool
7fc63493 1334Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1335{
386d01d6 1336 if (!is_utf8_char(p))
1337 return FALSE;
a0ed51b3 1338 if (!PL_utf8_space)
3bec3564 1339 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
eb160463 1340 return swash_fetch(PL_utf8_space, p, TRUE) != 0;
a0ed51b3 1341}
1342
1343bool
7fc63493 1344Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1345{
386d01d6 1346 if (!is_utf8_char(p))
1347 return FALSE;
a0ed51b3 1348 if (!PL_utf8_digit)
e24b16f9 1349 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
eb160463 1350 return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
a0ed51b3 1351}
1352
1353bool
7fc63493 1354Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1355{
386d01d6 1356 if (!is_utf8_char(p))
1357 return FALSE;
a0ed51b3 1358 if (!PL_utf8_upper)
c65e4d19 1359 PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
eb160463 1360 return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
a0ed51b3 1361}
1362
1363bool
7fc63493 1364Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1365{
386d01d6 1366 if (!is_utf8_char(p))
1367 return FALSE;
a0ed51b3 1368 if (!PL_utf8_lower)
c65e4d19 1369 PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
eb160463 1370 return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
a0ed51b3 1371}
1372
1373bool
7fc63493 1374Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1375{
386d01d6 1376 if (!is_utf8_char(p))
1377 return FALSE;
b8c5462f 1378 if (!PL_utf8_cntrl)
1379 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
eb160463 1380 return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
b8c5462f 1381}
1382
1383bool
7fc63493 1384Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1385{
386d01d6 1386 if (!is_utf8_char(p))
1387 return FALSE;
b8c5462f 1388 if (!PL_utf8_graph)
1389 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
eb160463 1390 return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
b8c5462f 1391}
1392
1393bool
7fc63493 1394Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1395{
386d01d6 1396 if (!is_utf8_char(p))
1397 return FALSE;
a0ed51b3 1398 if (!PL_utf8_print)
e24b16f9 1399 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
eb160463 1400 return swash_fetch(PL_utf8_print, p, TRUE) != 0;
a0ed51b3 1401}
1402
1403bool
7fc63493 1404Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1405{
386d01d6 1406 if (!is_utf8_char(p))
1407 return FALSE;
b8c5462f 1408 if (!PL_utf8_punct)
1409 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
eb160463 1410 return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
b8c5462f 1411}
1412
1413bool
7fc63493 1414Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1415{
386d01d6 1416 if (!is_utf8_char(p))
1417 return FALSE;
b8c5462f 1418 if (!PL_utf8_xdigit)
1419 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
eb160463 1420 return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
b8c5462f 1421}
1422
1423bool
7fc63493 1424Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1425{
386d01d6 1426 if (!is_utf8_char(p))
1427 return FALSE;
a0ed51b3 1428 if (!PL_utf8_mark)
e24b16f9 1429 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
eb160463 1430 return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
a0ed51b3 1431}
1432
6b5c0936 1433/*
1434=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1435
1436The "p" contains the pointer to the UTF-8 string encoding
1437the character that is being converted.
1438
1439The "ustrp" is a pointer to the character buffer to put the
1440conversion result to. The "lenp" is a pointer to the length
1441of the result.
1442
0134edef 1443The "swashp" is a pointer to the swash to use.
6b5c0936 1444
0134edef 1445Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1446and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1447but not always, a multicharacter mapping), is tried first.
6b5c0936 1448
0134edef 1449The "special" is a string like "utf8::ToSpecLower", which means the
1450hash %utf8::ToSpecLower. The access to the hash is through
1451Perl_to_utf8_case().
6b5c0936 1452
0134edef 1453The "normal" is a string like "ToLower" which means the swash
1454%utf8::ToLower.
1455
1456=cut */
6b5c0936 1457
2104c8d9 1458UV
9a957fbc 1459Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1460 SV **swashp, const char *normal, const char *special)
a0ed51b3 1461{
89ebb4a3 1462 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1463 STRLEN len = 0;
a0ed51b3 1464
aec46f14 1465 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7 1466 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1467 * are necessary in EBCDIC, they are redundant no-ops
1468 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1469 const UV uv1 = NATIVE_TO_UNI(uv0);
1feea2c7 1470 uvuni_to_utf8(tmpbuf, uv1);
0134edef 1471
1472 if (!*swashp) /* load on-demand */
1473 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1474
b08cf34e 1475 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1476 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1477 /* It might be "special" (sometimes, but not always,
2a37f04d 1478 * a multicharacter mapping) */
983ffd37 1479 HV *hv;
b08cf34e 1480 SV **svp;
1481
1482 if ((hv = get_hv(special, FALSE)) &&
1483 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1484 (*svp)) {
cfd0369c 1485 const char *s;
47654450 1486
cfd0369c 1487 s = SvPV_const(*svp, len);
47654450 1488 if (len == 1)
1489 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1490 else {
2f9475ad 1491#ifdef EBCDIC
1492 /* If we have EBCDIC we need to remap the characters
1493 * since any characters in the low 256 are Unicode
1494 * code points, not EBCDIC. */
7cda7a3d 1495 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad 1496
1497 d = tmpbuf;
b08cf34e 1498 if (SvUTF8(*svp)) {
2f9475ad 1499 STRLEN tlen = 0;
1500
1501 while (t < tend) {
1502 UV c = utf8_to_uvchr(t, &tlen);
1503 if (tlen > 0) {
1504 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1505 t += tlen;
1506 }
1507 else
1508 break;
1509 }
1510 }
1511 else {
36fec512 1512 while (t < tend) {
1513 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1514 t++;
1515 }
2f9475ad 1516 }
1517 len = d - tmpbuf;
1518 Copy(tmpbuf, ustrp, len, U8);
1519#else
d2dcd0fb 1520 Copy(s, ustrp, len, U8);
2f9475ad 1521#endif
29e98929 1522 }
983ffd37 1523 }
0134edef 1524 }
1525
1526 if (!len && *swashp) {
1527 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1528
1529 if (uv2) {
1530 /* It was "normal" (a single character mapping). */
1531 UV uv3 = UNI_TO_NATIVE(uv2);
1532
e9101d72 1533 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d 1534 }
1535 }
1feea2c7 1536
0134edef 1537 if (!len) /* Neither: just copy. */
1538 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1539
2a37f04d 1540 if (lenp)
1541 *lenp = len;
1542
0134edef 1543 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3 1544}
1545
d3e79532 1546/*
7fc63493 1547=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1548
1549Convert the UTF-8 encoded character at p to its uppercase version and
1550store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1551that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1552the uppercase version may be longer than the original character.
d3e79532 1553
1554The first character of the uppercased version is returned
1555(but note, as explained above, that there may be more.)
1556
1557=cut */
1558
2104c8d9 1559UV
7fc63493 1560Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1561{
983ffd37 1562 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1563 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1564}
a0ed51b3 1565
d3e79532 1566/*
7fc63493 1567=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1568
1569Convert the UTF-8 encoded character at p to its titlecase version and
1570store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1571that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1572titlecase version may be longer than the original character.
d3e79532 1573
1574The first character of the titlecased version is returned
1575(but note, as explained above, that there may be more.)
1576
1577=cut */
1578
983ffd37 1579UV
7fc63493 1580Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37 1581{
1582 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1583 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3 1584}
1585
d3e79532 1586/*
7fc63493 1587=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1588
1589Convert the UTF-8 encoded character at p to its lowercase version and
1590store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1591that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1592lowercase version may be longer than the original character.
d3e79532 1593
1594The first character of the lowercased version is returned
1595(but note, as explained above, that there may be more.)
1596
1597=cut */
1598
2104c8d9 1599UV
7fc63493 1600Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1601{
983ffd37 1602 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1603 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1604}
1605
d3e79532 1606/*
7fc63493 1607=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1608
1609Convert the UTF-8 encoded character at p to its foldcase version and
1610store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1611that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532 1612foldcase version may be longer than the original character (up to
1613three characters).
1614
1615The first character of the foldcased version is returned
1616(but note, as explained above, that there may be more.)
1617
1618=cut */
1619
b4e400f9 1620UV
7fc63493 1621Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9 1622{
1623 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1624 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3 1625}
1626
1627/* a "swash" is a swatch hash */
1628
1629SV*
7fc63493 1630Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1631{
27da23d5 1632 dVAR;
a0ed51b3 1633 SV* retval;
9a957fbc 1634 SV* const tokenbufsv = sv_newmortal();
8e84507e 1635 dSP;
7fc63493 1636 const size_t pkg_len = strlen(pkg);
1637 const size_t name_len = strlen(name);
aec46f14 1638 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1639 SV* errsv_save;
ce3b816e 1640
96ca9f55 1641 PUSHSTACKi(PERLSI_MAGIC);
1642 ENTER;
1643 SAVEI32(PL_hints);
1644 PL_hints = 0;
1645 save_re_context();
1b026014 1646 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1647 ENTER;
f8be5cf0 1648 errsv_save = newSVsv(ERRSV);
71bed85a 1649 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1650 Nullsv);
f8be5cf0 1651 if (!SvTRUE(ERRSV))
1652 sv_setsv(ERRSV, errsv_save);
1653 SvREFCNT_dec(errsv_save);
ce3b816e 1654 LEAVE;
1655 }
1656 SPAGAIN;
a0ed51b3 1657 PUSHMARK(SP);
1658 EXTEND(SP,5);
71bed85a 1659 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1660 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3 1661 PUSHs(listsv);
1662 PUSHs(sv_2mortal(newSViv(minbits)));
1663 PUSHs(sv_2mortal(newSViv(none)));
1664 PUTBACK;
923e4eb5 1665 if (IN_PERL_COMPILETIME) {
bf1fed83 1666 /* XXX ought to be handled by lex_start */
82686b01 1667 SAVEI32(PL_in_my);
2b4bd638 1668 PL_in_my = 0;
bf1fed83 1669 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1670 }
f8be5cf0 1671 errsv_save = newSVsv(ERRSV);
864dbfa3 1672 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1673 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1674 else
e24b16f9 1675 retval = &PL_sv_undef;
f8be5cf0 1676 if (!SvTRUE(ERRSV))
1677 sv_setsv(ERRSV, errsv_save);
1678 SvREFCNT_dec(errsv_save);
a0ed51b3 1679 LEAVE;
1680 POPSTACK;
923e4eb5 1681 if (IN_PERL_COMPILETIME) {
bf1fed83 1682 STRLEN len;
aec46f14 1683 const char* const pv = SvPV_const(tokenbufsv, len);
bf1fed83 1684
1685 Copy(pv, PL_tokenbuf, len+1, char);
eb160463 1686 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1687 }
bc45ce41 1688 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1689 if (SvPOK(retval))
35c1215d 1690 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1691 retval);
cea2e8a9 1692 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1693 }
a0ed51b3 1694 return retval;
1695}
1696
035d37be 1697
1698/* This API is wrong for special case conversions since we may need to
1699 * return several Unicode characters for a single Unicode character
1700 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1701 * the lower-level routine, and it is similarly broken for returning
1702 * multiple values. --jhi */
a0ed51b3 1703UV
7fc63493 1704Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
a0ed51b3 1705{
27da23d5 1706 dVAR;
aec46f14 1707 HV* const hv = (HV*)SvRV(sv);
3568d838 1708 U32 klen;
1709 U32 off;
a0ed51b3 1710 STRLEN slen;
7d85a32c 1711 STRLEN needents;
cfd0369c 1712 const U8 *tmps = NULL;
a0ed51b3 1713 U32 bit;
1714 SV *retval;
3568d838 1715 U8 tmputf8[2];
1716 UV c = NATIVE_TO_ASCII(*ptr);
1717
1718 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
eb160463 1719 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1720 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
3568d838 1721 ptr = tmputf8;
1722 }
1723 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1724 * then the "swatch" is a vec() for al the chars which start
1725 * with 0xAA..0xYY
1726 * So the key in the hash (klen) is length of encoded char -1
1727 */
1728 klen = UTF8SKIP(ptr) - 1;
1729 off = ptr[klen];
a0ed51b3 1730
7d85a32c 1731 if (klen == 0)
1732 {
1733 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1734 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1735 */
1736 needents = UTF_CONTINUATION_MARK;
1737 off = NATIVE_TO_UTF(ptr[klen]);
1738 }
1739 else
1740 {
1741 /* If char is encoded then swatch is for the prefix */
1742 needents = (1 << UTF_ACCUMULATION_SHIFT);
1743 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1744 }
1745
a0ed51b3 1746 /*
1747 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1748 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1749 * it's nothing to sniff at.) Pity we usually come through at least
1750 * two function calls to get here...
1751 *
1752 * NB: this code assumes that swatches are never modified, once generated!
1753 */
1754
3568d838 1755 if (hv == PL_last_swash_hv &&
a0ed51b3 1756 klen == PL_last_swash_klen &&
27da23d5 1757 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3 1758 {
1759 tmps = PL_last_swash_tmps;
1760 slen = PL_last_swash_slen;
1761 }
1762 else {
1763 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1764 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 1765
1766 /* If not cached, generate it via utf8::SWASHGET */
cfd0369c 1767 if (!svp || !SvPOK(*svp) || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
a0ed51b3 1768 dSP;
2b9d42f0 1769 /* We use utf8n_to_uvuni() as we want an index into
1770 Unicode tables, not a native character number.
1771 */
aec46f14 1772 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae 1773 ckWARN(WARN_UTF8) ?
1774 0 : UTF8_ALLOW_ANY);
f8be5cf0 1775 SV *errsv_save;
a0ed51b3 1776 ENTER;
1777 SAVETMPS;
1778 save_re_context();
1779 PUSHSTACKi(PERLSI_MAGIC);
1780 PUSHMARK(SP);
1781 EXTEND(SP,3);
1782 PUSHs((SV*)sv);
ffbc6a93 1783 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838 1784 PUSHs(sv_2mortal(newSViv((klen) ?
1785 (code_point & ~(needents - 1)) : 0)));
a0ed51b3 1786 PUSHs(sv_2mortal(newSViv(needents)));
1787 PUTBACK;
f8be5cf0 1788 errsv_save = newSVsv(ERRSV);
864dbfa3 1789 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1790 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1791 else
e24b16f9 1792 retval = &PL_sv_undef;
f8be5cf0 1793 if (!SvTRUE(ERRSV))
1794 sv_setsv(ERRSV, errsv_save);
1795 SvREFCNT_dec(errsv_save);
a0ed51b3 1796 POPSTACK;
1797 FREETMPS;
1798 LEAVE;
923e4eb5 1799 if (IN_PERL_COMPILETIME)
eb160463 1800 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1801
e1ec3a88 1802 svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
a0ed51b3 1803
7d85a32c 1804 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1805 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3 1806 }
1807
1808 PL_last_swash_hv = hv;
1809 PL_last_swash_klen = klen;
cfd0369c 1810 /* FIXME change interpvar.h? */
1811 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3 1812 PL_last_swash_slen = slen;
1813 if (klen)
1814 Copy(ptr, PL_last_swash_key, klen, U8);
1815 }
1816
9faf8d75 1817 switch ((int)((slen << 3) / needents)) {
a0ed51b3 1818 case 1:
1819 bit = 1 << (off & 7);
1820 off >>= 3;
1821 return (tmps[off] & bit) != 0;
1822 case 8:
1823 return tmps[off];
1824 case 16:
1825 off <<= 1;
1826 return (tmps[off] << 8) + tmps[off + 1] ;
1827 case 32:
1828 off <<= 2;
1829 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1830 }
cea2e8a9 1831 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3 1832 return 0;
1833}
2b9d42f0 1834
b851fbc1 1835U8 *
1836Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1837{
1838 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1839}
2b9d42f0 1840
1841/*
d2cc3551 1842=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1843
1844Build to the scalar dsv a displayable version of the string spv,
1845length len, the displayable version being at most pvlim bytes long
1846(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1847
9e55ce06 1848The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1849isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054 1850to display the \\[nrfta\\] as the backslashed versions (like '\n')
1851(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1852UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1853UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1854
d2cc3551 1855The pointer to the PV of the dsv is returned.
1856
1857=cut */
e6b2e755 1858char *
e1ec3a88 1859Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755 1860{
1861 int truncated = 0;
e1ec3a88 1862 const char *s, *e;
e6b2e755 1863
1864 sv_setpvn(dsv, "", 0);
e1ec3a88 1865 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 1866 UV u;
a49f32c6 1867 /* This serves double duty as a flag and a character to print after
1868 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1869 */
1870 char ok = 0;
c728cb41 1871
e6b2e755 1872 if (pvlim && SvCUR(dsv) >= pvlim) {
1873 truncated++;
1874 break;
1875 }
1876 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1877 if (u < 256) {
a3b680e6 1878 const unsigned char c = (unsigned char)u & 0xFF;
c728cb41 1879 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
a49f32c6 1880 switch (c) {
c728cb41 1881 case '\n':
a49f32c6 1882 ok = 'n'; break;
c728cb41 1883 case '\r':
a49f32c6 1884 ok = 'r'; break;
c728cb41 1885 case '\t':
a49f32c6 1886 ok = 't'; break;
c728cb41 1887 case '\f':
a49f32c6 1888 ok = 'f'; break;
c728cb41 1889 case '\a':
a49f32c6 1890 ok = 'a'; break;
c728cb41 1891 case '\\':
a49f32c6 1892 ok = '\\'; break;
c728cb41 1893 default: break;
1894 }
a49f32c6 1895 if (ok) {
1896 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1897 }
c728cb41 1898 }
00e86452 1899 /* isPRINT() is the locale-blind version. */
a49f32c6 1900 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1901 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1902 ok = 1;
0a2ef054 1903 }
c728cb41 1904 }
1905 if (!ok)
9e55ce06 1906 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755 1907 }
1908 if (truncated)
1909 sv_catpvn(dsv, "...", 3);
1910
1911 return SvPVX(dsv);
1912}
2b9d42f0 1913
d2cc3551 1914/*
1915=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1916
1917Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1918the displayable version being at most pvlim bytes long
d2cc3551 1919(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1920
1921The flags argument is as in pv_uni_display().
1922
d2cc3551 1923The pointer to the PV of the dsv is returned.
1924
1925=cut */
e6b2e755 1926char *
1927Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1928{
cfd0369c 1929 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
1930 SvCUR(ssv), pvlim, flags);
701a277b 1931}
1932
d2cc3551 1933/*
d07ddd77 1934=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551 1935
1936Return true if the strings s1 and s2 differ case-insensitively, false
1937if not (if they are equal case-insensitively). If u1 is true, the
1938string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77 1939the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1940are false, the respective string is assumed to be in native 8-bit
1941encoding.
1942
1943If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1944in there (they will point at the beginning of the I<next> character).
1945If the pointers behind pe1 or pe2 are non-NULL, they are the end
1946pointers beyond which scanning will not continue under any
4cdaeff7 1947circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77 1948s2+l2 will be used as goal end pointers that will also stop the scan,
1949and which qualify towards defining a successful match: all the scans
1950that define an explicit length must reach their goal pointers for
1951a match to succeed).
d2cc3551 1952
1953For case-insensitiveness, the "casefolding" of Unicode is used
1954instead of upper/lowercasing both the characters, see
1955http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1956
1957=cut */
701a277b 1958I32
d07ddd77 1959Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 1960{
e1ec3a88 1961 register const U8 *p1 = (const U8*)s1;
1962 register const U8 *p2 = (const U8*)s2;
1963 register const U8 *f1 = 0, *f2 = 0;
1964 register U8 *e1 = 0, *q1 = 0;
1965 register U8 *e2 = 0, *q2 = 0;
d07ddd77 1966 STRLEN n1 = 0, n2 = 0;
89ebb4a3 1967 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
1968 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8 1969 U8 natbuf[1+1];
1970 STRLEN foldlen1, foldlen2;
d07ddd77 1971 bool match;
332ddc25 1972
d07ddd77 1973 if (pe1)
1974 e1 = *(U8**)pe1;
e1ec3a88 1975 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
1976 f1 = (const U8*)s1 + l1;
d07ddd77 1977 if (pe2)
1978 e2 = *(U8**)pe2;
e1ec3a88 1979 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
1980 f2 = (const U8*)s2 + l2;
d07ddd77 1981
1982 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1983 return 1; /* mismatch; possible infinite loop or false positive */
1984
a6872d42 1985 if (!u1 || !u2)
1986 natbuf[1] = 0; /* Need to terminate the buffer. */
1987
d07ddd77 1988 while ((e1 == 0 || p1 < e1) &&
1989 (f1 == 0 || p1 < f1) &&
1990 (e2 == 0 || p2 < e2) &&
1991 (f2 == 0 || p2 < f2)) {
1992 if (n1 == 0) {
d7f013c8 1993 if (u1)
1994 to_utf8_fold(p1, foldbuf1, &foldlen1);
1995 else {
809e8e66 1996 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8 1997 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1998 }
1999 q1 = foldbuf1;
d07ddd77 2000 n1 = foldlen1;
332ddc25 2001 }
d07ddd77 2002 if (n2 == 0) {
d7f013c8 2003 if (u2)
2004 to_utf8_fold(p2, foldbuf2, &foldlen2);
2005 else {
809e8e66 2006 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8 2007 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2008 }
2009 q2 = foldbuf2;
d07ddd77 2010 n2 = foldlen2;
332ddc25 2011 }
d07ddd77 2012 while (n1 && n2) {
2013 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2014 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2015 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2016 return 1; /* mismatch */
d07ddd77 2017 n1 -= UTF8SKIP(q1);
d7f013c8 2018 q1 += UTF8SKIP(q1);
d07ddd77 2019 n2 -= UTF8SKIP(q2);
d7f013c8 2020 q2 += UTF8SKIP(q2);
701a277b 2021 }
d07ddd77 2022 if (n1 == 0)
d7f013c8 2023 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2024 if (n2 == 0)
d7f013c8 2025 p2 += u2 ? UTF8SKIP(p2) : 1;
2026
d2cc3551 2027 }
5469e704 2028
d07ddd77 2029 /* A match is defined by all the scans that specified
2030 * an explicit length reaching their final goals. */
2031 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704 2032
2033 if (match) {
d07ddd77 2034 if (pe1)
2035 *pe1 = (char*)p1;
2036 if (pe2)
2037 *pe2 = (char*)p2;
5469e704 2038 }
2039
2040 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2041}
701a277b 2042
a49f32c6 2043/*
2044 * Local variables:
2045 * c-indentation-style: bsd
2046 * c-basic-offset: 4
2047 * indent-tabs-mode: t
2048 * End:
2049 *
37442d52 2050 * ex: set ts=8 sts=4 sw=4 noet:
2051 */