If /dev/null is not readable, perl -e fails with
[p5sagit/p5-mst-13.2.git] / utf8.c
CommitLineData
a0ed51b3 1/* utf8.c
2 *
af3babe4 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
4 * others
a0ed51b3 5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3 26#include "perl.h"
27
27da23d5 28static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 30
ccfc67b7 31/*
32=head1 Unicode Support
a0ed51b3 33
166f8a29 34This file contains various utility functions for manipulating UTF8-encoded
35strings. For the uninitiated, this is a method of representing arbitrary
61296642 36Unicode characters as a variable number of bytes, in such a way that
56da48f7 37characters in the ASCII range are unmodified, and a zero byte never appears
38within non-zero characters.
166f8a29 39
b851fbc1 40=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 41
1e54db1a 42Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 43of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 44bytes available. The return value is the pointer to the byte after the
9041c2e3 45end of the new character. In other words,
eebe1485 46
b851fbc1 47 d = uvuni_to_utf8_flags(d, uv, flags);
48
49or, in most cases,
50
9041c2e3 51 d = uvuni_to_utf8(d, uv);
eebe1485 52
b851fbc1 53(which is equivalent to)
54
55 d = uvuni_to_utf8_flags(d, uv, 0);
56
eebe1485 57is the recommended Unicode-aware way of saying
58
59 *(d++) = uv;
60
61=cut
62*/
63
dfe13c55 64U8 *
b851fbc1 65Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 66{
62961d2e 67 if (ckWARN(WARN_UTF8)) {
b851fbc1 68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1 71 else if (
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
74 ||
c867b360 75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1 76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
2a20b9da 78 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1 79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
81 )
9014280d 82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800 83 "Unicode character 0x%04"UVxf" is illegal", uv);
84 }
c4d5f83a 85 if (UNI_IS_INVARIANT(uv)) {
eb160463 86 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3 87 return d;
88 }
2d331972 89#if defined(EBCDIC)
1d72bdf6 90 else {
91 STRLEN len = UNISKIP(uv);
92 U8 *p = d+len-1;
93 while (p > d) {
eb160463 94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6 95 uv >>= UTF_ACCUMULATION_SHIFT;
96 }
eb160463 97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6 98 return d+len;
99 }
100#else /* Non loop style */
a0ed51b3 101 if (uv < 0x800) {
eb160463 102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 104 return d;
105 }
106 if (uv < 0x10000) {
eb160463 107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 110 return d;
111 }
112 if (uv < 0x200000) {
eb160463 113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 117 return d;
118 }
119 if (uv < 0x4000000) {
eb160463 120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 125 return d;
126 }
127 if (uv < 0x80000000) {
eb160463 128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 134 return d;
135 }
6b8eaf93 136#ifdef HAS_QUAD
d7578b48 137 if (uv < UTF8_QUAD_MAX)
a0ed51b3 138#endif
139 {
eb160463 140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 147 return d;
148 }
6b8eaf93 149#ifdef HAS_QUAD
a0ed51b3 150 {
eb160463 151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 164 return d;
165 }
166#endif
1d72bdf6 167#endif /* Loop style */
a0ed51b3 168}
b851fbc1 169
170U8 *
171Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
172{
173 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
174}
9041c2e3 175
646ca15d 176/*
177
178Tests if some arbitrary number of bytes begins in a valid UTF-8
179character. Note that an INVARIANT (i.e. ASCII) character is a valid
180UTF-8 character. The actual number of bytes in the UTF-8 character
181will be returned if it is valid, otherwise 0.
182
183This is the "slow" version as opposed to the "fast" version which is
184the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
185difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
186or less you should use the IS_UTF8_CHAR(), for lengths of five or more
187you should use the _slow(). In practice this means that the _slow()
188will be used very rarely, since the maximum Unicode code point (as of
189Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
190the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
191five bytes or more.
192
193=cut */
c053b435 194STATIC STRLEN
646ca15d 195S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
196{
197 U8 u = *s;
198 STRLEN slen;
199 UV uv, ouv;
200
201 if (UTF8_IS_INVARIANT(u))
202 return 1;
203
204 if (!UTF8_IS_START(u))
205 return 0;
206
207 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
208 return 0;
209
210 slen = len - 1;
211 s++;
77263263 212#ifdef EBCDIC
213 u = NATIVE_TO_UTF(u);
214#endif
646ca15d 215 u &= UTF_START_MASK(len);
216 uv = u;
217 ouv = uv;
218 while (slen--) {
219 if (!UTF8_IS_CONTINUATION(*s))
220 return 0;
221 uv = UTF8_ACCUMULATE(uv, *s);
222 if (uv < ouv)
223 return 0;
224 ouv = uv;
225 s++;
226 }
227
228 if ((STRLEN)UNISKIP(uv) < len)
229 return 0;
230
231 return len;
232}
9041c2e3 233
234/*
7fc63493 235=for apidoc A|STRLEN|is_utf8_char|const U8 *s
eebe1485 236
5da9da9e 237Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01 238character. Note that an INVARIANT (i.e. ASCII) character is a valid
239UTF-8 character. The actual number of bytes in the UTF-8 character
240will be returned if it is valid, otherwise 0.
9041c2e3 241
82686b01 242=cut */
067a85ef 243STRLEN
7fc63493 244Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 245{
44f8325f 246 const STRLEN len = UTF8SKIP(s);
3b0fc154 247#ifdef IS_UTF8_CHAR
768c67ee 248 if (IS_UTF8_CHAR_FAST(len))
3b0fc154 249 return IS_UTF8_CHAR(s, len) ? len : 0;
250#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 251 return is_utf8_char_slow(s, len);
386d01d6 252}
253
6662521e 254/*
7fc63493 255=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
6662521e 256
c9ada85f 257Returns true if first C<len> bytes of the given string form a valid
1e54db1a 258UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
259not mean 'a string that contains code points above 0x7F encoded in UTF-8'
260because a valid ASCII string is a valid UTF-8 string.
6662521e 261
768c67ee 262See also is_utf8_string_loclen() and is_utf8_string_loc().
263
6662521e 264=cut
265*/
266
8e84507e 267bool
7fc63493 268Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 269{
7fc63493 270 const U8* x = s;
271 const U8* send;
067a85ef 272
44f8325f 273 if (!len)
e1ec3a88 274 len = strlen((const char *)s);
1aa99e6b 275 send = s + len;
276
6662521e 277 while (x < send) {
a3b680e6 278 STRLEN c;
1acdb0da 279 /* Inline the easy bits of is_utf8_char() here for speed... */
280 if (UTF8_IS_INVARIANT(*x))
281 c = 1;
282 else if (!UTF8_IS_START(*x))
768c67ee 283 goto out;
1acdb0da 284 else {
285 /* ... and call is_utf8_char() only if really needed. */
646ca15d 286#ifdef IS_UTF8_CHAR
287 c = UTF8SKIP(x);
768c67ee 288 if (IS_UTF8_CHAR_FAST(c)) {
289 if (!IS_UTF8_CHAR(x, c))
290 goto out;
291 } else if (!is_utf8_char_slow(x, c))
292 goto out;
646ca15d 293#else
294 c = is_utf8_char(x);
295#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 296 if (!c)
768c67ee 297 goto out;
1acdb0da 298 }
6662521e 299 x += c;
6662521e 300 }
768c67ee 301
302 out:
60006e79 303 if (x != send)
304 return FALSE;
067a85ef 305
306 return TRUE;
6662521e 307}
308
67e989fb 309/*
768c67ee 310=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
81cd54e3 311
768c67ee 312Like is_ut8_string() but stores the location of the failure (in the
313case of "utf8ness failure") or the location s+len (in the case of
314"utf8ness success") in the C<ep>, and the number of UTF-8
315encoded characters in the C<el>.
316
317See also is_utf8_string_loc() and is_utf8_string().
81cd54e3 318
319=cut
320*/
321
322bool
768c67ee 323Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 324{
7fc63493 325 const U8* x = s;
326 const U8* send;
81cd54e3 327 STRLEN c;
328
44f8325f 329 if (!len)
e1ec3a88 330 len = strlen((const char *)s);
81cd54e3 331 send = s + len;
768c67ee 332 if (el)
333 *el = 0;
81cd54e3 334
335 while (x < send) {
336 /* Inline the easy bits of is_utf8_char() here for speed... */
337 if (UTF8_IS_INVARIANT(*x))
768c67ee 338 c = 1;
339 else if (!UTF8_IS_START(*x))
340 goto out;
81cd54e3 341 else {
768c67ee 342 /* ... and call is_utf8_char() only if really needed. */
343#ifdef IS_UTF8_CHAR
344 c = UTF8SKIP(x);
345 if (IS_UTF8_CHAR_FAST(c)) {
346 if (!IS_UTF8_CHAR(x, c))
347 c = 0;
348 } else
349 c = is_utf8_char_slow(x, c);
350#else
351 c = is_utf8_char(x);
352#endif /* #ifdef IS_UTF8_CHAR */
353 if (!c)
354 goto out;
81cd54e3 355 }
768c67ee 356 x += c;
357 if (el)
358 (*el)++;
81cd54e3 359 }
768c67ee 360
361 out:
362 if (ep)
363 *ep = x;
364 if (x != send)
81cd54e3 365 return FALSE;
81cd54e3 366
367 return TRUE;
368}
369
370/*
768c67ee 371=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
372
373Like is_ut8_string() but stores the location of the failure (in the
374case of "utf8ness failure") or the location s+len (in the case of
375"utf8ness success") in the C<ep>.
376
377See also is_utf8_string_loclen() and is_utf8_string().
378
379=cut
380*/
381
382bool
383Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **ep)
384{
385 return is_utf8_string_loclen(s, len, ep, 0);
386}
387
388/*
7fc63493 389=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 390
9041c2e3 391Bottom level UTF-8 decode routine.
392Returns the unicode code point value of the first character in the string C<s>
1e54db1a 393which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 394C<retlen> will be set to the length, in bytes, of that character.
67e989fb 395
1e54db1a 396If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880 397is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
398it is assumed that the caller will raise a warning, and this function
28d3d195 399will silently just set C<retlen> to C<-1> and return zero. If the
400C<flags> does not contain UTF8_CHECK_ONLY, warnings about
401malformations will be given, C<retlen> will be set to the expected
402length of the UTF-8 character in bytes, and zero will be returned.
403
404The C<flags> can also contain various flags to allow deviations from
405the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 406
9041c2e3 407Most code should use utf8_to_uvchr() rather than call this directly.
408
37607a96 409=cut
410*/
67e989fb 411
a0ed51b3 412UV
7fc63493 413Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 414{
7fc63493 415 const U8 *s0 = s;
9c5ffd7c 416 UV uv = *s, ouv = 0;
ba210ebe 417 STRLEN len = 1;
7fc63493 418 const bool dowarn = ckWARN_d(WARN_UTF8);
419 const UV startbyte = *s;
ba210ebe 420 STRLEN expectlen = 0;
a0dbb045 421 U32 warning = 0;
422
423/* This list is a superset of the UTF8_ALLOW_XXX. */
424
425#define UTF8_WARN_EMPTY 1
426#define UTF8_WARN_CONTINUATION 2
427#define UTF8_WARN_NON_CONTINUATION 3
428#define UTF8_WARN_FE_FF 4
429#define UTF8_WARN_SHORT 5
430#define UTF8_WARN_OVERFLOW 6
431#define UTF8_WARN_SURROGATE 7
c867b360 432#define UTF8_WARN_LONG 8
433#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045 434
435 if (curlen == 0 &&
436 !(flags & UTF8_ALLOW_EMPTY)) {
437 warning = UTF8_WARN_EMPTY;
0c443dc2 438 goto malformed;
439 }
440
1d72bdf6 441 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3 442 if (retlen)
443 *retlen = 1;
c4d5f83a 444 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 445 }
67e989fb 446
421a8bf2 447 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 448 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 449 warning = UTF8_WARN_CONTINUATION;
ba210ebe 450 goto malformed;
451 }
452
421a8bf2 453 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 454 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 455 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 456 goto malformed;
457 }
9041c2e3 458
1d72bdf6 459#ifdef EBCDIC
75383841 460 uv = NATIVE_TO_UTF(uv);
1d72bdf6 461#else
fcc8fcf6 462 if ((uv == 0xfe || uv == 0xff) &&
463 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 464 warning = UTF8_WARN_FE_FF;
ba210ebe 465 goto malformed;
a0ed51b3 466 }
1d72bdf6 467#endif
468
ba210ebe 469 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
470 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
471 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
472 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6 473#ifdef EBCDIC
474 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
475 else { len = 7; uv &= 0x01; }
476#else
ba210ebe 477 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
478 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6 479 else { len = 13; uv = 0; } /* whoa! */
480#endif
481
a0ed51b3 482 if (retlen)
483 *retlen = len;
9041c2e3 484
ba210ebe 485 expectlen = len;
486
fcc8fcf6 487 if ((curlen < expectlen) &&
488 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 489 warning = UTF8_WARN_SHORT;
ba210ebe 490 goto malformed;
491 }
492
493 len--;
a0ed51b3 494 s++;
ba210ebe 495 ouv = uv;
496
a0ed51b3 497 while (len--) {
421a8bf2 498 if (!UTF8_IS_CONTINUATION(*s) &&
499 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 500 s--;
501 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 502 goto malformed;
a0ed51b3 503 }
504 else
8850bf83 505 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045 506 if (!(uv > ouv)) {
507 /* These cannot be allowed. */
508 if (uv == ouv) {
75dbc644 509 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 510 warning = UTF8_WARN_LONG;
511 goto malformed;
512 }
513 }
514 else { /* uv < ouv */
515 /* This cannot be allowed. */
516 warning = UTF8_WARN_OVERFLOW;
517 goto malformed;
518 }
ba210ebe 519 }
520 s++;
521 ouv = uv;
522 }
523
421a8bf2 524 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 525 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 526 warning = UTF8_WARN_SURROGATE;
ba210ebe 527 goto malformed;
eb160463 528 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 529 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 530 warning = UTF8_WARN_LONG;
ba210ebe 531 goto malformed;
421a8bf2 532 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 533 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 534 warning = UTF8_WARN_FFFF;
a9917092 535 goto malformed;
a0ed51b3 536 }
ba210ebe 537
a0ed51b3 538 return uv;
ba210ebe 539
540malformed:
541
fcc8fcf6 542 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 543 if (retlen)
cc366d4b 544 *retlen = -1;
ba210ebe 545 return 0;
546 }
547
a0dbb045 548 if (dowarn) {
44f8325f 549 SV* const sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
a0dbb045 550
551 switch (warning) {
552 case 0: /* Intentionally empty. */ break;
553 case UTF8_WARN_EMPTY:
54667de8 554 Perl_sv_catpv(aTHX_ sv, "(empty string)");
a0dbb045 555 break;
556 case UTF8_WARN_CONTINUATION:
097fb8e2 557 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045 558 break;
559 case UTF8_WARN_NON_CONTINUATION:
097fb8e2 560 if (s == s0)
561 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
562 (UV)s[1], startbyte);
563 else
564 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
5d7488b2 565 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, (int)expectlen);
097fb8e2 566
a0dbb045 567 break;
568 case UTF8_WARN_FE_FF:
569 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
570 break;
571 case UTF8_WARN_SHORT:
097fb8e2 572 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 573 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 574 expectlen = curlen; /* distance for caller to skip */
a0dbb045 575 break;
576 case UTF8_WARN_OVERFLOW:
097fb8e2 577 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
578 ouv, *s, startbyte);
a0dbb045 579 break;
580 case UTF8_WARN_SURROGATE:
581 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
582 break;
a0dbb045 583 case UTF8_WARN_LONG:
097fb8e2 584 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 585 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045 586 break;
587 case UTF8_WARN_FFFF:
588 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
589 break;
590 default:
54667de8 591 Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
a0dbb045 592 break;
593 }
594
595 if (warning) {
44f8325f 596 const char * const s = SvPVX_const(sv);
a0dbb045 597
598 if (PL_op)
9014280d 599 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 600 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 601 else
9014280d 602 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045 603 }
604 }
605
ba210ebe 606 if (retlen)
28d3d195 607 *retlen = expectlen ? expectlen : len;
ba210ebe 608
28d3d195 609 return 0;
a0ed51b3 610}
611
8e84507e 612/*
7fc63493 613=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
9041c2e3 614
615Returns the native character value of the first character in the string C<s>
1e54db1a 616which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3 617length, in bytes, of that character.
618
1e54db1a 619If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3 620returned and retlen is set, if possible, to -1.
621
622=cut
623*/
624
625UV
7fc63493 626Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 627{
89ebb4a3 628 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 629 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3 630}
631
632/*
7fc63493 633=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
9041c2e3 634
635Returns the Unicode code point of the first character in the string C<s>
1e54db1a 636which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3 637length, in bytes, of that character.
638
639This function should only be used when returned UV is considered
640an index into the Unicode semantic tables (e.g. swashes).
641
1e54db1a 642If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 643returned and retlen is set, if possible, to -1.
8e84507e 644
645=cut
646*/
647
648UV
7fc63493 649Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 650{
9041c2e3 651 /* Call the low level routine asking for checks */
89ebb4a3 652 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 653 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e 654}
655
b76347f2 656/*
35a4481c 657=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
b76347f2 658
659Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47 660Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
661up past C<e>, croaks.
b76347f2 662
663=cut
664*/
665
666STRLEN
35a4481c 667Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2 668{
669 STRLEN len = 0;
670
8850bf83 671 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
672 * the bitops (especially ~) can create illegal UTF-8.
673 * In other words: in Perl UTF-8 is not just for Unicode. */
674
a3b680e6 675 if (e < s)
676 goto warn_and_return;
b76347f2 677 while (s < e) {
4373e329 678 const U8 t = UTF8SKIP(s);
901b21bf 679 if (e - s < t) {
a3b680e6 680 warn_and_return:
901b21bf 681 if (ckWARN_d(WARN_UTF8)) {
682 if (PL_op)
683 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 684 "%s in %s", unees, OP_DESC(PL_op));
901b21bf 685 else
686 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
687 }
688 return len;
689 }
b76347f2 690 s += t;
691 len++;
692 }
693
694 return len;
695}
696
b06226ff 697/*
35a4481c 698=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
b06226ff 699
1e54db1a 700Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff 701and C<b>.
702
703WARNING: use only if you *know* that the pointers point inside the
704same UTF-8 buffer.
705
37607a96 706=cut
707*/
a0ed51b3 708
02eb7b47 709IV
35a4481c 710Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 711{
02eb7b47 712 IV off = 0;
713
8850bf83 714 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
715 * the bitops (especially ~) can create illegal UTF-8.
716 * In other words: in Perl UTF-8 is not just for Unicode. */
717
a0ed51b3 718 if (a < b) {
719 while (a < b) {
35a4481c 720 const U8 c = UTF8SKIP(a);
a3b680e6 721 if (b - a < c)
722 goto warn_and_return;
02eb7b47 723 a += c;
a0ed51b3 724 off--;
725 }
726 }
727 else {
728 while (b < a) {
4373e329 729 const U8 c = UTF8SKIP(b);
02eb7b47 730
901b21bf 731 if (a - b < c) {
a3b680e6 732 warn_and_return:
901b21bf 733 if (ckWARN_d(WARN_UTF8)) {
734 if (PL_op)
735 Perl_warner(aTHX_ packWARN(WARN_UTF8),
736 "%s in %s", unees, OP_DESC(PL_op));
737 else
738 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
739 }
740 return off;
741 }
02eb7b47 742 b += c;
a0ed51b3 743 off++;
744 }
745 }
02eb7b47 746
a0ed51b3 747 return off;
748}
749
b06226ff 750/*
37607a96 751=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 752
8850bf83 753Return the UTF-8 pointer C<s> displaced by C<off> characters, either
754forward or backward.
b06226ff 755
756WARNING: do not use the following unless you *know* C<off> is within
8850bf83 757the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
758on the first byte of character or just after the last byte of a character.
b06226ff 759
37607a96 760=cut
761*/
a0ed51b3 762
763U8 *
4373e329 764Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 765{
8850bf83 766 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
767 * the bitops (especially ~) can create illegal UTF-8.
768 * In other words: in Perl UTF-8 is not just for Unicode. */
769
a0ed51b3 770 if (off >= 0) {
771 while (off--)
772 s += UTF8SKIP(s);
773 }
774 else {
775 while (off++) {
776 s--;
8850bf83 777 while (UTF8_IS_CONTINUATION(*s))
778 s--;
a0ed51b3 779 }
780 }
4373e329 781 return (U8 *)s;
a0ed51b3 782}
783
6940069f 784/*
eebe1485 785=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 786
1e54db1a 787Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53 788Unlike C<bytes_to_utf8>, this over-writes the original string, and
789updates len to contain the new length.
67e989fb 790Returns zero on failure, setting C<len> to -1.
6940069f 791
792=cut
793*/
794
795U8 *
37607a96 796Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 797{
6940069f 798 U8 *send;
799 U8 *d;
dcad2880 800 U8 *save = s;
246fae53 801
1e54db1a 802 /* ensure valid UTF-8 and chars < 256 before updating string */
dcad2880 803 for (send = s + *len; s < send; ) {
804 U8 c = *s++;
805
1d72bdf6 806 if (!UTF8_IS_INVARIANT(c) &&
807 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
808 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880 809 *len = -1;
810 return 0;
811 }
246fae53 812 }
dcad2880 813
814 d = s = save;
6940069f 815 while (s < send) {
ed646e6e 816 STRLEN ulen;
9041c2e3 817 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 818 s += ulen;
6940069f 819 }
820 *d = '\0';
246fae53 821 *len = d - save;
6940069f 822 return save;
823}
824
825/*
e1ec3a88 826=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 827
1e54db1a 828Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
35a4481c 829Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0 830the newly-created string, and updates C<len> to contain the new
831length. Returns the original string if no conversion occurs, C<len>
832is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
8330 if C<s> is converted or contains all 7bit characters.
f9a63242 834
37607a96 835=cut
836*/
f9a63242 837
838U8 *
e1ec3a88 839Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 840{
f9a63242 841 U8 *d;
e1ec3a88 842 const U8 *start = s;
843 const U8 *send;
f9a63242 844 I32 count = 0;
845
846 if (!*is_utf8)
73d840c0 847 return (U8 *)start;
f9a63242 848
1e54db1a 849 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 850 for (send = s + *len; s < send;) {
e1ec3a88 851 U8 c = *s++;
1d72bdf6 852 if (!UTF8_IS_INVARIANT(c)) {
db42d148 853 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
854 (c = *s++) && UTF8_IS_CONTINUATION(c))
855 count++;
856 else
73d840c0 857 return (U8 *)start;
db42d148 858 }
f9a63242 859 }
860
861 *is_utf8 = 0;
862
a02a5408 863 Newxz(d, (*len) - count + 1, U8);
ef9edfd0 864 s = start; start = d;
f9a63242 865 while (s < send) {
866 U8 c = *s++;
c4d5f83a 867 if (!UTF8_IS_INVARIANT(c)) {
868 /* Then it is two-byte encoded */
869 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
870 c = ASCII_TO_NATIVE(c);
871 }
872 *d++ = c;
f9a63242 873 }
874 *d = '\0';
875 *len = d - start;
73d840c0 876 return (U8 *)start;
f9a63242 877}
878
879/*
35a4481c 880=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
6940069f 881
1e54db1a 882Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e 883Returns a pointer to the newly-created string, and sets C<len> to
884reflect the new length.
6940069f 885
1e54db1a 886If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f 887see sv_recode_to_utf8().
888
497711e7 889=cut
6940069f 890*/
891
892U8*
35a4481c 893Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 894{
35a4481c 895 const U8 * const send = s + (*len);
6940069f 896 U8 *d;
897 U8 *dst;
6940069f 898
a02a5408 899 Newxz(d, (*len) * 2 + 1, U8);
6940069f 900 dst = d;
901
902 while (s < send) {
35a4481c 903 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 904 if (UNI_IS_INVARIANT(uv))
eb160463 905 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 906 else {
eb160463 907 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
908 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f 909 }
910 }
911 *d = '\0';
6662521e 912 *len = d-dst;
6940069f 913 return dst;
914}
915
a0ed51b3 916/*
dea0fc0b 917 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3 918 *
919 * Destination must be pre-extended to 3/2 source. Do not use in-place.
920 * We optimize for native, for obvious reasons. */
921
922U8*
dea0fc0b 923Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 924{
dea0fc0b 925 U8* pend;
926 U8* dstart = d;
927
1de9afcd 928 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
929 d[0] = 0;
930 *newlen = 1;
931 return d;
932 }
933
dea0fc0b 934 if (bytelen & 1)
014ead4b 935 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
dea0fc0b 936
937 pend = p + bytelen;
938
a0ed51b3 939 while (p < pend) {
dea0fc0b 940 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
941 p += 2;
a0ed51b3 942 if (uv < 0x80) {
eb160463 943 *d++ = (U8)uv;
a0ed51b3 944 continue;
945 }
946 if (uv < 0x800) {
eb160463 947 *d++ = (U8)(( uv >> 6) | 0xc0);
948 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 949 continue;
950 }
951 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e 952 UV low = (p[0] << 8) + p[1];
953 p += 2;
dea0fc0b 954 if (low < 0xdc00 || low >= 0xdfff)
955 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3 956 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
957 }
958 if (uv < 0x10000) {
eb160463 959 *d++ = (U8)(( uv >> 12) | 0xe0);
960 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
961 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 962 continue;
963 }
964 else {
eb160463 965 *d++ = (U8)(( uv >> 18) | 0xf0);
966 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
967 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
968 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 969 continue;
970 }
971 }
dea0fc0b 972 *newlen = d - dstart;
a0ed51b3 973 return d;
974}
975
976/* Note: this one is slightly destructive of the source. */
977
978U8*
dea0fc0b 979Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 980{
981 U8* s = (U8*)p;
982 U8* send = s + bytelen;
983 while (s < send) {
984 U8 tmp = s[0];
985 s[0] = s[1];
986 s[1] = tmp;
987 s += 2;
988 }
dea0fc0b 989 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3 990}
991
992/* for now these are all defined (inefficiently) in terms of the utf8 versions */
993
994bool
84afefe6 995Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 996{
89ebb4a3 997 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 998 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 999 return is_utf8_alnum(tmpbuf);
1000}
1001
1002bool
84afefe6 1003Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 1004{
89ebb4a3 1005 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1006 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1007 return is_utf8_alnumc(tmpbuf);
1008}
1009
1010bool
84afefe6 1011Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1012{
89ebb4a3 1013 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1014 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1015 return is_utf8_idfirst(tmpbuf);
1016}
1017
1018bool
84afefe6 1019Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1020{
89ebb4a3 1021 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1022 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1023 return is_utf8_alpha(tmpbuf);
1024}
1025
1026bool
84afefe6 1027Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1028{
89ebb4a3 1029 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1030 uvchr_to_utf8(tmpbuf, c);
4d61ec05 1031 return is_utf8_ascii(tmpbuf);
1032}
1033
1034bool
84afefe6 1035Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1036{
89ebb4a3 1037 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1038 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1039 return is_utf8_space(tmpbuf);
1040}
1041
1042bool
84afefe6 1043Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1044{
89ebb4a3 1045 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1046 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1047 return is_utf8_digit(tmpbuf);
1048}
1049
1050bool
84afefe6 1051Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1052{
89ebb4a3 1053 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1054 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1055 return is_utf8_upper(tmpbuf);
1056}
1057
1058bool
84afefe6 1059Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1060{
89ebb4a3 1061 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1062 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1063 return is_utf8_lower(tmpbuf);
1064}
1065
1066bool
84afefe6 1067Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1068{
89ebb4a3 1069 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1070 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1071 return is_utf8_cntrl(tmpbuf);
1072}
1073
1074bool
84afefe6 1075Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1076{
89ebb4a3 1077 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1078 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1079 return is_utf8_graph(tmpbuf);
1080}
1081
1082bool
84afefe6 1083Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1084{
89ebb4a3 1085 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1086 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1087 return is_utf8_print(tmpbuf);
1088}
1089
b8c5462f 1090bool
84afefe6 1091Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1092{
89ebb4a3 1093 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1094 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1095 return is_utf8_punct(tmpbuf);
1096}
1097
4d61ec05 1098bool
84afefe6 1099Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1100{
89ebb4a3 1101 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1102 uvchr_to_utf8(tmpbuf, c);
4d61ec05 1103 return is_utf8_xdigit(tmpbuf);
1104}
1105
84afefe6 1106UV
1107Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1108{
0ebc6274 1109 uvchr_to_utf8(p, c);
1110 return to_utf8_upper(p, p, lenp);
a0ed51b3 1111}
1112
84afefe6 1113UV
1114Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1115{
0ebc6274 1116 uvchr_to_utf8(p, c);
1117 return to_utf8_title(p, p, lenp);
a0ed51b3 1118}
1119
84afefe6 1120UV
1121Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1122{
0ebc6274 1123 uvchr_to_utf8(p, c);
1124 return to_utf8_lower(p, p, lenp);
a0ed51b3 1125}
1126
84afefe6 1127UV
1128Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1129{
0ebc6274 1130 uvchr_to_utf8(p, c);
1131 return to_utf8_fold(p, p, lenp);
84afefe6 1132}
1133
a0ed51b3 1134/* for now these all assume no locale info available for Unicode > 255 */
1135
1136bool
84afefe6 1137Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3 1138{
1139 return is_uni_alnum(c); /* XXX no locale support yet */
1140}
1141
1142bool
84afefe6 1143Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f 1144{
1145 return is_uni_alnumc(c); /* XXX no locale support yet */
1146}
1147
1148bool
84afefe6 1149Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3 1150{
1151 return is_uni_idfirst(c); /* XXX no locale support yet */
1152}
1153
1154bool
84afefe6 1155Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3 1156{
1157 return is_uni_alpha(c); /* XXX no locale support yet */
1158}
1159
1160bool
84afefe6 1161Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05 1162{
1163 return is_uni_ascii(c); /* XXX no locale support yet */
1164}
1165
1166bool
84afefe6 1167Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3 1168{
1169 return is_uni_space(c); /* XXX no locale support yet */
1170}
1171
1172bool
84afefe6 1173Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3 1174{
1175 return is_uni_digit(c); /* XXX no locale support yet */
1176}
1177
1178bool
84afefe6 1179Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3 1180{
1181 return is_uni_upper(c); /* XXX no locale support yet */
1182}
1183
1184bool
84afefe6 1185Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3 1186{
1187 return is_uni_lower(c); /* XXX no locale support yet */
1188}
1189
1190bool
84afefe6 1191Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f 1192{
1193 return is_uni_cntrl(c); /* XXX no locale support yet */
1194}
1195
1196bool
84afefe6 1197Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f 1198{
1199 return is_uni_graph(c); /* XXX no locale support yet */
1200}
1201
1202bool
84afefe6 1203Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3 1204{
1205 return is_uni_print(c); /* XXX no locale support yet */
1206}
1207
b8c5462f 1208bool
84afefe6 1209Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f 1210{
1211 return is_uni_punct(c); /* XXX no locale support yet */
1212}
1213
4d61ec05 1214bool
84afefe6 1215Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05 1216{
1217 return is_uni_xdigit(c); /* XXX no locale support yet */
1218}
1219
b7ac61fa 1220U32
1221Perl_to_uni_upper_lc(pTHX_ U32 c)
1222{
ee099d14 1223 /* XXX returns only the first character -- do not use XXX */
1224 /* XXX no locale support yet */
1225 STRLEN len;
89ebb4a3 1226 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1227 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa 1228}
1229
1230U32
1231Perl_to_uni_title_lc(pTHX_ U32 c)
1232{
ee099d14 1233 /* XXX returns only the first character XXX -- do not use XXX */
1234 /* XXX no locale support yet */
1235 STRLEN len;
89ebb4a3 1236 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1237 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa 1238}
1239
1240U32
1241Perl_to_uni_lower_lc(pTHX_ U32 c)
1242{
ee099d14 1243 /* XXX returns only the first character -- do not use XXX */
1244 /* XXX no locale support yet */
1245 STRLEN len;
89ebb4a3 1246 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1247 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa 1248}
1249
a0ed51b3 1250bool
7fc63493 1251Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1252{
386d01d6 1253 if (!is_utf8_char(p))
1254 return FALSE;
a0ed51b3 1255 if (!PL_utf8_alnum)
289d4f09 1256 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1257 * descendant of isalnum(3), in other words, it doesn't
1258 * contain the '_'. --jhi */
1259 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
eb160463 1260 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3 1261/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1262#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1263 if (!PL_utf8_alnum)
1264 PL_utf8_alnum = swash_init("utf8", "",
1265 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1266 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3 1267#endif
1268}
1269
1270bool
7fc63493 1271Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1272{
386d01d6 1273 if (!is_utf8_char(p))
1274 return FALSE;
b8c5462f 1275 if (!PL_utf8_alnum)
1276 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
eb160463 1277 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f 1278/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1279#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1280 if (!PL_utf8_alnum)
1281 PL_utf8_alnum = swash_init("utf8", "",
1282 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1283 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f 1284#endif
1285}
1286
1287bool
7fc63493 1288Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1289{
82686b01 1290 if (*p == '_')
1291 return TRUE;
1292 if (!is_utf8_char(p))
1293 return FALSE;
1294 if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1295 PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
eb160463 1296 return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
82686b01 1297}
1298
1299bool
7fc63493 1300Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01 1301{
1302 if (*p == '_')
1303 return TRUE;
1304 if (!is_utf8_char(p))
1305 return FALSE;
1306 if (!PL_utf8_idcont)
1307 PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
eb160463 1308 return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
a0ed51b3 1309}
1310
1311bool
7fc63493 1312Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1313{
386d01d6 1314 if (!is_utf8_char(p))
1315 return FALSE;
a0ed51b3 1316 if (!PL_utf8_alpha)
e24b16f9 1317 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
eb160463 1318 return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
a0ed51b3 1319}
1320
1321bool
7fc63493 1322Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1323{
386d01d6 1324 if (!is_utf8_char(p))
1325 return FALSE;
b8c5462f 1326 if (!PL_utf8_ascii)
1327 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
eb160463 1328 return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
b8c5462f 1329}
1330
1331bool
7fc63493 1332Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1333{
386d01d6 1334 if (!is_utf8_char(p))
1335 return FALSE;
a0ed51b3 1336 if (!PL_utf8_space)
3bec3564 1337 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
eb160463 1338 return swash_fetch(PL_utf8_space, p, TRUE) != 0;
a0ed51b3 1339}
1340
1341bool
7fc63493 1342Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1343{
386d01d6 1344 if (!is_utf8_char(p))
1345 return FALSE;
a0ed51b3 1346 if (!PL_utf8_digit)
e24b16f9 1347 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
eb160463 1348 return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
a0ed51b3 1349}
1350
1351bool
7fc63493 1352Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1353{
386d01d6 1354 if (!is_utf8_char(p))
1355 return FALSE;
a0ed51b3 1356 if (!PL_utf8_upper)
c65e4d19 1357 PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
eb160463 1358 return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
a0ed51b3 1359}
1360
1361bool
7fc63493 1362Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1363{
386d01d6 1364 if (!is_utf8_char(p))
1365 return FALSE;
a0ed51b3 1366 if (!PL_utf8_lower)
c65e4d19 1367 PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
eb160463 1368 return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
a0ed51b3 1369}
1370
1371bool
7fc63493 1372Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1373{
386d01d6 1374 if (!is_utf8_char(p))
1375 return FALSE;
b8c5462f 1376 if (!PL_utf8_cntrl)
1377 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
eb160463 1378 return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
b8c5462f 1379}
1380
1381bool
7fc63493 1382Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1383{
386d01d6 1384 if (!is_utf8_char(p))
1385 return FALSE;
b8c5462f 1386 if (!PL_utf8_graph)
1387 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
eb160463 1388 return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
b8c5462f 1389}
1390
1391bool
7fc63493 1392Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1393{
386d01d6 1394 if (!is_utf8_char(p))
1395 return FALSE;
a0ed51b3 1396 if (!PL_utf8_print)
e24b16f9 1397 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
eb160463 1398 return swash_fetch(PL_utf8_print, p, TRUE) != 0;
a0ed51b3 1399}
1400
1401bool
7fc63493 1402Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1403{
386d01d6 1404 if (!is_utf8_char(p))
1405 return FALSE;
b8c5462f 1406 if (!PL_utf8_punct)
1407 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
eb160463 1408 return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
b8c5462f 1409}
1410
1411bool
7fc63493 1412Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1413{
386d01d6 1414 if (!is_utf8_char(p))
1415 return FALSE;
b8c5462f 1416 if (!PL_utf8_xdigit)
1417 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
eb160463 1418 return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
b8c5462f 1419}
1420
1421bool
7fc63493 1422Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1423{
386d01d6 1424 if (!is_utf8_char(p))
1425 return FALSE;
a0ed51b3 1426 if (!PL_utf8_mark)
e24b16f9 1427 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
eb160463 1428 return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
a0ed51b3 1429}
1430
6b5c0936 1431/*
1432=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1433
1434The "p" contains the pointer to the UTF-8 string encoding
1435the character that is being converted.
1436
1437The "ustrp" is a pointer to the character buffer to put the
1438conversion result to. The "lenp" is a pointer to the length
1439of the result.
1440
0134edef 1441The "swashp" is a pointer to the swash to use.
6b5c0936 1442
0134edef 1443Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1444and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1445but not always, a multicharacter mapping), is tried first.
6b5c0936 1446
0134edef 1447The "special" is a string like "utf8::ToSpecLower", which means the
1448hash %utf8::ToSpecLower. The access to the hash is through
1449Perl_to_utf8_case().
6b5c0936 1450
0134edef 1451The "normal" is a string like "ToLower" which means the swash
1452%utf8::ToLower.
1453
1454=cut */
6b5c0936 1455
2104c8d9 1456UV
9a957fbc 1457Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1458 SV **swashp, const char *normal, const char *special)
a0ed51b3 1459{
89ebb4a3 1460 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1461 STRLEN len = 0;
a0ed51b3 1462
aec46f14 1463 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7 1464 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1465 * are necessary in EBCDIC, they are redundant no-ops
1466 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1467 const UV uv1 = NATIVE_TO_UNI(uv0);
1feea2c7 1468 uvuni_to_utf8(tmpbuf, uv1);
0134edef 1469
1470 if (!*swashp) /* load on-demand */
1471 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1472
b08cf34e 1473 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1474 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1475 /* It might be "special" (sometimes, but not always,
2a37f04d 1476 * a multicharacter mapping) */
983ffd37 1477 HV *hv;
b08cf34e 1478 SV **svp;
1479
1480 if ((hv = get_hv(special, FALSE)) &&
1481 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1482 (*svp)) {
cfd0369c 1483 const char *s;
47654450 1484
cfd0369c 1485 s = SvPV_const(*svp, len);
47654450 1486 if (len == 1)
1487 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1488 else {
2f9475ad 1489#ifdef EBCDIC
1490 /* If we have EBCDIC we need to remap the characters
1491 * since any characters in the low 256 are Unicode
1492 * code points, not EBCDIC. */
7cda7a3d 1493 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad 1494
1495 d = tmpbuf;
b08cf34e 1496 if (SvUTF8(*svp)) {
2f9475ad 1497 STRLEN tlen = 0;
1498
1499 while (t < tend) {
1500 UV c = utf8_to_uvchr(t, &tlen);
1501 if (tlen > 0) {
1502 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1503 t += tlen;
1504 }
1505 else
1506 break;
1507 }
1508 }
1509 else {
36fec512 1510 while (t < tend) {
1511 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1512 t++;
1513 }
2f9475ad 1514 }
1515 len = d - tmpbuf;
1516 Copy(tmpbuf, ustrp, len, U8);
1517#else
d2dcd0fb 1518 Copy(s, ustrp, len, U8);
2f9475ad 1519#endif
29e98929 1520 }
983ffd37 1521 }
0134edef 1522 }
1523
1524 if (!len && *swashp) {
1525 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1526
1527 if (uv2) {
1528 /* It was "normal" (a single character mapping). */
1529 UV uv3 = UNI_TO_NATIVE(uv2);
1530
e9101d72 1531 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d 1532 }
1533 }
1feea2c7 1534
0134edef 1535 if (!len) /* Neither: just copy. */
1536 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1537
2a37f04d 1538 if (lenp)
1539 *lenp = len;
1540
0134edef 1541 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3 1542}
1543
d3e79532 1544/*
7fc63493 1545=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1546
1547Convert the UTF-8 encoded character at p to its uppercase version and
1548store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1549that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1550the uppercase version may be longer than the original character.
d3e79532 1551
1552The first character of the uppercased version is returned
1553(but note, as explained above, that there may be more.)
1554
1555=cut */
1556
2104c8d9 1557UV
7fc63493 1558Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1559{
983ffd37 1560 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1561 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1562}
a0ed51b3 1563
d3e79532 1564/*
7fc63493 1565=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1566
1567Convert the UTF-8 encoded character at p to its titlecase version and
1568store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1569that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1570titlecase version may be longer than the original character.
d3e79532 1571
1572The first character of the titlecased version is returned
1573(but note, as explained above, that there may be more.)
1574
1575=cut */
1576
983ffd37 1577UV
7fc63493 1578Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37 1579{
1580 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1581 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3 1582}
1583
d3e79532 1584/*
7fc63493 1585=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1586
1587Convert the UTF-8 encoded character at p to its lowercase version and
1588store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1589that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1590lowercase version may be longer than the original character.
d3e79532 1591
1592The first character of the lowercased version is returned
1593(but note, as explained above, that there may be more.)
1594
1595=cut */
1596
2104c8d9 1597UV
7fc63493 1598Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1599{
983ffd37 1600 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1601 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1602}
1603
d3e79532 1604/*
7fc63493 1605=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1606
1607Convert the UTF-8 encoded character at p to its foldcase version and
1608store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1609that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532 1610foldcase version may be longer than the original character (up to
1611three characters).
1612
1613The first character of the foldcased version is returned
1614(but note, as explained above, that there may be more.)
1615
1616=cut */
1617
b4e400f9 1618UV
7fc63493 1619Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9 1620{
1621 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1622 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3 1623}
1624
1625/* a "swash" is a swatch hash */
1626
1627SV*
7fc63493 1628Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1629{
27da23d5 1630 dVAR;
a0ed51b3 1631 SV* retval;
9a957fbc 1632 SV* const tokenbufsv = sv_newmortal();
8e84507e 1633 dSP;
7fc63493 1634 const size_t pkg_len = strlen(pkg);
1635 const size_t name_len = strlen(name);
aec46f14 1636 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1637 SV* errsv_save;
ce3b816e 1638
96ca9f55 1639 PUSHSTACKi(PERLSI_MAGIC);
1640 ENTER;
1641 SAVEI32(PL_hints);
1642 PL_hints = 0;
1643 save_re_context();
1b026014 1644 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1645 ENTER;
f8be5cf0 1646 errsv_save = newSVsv(ERRSV);
71bed85a 1647 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1648 Nullsv);
f8be5cf0 1649 if (!SvTRUE(ERRSV))
1650 sv_setsv(ERRSV, errsv_save);
1651 SvREFCNT_dec(errsv_save);
ce3b816e 1652 LEAVE;
1653 }
1654 SPAGAIN;
a0ed51b3 1655 PUSHMARK(SP);
1656 EXTEND(SP,5);
71bed85a 1657 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1658 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3 1659 PUSHs(listsv);
1660 PUSHs(sv_2mortal(newSViv(minbits)));
1661 PUSHs(sv_2mortal(newSViv(none)));
1662 PUTBACK;
923e4eb5 1663 if (IN_PERL_COMPILETIME) {
bf1fed83 1664 /* XXX ought to be handled by lex_start */
82686b01 1665 SAVEI32(PL_in_my);
2b4bd638 1666 PL_in_my = 0;
bf1fed83 1667 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1668 }
f8be5cf0 1669 errsv_save = newSVsv(ERRSV);
864dbfa3 1670 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1671 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1672 else
e24b16f9 1673 retval = &PL_sv_undef;
f8be5cf0 1674 if (!SvTRUE(ERRSV))
1675 sv_setsv(ERRSV, errsv_save);
1676 SvREFCNT_dec(errsv_save);
a0ed51b3 1677 LEAVE;
1678 POPSTACK;
923e4eb5 1679 if (IN_PERL_COMPILETIME) {
bf1fed83 1680 STRLEN len;
aec46f14 1681 const char* const pv = SvPV_const(tokenbufsv, len);
bf1fed83 1682
1683 Copy(pv, PL_tokenbuf, len+1, char);
eb160463 1684 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1685 }
bc45ce41 1686 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1687 if (SvPOK(retval))
35c1215d 1688 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1689 retval);
cea2e8a9 1690 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1691 }
a0ed51b3 1692 return retval;
1693}
1694
035d37be 1695
1696/* This API is wrong for special case conversions since we may need to
1697 * return several Unicode characters for a single Unicode character
1698 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1699 * the lower-level routine, and it is similarly broken for returning
1700 * multiple values. --jhi */
a0ed51b3 1701UV
7fc63493 1702Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
a0ed51b3 1703{
27da23d5 1704 dVAR;
aec46f14 1705 HV* const hv = (HV*)SvRV(sv);
3568d838 1706 U32 klen;
1707 U32 off;
a0ed51b3 1708 STRLEN slen;
7d85a32c 1709 STRLEN needents;
cfd0369c 1710 const U8 *tmps = NULL;
a0ed51b3 1711 U32 bit;
1712 SV *retval;
3568d838 1713 U8 tmputf8[2];
1714 UV c = NATIVE_TO_ASCII(*ptr);
1715
1716 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
eb160463 1717 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1718 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
3568d838 1719 ptr = tmputf8;
1720 }
1721 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1722 * then the "swatch" is a vec() for al the chars which start
1723 * with 0xAA..0xYY
1724 * So the key in the hash (klen) is length of encoded char -1
1725 */
1726 klen = UTF8SKIP(ptr) - 1;
1727 off = ptr[klen];
a0ed51b3 1728
7d85a32c 1729 if (klen == 0)
1730 {
1731 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1732 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1733 */
1734 needents = UTF_CONTINUATION_MARK;
1735 off = NATIVE_TO_UTF(ptr[klen]);
1736 }
1737 else
1738 {
1739 /* If char is encoded then swatch is for the prefix */
1740 needents = (1 << UTF_ACCUMULATION_SHIFT);
1741 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1742 }
1743
a0ed51b3 1744 /*
1745 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1746 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1747 * it's nothing to sniff at.) Pity we usually come through at least
1748 * two function calls to get here...
1749 *
1750 * NB: this code assumes that swatches are never modified, once generated!
1751 */
1752
3568d838 1753 if (hv == PL_last_swash_hv &&
a0ed51b3 1754 klen == PL_last_swash_klen &&
27da23d5 1755 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3 1756 {
1757 tmps = PL_last_swash_tmps;
1758 slen = PL_last_swash_slen;
1759 }
1760 else {
1761 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1762 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 1763
1764 /* If not cached, generate it via utf8::SWASHGET */
cfd0369c 1765 if (!svp || !SvPOK(*svp) || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
a0ed51b3 1766 dSP;
2b9d42f0 1767 /* We use utf8n_to_uvuni() as we want an index into
1768 Unicode tables, not a native character number.
1769 */
aec46f14 1770 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae 1771 ckWARN(WARN_UTF8) ?
1772 0 : UTF8_ALLOW_ANY);
f8be5cf0 1773 SV *errsv_save;
a0ed51b3 1774 ENTER;
1775 SAVETMPS;
1776 save_re_context();
1777 PUSHSTACKi(PERLSI_MAGIC);
1778 PUSHMARK(SP);
1779 EXTEND(SP,3);
1780 PUSHs((SV*)sv);
ffbc6a93 1781 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838 1782 PUSHs(sv_2mortal(newSViv((klen) ?
1783 (code_point & ~(needents - 1)) : 0)));
a0ed51b3 1784 PUSHs(sv_2mortal(newSViv(needents)));
1785 PUTBACK;
f8be5cf0 1786 errsv_save = newSVsv(ERRSV);
864dbfa3 1787 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1788 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1789 else
e24b16f9 1790 retval = &PL_sv_undef;
f8be5cf0 1791 if (!SvTRUE(ERRSV))
1792 sv_setsv(ERRSV, errsv_save);
1793 SvREFCNT_dec(errsv_save);
a0ed51b3 1794 POPSTACK;
1795 FREETMPS;
1796 LEAVE;
923e4eb5 1797 if (IN_PERL_COMPILETIME)
eb160463 1798 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1799
e1ec3a88 1800 svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
a0ed51b3 1801
7d85a32c 1802 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1803 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3 1804 }
1805
1806 PL_last_swash_hv = hv;
1807 PL_last_swash_klen = klen;
cfd0369c 1808 /* FIXME change interpvar.h? */
1809 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3 1810 PL_last_swash_slen = slen;
1811 if (klen)
1812 Copy(ptr, PL_last_swash_key, klen, U8);
1813 }
1814
9faf8d75 1815 switch ((int)((slen << 3) / needents)) {
a0ed51b3 1816 case 1:
1817 bit = 1 << (off & 7);
1818 off >>= 3;
1819 return (tmps[off] & bit) != 0;
1820 case 8:
1821 return tmps[off];
1822 case 16:
1823 off <<= 1;
1824 return (tmps[off] << 8) + tmps[off + 1] ;
1825 case 32:
1826 off <<= 2;
1827 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1828 }
cea2e8a9 1829 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3 1830 return 0;
1831}
2b9d42f0 1832
1833
1834/*
37607a96 1835=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0 1836
1e54db1a 1837Adds the UTF-8 representation of the Native codepoint C<uv> to the end
89ebb4a3 1838of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2b9d42f0 1839bytes available. The return value is the pointer to the byte after the
1840end of the new character. In other words,
1841
1842 d = uvchr_to_utf8(d, uv);
1843
1844is the recommended wide native character-aware way of saying
1845
1846 *(d++) = uv;
1847
1848=cut
1849*/
1850
1851/* On ASCII machines this is normally a macro but we want a
1852 real function in case XS code wants it
1853*/
1854#undef Perl_uvchr_to_utf8
1855U8 *
1856Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1857{
b851fbc1 1858 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2b9d42f0 1859}
1860
b851fbc1 1861U8 *
1862Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1863{
1864 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1865}
2b9d42f0 1866
1867/*
37607a96 1868=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0 1869
1870Returns the native character value of the first character in the string C<s>
1e54db1a 1871which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2b9d42f0 1872length, in bytes, of that character.
1873
1874Allows length and flags to be passed to low level routine.
1875
1876=cut
1877*/
0a2ef054 1878/* On ASCII machines this is normally a macro but we want
1879 a real function in case XS code wants it
2b9d42f0 1880*/
1881#undef Perl_utf8n_to_uvchr
1882UV
7fc63493 1883Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0 1884{
aec46f14 1885 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2b9d42f0 1886 return UNI_TO_NATIVE(uv);
1887}
1888
d2cc3551 1889/*
1890=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1891
1892Build to the scalar dsv a displayable version of the string spv,
1893length len, the displayable version being at most pvlim bytes long
1894(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1895
9e55ce06 1896The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1897isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054 1898to display the \\[nrfta\\] as the backslashed versions (like '\n')
1899(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1900UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1901UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1902
d2cc3551 1903The pointer to the PV of the dsv is returned.
1904
1905=cut */
e6b2e755 1906char *
e1ec3a88 1907Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755 1908{
1909 int truncated = 0;
e1ec3a88 1910 const char *s, *e;
e6b2e755 1911
1912 sv_setpvn(dsv, "", 0);
e1ec3a88 1913 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 1914 UV u;
a49f32c6 1915 /* This serves double duty as a flag and a character to print after
1916 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1917 */
1918 char ok = 0;
c728cb41 1919
e6b2e755 1920 if (pvlim && SvCUR(dsv) >= pvlim) {
1921 truncated++;
1922 break;
1923 }
1924 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1925 if (u < 256) {
a3b680e6 1926 const unsigned char c = (unsigned char)u & 0xFF;
c728cb41 1927 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
a49f32c6 1928 switch (c) {
c728cb41 1929 case '\n':
a49f32c6 1930 ok = 'n'; break;
c728cb41 1931 case '\r':
a49f32c6 1932 ok = 'r'; break;
c728cb41 1933 case '\t':
a49f32c6 1934 ok = 't'; break;
c728cb41 1935 case '\f':
a49f32c6 1936 ok = 'f'; break;
c728cb41 1937 case '\a':
a49f32c6 1938 ok = 'a'; break;
c728cb41 1939 case '\\':
a49f32c6 1940 ok = '\\'; break;
c728cb41 1941 default: break;
1942 }
a49f32c6 1943 if (ok) {
1944 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1945 }
c728cb41 1946 }
00e86452 1947 /* isPRINT() is the locale-blind version. */
a49f32c6 1948 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1949 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1950 ok = 1;
0a2ef054 1951 }
c728cb41 1952 }
1953 if (!ok)
9e55ce06 1954 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755 1955 }
1956 if (truncated)
1957 sv_catpvn(dsv, "...", 3);
1958
1959 return SvPVX(dsv);
1960}
2b9d42f0 1961
d2cc3551 1962/*
1963=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1964
1965Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1966the displayable version being at most pvlim bytes long
d2cc3551 1967(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1968
1969The flags argument is as in pv_uni_display().
1970
d2cc3551 1971The pointer to the PV of the dsv is returned.
1972
1973=cut */
e6b2e755 1974char *
1975Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1976{
cfd0369c 1977 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
1978 SvCUR(ssv), pvlim, flags);
701a277b 1979}
1980
d2cc3551 1981/*
d07ddd77 1982=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551 1983
1984Return true if the strings s1 and s2 differ case-insensitively, false
1985if not (if they are equal case-insensitively). If u1 is true, the
1986string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77 1987the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1988are false, the respective string is assumed to be in native 8-bit
1989encoding.
1990
1991If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1992in there (they will point at the beginning of the I<next> character).
1993If the pointers behind pe1 or pe2 are non-NULL, they are the end
1994pointers beyond which scanning will not continue under any
4cdaeff7 1995circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77 1996s2+l2 will be used as goal end pointers that will also stop the scan,
1997and which qualify towards defining a successful match: all the scans
1998that define an explicit length must reach their goal pointers for
1999a match to succeed).
d2cc3551 2000
2001For case-insensitiveness, the "casefolding" of Unicode is used
2002instead of upper/lowercasing both the characters, see
2003http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2004
2005=cut */
701a277b 2006I32
d07ddd77 2007Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 2008{
e1ec3a88 2009 register const U8 *p1 = (const U8*)s1;
2010 register const U8 *p2 = (const U8*)s2;
2011 register const U8 *f1 = 0, *f2 = 0;
2012 register U8 *e1 = 0, *q1 = 0;
2013 register U8 *e2 = 0, *q2 = 0;
d07ddd77 2014 STRLEN n1 = 0, n2 = 0;
89ebb4a3 2015 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2016 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8 2017 U8 natbuf[1+1];
2018 STRLEN foldlen1, foldlen2;
d07ddd77 2019 bool match;
332ddc25 2020
d07ddd77 2021 if (pe1)
2022 e1 = *(U8**)pe1;
e1ec3a88 2023 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2024 f1 = (const U8*)s1 + l1;
d07ddd77 2025 if (pe2)
2026 e2 = *(U8**)pe2;
e1ec3a88 2027 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2028 f2 = (const U8*)s2 + l2;
d07ddd77 2029
2030 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2031 return 1; /* mismatch; possible infinite loop or false positive */
2032
a6872d42 2033 if (!u1 || !u2)
2034 natbuf[1] = 0; /* Need to terminate the buffer. */
2035
d07ddd77 2036 while ((e1 == 0 || p1 < e1) &&
2037 (f1 == 0 || p1 < f1) &&
2038 (e2 == 0 || p2 < e2) &&
2039 (f2 == 0 || p2 < f2)) {
2040 if (n1 == 0) {
d7f013c8 2041 if (u1)
2042 to_utf8_fold(p1, foldbuf1, &foldlen1);
2043 else {
809e8e66 2044 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8 2045 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2046 }
2047 q1 = foldbuf1;
d07ddd77 2048 n1 = foldlen1;
332ddc25 2049 }
d07ddd77 2050 if (n2 == 0) {
d7f013c8 2051 if (u2)
2052 to_utf8_fold(p2, foldbuf2, &foldlen2);
2053 else {
809e8e66 2054 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8 2055 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2056 }
2057 q2 = foldbuf2;
d07ddd77 2058 n2 = foldlen2;
332ddc25 2059 }
d07ddd77 2060 while (n1 && n2) {
2061 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2062 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2063 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2064 return 1; /* mismatch */
d07ddd77 2065 n1 -= UTF8SKIP(q1);
d7f013c8 2066 q1 += UTF8SKIP(q1);
d07ddd77 2067 n2 -= UTF8SKIP(q2);
d7f013c8 2068 q2 += UTF8SKIP(q2);
701a277b 2069 }
d07ddd77 2070 if (n1 == 0)
d7f013c8 2071 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2072 if (n2 == 0)
d7f013c8 2073 p2 += u2 ? UTF8SKIP(p2) : 1;
2074
d2cc3551 2075 }
5469e704 2076
d07ddd77 2077 /* A match is defined by all the scans that specified
2078 * an explicit length reaching their final goals. */
2079 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704 2080
2081 if (match) {
d07ddd77 2082 if (pe1)
2083 *pe1 = (char*)p1;
2084 if (pe2)
2085 *pe2 = (char*)p2;
5469e704 2086 }
2087
2088 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2089}
701a277b 2090
a49f32c6 2091/*
2092 * Local variables:
2093 * c-indentation-style: bsd
2094 * c-basic-offset: 4
2095 * indent-tabs-mode: t
2096 * End:
2097 *
37442d52 2098 * ex: set ts=8 sts=4 sw=4 noet:
2099 */