Tru64: allow creating perl.pixie even with optimized Perl
[p5sagit/p5-mst-13.2.git] / utf8.c
CommitLineData
a0ed51b3 1/* utf8.c
2 *
af3babe4 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
4 * others
a0ed51b3 5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3 26#include "perl.h"
27
27da23d5 28static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 30
ccfc67b7 31/*
32=head1 Unicode Support
a0ed51b3 33
166f8a29 34This file contains various utility functions for manipulating UTF8-encoded
35strings. For the uninitiated, this is a method of representing arbitrary
61296642 36Unicode characters as a variable number of bytes, in such a way that
56da48f7 37characters in the ASCII range are unmodified, and a zero byte never appears
38within non-zero characters.
166f8a29 39
b851fbc1 40=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 41
1e54db1a 42Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 43of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 44bytes available. The return value is the pointer to the byte after the
9041c2e3 45end of the new character. In other words,
eebe1485 46
b851fbc1 47 d = uvuni_to_utf8_flags(d, uv, flags);
48
49or, in most cases,
50
9041c2e3 51 d = uvuni_to_utf8(d, uv);
eebe1485 52
b851fbc1 53(which is equivalent to)
54
55 d = uvuni_to_utf8_flags(d, uv, 0);
56
eebe1485 57is the recommended Unicode-aware way of saying
58
59 *(d++) = uv;
60
61=cut
62*/
63
dfe13c55 64U8 *
b851fbc1 65Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 66{
62961d2e 67 if (ckWARN(WARN_UTF8)) {
b851fbc1 68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1 71 else if (
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
74 ||
c867b360 75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1 76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
2a20b9da 78 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1 79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
81 )
9014280d 82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800 83 "Unicode character 0x%04"UVxf" is illegal", uv);
84 }
c4d5f83a 85 if (UNI_IS_INVARIANT(uv)) {
eb160463 86 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3 87 return d;
88 }
2d331972 89#if defined(EBCDIC)
1d72bdf6 90 else {
91 STRLEN len = UNISKIP(uv);
92 U8 *p = d+len-1;
93 while (p > d) {
eb160463 94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6 95 uv >>= UTF_ACCUMULATION_SHIFT;
96 }
eb160463 97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6 98 return d+len;
99 }
100#else /* Non loop style */
a0ed51b3 101 if (uv < 0x800) {
eb160463 102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 104 return d;
105 }
106 if (uv < 0x10000) {
eb160463 107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 110 return d;
111 }
112 if (uv < 0x200000) {
eb160463 113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 117 return d;
118 }
119 if (uv < 0x4000000) {
eb160463 120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 125 return d;
126 }
127 if (uv < 0x80000000) {
eb160463 128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 134 return d;
135 }
6b8eaf93 136#ifdef HAS_QUAD
d7578b48 137 if (uv < UTF8_QUAD_MAX)
a0ed51b3 138#endif
139 {
eb160463 140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 147 return d;
148 }
6b8eaf93 149#ifdef HAS_QUAD
a0ed51b3 150 {
eb160463 151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 164 return d;
165 }
166#endif
1d72bdf6 167#endif /* Loop style */
a0ed51b3 168}
b851fbc1 169
170U8 *
171Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
172{
173 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
174}
9041c2e3 175
646ca15d 176/*
177
178Tests if some arbitrary number of bytes begins in a valid UTF-8
179character. Note that an INVARIANT (i.e. ASCII) character is a valid
180UTF-8 character. The actual number of bytes in the UTF-8 character
181will be returned if it is valid, otherwise 0.
182
183This is the "slow" version as opposed to the "fast" version which is
184the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
185difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
186or less you should use the IS_UTF8_CHAR(), for lengths of five or more
187you should use the _slow(). In practice this means that the _slow()
188will be used very rarely, since the maximum Unicode code point (as of
189Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
190the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
191five bytes or more.
192
193=cut */
194STRLEN
195S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
196{
197 U8 u = *s;
198 STRLEN slen;
199 UV uv, ouv;
200
201 if (UTF8_IS_INVARIANT(u))
202 return 1;
203
204 if (!UTF8_IS_START(u))
205 return 0;
206
207 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
208 return 0;
209
210 slen = len - 1;
211 s++;
212 u &= UTF_START_MASK(len);
213 uv = u;
214 ouv = uv;
215 while (slen--) {
216 if (!UTF8_IS_CONTINUATION(*s))
217 return 0;
218 uv = UTF8_ACCUMULATE(uv, *s);
219 if (uv < ouv)
220 return 0;
221 ouv = uv;
222 s++;
223 }
224
225 if ((STRLEN)UNISKIP(uv) < len)
226 return 0;
227
228 return len;
229}
9041c2e3 230
231/*
7fc63493 232=for apidoc A|STRLEN|is_utf8_char|const U8 *s
eebe1485 233
5da9da9e 234Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01 235character. Note that an INVARIANT (i.e. ASCII) character is a valid
236UTF-8 character. The actual number of bytes in the UTF-8 character
237will be returned if it is valid, otherwise 0.
9041c2e3 238
82686b01 239=cut */
067a85ef 240STRLEN
7fc63493 241Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 242{
3b0fc154 243 STRLEN len;
244#ifdef IS_UTF8_CHAR
9f07fdcd 245 len = UTF8SKIP(s);
3b0fc154 246 if (len <= 4)
247 return IS_UTF8_CHAR(s, len) ? len : 0;
248#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 249 return is_utf8_char_slow(s, len);
386d01d6 250}
251
6662521e 252/*
7fc63493 253=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
6662521e 254
c9ada85f 255Returns true if first C<len> bytes of the given string form a valid
1e54db1a 256UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
257not mean 'a string that contains code points above 0x7F encoded in UTF-8'
258because a valid ASCII string is a valid UTF-8 string.
6662521e 259
260=cut
261*/
262
8e84507e 263bool
7fc63493 264Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 265{
7fc63493 266 const U8* x = s;
267 const U8* send;
067a85ef 268
61468b03 269 if (!len && s)
e1ec3a88 270 len = strlen((const char *)s);
1aa99e6b 271 send = s + len;
272
6662521e 273 while (x < send) {
a3b680e6 274 STRLEN c;
1acdb0da 275 /* Inline the easy bits of is_utf8_char() here for speed... */
276 if (UTF8_IS_INVARIANT(*x))
277 c = 1;
278 else if (!UTF8_IS_START(*x))
279 return FALSE;
280 else {
281 /* ... and call is_utf8_char() only if really needed. */
646ca15d 282#ifdef IS_UTF8_CHAR
283 c = UTF8SKIP(x);
284 if (c <= 4) {
285 if (!IS_UTF8_CHAR(x, c))
286 return FALSE;
287 } else {
2c0c5f92 288 if (!is_utf8_char_slow(x, c))
646ca15d 289 return FALSE;
290 }
291#else
292 c = is_utf8_char(x);
293#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 294 if (!c)
295 return FALSE;
296 }
6662521e 297 x += c;
6662521e 298 }
60006e79 299 if (x != send)
300 return FALSE;
067a85ef 301
302 return TRUE;
6662521e 303}
304
67e989fb 305/*
7fc63493 306=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **p
81cd54e3 307
308Like is_ut8_string but store the location of the failure in
309the last argument.
310
311=cut
312*/
313
314bool
7fc63493 315Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **p)
81cd54e3 316{
7fc63493 317 const U8* x = s;
318 const U8* send;
81cd54e3 319 STRLEN c;
320
61468b03 321 if (!len && s)
e1ec3a88 322 len = strlen((const char *)s);
81cd54e3 323 send = s + len;
324
325 while (x < send) {
326 /* Inline the easy bits of is_utf8_char() here for speed... */
327 if (UTF8_IS_INVARIANT(*x))
328 c = 1;
329 else if (!UTF8_IS_START(*x)) {
330 if (p)
331 *p = x;
332 return FALSE;
333 }
334 else {
335 /* ... and call is_utf8_char() only if really needed. */
336 c = is_utf8_char(x);
337 if (!c) {
338 if (p)
339 *p = x;
340 return FALSE;
341 }
342 }
343 x += c;
344 }
345 if (x != send) {
346 if (p)
347 *p = x;
348 return FALSE;
349 }
350
351 return TRUE;
352}
353
354/*
7fc63493 355=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 356
9041c2e3 357Bottom level UTF-8 decode routine.
358Returns the unicode code point value of the first character in the string C<s>
1e54db1a 359which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 360C<retlen> will be set to the length, in bytes, of that character.
67e989fb 361
1e54db1a 362If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880 363is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
364it is assumed that the caller will raise a warning, and this function
28d3d195 365will silently just set C<retlen> to C<-1> and return zero. If the
366C<flags> does not contain UTF8_CHECK_ONLY, warnings about
367malformations will be given, C<retlen> will be set to the expected
368length of the UTF-8 character in bytes, and zero will be returned.
369
370The C<flags> can also contain various flags to allow deviations from
371the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 372
9041c2e3 373Most code should use utf8_to_uvchr() rather than call this directly.
374
37607a96 375=cut
376*/
67e989fb 377
a0ed51b3 378UV
7fc63493 379Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 380{
7fc63493 381 const U8 *s0 = s;
9c5ffd7c 382 UV uv = *s, ouv = 0;
ba210ebe 383 STRLEN len = 1;
7fc63493 384 const bool dowarn = ckWARN_d(WARN_UTF8);
385 const UV startbyte = *s;
ba210ebe 386 STRLEN expectlen = 0;
a0dbb045 387 U32 warning = 0;
388
389/* This list is a superset of the UTF8_ALLOW_XXX. */
390
391#define UTF8_WARN_EMPTY 1
392#define UTF8_WARN_CONTINUATION 2
393#define UTF8_WARN_NON_CONTINUATION 3
394#define UTF8_WARN_FE_FF 4
395#define UTF8_WARN_SHORT 5
396#define UTF8_WARN_OVERFLOW 6
397#define UTF8_WARN_SURROGATE 7
c867b360 398#define UTF8_WARN_LONG 8
399#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045 400
401 if (curlen == 0 &&
402 !(flags & UTF8_ALLOW_EMPTY)) {
403 warning = UTF8_WARN_EMPTY;
0c443dc2 404 goto malformed;
405 }
406
1d72bdf6 407 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3 408 if (retlen)
409 *retlen = 1;
c4d5f83a 410 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 411 }
67e989fb 412
421a8bf2 413 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 414 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 415 warning = UTF8_WARN_CONTINUATION;
ba210ebe 416 goto malformed;
417 }
418
421a8bf2 419 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 420 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 421 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 422 goto malformed;
423 }
9041c2e3 424
1d72bdf6 425#ifdef EBCDIC
75383841 426 uv = NATIVE_TO_UTF(uv);
1d72bdf6 427#else
fcc8fcf6 428 if ((uv == 0xfe || uv == 0xff) &&
429 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 430 warning = UTF8_WARN_FE_FF;
ba210ebe 431 goto malformed;
a0ed51b3 432 }
1d72bdf6 433#endif
434
ba210ebe 435 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
436 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
437 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
438 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6 439#ifdef EBCDIC
440 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
441 else { len = 7; uv &= 0x01; }
442#else
ba210ebe 443 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
444 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6 445 else { len = 13; uv = 0; } /* whoa! */
446#endif
447
a0ed51b3 448 if (retlen)
449 *retlen = len;
9041c2e3 450
ba210ebe 451 expectlen = len;
452
fcc8fcf6 453 if ((curlen < expectlen) &&
454 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 455 warning = UTF8_WARN_SHORT;
ba210ebe 456 goto malformed;
457 }
458
459 len--;
a0ed51b3 460 s++;
ba210ebe 461 ouv = uv;
462
a0ed51b3 463 while (len--) {
421a8bf2 464 if (!UTF8_IS_CONTINUATION(*s) &&
465 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 466 s--;
467 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 468 goto malformed;
a0ed51b3 469 }
470 else
8850bf83 471 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045 472 if (!(uv > ouv)) {
473 /* These cannot be allowed. */
474 if (uv == ouv) {
75dbc644 475 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 476 warning = UTF8_WARN_LONG;
477 goto malformed;
478 }
479 }
480 else { /* uv < ouv */
481 /* This cannot be allowed. */
482 warning = UTF8_WARN_OVERFLOW;
483 goto malformed;
484 }
ba210ebe 485 }
486 s++;
487 ouv = uv;
488 }
489
421a8bf2 490 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 491 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 492 warning = UTF8_WARN_SURROGATE;
ba210ebe 493 goto malformed;
eb160463 494 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 495 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 496 warning = UTF8_WARN_LONG;
ba210ebe 497 goto malformed;
421a8bf2 498 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 499 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 500 warning = UTF8_WARN_FFFF;
a9917092 501 goto malformed;
a0ed51b3 502 }
ba210ebe 503
a0ed51b3 504 return uv;
ba210ebe 505
506malformed:
507
fcc8fcf6 508 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 509 if (retlen)
cc366d4b 510 *retlen = -1;
ba210ebe 511 return 0;
512 }
513
a0dbb045 514 if (dowarn) {
515 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
516
517 switch (warning) {
518 case 0: /* Intentionally empty. */ break;
519 case UTF8_WARN_EMPTY:
54667de8 520 Perl_sv_catpv(aTHX_ sv, "(empty string)");
a0dbb045 521 break;
522 case UTF8_WARN_CONTINUATION:
097fb8e2 523 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045 524 break;
525 case UTF8_WARN_NON_CONTINUATION:
097fb8e2 526 if (s == s0)
527 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
528 (UV)s[1], startbyte);
529 else
530 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
5d7488b2 531 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, (int)expectlen);
097fb8e2 532
a0dbb045 533 break;
534 case UTF8_WARN_FE_FF:
535 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
536 break;
537 case UTF8_WARN_SHORT:
097fb8e2 538 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 539 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 540 expectlen = curlen; /* distance for caller to skip */
a0dbb045 541 break;
542 case UTF8_WARN_OVERFLOW:
097fb8e2 543 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
544 ouv, *s, startbyte);
a0dbb045 545 break;
546 case UTF8_WARN_SURROGATE:
547 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
548 break;
a0dbb045 549 case UTF8_WARN_LONG:
097fb8e2 550 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 551 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045 552 break;
553 case UTF8_WARN_FFFF:
554 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
555 break;
556 default:
54667de8 557 Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
a0dbb045 558 break;
559 }
560
561 if (warning) {
504618e9 562 const char *s = SvPVX_const(sv);
a0dbb045 563
564 if (PL_op)
9014280d 565 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 566 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 567 else
9014280d 568 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045 569 }
570 }
571
ba210ebe 572 if (retlen)
28d3d195 573 *retlen = expectlen ? expectlen : len;
ba210ebe 574
28d3d195 575 return 0;
a0ed51b3 576}
577
8e84507e 578/*
7fc63493 579=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
9041c2e3 580
581Returns the native character value of the first character in the string C<s>
1e54db1a 582which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3 583length, in bytes, of that character.
584
1e54db1a 585If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3 586returned and retlen is set, if possible, to -1.
587
588=cut
589*/
590
591UV
7fc63493 592Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 593{
89ebb4a3 594 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 595 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3 596}
597
598/*
7fc63493 599=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
9041c2e3 600
601Returns the Unicode code point of the first character in the string C<s>
1e54db1a 602which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3 603length, in bytes, of that character.
604
605This function should only be used when returned UV is considered
606an index into the Unicode semantic tables (e.g. swashes).
607
1e54db1a 608If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 609returned and retlen is set, if possible, to -1.
8e84507e 610
611=cut
612*/
613
614UV
7fc63493 615Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 616{
9041c2e3 617 /* Call the low level routine asking for checks */
89ebb4a3 618 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 619 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e 620}
621
b76347f2 622/*
35a4481c 623=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
b76347f2 624
625Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47 626Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
627up past C<e>, croaks.
b76347f2 628
629=cut
630*/
631
632STRLEN
35a4481c 633Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2 634{
635 STRLEN len = 0;
636
8850bf83 637 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
638 * the bitops (especially ~) can create illegal UTF-8.
639 * In other words: in Perl UTF-8 is not just for Unicode. */
640
a3b680e6 641 if (e < s)
642 goto warn_and_return;
b76347f2 643 while (s < e) {
4373e329 644 const U8 t = UTF8SKIP(s);
901b21bf 645 if (e - s < t) {
a3b680e6 646 warn_and_return:
901b21bf 647 if (ckWARN_d(WARN_UTF8)) {
648 if (PL_op)
649 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 650 "%s in %s", unees, OP_DESC(PL_op));
901b21bf 651 else
652 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
653 }
654 return len;
655 }
b76347f2 656 s += t;
657 len++;
658 }
659
660 return len;
661}
662
b06226ff 663/*
35a4481c 664=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
b06226ff 665
1e54db1a 666Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff 667and C<b>.
668
669WARNING: use only if you *know* that the pointers point inside the
670same UTF-8 buffer.
671
37607a96 672=cut
673*/
a0ed51b3 674
02eb7b47 675IV
35a4481c 676Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 677{
02eb7b47 678 IV off = 0;
679
8850bf83 680 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
681 * the bitops (especially ~) can create illegal UTF-8.
682 * In other words: in Perl UTF-8 is not just for Unicode. */
683
a0ed51b3 684 if (a < b) {
685 while (a < b) {
35a4481c 686 const U8 c = UTF8SKIP(a);
a3b680e6 687 if (b - a < c)
688 goto warn_and_return;
02eb7b47 689 a += c;
a0ed51b3 690 off--;
691 }
692 }
693 else {
694 while (b < a) {
4373e329 695 const U8 c = UTF8SKIP(b);
02eb7b47 696
901b21bf 697 if (a - b < c) {
a3b680e6 698 warn_and_return:
901b21bf 699 if (ckWARN_d(WARN_UTF8)) {
700 if (PL_op)
701 Perl_warner(aTHX_ packWARN(WARN_UTF8),
702 "%s in %s", unees, OP_DESC(PL_op));
703 else
704 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
705 }
706 return off;
707 }
02eb7b47 708 b += c;
a0ed51b3 709 off++;
710 }
711 }
02eb7b47 712
a0ed51b3 713 return off;
714}
715
b06226ff 716/*
37607a96 717=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 718
8850bf83 719Return the UTF-8 pointer C<s> displaced by C<off> characters, either
720forward or backward.
b06226ff 721
722WARNING: do not use the following unless you *know* C<off> is within
8850bf83 723the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
724on the first byte of character or just after the last byte of a character.
b06226ff 725
37607a96 726=cut
727*/
a0ed51b3 728
729U8 *
4373e329 730Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 731{
8850bf83 732 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
733 * the bitops (especially ~) can create illegal UTF-8.
734 * In other words: in Perl UTF-8 is not just for Unicode. */
735
a0ed51b3 736 if (off >= 0) {
737 while (off--)
738 s += UTF8SKIP(s);
739 }
740 else {
741 while (off++) {
742 s--;
8850bf83 743 while (UTF8_IS_CONTINUATION(*s))
744 s--;
a0ed51b3 745 }
746 }
4373e329 747 return (U8 *)s;
a0ed51b3 748}
749
6940069f 750/*
eebe1485 751=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 752
1e54db1a 753Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53 754Unlike C<bytes_to_utf8>, this over-writes the original string, and
755updates len to contain the new length.
67e989fb 756Returns zero on failure, setting C<len> to -1.
6940069f 757
758=cut
759*/
760
761U8 *
37607a96 762Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 763{
6940069f 764 U8 *send;
765 U8 *d;
dcad2880 766 U8 *save = s;
246fae53 767
1e54db1a 768 /* ensure valid UTF-8 and chars < 256 before updating string */
dcad2880 769 for (send = s + *len; s < send; ) {
770 U8 c = *s++;
771
1d72bdf6 772 if (!UTF8_IS_INVARIANT(c) &&
773 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
774 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880 775 *len = -1;
776 return 0;
777 }
246fae53 778 }
dcad2880 779
780 d = s = save;
6940069f 781 while (s < send) {
ed646e6e 782 STRLEN ulen;
9041c2e3 783 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 784 s += ulen;
6940069f 785 }
786 *d = '\0';
246fae53 787 *len = d - save;
6940069f 788 return save;
789}
790
791/*
e1ec3a88 792=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 793
1e54db1a 794Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
35a4481c 795Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0 796the newly-created string, and updates C<len> to contain the new
797length. Returns the original string if no conversion occurs, C<len>
798is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
7990 if C<s> is converted or contains all 7bit characters.
f9a63242 800
37607a96 801=cut
802*/
f9a63242 803
804U8 *
e1ec3a88 805Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 806{
f9a63242 807 U8 *d;
e1ec3a88 808 const U8 *start = s;
809 const U8 *send;
f9a63242 810 I32 count = 0;
811
812 if (!*is_utf8)
73d840c0 813 return (U8 *)start;
f9a63242 814
1e54db1a 815 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 816 for (send = s + *len; s < send;) {
e1ec3a88 817 U8 c = *s++;
1d72bdf6 818 if (!UTF8_IS_INVARIANT(c)) {
db42d148 819 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
820 (c = *s++) && UTF8_IS_CONTINUATION(c))
821 count++;
822 else
73d840c0 823 return (U8 *)start;
db42d148 824 }
f9a63242 825 }
826
827 *is_utf8 = 0;
828
f9a63242 829 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 830 s = start; start = d;
f9a63242 831 while (s < send) {
832 U8 c = *s++;
c4d5f83a 833 if (!UTF8_IS_INVARIANT(c)) {
834 /* Then it is two-byte encoded */
835 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
836 c = ASCII_TO_NATIVE(c);
837 }
838 *d++ = c;
f9a63242 839 }
840 *d = '\0';
841 *len = d - start;
73d840c0 842 return (U8 *)start;
f9a63242 843}
844
845/*
35a4481c 846=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
6940069f 847
1e54db1a 848Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e 849Returns a pointer to the newly-created string, and sets C<len> to
850reflect the new length.
6940069f 851
1e54db1a 852If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f 853see sv_recode_to_utf8().
854
497711e7 855=cut
6940069f 856*/
857
858U8*
35a4481c 859Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 860{
35a4481c 861 const U8 * const send = s + (*len);
6940069f 862 U8 *d;
863 U8 *dst;
6940069f 864
6662521e 865 Newz(801, d, (*len) * 2 + 1, U8);
6940069f 866 dst = d;
867
868 while (s < send) {
35a4481c 869 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 870 if (UNI_IS_INVARIANT(uv))
eb160463 871 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 872 else {
eb160463 873 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
874 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f 875 }
876 }
877 *d = '\0';
6662521e 878 *len = d-dst;
6940069f 879 return dst;
880}
881
a0ed51b3 882/*
dea0fc0b 883 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3 884 *
885 * Destination must be pre-extended to 3/2 source. Do not use in-place.
886 * We optimize for native, for obvious reasons. */
887
888U8*
dea0fc0b 889Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 890{
dea0fc0b 891 U8* pend;
892 U8* dstart = d;
893
1de9afcd 894 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
895 d[0] = 0;
896 *newlen = 1;
897 return d;
898 }
899
dea0fc0b 900 if (bytelen & 1)
014ead4b 901 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
dea0fc0b 902
903 pend = p + bytelen;
904
a0ed51b3 905 while (p < pend) {
dea0fc0b 906 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
907 p += 2;
a0ed51b3 908 if (uv < 0x80) {
eb160463 909 *d++ = (U8)uv;
a0ed51b3 910 continue;
911 }
912 if (uv < 0x800) {
eb160463 913 *d++ = (U8)(( uv >> 6) | 0xc0);
914 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 915 continue;
916 }
917 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e 918 UV low = (p[0] << 8) + p[1];
919 p += 2;
dea0fc0b 920 if (low < 0xdc00 || low >= 0xdfff)
921 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3 922 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
923 }
924 if (uv < 0x10000) {
eb160463 925 *d++ = (U8)(( uv >> 12) | 0xe0);
926 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
927 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 928 continue;
929 }
930 else {
eb160463 931 *d++ = (U8)(( uv >> 18) | 0xf0);
932 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
933 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
934 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 935 continue;
936 }
937 }
dea0fc0b 938 *newlen = d - dstart;
a0ed51b3 939 return d;
940}
941
942/* Note: this one is slightly destructive of the source. */
943
944U8*
dea0fc0b 945Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 946{
947 U8* s = (U8*)p;
948 U8* send = s + bytelen;
949 while (s < send) {
950 U8 tmp = s[0];
951 s[0] = s[1];
952 s[1] = tmp;
953 s += 2;
954 }
dea0fc0b 955 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3 956}
957
958/* for now these are all defined (inefficiently) in terms of the utf8 versions */
959
960bool
84afefe6 961Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 962{
89ebb4a3 963 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 964 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 965 return is_utf8_alnum(tmpbuf);
966}
967
968bool
84afefe6 969Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 970{
89ebb4a3 971 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 972 uvchr_to_utf8(tmpbuf, c);
b8c5462f 973 return is_utf8_alnumc(tmpbuf);
974}
975
976bool
84afefe6 977Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 978{
89ebb4a3 979 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 980 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 981 return is_utf8_idfirst(tmpbuf);
982}
983
984bool
84afefe6 985Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 986{
89ebb4a3 987 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 988 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 989 return is_utf8_alpha(tmpbuf);
990}
991
992bool
84afefe6 993Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 994{
89ebb4a3 995 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 996 uvchr_to_utf8(tmpbuf, c);
4d61ec05 997 return is_utf8_ascii(tmpbuf);
998}
999
1000bool
84afefe6 1001Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1002{
89ebb4a3 1003 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1004 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1005 return is_utf8_space(tmpbuf);
1006}
1007
1008bool
84afefe6 1009Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1010{
89ebb4a3 1011 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1012 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1013 return is_utf8_digit(tmpbuf);
1014}
1015
1016bool
84afefe6 1017Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1018{
89ebb4a3 1019 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1020 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1021 return is_utf8_upper(tmpbuf);
1022}
1023
1024bool
84afefe6 1025Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1026{
89ebb4a3 1027 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1028 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1029 return is_utf8_lower(tmpbuf);
1030}
1031
1032bool
84afefe6 1033Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1034{
89ebb4a3 1035 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1036 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1037 return is_utf8_cntrl(tmpbuf);
1038}
1039
1040bool
84afefe6 1041Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1042{
89ebb4a3 1043 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1044 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1045 return is_utf8_graph(tmpbuf);
1046}
1047
1048bool
84afefe6 1049Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1050{
89ebb4a3 1051 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1052 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1053 return is_utf8_print(tmpbuf);
1054}
1055
b8c5462f 1056bool
84afefe6 1057Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1058{
89ebb4a3 1059 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1060 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1061 return is_utf8_punct(tmpbuf);
1062}
1063
4d61ec05 1064bool
84afefe6 1065Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1066{
89ebb4a3 1067 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1068 uvchr_to_utf8(tmpbuf, c);
4d61ec05 1069 return is_utf8_xdigit(tmpbuf);
1070}
1071
84afefe6 1072UV
1073Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1074{
0ebc6274 1075 uvchr_to_utf8(p, c);
1076 return to_utf8_upper(p, p, lenp);
a0ed51b3 1077}
1078
84afefe6 1079UV
1080Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1081{
0ebc6274 1082 uvchr_to_utf8(p, c);
1083 return to_utf8_title(p, p, lenp);
a0ed51b3 1084}
1085
84afefe6 1086UV
1087Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1088{
0ebc6274 1089 uvchr_to_utf8(p, c);
1090 return to_utf8_lower(p, p, lenp);
a0ed51b3 1091}
1092
84afefe6 1093UV
1094Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1095{
0ebc6274 1096 uvchr_to_utf8(p, c);
1097 return to_utf8_fold(p, p, lenp);
84afefe6 1098}
1099
a0ed51b3 1100/* for now these all assume no locale info available for Unicode > 255 */
1101
1102bool
84afefe6 1103Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3 1104{
1105 return is_uni_alnum(c); /* XXX no locale support yet */
1106}
1107
1108bool
84afefe6 1109Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f 1110{
1111 return is_uni_alnumc(c); /* XXX no locale support yet */
1112}
1113
1114bool
84afefe6 1115Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3 1116{
1117 return is_uni_idfirst(c); /* XXX no locale support yet */
1118}
1119
1120bool
84afefe6 1121Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3 1122{
1123 return is_uni_alpha(c); /* XXX no locale support yet */
1124}
1125
1126bool
84afefe6 1127Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05 1128{
1129 return is_uni_ascii(c); /* XXX no locale support yet */
1130}
1131
1132bool
84afefe6 1133Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3 1134{
1135 return is_uni_space(c); /* XXX no locale support yet */
1136}
1137
1138bool
84afefe6 1139Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3 1140{
1141 return is_uni_digit(c); /* XXX no locale support yet */
1142}
1143
1144bool
84afefe6 1145Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3 1146{
1147 return is_uni_upper(c); /* XXX no locale support yet */
1148}
1149
1150bool
84afefe6 1151Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3 1152{
1153 return is_uni_lower(c); /* XXX no locale support yet */
1154}
1155
1156bool
84afefe6 1157Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f 1158{
1159 return is_uni_cntrl(c); /* XXX no locale support yet */
1160}
1161
1162bool
84afefe6 1163Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f 1164{
1165 return is_uni_graph(c); /* XXX no locale support yet */
1166}
1167
1168bool
84afefe6 1169Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3 1170{
1171 return is_uni_print(c); /* XXX no locale support yet */
1172}
1173
b8c5462f 1174bool
84afefe6 1175Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f 1176{
1177 return is_uni_punct(c); /* XXX no locale support yet */
1178}
1179
4d61ec05 1180bool
84afefe6 1181Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05 1182{
1183 return is_uni_xdigit(c); /* XXX no locale support yet */
1184}
1185
b7ac61fa 1186U32
1187Perl_to_uni_upper_lc(pTHX_ U32 c)
1188{
ee099d14 1189 /* XXX returns only the first character -- do not use XXX */
1190 /* XXX no locale support yet */
1191 STRLEN len;
89ebb4a3 1192 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1193 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa 1194}
1195
1196U32
1197Perl_to_uni_title_lc(pTHX_ U32 c)
1198{
ee099d14 1199 /* XXX returns only the first character XXX -- do not use XXX */
1200 /* XXX no locale support yet */
1201 STRLEN len;
89ebb4a3 1202 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1203 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa 1204}
1205
1206U32
1207Perl_to_uni_lower_lc(pTHX_ U32 c)
1208{
ee099d14 1209 /* XXX returns only the first character -- do not use XXX */
1210 /* XXX no locale support yet */
1211 STRLEN len;
89ebb4a3 1212 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1213 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa 1214}
1215
a0ed51b3 1216bool
7fc63493 1217Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1218{
386d01d6 1219 if (!is_utf8_char(p))
1220 return FALSE;
a0ed51b3 1221 if (!PL_utf8_alnum)
289d4f09 1222 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1223 * descendant of isalnum(3), in other words, it doesn't
1224 * contain the '_'. --jhi */
1225 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
eb160463 1226 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3 1227/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1228#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1229 if (!PL_utf8_alnum)
1230 PL_utf8_alnum = swash_init("utf8", "",
1231 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1232 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3 1233#endif
1234}
1235
1236bool
7fc63493 1237Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1238{
386d01d6 1239 if (!is_utf8_char(p))
1240 return FALSE;
b8c5462f 1241 if (!PL_utf8_alnum)
1242 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
eb160463 1243 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f 1244/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1245#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1246 if (!PL_utf8_alnum)
1247 PL_utf8_alnum = swash_init("utf8", "",
1248 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1249 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f 1250#endif
1251}
1252
1253bool
7fc63493 1254Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1255{
82686b01 1256 if (*p == '_')
1257 return TRUE;
1258 if (!is_utf8_char(p))
1259 return FALSE;
1260 if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1261 PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
eb160463 1262 return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
82686b01 1263}
1264
1265bool
7fc63493 1266Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01 1267{
1268 if (*p == '_')
1269 return TRUE;
1270 if (!is_utf8_char(p))
1271 return FALSE;
1272 if (!PL_utf8_idcont)
1273 PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
eb160463 1274 return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
a0ed51b3 1275}
1276
1277bool
7fc63493 1278Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1279{
386d01d6 1280 if (!is_utf8_char(p))
1281 return FALSE;
a0ed51b3 1282 if (!PL_utf8_alpha)
e24b16f9 1283 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
eb160463 1284 return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
a0ed51b3 1285}
1286
1287bool
7fc63493 1288Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1289{
386d01d6 1290 if (!is_utf8_char(p))
1291 return FALSE;
b8c5462f 1292 if (!PL_utf8_ascii)
1293 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
eb160463 1294 return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
b8c5462f 1295}
1296
1297bool
7fc63493 1298Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1299{
386d01d6 1300 if (!is_utf8_char(p))
1301 return FALSE;
a0ed51b3 1302 if (!PL_utf8_space)
3bec3564 1303 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
eb160463 1304 return swash_fetch(PL_utf8_space, p, TRUE) != 0;
a0ed51b3 1305}
1306
1307bool
7fc63493 1308Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1309{
386d01d6 1310 if (!is_utf8_char(p))
1311 return FALSE;
a0ed51b3 1312 if (!PL_utf8_digit)
e24b16f9 1313 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
eb160463 1314 return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
a0ed51b3 1315}
1316
1317bool
7fc63493 1318Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1319{
386d01d6 1320 if (!is_utf8_char(p))
1321 return FALSE;
a0ed51b3 1322 if (!PL_utf8_upper)
c65e4d19 1323 PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
eb160463 1324 return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
a0ed51b3 1325}
1326
1327bool
7fc63493 1328Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1329{
386d01d6 1330 if (!is_utf8_char(p))
1331 return FALSE;
a0ed51b3 1332 if (!PL_utf8_lower)
c65e4d19 1333 PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
eb160463 1334 return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
a0ed51b3 1335}
1336
1337bool
7fc63493 1338Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1339{
386d01d6 1340 if (!is_utf8_char(p))
1341 return FALSE;
b8c5462f 1342 if (!PL_utf8_cntrl)
1343 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
eb160463 1344 return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
b8c5462f 1345}
1346
1347bool
7fc63493 1348Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1349{
386d01d6 1350 if (!is_utf8_char(p))
1351 return FALSE;
b8c5462f 1352 if (!PL_utf8_graph)
1353 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
eb160463 1354 return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
b8c5462f 1355}
1356
1357bool
7fc63493 1358Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1359{
386d01d6 1360 if (!is_utf8_char(p))
1361 return FALSE;
a0ed51b3 1362 if (!PL_utf8_print)
e24b16f9 1363 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
eb160463 1364 return swash_fetch(PL_utf8_print, p, TRUE) != 0;
a0ed51b3 1365}
1366
1367bool
7fc63493 1368Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1369{
386d01d6 1370 if (!is_utf8_char(p))
1371 return FALSE;
b8c5462f 1372 if (!PL_utf8_punct)
1373 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
eb160463 1374 return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
b8c5462f 1375}
1376
1377bool
7fc63493 1378Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1379{
386d01d6 1380 if (!is_utf8_char(p))
1381 return FALSE;
b8c5462f 1382 if (!PL_utf8_xdigit)
1383 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
eb160463 1384 return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
b8c5462f 1385}
1386
1387bool
7fc63493 1388Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1389{
386d01d6 1390 if (!is_utf8_char(p))
1391 return FALSE;
a0ed51b3 1392 if (!PL_utf8_mark)
e24b16f9 1393 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
eb160463 1394 return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
a0ed51b3 1395}
1396
6b5c0936 1397/*
1398=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1399
1400The "p" contains the pointer to the UTF-8 string encoding
1401the character that is being converted.
1402
1403The "ustrp" is a pointer to the character buffer to put the
1404conversion result to. The "lenp" is a pointer to the length
1405of the result.
1406
0134edef 1407The "swashp" is a pointer to the swash to use.
6b5c0936 1408
0134edef 1409Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1410and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1411but not always, a multicharacter mapping), is tried first.
6b5c0936 1412
0134edef 1413The "special" is a string like "utf8::ToSpecLower", which means the
1414hash %utf8::ToSpecLower. The access to the hash is through
1415Perl_to_utf8_case().
6b5c0936 1416
0134edef 1417The "normal" is a string like "ToLower" which means the swash
1418%utf8::ToLower.
1419
1420=cut */
6b5c0936 1421
2104c8d9 1422UV
e1ec3a88 1423Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, const char *normal, const char *special)
a0ed51b3 1424{
89ebb4a3 1425 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1426 STRLEN len = 0;
a0ed51b3 1427
7fc63493 1428 const UV uv0 = utf8_to_uvchr(p, 0);
1feea2c7 1429 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1430 * are necessary in EBCDIC, they are redundant no-ops
1431 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1432 const UV uv1 = NATIVE_TO_UNI(uv0);
1feea2c7 1433 uvuni_to_utf8(tmpbuf, uv1);
0134edef 1434
1435 if (!*swashp) /* load on-demand */
1436 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1437
b08cf34e 1438 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1439 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1440 /* It might be "special" (sometimes, but not always,
2a37f04d 1441 * a multicharacter mapping) */
983ffd37 1442 HV *hv;
b08cf34e 1443 SV **svp;
1444
1445 if ((hv = get_hv(special, FALSE)) &&
1446 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1447 (*svp)) {
1448 char *s;
47654450 1449
b08cf34e 1450 s = SvPV(*svp, len);
47654450 1451 if (len == 1)
1452 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1453 else {
2f9475ad 1454#ifdef EBCDIC
1455 /* If we have EBCDIC we need to remap the characters
1456 * since any characters in the low 256 are Unicode
1457 * code points, not EBCDIC. */
7cda7a3d 1458 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad 1459
1460 d = tmpbuf;
b08cf34e 1461 if (SvUTF8(*svp)) {
2f9475ad 1462 STRLEN tlen = 0;
1463
1464 while (t < tend) {
1465 UV c = utf8_to_uvchr(t, &tlen);
1466 if (tlen > 0) {
1467 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1468 t += tlen;
1469 }
1470 else
1471 break;
1472 }
1473 }
1474 else {
36fec512 1475 while (t < tend) {
1476 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1477 t++;
1478 }
2f9475ad 1479 }
1480 len = d - tmpbuf;
1481 Copy(tmpbuf, ustrp, len, U8);
1482#else
d2dcd0fb 1483 Copy(s, ustrp, len, U8);
2f9475ad 1484#endif
29e98929 1485 }
983ffd37 1486 }
0134edef 1487 }
1488
1489 if (!len && *swashp) {
1490 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1491
1492 if (uv2) {
1493 /* It was "normal" (a single character mapping). */
1494 UV uv3 = UNI_TO_NATIVE(uv2);
1495
e9101d72 1496 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d 1497 }
1498 }
1feea2c7 1499
0134edef 1500 if (!len) /* Neither: just copy. */
1501 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1502
2a37f04d 1503 if (lenp)
1504 *lenp = len;
1505
0134edef 1506 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3 1507}
1508
d3e79532 1509/*
7fc63493 1510=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1511
1512Convert the UTF-8 encoded character at p to its uppercase version and
1513store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1514that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1515the uppercase version may be longer than the original character.
d3e79532 1516
1517The first character of the uppercased version is returned
1518(but note, as explained above, that there may be more.)
1519
1520=cut */
1521
2104c8d9 1522UV
7fc63493 1523Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1524{
983ffd37 1525 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1526 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1527}
a0ed51b3 1528
d3e79532 1529/*
7fc63493 1530=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1531
1532Convert the UTF-8 encoded character at p to its titlecase version and
1533store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1534that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1535titlecase version may be longer than the original character.
d3e79532 1536
1537The first character of the titlecased version is returned
1538(but note, as explained above, that there may be more.)
1539
1540=cut */
1541
983ffd37 1542UV
7fc63493 1543Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37 1544{
1545 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1546 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3 1547}
1548
d3e79532 1549/*
7fc63493 1550=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1551
1552Convert the UTF-8 encoded character at p to its lowercase version and
1553store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1554that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1555lowercase version may be longer than the original character.
d3e79532 1556
1557The first character of the lowercased version is returned
1558(but note, as explained above, that there may be more.)
1559
1560=cut */
1561
2104c8d9 1562UV
7fc63493 1563Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1564{
983ffd37 1565 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1566 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1567}
1568
d3e79532 1569/*
7fc63493 1570=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532 1571
1572Convert the UTF-8 encoded character at p to its foldcase version and
1573store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1574that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532 1575foldcase version may be longer than the original character (up to
1576three characters).
1577
1578The first character of the foldcased version is returned
1579(but note, as explained above, that there may be more.)
1580
1581=cut */
1582
b4e400f9 1583UV
7fc63493 1584Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9 1585{
1586 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1587 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3 1588}
1589
1590/* a "swash" is a swatch hash */
1591
1592SV*
7fc63493 1593Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1594{
27da23d5 1595 dVAR;
a0ed51b3 1596 SV* retval;
243b1711 1597 SV* tokenbufsv = sv_newmortal();
8e84507e 1598 dSP;
7fc63493 1599 const size_t pkg_len = strlen(pkg);
1600 const size_t name_len = strlen(name);
71bed85a 1601 HV *stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1602 SV* errsv_save;
ce3b816e 1603
96ca9f55 1604 PUSHSTACKi(PERLSI_MAGIC);
1605 ENTER;
1606 SAVEI32(PL_hints);
1607 PL_hints = 0;
1608 save_re_context();
1b026014 1609 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1610 ENTER;
f8be5cf0 1611 errsv_save = newSVsv(ERRSV);
71bed85a 1612 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1613 Nullsv);
f8be5cf0 1614 if (!SvTRUE(ERRSV))
1615 sv_setsv(ERRSV, errsv_save);
1616 SvREFCNT_dec(errsv_save);
ce3b816e 1617 LEAVE;
1618 }
1619 SPAGAIN;
a0ed51b3 1620 PUSHMARK(SP);
1621 EXTEND(SP,5);
71bed85a 1622 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1623 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3 1624 PUSHs(listsv);
1625 PUSHs(sv_2mortal(newSViv(minbits)));
1626 PUSHs(sv_2mortal(newSViv(none)));
1627 PUTBACK;
923e4eb5 1628 if (IN_PERL_COMPILETIME) {
bf1fed83 1629 /* XXX ought to be handled by lex_start */
82686b01 1630 SAVEI32(PL_in_my);
2b4bd638 1631 PL_in_my = 0;
bf1fed83 1632 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1633 }
f8be5cf0 1634 errsv_save = newSVsv(ERRSV);
864dbfa3 1635 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1636 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1637 else
e24b16f9 1638 retval = &PL_sv_undef;
f8be5cf0 1639 if (!SvTRUE(ERRSV))
1640 sv_setsv(ERRSV, errsv_save);
1641 SvREFCNT_dec(errsv_save);
a0ed51b3 1642 LEAVE;
1643 POPSTACK;
923e4eb5 1644 if (IN_PERL_COMPILETIME) {
bf1fed83 1645 STRLEN len;
73d840c0 1646 const char* pv = SvPV(tokenbufsv, len);
bf1fed83 1647
1648 Copy(pv, PL_tokenbuf, len+1, char);
eb160463 1649 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1650 }
bc45ce41 1651 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1652 if (SvPOK(retval))
35c1215d 1653 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1654 retval);
cea2e8a9 1655 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1656 }
a0ed51b3 1657 return retval;
1658}
1659
035d37be 1660
1661/* This API is wrong for special case conversions since we may need to
1662 * return several Unicode characters for a single Unicode character
1663 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1664 * the lower-level routine, and it is similarly broken for returning
1665 * multiple values. --jhi */
a0ed51b3 1666UV
7fc63493 1667Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
a0ed51b3 1668{
27da23d5 1669 dVAR;
a0ed51b3 1670 HV* hv = (HV*)SvRV(sv);
3568d838 1671 U32 klen;
1672 U32 off;
a0ed51b3 1673 STRLEN slen;
7d85a32c 1674 STRLEN needents;
4ea42e7f 1675 U8 *tmps = NULL;
a0ed51b3 1676 U32 bit;
1677 SV *retval;
3568d838 1678 U8 tmputf8[2];
1679 UV c = NATIVE_TO_ASCII(*ptr);
1680
1681 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
eb160463 1682 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1683 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
3568d838 1684 ptr = tmputf8;
1685 }
1686 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1687 * then the "swatch" is a vec() for al the chars which start
1688 * with 0xAA..0xYY
1689 * So the key in the hash (klen) is length of encoded char -1
1690 */
1691 klen = UTF8SKIP(ptr) - 1;
1692 off = ptr[klen];
a0ed51b3 1693
7d85a32c 1694 if (klen == 0)
1695 {
1696 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1697 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1698 */
1699 needents = UTF_CONTINUATION_MARK;
1700 off = NATIVE_TO_UTF(ptr[klen]);
1701 }
1702 else
1703 {
1704 /* If char is encoded then swatch is for the prefix */
1705 needents = (1 << UTF_ACCUMULATION_SHIFT);
1706 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1707 }
1708
a0ed51b3 1709 /*
1710 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1711 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1712 * it's nothing to sniff at.) Pity we usually come through at least
1713 * two function calls to get here...
1714 *
1715 * NB: this code assumes that swatches are never modified, once generated!
1716 */
1717
3568d838 1718 if (hv == PL_last_swash_hv &&
a0ed51b3 1719 klen == PL_last_swash_klen &&
27da23d5 1720 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3 1721 {
1722 tmps = PL_last_swash_tmps;
1723 slen = PL_last_swash_slen;
1724 }
1725 else {
1726 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1727 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 1728
1729 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1730 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1731 dSP;
2b9d42f0 1732 /* We use utf8n_to_uvuni() as we want an index into
1733 Unicode tables, not a native character number.
1734 */
89ebb4a3 1735 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae 1736 ckWARN(WARN_UTF8) ?
1737 0 : UTF8_ALLOW_ANY);
f8be5cf0 1738 SV *errsv_save;
a0ed51b3 1739 ENTER;
1740 SAVETMPS;
1741 save_re_context();
1742 PUSHSTACKi(PERLSI_MAGIC);
1743 PUSHMARK(SP);
1744 EXTEND(SP,3);
1745 PUSHs((SV*)sv);
ffbc6a93 1746 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838 1747 PUSHs(sv_2mortal(newSViv((klen) ?
1748 (code_point & ~(needents - 1)) : 0)));
a0ed51b3 1749 PUSHs(sv_2mortal(newSViv(needents)));
1750 PUTBACK;
f8be5cf0 1751 errsv_save = newSVsv(ERRSV);
864dbfa3 1752 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1753 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1754 else
e24b16f9 1755 retval = &PL_sv_undef;
f8be5cf0 1756 if (!SvTRUE(ERRSV))
1757 sv_setsv(ERRSV, errsv_save);
1758 SvREFCNT_dec(errsv_save);
a0ed51b3 1759 POPSTACK;
1760 FREETMPS;
1761 LEAVE;
923e4eb5 1762 if (IN_PERL_COMPILETIME)
eb160463 1763 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1764
e1ec3a88 1765 svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
a0ed51b3 1766
7d85a32c 1767 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1768 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3 1769 }
1770
1771 PL_last_swash_hv = hv;
1772 PL_last_swash_klen = klen;
1773 PL_last_swash_tmps = tmps;
1774 PL_last_swash_slen = slen;
1775 if (klen)
1776 Copy(ptr, PL_last_swash_key, klen, U8);
1777 }
1778
9faf8d75 1779 switch ((int)((slen << 3) / needents)) {
a0ed51b3 1780 case 1:
1781 bit = 1 << (off & 7);
1782 off >>= 3;
1783 return (tmps[off] & bit) != 0;
1784 case 8:
1785 return tmps[off];
1786 case 16:
1787 off <<= 1;
1788 return (tmps[off] << 8) + tmps[off + 1] ;
1789 case 32:
1790 off <<= 2;
1791 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1792 }
cea2e8a9 1793 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3 1794 return 0;
1795}
2b9d42f0 1796
1797
1798/*
37607a96 1799=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0 1800
1e54db1a 1801Adds the UTF-8 representation of the Native codepoint C<uv> to the end
89ebb4a3 1802of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2b9d42f0 1803bytes available. The return value is the pointer to the byte after the
1804end of the new character. In other words,
1805
1806 d = uvchr_to_utf8(d, uv);
1807
1808is the recommended wide native character-aware way of saying
1809
1810 *(d++) = uv;
1811
1812=cut
1813*/
1814
1815/* On ASCII machines this is normally a macro but we want a
1816 real function in case XS code wants it
1817*/
1818#undef Perl_uvchr_to_utf8
1819U8 *
1820Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1821{
b851fbc1 1822 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2b9d42f0 1823}
1824
b851fbc1 1825U8 *
1826Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1827{
1828 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1829}
2b9d42f0 1830
1831/*
37607a96 1832=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0 1833
1834Returns the native character value of the first character in the string C<s>
1e54db1a 1835which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2b9d42f0 1836length, in bytes, of that character.
1837
1838Allows length and flags to be passed to low level routine.
1839
1840=cut
1841*/
0a2ef054 1842/* On ASCII machines this is normally a macro but we want
1843 a real function in case XS code wants it
2b9d42f0 1844*/
1845#undef Perl_utf8n_to_uvchr
1846UV
7fc63493 1847Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0 1848{
1849 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1850 return UNI_TO_NATIVE(uv);
1851}
1852
d2cc3551 1853/*
1854=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1855
1856Build to the scalar dsv a displayable version of the string spv,
1857length len, the displayable version being at most pvlim bytes long
1858(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1859
9e55ce06 1860The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1861isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054 1862to display the \\[nrfta\\] as the backslashed versions (like '\n')
1863(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1864UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1865UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1866
d2cc3551 1867The pointer to the PV of the dsv is returned.
1868
1869=cut */
e6b2e755 1870char *
e1ec3a88 1871Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755 1872{
1873 int truncated = 0;
e1ec3a88 1874 const char *s, *e;
e6b2e755 1875
1876 sv_setpvn(dsv, "", 0);
e1ec3a88 1877 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 1878 UV u;
a49f32c6 1879 /* This serves double duty as a flag and a character to print after
1880 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1881 */
1882 char ok = 0;
c728cb41 1883
e6b2e755 1884 if (pvlim && SvCUR(dsv) >= pvlim) {
1885 truncated++;
1886 break;
1887 }
1888 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1889 if (u < 256) {
a3b680e6 1890 const unsigned char c = (unsigned char)u & 0xFF;
c728cb41 1891 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
a49f32c6 1892 switch (c) {
c728cb41 1893 case '\n':
a49f32c6 1894 ok = 'n'; break;
c728cb41 1895 case '\r':
a49f32c6 1896 ok = 'r'; break;
c728cb41 1897 case '\t':
a49f32c6 1898 ok = 't'; break;
c728cb41 1899 case '\f':
a49f32c6 1900 ok = 'f'; break;
c728cb41 1901 case '\a':
a49f32c6 1902 ok = 'a'; break;
c728cb41 1903 case '\\':
a49f32c6 1904 ok = '\\'; break;
c728cb41 1905 default: break;
1906 }
a49f32c6 1907 if (ok) {
1908 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1909 }
c728cb41 1910 }
00e86452 1911 /* isPRINT() is the locale-blind version. */
a49f32c6 1912 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1913 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1914 ok = 1;
0a2ef054 1915 }
c728cb41 1916 }
1917 if (!ok)
9e55ce06 1918 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755 1919 }
1920 if (truncated)
1921 sv_catpvn(dsv, "...", 3);
1922
1923 return SvPVX(dsv);
1924}
2b9d42f0 1925
d2cc3551 1926/*
1927=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1928
1929Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1930the displayable version being at most pvlim bytes long
d2cc3551 1931(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1932
1933The flags argument is as in pv_uni_display().
1934
d2cc3551 1935The pointer to the PV of the dsv is returned.
1936
1937=cut */
e6b2e755 1938char *
1939Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1940{
701a277b 1941 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1942 pvlim, flags);
1943}
1944
d2cc3551 1945/*
d07ddd77 1946=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551 1947
1948Return true if the strings s1 and s2 differ case-insensitively, false
1949if not (if they are equal case-insensitively). If u1 is true, the
1950string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77 1951the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1952are false, the respective string is assumed to be in native 8-bit
1953encoding.
1954
1955If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1956in there (they will point at the beginning of the I<next> character).
1957If the pointers behind pe1 or pe2 are non-NULL, they are the end
1958pointers beyond which scanning will not continue under any
4cdaeff7 1959circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77 1960s2+l2 will be used as goal end pointers that will also stop the scan,
1961and which qualify towards defining a successful match: all the scans
1962that define an explicit length must reach their goal pointers for
1963a match to succeed).
d2cc3551 1964
1965For case-insensitiveness, the "casefolding" of Unicode is used
1966instead of upper/lowercasing both the characters, see
1967http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1968
1969=cut */
701a277b 1970I32
d07ddd77 1971Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 1972{
e1ec3a88 1973 register const U8 *p1 = (const U8*)s1;
1974 register const U8 *p2 = (const U8*)s2;
1975 register const U8 *f1 = 0, *f2 = 0;
1976 register U8 *e1 = 0, *q1 = 0;
1977 register U8 *e2 = 0, *q2 = 0;
d07ddd77 1978 STRLEN n1 = 0, n2 = 0;
89ebb4a3 1979 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
1980 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8 1981 U8 natbuf[1+1];
1982 STRLEN foldlen1, foldlen2;
d07ddd77 1983 bool match;
332ddc25 1984
d07ddd77 1985 if (pe1)
1986 e1 = *(U8**)pe1;
e1ec3a88 1987 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
1988 f1 = (const U8*)s1 + l1;
d07ddd77 1989 if (pe2)
1990 e2 = *(U8**)pe2;
e1ec3a88 1991 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
1992 f2 = (const U8*)s2 + l2;
d07ddd77 1993
1994 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1995 return 1; /* mismatch; possible infinite loop or false positive */
1996
a6872d42 1997 if (!u1 || !u2)
1998 natbuf[1] = 0; /* Need to terminate the buffer. */
1999
d07ddd77 2000 while ((e1 == 0 || p1 < e1) &&
2001 (f1 == 0 || p1 < f1) &&
2002 (e2 == 0 || p2 < e2) &&
2003 (f2 == 0 || p2 < f2)) {
2004 if (n1 == 0) {
d7f013c8 2005 if (u1)
2006 to_utf8_fold(p1, foldbuf1, &foldlen1);
2007 else {
f5cee151 2008 natbuf[0] = *p1;
d7f013c8 2009 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2010 }
2011 q1 = foldbuf1;
d07ddd77 2012 n1 = foldlen1;
332ddc25 2013 }
d07ddd77 2014 if (n2 == 0) {
d7f013c8 2015 if (u2)
2016 to_utf8_fold(p2, foldbuf2, &foldlen2);
2017 else {
f5cee151 2018 natbuf[0] = *p2;
d7f013c8 2019 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2020 }
2021 q2 = foldbuf2;
d07ddd77 2022 n2 = foldlen2;
332ddc25 2023 }
d07ddd77 2024 while (n1 && n2) {
2025 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2026 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2027 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2028 return 1; /* mismatch */
d07ddd77 2029 n1 -= UTF8SKIP(q1);
d7f013c8 2030 q1 += UTF8SKIP(q1);
d07ddd77 2031 n2 -= UTF8SKIP(q2);
d7f013c8 2032 q2 += UTF8SKIP(q2);
701a277b 2033 }
d07ddd77 2034 if (n1 == 0)
d7f013c8 2035 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2036 if (n2 == 0)
d7f013c8 2037 p2 += u2 ? UTF8SKIP(p2) : 1;
2038
d2cc3551 2039 }
5469e704 2040
d07ddd77 2041 /* A match is defined by all the scans that specified
2042 * an explicit length reaching their final goals. */
2043 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704 2044
2045 if (match) {
d07ddd77 2046 if (pe1)
2047 *pe1 = (char*)p1;
2048 if (pe2)
2049 *pe2 = (char*)p2;
5469e704 2050 }
2051
2052 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2053}
701a277b 2054
a49f32c6 2055/*
2056 * Local variables:
2057 * c-indentation-style: bsd
2058 * c-basic-offset: 4
2059 * indent-tabs-mode: t
2060 * End:
2061 *
37442d52 2062 * ex: set ts=8 sts=4 sw=4 noet:
2063 */