A fix for [perl #31692] : as PerlIO::scalar accesses directly the
[p5sagit/p5-mst-13.2.git] / utf8.c
CommitLineData
a0ed51b3 1/* utf8.c
2 *
b5f8cc5c 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, by Larry Wall and others
a0ed51b3 4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3 25#include "perl.h"
26
901b21bf 27static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
28
ccfc67b7 29/*
30=head1 Unicode Support
a0ed51b3 31
166f8a29 32This file contains various utility functions for manipulating UTF8-encoded
33strings. For the uninitiated, this is a method of representing arbitrary
61296642 34Unicode characters as a variable number of bytes, in such a way that
56da48f7 35characters in the ASCII range are unmodified, and a zero byte never appears
36within non-zero characters.
166f8a29 37
b851fbc1 38=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 39
1e54db1a 40Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
eebe1485 41of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
42bytes available. The return value is the pointer to the byte after the
9041c2e3 43end of the new character. In other words,
eebe1485 44
b851fbc1 45 d = uvuni_to_utf8_flags(d, uv, flags);
46
47or, in most cases,
48
9041c2e3 49 d = uvuni_to_utf8(d, uv);
eebe1485 50
b851fbc1 51(which is equivalent to)
52
53 d = uvuni_to_utf8_flags(d, uv, 0);
54
eebe1485 55is the recommended Unicode-aware way of saying
56
57 *(d++) = uv;
58
59=cut
60*/
61
dfe13c55 62U8 *
b851fbc1 63Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 64{
62961d2e 65 if (ckWARN(WARN_UTF8)) {
b851fbc1 66 if (UNICODE_IS_SURROGATE(uv) &&
67 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 68 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1 69 else if (
70 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
71 !(flags & UNICODE_ALLOW_FDD0))
72 ||
c867b360 73 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1 74 !(flags & UNICODE_ALLOW_FFFF))) &&
75 /* UNICODE_ALLOW_SUPER includes
2a20b9da 76 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1 77 ((uv <= PERL_UNICODE_MAX) ||
78 !(flags & UNICODE_ALLOW_SUPER))
79 )
9014280d 80 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800 81 "Unicode character 0x%04"UVxf" is illegal", uv);
82 }
c4d5f83a 83 if (UNI_IS_INVARIANT(uv)) {
eb160463 84 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3 85 return d;
86 }
2d331972 87#if defined(EBCDIC)
1d72bdf6 88 else {
89 STRLEN len = UNISKIP(uv);
90 U8 *p = d+len-1;
91 while (p > d) {
eb160463 92 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6 93 uv >>= UTF_ACCUMULATION_SHIFT;
94 }
eb160463 95 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6 96 return d+len;
97 }
98#else /* Non loop style */
a0ed51b3 99 if (uv < 0x800) {
eb160463 100 *d++ = (U8)(( uv >> 6) | 0xc0);
101 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 102 return d;
103 }
104 if (uv < 0x10000) {
eb160463 105 *d++ = (U8)(( uv >> 12) | 0xe0);
106 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
107 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 108 return d;
109 }
110 if (uv < 0x200000) {
eb160463 111 *d++ = (U8)(( uv >> 18) | 0xf0);
112 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
113 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
114 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 115 return d;
116 }
117 if (uv < 0x4000000) {
eb160463 118 *d++ = (U8)(( uv >> 24) | 0xf8);
119 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
120 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
121 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
122 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 123 return d;
124 }
125 if (uv < 0x80000000) {
eb160463 126 *d++ = (U8)(( uv >> 30) | 0xfc);
127 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
128 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
129 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
131 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 132 return d;
133 }
6b8eaf93 134#ifdef HAS_QUAD
d7578b48 135 if (uv < UTF8_QUAD_MAX)
a0ed51b3 136#endif
137 {
eb160463 138 *d++ = 0xfe; /* Can't match U+FEFF! */
139 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
140 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
141 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
144 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 145 return d;
146 }
6b8eaf93 147#ifdef HAS_QUAD
a0ed51b3 148 {
eb160463 149 *d++ = 0xff; /* Can't match U+FFFE! */
150 *d++ = 0x80; /* 6 Reserved bits */
151 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
152 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
153 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
154 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
161 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 162 return d;
163 }
164#endif
1d72bdf6 165#endif /* Loop style */
a0ed51b3 166}
b851fbc1 167
168U8 *
169Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
170{
171 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
172}
9041c2e3 173
174
175/*
eebe1485 176=for apidoc A|STRLEN|is_utf8_char|U8 *s
177
5da9da9e 178Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01 179character. Note that an INVARIANT (i.e. ASCII) character is a valid
180UTF-8 character. The actual number of bytes in the UTF-8 character
181will be returned if it is valid, otherwise 0.
9041c2e3 182
82686b01 183=cut */
067a85ef 184STRLEN
386d01d6 185Perl_is_utf8_char(pTHX_ U8 *s)
186{
187 U8 u = *s;
067a85ef 188 STRLEN slen, len;
189 UV uv, ouv;
386d01d6 190
1d72bdf6 191 if (UTF8_IS_INVARIANT(u))
386d01d6 192 return 1;
193
60006e79 194 if (!UTF8_IS_START(u))
386d01d6 195 return 0;
196
9f07fdcd 197 len = UTF8SKIP(s);
386d01d6 198
60006e79 199 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef 200 return 0;
201
386d01d6 202 slen = len - 1;
203 s++;
6eb6869e 204 u &= UTF_START_MASK(len);
1d72bdf6 205 uv = u;
067a85ef 206 ouv = uv;
386d01d6 207 while (slen--) {
60006e79 208 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 209 return 0;
8850bf83 210 uv = UTF8_ACCUMULATE(uv, *s);
209a9bc1 211 if (uv < ouv)
067a85ef 212 return 0;
213 ouv = uv;
386d01d6 214 s++;
215 }
067a85ef 216
eb160463 217 if ((STRLEN)UNISKIP(uv) < len)
067a85ef 218 return 0;
219
386d01d6 220 return len;
221}
222
6662521e 223/*
eebe1485 224=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 225
c9ada85f 226Returns true if first C<len> bytes of the given string form a valid
1e54db1a 227UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
228not mean 'a string that contains code points above 0x7F encoded in UTF-8'
229because a valid ASCII string is a valid UTF-8 string.
6662521e 230
231=cut
232*/
233
8e84507e 234bool
6662521e 235Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
236{
067a85ef 237 U8* x = s;
1aa99e6b 238 U8* send;
067a85ef 239 STRLEN c;
240
61468b03 241 if (!len && s)
6cd5fe39 242 len = strlen((char *)s);
1aa99e6b 243 send = s + len;
244
6662521e 245 while (x < send) {
1acdb0da 246 /* Inline the easy bits of is_utf8_char() here for speed... */
247 if (UTF8_IS_INVARIANT(*x))
248 c = 1;
249 else if (!UTF8_IS_START(*x))
250 return FALSE;
251 else {
252 /* ... and call is_utf8_char() only if really needed. */
253 c = is_utf8_char(x);
254 if (!c)
255 return FALSE;
256 }
6662521e 257 x += c;
6662521e 258 }
60006e79 259 if (x != send)
260 return FALSE;
067a85ef 261
262 return TRUE;
6662521e 263}
264
67e989fb 265/*
81cd54e3 266=for apidoc A|bool|is_utf8_string_loc|U8 *s|STRLEN len|U8 **p
267
268Like is_ut8_string but store the location of the failure in
269the last argument.
270
271=cut
272*/
273
274bool
275Perl_is_utf8_string_loc(pTHX_ U8 *s, STRLEN len, U8 **p)
276{
277 U8* x = s;
278 U8* send;
279 STRLEN c;
280
61468b03 281 if (!len && s)
81cd54e3 282 len = strlen((char *)s);
283 send = s + len;
284
285 while (x < send) {
286 /* Inline the easy bits of is_utf8_char() here for speed... */
287 if (UTF8_IS_INVARIANT(*x))
288 c = 1;
289 else if (!UTF8_IS_START(*x)) {
290 if (p)
291 *p = x;
292 return FALSE;
293 }
294 else {
295 /* ... and call is_utf8_char() only if really needed. */
296 c = is_utf8_char(x);
297 if (!c) {
298 if (p)
299 *p = x;
300 return FALSE;
301 }
302 }
303 x += c;
304 }
305 if (x != send) {
306 if (p)
307 *p = x;
308 return FALSE;
309 }
310
311 return TRUE;
312}
313
314/*
9041c2e3 315=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 316
9041c2e3 317Bottom level UTF-8 decode routine.
318Returns the unicode code point value of the first character in the string C<s>
1e54db1a 319which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 320C<retlen> will be set to the length, in bytes, of that character.
67e989fb 321
1e54db1a 322If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880 323is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
324it is assumed that the caller will raise a warning, and this function
28d3d195 325will silently just set C<retlen> to C<-1> and return zero. If the
326C<flags> does not contain UTF8_CHECK_ONLY, warnings about
327malformations will be given, C<retlen> will be set to the expected
328length of the UTF-8 character in bytes, and zero will be returned.
329
330The C<flags> can also contain various flags to allow deviations from
331the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 332
9041c2e3 333Most code should use utf8_to_uvchr() rather than call this directly.
334
37607a96 335=cut
336*/
67e989fb 337
a0ed51b3 338UV
37607a96 339Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 340{
097fb8e2 341 U8 *s0 = s;
9c5ffd7c 342 UV uv = *s, ouv = 0;
ba210ebe 343 STRLEN len = 1;
344 bool dowarn = ckWARN_d(WARN_UTF8);
4f849cb6 345 UV startbyte = *s;
ba210ebe 346 STRLEN expectlen = 0;
a0dbb045 347 U32 warning = 0;
348
349/* This list is a superset of the UTF8_ALLOW_XXX. */
350
351#define UTF8_WARN_EMPTY 1
352#define UTF8_WARN_CONTINUATION 2
353#define UTF8_WARN_NON_CONTINUATION 3
354#define UTF8_WARN_FE_FF 4
355#define UTF8_WARN_SHORT 5
356#define UTF8_WARN_OVERFLOW 6
357#define UTF8_WARN_SURROGATE 7
c867b360 358#define UTF8_WARN_LONG 8
359#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045 360
361 if (curlen == 0 &&
362 !(flags & UTF8_ALLOW_EMPTY)) {
363 warning = UTF8_WARN_EMPTY;
0c443dc2 364 goto malformed;
365 }
366
1d72bdf6 367 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3 368 if (retlen)
369 *retlen = 1;
c4d5f83a 370 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 371 }
67e989fb 372
421a8bf2 373 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 374 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 375 warning = UTF8_WARN_CONTINUATION;
ba210ebe 376 goto malformed;
377 }
378
421a8bf2 379 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 380 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 381 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 382 goto malformed;
383 }
9041c2e3 384
1d72bdf6 385#ifdef EBCDIC
75383841 386 uv = NATIVE_TO_UTF(uv);
1d72bdf6 387#else
fcc8fcf6 388 if ((uv == 0xfe || uv == 0xff) &&
389 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 390 warning = UTF8_WARN_FE_FF;
ba210ebe 391 goto malformed;
a0ed51b3 392 }
1d72bdf6 393#endif
394
ba210ebe 395 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
396 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
397 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
398 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6 399#ifdef EBCDIC
400 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
401 else { len = 7; uv &= 0x01; }
402#else
ba210ebe 403 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
404 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6 405 else { len = 13; uv = 0; } /* whoa! */
406#endif
407
a0ed51b3 408 if (retlen)
409 *retlen = len;
9041c2e3 410
ba210ebe 411 expectlen = len;
412
fcc8fcf6 413 if ((curlen < expectlen) &&
414 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 415 warning = UTF8_WARN_SHORT;
ba210ebe 416 goto malformed;
417 }
418
419 len--;
a0ed51b3 420 s++;
ba210ebe 421 ouv = uv;
422
a0ed51b3 423 while (len--) {
421a8bf2 424 if (!UTF8_IS_CONTINUATION(*s) &&
425 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 426 s--;
427 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 428 goto malformed;
a0ed51b3 429 }
430 else
8850bf83 431 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045 432 if (!(uv > ouv)) {
433 /* These cannot be allowed. */
434 if (uv == ouv) {
435 if (!(flags & UTF8_ALLOW_LONG)) {
436 warning = UTF8_WARN_LONG;
437 goto malformed;
438 }
439 }
440 else { /* uv < ouv */
441 /* This cannot be allowed. */
442 warning = UTF8_WARN_OVERFLOW;
443 goto malformed;
444 }
ba210ebe 445 }
446 s++;
447 ouv = uv;
448 }
449
421a8bf2 450 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 451 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 452 warning = UTF8_WARN_SURROGATE;
ba210ebe 453 goto malformed;
eb160463 454 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 455 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 456 warning = UTF8_WARN_LONG;
ba210ebe 457 goto malformed;
421a8bf2 458 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 459 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 460 warning = UTF8_WARN_FFFF;
a9917092 461 goto malformed;
a0ed51b3 462 }
ba210ebe 463
a0ed51b3 464 return uv;
ba210ebe 465
466malformed:
467
fcc8fcf6 468 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 469 if (retlen)
cc366d4b 470 *retlen = -1;
ba210ebe 471 return 0;
472 }
473
a0dbb045 474 if (dowarn) {
475 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
476
477 switch (warning) {
478 case 0: /* Intentionally empty. */ break;
479 case UTF8_WARN_EMPTY:
480 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
481 break;
482 case UTF8_WARN_CONTINUATION:
097fb8e2 483 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045 484 break;
485 case UTF8_WARN_NON_CONTINUATION:
097fb8e2 486 if (s == s0)
487 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
488 (UV)s[1], startbyte);
489 else
490 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
491 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
492
a0dbb045 493 break;
494 case UTF8_WARN_FE_FF:
495 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
496 break;
497 case UTF8_WARN_SHORT:
097fb8e2 498 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
499 curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
b31f83c2 500 expectlen = curlen; /* distance for caller to skip */
a0dbb045 501 break;
502 case UTF8_WARN_OVERFLOW:
097fb8e2 503 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
504 ouv, *s, startbyte);
a0dbb045 505 break;
506 case UTF8_WARN_SURROGATE:
507 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
508 break;
a0dbb045 509 case UTF8_WARN_LONG:
097fb8e2 510 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
511 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045 512 break;
513 case UTF8_WARN_FFFF:
514 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
515 break;
516 default:
517 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
518 break;
519 }
520
521 if (warning) {
522 char *s = SvPVX(sv);
523
524 if (PL_op)
9014280d 525 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 526 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 527 else
9014280d 528 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045 529 }
530 }
531
ba210ebe 532 if (retlen)
28d3d195 533 *retlen = expectlen ? expectlen : len;
ba210ebe 534
28d3d195 535 return 0;
a0ed51b3 536}
537
8e84507e 538/*
37607a96 539=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
9041c2e3 540
541Returns the native character value of the first character in the string C<s>
1e54db1a 542which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3 543length, in bytes, of that character.
544
1e54db1a 545If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3 546returned and retlen is set, if possible, to -1.
547
548=cut
549*/
550
551UV
37607a96 552Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
9041c2e3 553{
872c91ae 554 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
555 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3 556}
557
558/*
37607a96 559=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
9041c2e3 560
561Returns the Unicode code point of the first character in the string C<s>
1e54db1a 562which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3 563length, in bytes, of that character.
564
565This function should only be used when returned UV is considered
566an index into the Unicode semantic tables (e.g. swashes).
567
1e54db1a 568If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 569returned and retlen is set, if possible, to -1.
8e84507e 570
571=cut
572*/
573
574UV
37607a96 575Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
8e84507e 576{
9041c2e3 577 /* Call the low level routine asking for checks */
872c91ae 578 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
579 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e 580}
581
b76347f2 582/*
37607a96 583=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
b76347f2 584
585Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47 586Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
587up past C<e>, croaks.
b76347f2 588
589=cut
590*/
591
592STRLEN
37607a96 593Perl_utf8_length(pTHX_ U8 *s, U8 *e)
b76347f2 594{
595 STRLEN len = 0;
596
8850bf83 597 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
598 * the bitops (especially ~) can create illegal UTF-8.
599 * In other words: in Perl UTF-8 is not just for Unicode. */
600
901b21bf 601 if (e < s) {
602 if (ckWARN_d(WARN_UTF8)) {
603 if (PL_op)
604 Perl_warner(aTHX_ packWARN(WARN_UTF8),
605 "%s in %s", unees, OP_DESC(PL_op));
606 else
607 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
608 }
609 return 0;
610 }
b76347f2 611 while (s < e) {
02eb7b47 612 U8 t = UTF8SKIP(s);
b76347f2 613
901b21bf 614 if (e - s < t) {
615 if (ckWARN_d(WARN_UTF8)) {
616 if (PL_op)
617 Perl_warner(aTHX_ packWARN(WARN_UTF8),
618 unees, OP_DESC(PL_op));
619 else
620 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
621 }
622 return len;
623 }
b76347f2 624 s += t;
625 len++;
626 }
627
628 return len;
629}
630
b06226ff 631/*
eebe1485 632=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff 633
1e54db1a 634Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff 635and C<b>.
636
637WARNING: use only if you *know* that the pointers point inside the
638same UTF-8 buffer.
639
37607a96 640=cut
641*/
a0ed51b3 642
02eb7b47 643IV
864dbfa3 644Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 645{
02eb7b47 646 IV off = 0;
647
8850bf83 648 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
649 * the bitops (especially ~) can create illegal UTF-8.
650 * In other words: in Perl UTF-8 is not just for Unicode. */
651
a0ed51b3 652 if (a < b) {
653 while (a < b) {
02eb7b47 654 U8 c = UTF8SKIP(a);
655
901b21bf 656 if (b - a < c) {
657 if (ckWARN_d(WARN_UTF8)) {
658 if (PL_op)
659 Perl_warner(aTHX_ packWARN(WARN_UTF8),
660 "%s in %s", unees, OP_DESC(PL_op));
661 else
662 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
663 }
664 return off;
665 }
02eb7b47 666 a += c;
a0ed51b3 667 off--;
668 }
669 }
670 else {
671 while (b < a) {
02eb7b47 672 U8 c = UTF8SKIP(b);
673
901b21bf 674 if (a - b < c) {
675 if (ckWARN_d(WARN_UTF8)) {
676 if (PL_op)
677 Perl_warner(aTHX_ packWARN(WARN_UTF8),
678 "%s in %s", unees, OP_DESC(PL_op));
679 else
680 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
681 }
682 return off;
683 }
02eb7b47 684 b += c;
a0ed51b3 685 off++;
686 }
687 }
02eb7b47 688
a0ed51b3 689 return off;
690}
691
b06226ff 692/*
37607a96 693=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 694
8850bf83 695Return the UTF-8 pointer C<s> displaced by C<off> characters, either
696forward or backward.
b06226ff 697
698WARNING: do not use the following unless you *know* C<off> is within
8850bf83 699the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
700on the first byte of character or just after the last byte of a character.
b06226ff 701
37607a96 702=cut
703*/
a0ed51b3 704
705U8 *
864dbfa3 706Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 707{
8850bf83 708 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
709 * the bitops (especially ~) can create illegal UTF-8.
710 * In other words: in Perl UTF-8 is not just for Unicode. */
711
a0ed51b3 712 if (off >= 0) {
713 while (off--)
714 s += UTF8SKIP(s);
715 }
716 else {
717 while (off++) {
718 s--;
8850bf83 719 while (UTF8_IS_CONTINUATION(*s))
720 s--;
a0ed51b3 721 }
722 }
723 return s;
724}
725
6940069f 726/*
eebe1485 727=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 728
1e54db1a 729Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53 730Unlike C<bytes_to_utf8>, this over-writes the original string, and
731updates len to contain the new length.
67e989fb 732Returns zero on failure, setting C<len> to -1.
6940069f 733
734=cut
735*/
736
737U8 *
37607a96 738Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 739{
6940069f 740 U8 *send;
741 U8 *d;
dcad2880 742 U8 *save = s;
246fae53 743
1e54db1a 744 /* ensure valid UTF-8 and chars < 256 before updating string */
dcad2880 745 for (send = s + *len; s < send; ) {
746 U8 c = *s++;
747
1d72bdf6 748 if (!UTF8_IS_INVARIANT(c) &&
749 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
750 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880 751 *len = -1;
752 return 0;
753 }
246fae53 754 }
dcad2880 755
756 d = s = save;
6940069f 757 while (s < send) {
ed646e6e 758 STRLEN ulen;
9041c2e3 759 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 760 s += ulen;
6940069f 761 }
762 *d = '\0';
246fae53 763 *len = d - save;
6940069f 764 return save;
765}
766
767/*
f9a63242 768=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
769
1e54db1a 770Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
f9a63242 771Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0 772the newly-created string, and updates C<len> to contain the new
773length. Returns the original string if no conversion occurs, C<len>
774is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
7750 if C<s> is converted or contains all 7bit characters.
f9a63242 776
37607a96 777=cut
778*/
f9a63242 779
780U8 *
37607a96 781Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 782{
f9a63242 783 U8 *d;
784 U8 *start = s;
db42d148 785 U8 *send;
f9a63242 786 I32 count = 0;
787
788 if (!*is_utf8)
789 return start;
790
1e54db1a 791 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 792 for (send = s + *len; s < send;) {
793 U8 c = *s++;
1d72bdf6 794 if (!UTF8_IS_INVARIANT(c)) {
db42d148 795 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
796 (c = *s++) && UTF8_IS_CONTINUATION(c))
797 count++;
798 else
f9a63242 799 return start;
db42d148 800 }
f9a63242 801 }
802
803 *is_utf8 = 0;
804
f9a63242 805 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 806 s = start; start = d;
f9a63242 807 while (s < send) {
808 U8 c = *s++;
c4d5f83a 809 if (!UTF8_IS_INVARIANT(c)) {
810 /* Then it is two-byte encoded */
811 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
812 c = ASCII_TO_NATIVE(c);
813 }
814 *d++ = c;
f9a63242 815 }
816 *d = '\0';
817 *len = d - start;
818 return start;
819}
820
821/*
eebe1485 822=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f 823
1e54db1a 824Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e 825Returns a pointer to the newly-created string, and sets C<len> to
826reflect the new length.
6940069f 827
1e54db1a 828If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f 829see sv_recode_to_utf8().
830
497711e7 831=cut
6940069f 832*/
833
834U8*
37607a96 835Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
6940069f 836{
6940069f 837 U8 *send;
838 U8 *d;
839 U8 *dst;
6662521e 840 send = s + (*len);
6940069f 841
6662521e 842 Newz(801, d, (*len) * 2 + 1, U8);
6940069f 843 dst = d;
844
845 while (s < send) {
db42d148 846 UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 847 if (UNI_IS_INVARIANT(uv))
eb160463 848 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 849 else {
eb160463 850 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
851 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f 852 }
853 }
854 *d = '\0';
6662521e 855 *len = d-dst;
6940069f 856 return dst;
857}
858
a0ed51b3 859/*
dea0fc0b 860 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3 861 *
862 * Destination must be pre-extended to 3/2 source. Do not use in-place.
863 * We optimize for native, for obvious reasons. */
864
865U8*
dea0fc0b 866Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 867{
dea0fc0b 868 U8* pend;
869 U8* dstart = d;
870
871 if (bytelen & 1)
a7867d0a 872 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b 873
874 pend = p + bytelen;
875
a0ed51b3 876 while (p < pend) {
dea0fc0b 877 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
878 p += 2;
a0ed51b3 879 if (uv < 0x80) {
eb160463 880 *d++ = (U8)uv;
a0ed51b3 881 continue;
882 }
883 if (uv < 0x800) {
eb160463 884 *d++ = (U8)(( uv >> 6) | 0xc0);
885 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 886 continue;
887 }
888 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e 889 UV low = (p[0] << 8) + p[1];
890 p += 2;
dea0fc0b 891 if (low < 0xdc00 || low >= 0xdfff)
892 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3 893 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
894 }
895 if (uv < 0x10000) {
eb160463 896 *d++ = (U8)(( uv >> 12) | 0xe0);
897 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
898 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 899 continue;
900 }
901 else {
eb160463 902 *d++ = (U8)(( uv >> 18) | 0xf0);
903 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
904 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
905 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3 906 continue;
907 }
908 }
dea0fc0b 909 *newlen = d - dstart;
a0ed51b3 910 return d;
911}
912
913/* Note: this one is slightly destructive of the source. */
914
915U8*
dea0fc0b 916Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 917{
918 U8* s = (U8*)p;
919 U8* send = s + bytelen;
920 while (s < send) {
921 U8 tmp = s[0];
922 s[0] = s[1];
923 s[1] = tmp;
924 s += 2;
925 }
dea0fc0b 926 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3 927}
928
929/* for now these are all defined (inefficiently) in terms of the utf8 versions */
930
931bool
84afefe6 932Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 933{
ad391ad9 934 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 935 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 936 return is_utf8_alnum(tmpbuf);
937}
938
939bool
84afefe6 940Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 941{
ad391ad9 942 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 943 uvchr_to_utf8(tmpbuf, c);
b8c5462f 944 return is_utf8_alnumc(tmpbuf);
945}
946
947bool
84afefe6 948Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 949{
ad391ad9 950 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 951 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 952 return is_utf8_idfirst(tmpbuf);
953}
954
955bool
84afefe6 956Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 957{
ad391ad9 958 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 959 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 960 return is_utf8_alpha(tmpbuf);
961}
962
963bool
84afefe6 964Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 965{
ad391ad9 966 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 967 uvchr_to_utf8(tmpbuf, c);
4d61ec05 968 return is_utf8_ascii(tmpbuf);
969}
970
971bool
84afefe6 972Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 973{
ad391ad9 974 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 975 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 976 return is_utf8_space(tmpbuf);
977}
978
979bool
84afefe6 980Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 981{
ad391ad9 982 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 983 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 984 return is_utf8_digit(tmpbuf);
985}
986
987bool
84afefe6 988Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 989{
ad391ad9 990 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 991 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 992 return is_utf8_upper(tmpbuf);
993}
994
995bool
84afefe6 996Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 997{
ad391ad9 998 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 999 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1000 return is_utf8_lower(tmpbuf);
1001}
1002
1003bool
84afefe6 1004Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1005{
ad391ad9 1006 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1007 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1008 return is_utf8_cntrl(tmpbuf);
1009}
1010
1011bool
84afefe6 1012Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1013{
ad391ad9 1014 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1015 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1016 return is_utf8_graph(tmpbuf);
1017}
1018
1019bool
84afefe6 1020Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1021{
ad391ad9 1022 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1023 uvchr_to_utf8(tmpbuf, c);
a0ed51b3 1024 return is_utf8_print(tmpbuf);
1025}
1026
b8c5462f 1027bool
84afefe6 1028Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1029{
ad391ad9 1030 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1031 uvchr_to_utf8(tmpbuf, c);
b8c5462f 1032 return is_utf8_punct(tmpbuf);
1033}
1034
4d61ec05 1035bool
84afefe6 1036Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1037{
e7ae6809 1038 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
230880c1 1039 uvchr_to_utf8(tmpbuf, c);
4d61ec05 1040 return is_utf8_xdigit(tmpbuf);
1041}
1042
84afefe6 1043UV
1044Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1045{
0ebc6274 1046 uvchr_to_utf8(p, c);
1047 return to_utf8_upper(p, p, lenp);
a0ed51b3 1048}
1049
84afefe6 1050UV
1051Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1052{
0ebc6274 1053 uvchr_to_utf8(p, c);
1054 return to_utf8_title(p, p, lenp);
a0ed51b3 1055}
1056
84afefe6 1057UV
1058Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1059{
0ebc6274 1060 uvchr_to_utf8(p, c);
1061 return to_utf8_lower(p, p, lenp);
a0ed51b3 1062}
1063
84afefe6 1064UV
1065Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1066{
0ebc6274 1067 uvchr_to_utf8(p, c);
1068 return to_utf8_fold(p, p, lenp);
84afefe6 1069}
1070
a0ed51b3 1071/* for now these all assume no locale info available for Unicode > 255 */
1072
1073bool
84afefe6 1074Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3 1075{
1076 return is_uni_alnum(c); /* XXX no locale support yet */
1077}
1078
1079bool
84afefe6 1080Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f 1081{
1082 return is_uni_alnumc(c); /* XXX no locale support yet */
1083}
1084
1085bool
84afefe6 1086Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3 1087{
1088 return is_uni_idfirst(c); /* XXX no locale support yet */
1089}
1090
1091bool
84afefe6 1092Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3 1093{
1094 return is_uni_alpha(c); /* XXX no locale support yet */
1095}
1096
1097bool
84afefe6 1098Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05 1099{
1100 return is_uni_ascii(c); /* XXX no locale support yet */
1101}
1102
1103bool
84afefe6 1104Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3 1105{
1106 return is_uni_space(c); /* XXX no locale support yet */
1107}
1108
1109bool
84afefe6 1110Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3 1111{
1112 return is_uni_digit(c); /* XXX no locale support yet */
1113}
1114
1115bool
84afefe6 1116Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3 1117{
1118 return is_uni_upper(c); /* XXX no locale support yet */
1119}
1120
1121bool
84afefe6 1122Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3 1123{
1124 return is_uni_lower(c); /* XXX no locale support yet */
1125}
1126
1127bool
84afefe6 1128Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f 1129{
1130 return is_uni_cntrl(c); /* XXX no locale support yet */
1131}
1132
1133bool
84afefe6 1134Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f 1135{
1136 return is_uni_graph(c); /* XXX no locale support yet */
1137}
1138
1139bool
84afefe6 1140Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3 1141{
1142 return is_uni_print(c); /* XXX no locale support yet */
1143}
1144
b8c5462f 1145bool
84afefe6 1146Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f 1147{
1148 return is_uni_punct(c); /* XXX no locale support yet */
1149}
1150
4d61ec05 1151bool
84afefe6 1152Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05 1153{
1154 return is_uni_xdigit(c); /* XXX no locale support yet */
1155}
1156
b7ac61fa 1157U32
1158Perl_to_uni_upper_lc(pTHX_ U32 c)
1159{
ee099d14 1160 /* XXX returns only the first character -- do not use XXX */
1161 /* XXX no locale support yet */
1162 STRLEN len;
1163 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1164 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa 1165}
1166
1167U32
1168Perl_to_uni_title_lc(pTHX_ U32 c)
1169{
ee099d14 1170 /* XXX returns only the first character XXX -- do not use XXX */
1171 /* XXX no locale support yet */
1172 STRLEN len;
1173 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1174 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa 1175}
1176
1177U32
1178Perl_to_uni_lower_lc(pTHX_ U32 c)
1179{
ee099d14 1180 /* XXX returns only the first character -- do not use XXX */
1181 /* XXX no locale support yet */
1182 STRLEN len;
1183 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1184 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa 1185}
1186
a0ed51b3 1187bool
864dbfa3 1188Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1189{
386d01d6 1190 if (!is_utf8_char(p))
1191 return FALSE;
a0ed51b3 1192 if (!PL_utf8_alnum)
289d4f09 1193 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1194 * descendant of isalnum(3), in other words, it doesn't
1195 * contain the '_'. --jhi */
1196 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
eb160463 1197 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3 1198/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1199#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1200 if (!PL_utf8_alnum)
1201 PL_utf8_alnum = swash_init("utf8", "",
1202 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1203 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3 1204#endif
1205}
1206
1207bool
b8c5462f 1208Perl_is_utf8_alnumc(pTHX_ U8 *p)
1209{
386d01d6 1210 if (!is_utf8_char(p))
1211 return FALSE;
b8c5462f 1212 if (!PL_utf8_alnum)
1213 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
eb160463 1214 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f 1215/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1216#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1217 if (!PL_utf8_alnum)
1218 PL_utf8_alnum = swash_init("utf8", "",
1219 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1220 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f 1221#endif
1222}
1223
1224bool
82686b01 1225Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
a0ed51b3 1226{
82686b01 1227 if (*p == '_')
1228 return TRUE;
1229 if (!is_utf8_char(p))
1230 return FALSE;
1231 if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1232 PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
eb160463 1233 return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
82686b01 1234}
1235
1236bool
1237Perl_is_utf8_idcont(pTHX_ U8 *p)
1238{
1239 if (*p == '_')
1240 return TRUE;
1241 if (!is_utf8_char(p))
1242 return FALSE;
1243 if (!PL_utf8_idcont)
1244 PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
eb160463 1245 return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
a0ed51b3 1246}
1247
1248bool
864dbfa3 1249Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1250{
386d01d6 1251 if (!is_utf8_char(p))
1252 return FALSE;
a0ed51b3 1253 if (!PL_utf8_alpha)
e24b16f9 1254 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
eb160463 1255 return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
a0ed51b3 1256}
1257
1258bool
b8c5462f 1259Perl_is_utf8_ascii(pTHX_ U8 *p)
1260{
386d01d6 1261 if (!is_utf8_char(p))
1262 return FALSE;
b8c5462f 1263 if (!PL_utf8_ascii)
1264 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
eb160463 1265 return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
b8c5462f 1266}
1267
1268bool
864dbfa3 1269Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1270{
386d01d6 1271 if (!is_utf8_char(p))
1272 return FALSE;
a0ed51b3 1273 if (!PL_utf8_space)
3bec3564 1274 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
eb160463 1275 return swash_fetch(PL_utf8_space, p, TRUE) != 0;
a0ed51b3 1276}
1277
1278bool
864dbfa3 1279Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1280{
386d01d6 1281 if (!is_utf8_char(p))
1282 return FALSE;
a0ed51b3 1283 if (!PL_utf8_digit)
e24b16f9 1284 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
eb160463 1285 return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
a0ed51b3 1286}
1287
1288bool
864dbfa3 1289Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1290{
386d01d6 1291 if (!is_utf8_char(p))
1292 return FALSE;
a0ed51b3 1293 if (!PL_utf8_upper)
c65e4d19 1294 PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
eb160463 1295 return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
a0ed51b3 1296}
1297
1298bool
864dbfa3 1299Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1300{
386d01d6 1301 if (!is_utf8_char(p))
1302 return FALSE;
a0ed51b3 1303 if (!PL_utf8_lower)
c65e4d19 1304 PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
eb160463 1305 return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
a0ed51b3 1306}
1307
1308bool
b8c5462f 1309Perl_is_utf8_cntrl(pTHX_ U8 *p)
1310{
386d01d6 1311 if (!is_utf8_char(p))
1312 return FALSE;
b8c5462f 1313 if (!PL_utf8_cntrl)
1314 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
eb160463 1315 return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
b8c5462f 1316}
1317
1318bool
1319Perl_is_utf8_graph(pTHX_ U8 *p)
1320{
386d01d6 1321 if (!is_utf8_char(p))
1322 return FALSE;
b8c5462f 1323 if (!PL_utf8_graph)
1324 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
eb160463 1325 return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
b8c5462f 1326}
1327
1328bool
864dbfa3 1329Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1330{
386d01d6 1331 if (!is_utf8_char(p))
1332 return FALSE;
a0ed51b3 1333 if (!PL_utf8_print)
e24b16f9 1334 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
eb160463 1335 return swash_fetch(PL_utf8_print, p, TRUE) != 0;
a0ed51b3 1336}
1337
1338bool
b8c5462f 1339Perl_is_utf8_punct(pTHX_ U8 *p)
1340{
386d01d6 1341 if (!is_utf8_char(p))
1342 return FALSE;
b8c5462f 1343 if (!PL_utf8_punct)
1344 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
eb160463 1345 return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
b8c5462f 1346}
1347
1348bool
1349Perl_is_utf8_xdigit(pTHX_ U8 *p)
1350{
386d01d6 1351 if (!is_utf8_char(p))
1352 return FALSE;
b8c5462f 1353 if (!PL_utf8_xdigit)
1354 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
eb160463 1355 return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
b8c5462f 1356}
1357
1358bool
864dbfa3 1359Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1360{
386d01d6 1361 if (!is_utf8_char(p))
1362 return FALSE;
a0ed51b3 1363 if (!PL_utf8_mark)
e24b16f9 1364 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
eb160463 1365 return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
a0ed51b3 1366}
1367
6b5c0936 1368/*
1369=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1370
1371The "p" contains the pointer to the UTF-8 string encoding
1372the character that is being converted.
1373
1374The "ustrp" is a pointer to the character buffer to put the
1375conversion result to. The "lenp" is a pointer to the length
1376of the result.
1377
0134edef 1378The "swashp" is a pointer to the swash to use.
6b5c0936 1379
0134edef 1380Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1381and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1382but not always, a multicharacter mapping), is tried first.
6b5c0936 1383
0134edef 1384The "special" is a string like "utf8::ToSpecLower", which means the
1385hash %utf8::ToSpecLower. The access to the hash is through
1386Perl_to_utf8_case().
6b5c0936 1387
0134edef 1388The "normal" is a string like "ToLower" which means the swash
1389%utf8::ToLower.
1390
1391=cut */
6b5c0936 1392
2104c8d9 1393UV
a6872d42 1394Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
a0ed51b3 1395{
0134edef 1396 UV uv0, uv1;
2f9475ad 1397 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
0134edef 1398 STRLEN len = 0;
a0ed51b3 1399
1feea2c7 1400 uv0 = utf8_to_uvchr(p, 0);
1401 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1402 * are necessary in EBCDIC, they are redundant no-ops
1403 * in ASCII-ish platforms, and hopefully optimized away. */
1404 uv1 = NATIVE_TO_UNI(uv0);
1405 uvuni_to_utf8(tmpbuf, uv1);
0134edef 1406
1407 if (!*swashp) /* load on-demand */
1408 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1409
b08cf34e 1410 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1411 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1412 /* It might be "special" (sometimes, but not always,
2a37f04d 1413 * a multicharacter mapping) */
983ffd37 1414 HV *hv;
b08cf34e 1415 SV **svp;
1416
1417 if ((hv = get_hv(special, FALSE)) &&
1418 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1419 (*svp)) {
1420 char *s;
47654450 1421
b08cf34e 1422 s = SvPV(*svp, len);
47654450 1423 if (len == 1)
1424 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1425 else {
2f9475ad 1426#ifdef EBCDIC
1427 /* If we have EBCDIC we need to remap the characters
1428 * since any characters in the low 256 are Unicode
1429 * code points, not EBCDIC. */
7cda7a3d 1430 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad 1431
1432 d = tmpbuf;
b08cf34e 1433 if (SvUTF8(*svp)) {
2f9475ad 1434 STRLEN tlen = 0;
1435
1436 while (t < tend) {
1437 UV c = utf8_to_uvchr(t, &tlen);
1438 if (tlen > 0) {
1439 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1440 t += tlen;
1441 }
1442 else
1443 break;
1444 }
1445 }
1446 else {
36fec512 1447 while (t < tend) {
1448 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1449 t++;
1450 }
2f9475ad 1451 }
1452 len = d - tmpbuf;
1453 Copy(tmpbuf, ustrp, len, U8);
1454#else
d2dcd0fb 1455 Copy(s, ustrp, len, U8);
2f9475ad 1456#endif
29e98929 1457 }
983ffd37 1458 }
0134edef 1459 }
1460
1461 if (!len && *swashp) {
1462 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1463
1464 if (uv2) {
1465 /* It was "normal" (a single character mapping). */
1466 UV uv3 = UNI_TO_NATIVE(uv2);
1467
e9101d72 1468 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d 1469 }
1470 }
1feea2c7 1471
0134edef 1472 if (!len) /* Neither: just copy. */
1473 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1474
2a37f04d 1475 if (lenp)
1476 *lenp = len;
1477
0134edef 1478 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3 1479}
1480
d3e79532 1481/*
1482=for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1483
1484Convert the UTF-8 encoded character at p to its uppercase version and
1485store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1486that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1487uppercase version may be longer than the original character (up to two
1488characters).
1489
1490The first character of the uppercased version is returned
1491(but note, as explained above, that there may be more.)
1492
1493=cut */
1494
2104c8d9 1495UV
983ffd37 1496Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1497{
983ffd37 1498 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1499 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1500}
a0ed51b3 1501
d3e79532 1502/*
1503=for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1504
1505Convert the UTF-8 encoded character at p to its titlecase version and
1506store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1507that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1508titlecase version may be longer than the original character (up to two
1509characters).
1510
1511The first character of the titlecased version is returned
1512(but note, as explained above, that there may be more.)
1513
1514=cut */
1515
983ffd37 1516UV
1517Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1518{
1519 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1520 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3 1521}
1522
d3e79532 1523/*
1524=for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1525
1526Convert the UTF-8 encoded character at p to its lowercase version and
1527store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1528that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1529lowercase version may be longer than the original character (up to two
1530characters).
1531
1532The first character of the lowercased version is returned
1533(but note, as explained above, that there may be more.)
1534
1535=cut */
1536
2104c8d9 1537UV
a2a2844f 1538Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1539{
983ffd37 1540 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1541 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1542}
1543
d3e79532 1544/*
1545=for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1546
1547Convert the UTF-8 encoded character at p to its foldcase version and
1548store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1549that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1550foldcase version may be longer than the original character (up to
1551three characters).
1552
1553The first character of the foldcased version is returned
1554(but note, as explained above, that there may be more.)
1555
1556=cut */
1557
b4e400f9 1558UV
1559Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1560{
1561 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1562 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3 1563}
1564
1565/* a "swash" is a swatch hash */
1566
1567SV*
864dbfa3 1568Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1569{
1570 SV* retval;
bf1fed83 1571 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1572 dSP;
71bed85a 1573 size_t pkg_len = strlen(pkg);
1574 size_t name_len = strlen(name);
1575 HV *stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1576 SV* errsv_save;
ce3b816e 1577
1b026014 1578 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1579 ENTER;
f8be5cf0 1580 errsv_save = newSVsv(ERRSV);
71bed85a 1581 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1582 Nullsv);
f8be5cf0 1583 if (!SvTRUE(ERRSV))
1584 sv_setsv(ERRSV, errsv_save);
1585 SvREFCNT_dec(errsv_save);
ce3b816e 1586 LEAVE;
1587 }
1588 SPAGAIN;
a0ed51b3 1589 PUSHSTACKi(PERLSI_MAGIC);
1590 PUSHMARK(SP);
1591 EXTEND(SP,5);
71bed85a 1592 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1593 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3 1594 PUSHs(listsv);
1595 PUSHs(sv_2mortal(newSViv(minbits)));
1596 PUSHs(sv_2mortal(newSViv(none)));
1597 PUTBACK;
1598 ENTER;
1599 SAVEI32(PL_hints);
1600 PL_hints = 0;
1601 save_re_context();
923e4eb5 1602 if (IN_PERL_COMPILETIME) {
bf1fed83 1603 /* XXX ought to be handled by lex_start */
82686b01 1604 SAVEI32(PL_in_my);
2b4bd638 1605 PL_in_my = 0;
bf1fed83 1606 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1607 }
f8be5cf0 1608 errsv_save = newSVsv(ERRSV);
864dbfa3 1609 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1610 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1611 else
e24b16f9 1612 retval = &PL_sv_undef;
f8be5cf0 1613 if (!SvTRUE(ERRSV))
1614 sv_setsv(ERRSV, errsv_save);
1615 SvREFCNT_dec(errsv_save);
a0ed51b3 1616 LEAVE;
1617 POPSTACK;
923e4eb5 1618 if (IN_PERL_COMPILETIME) {
bf1fed83 1619 STRLEN len;
1620 char* pv = SvPV(tokenbufsv, len);
1621
1622 Copy(pv, PL_tokenbuf, len+1, char);
eb160463 1623 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1624 }
bc45ce41 1625 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1626 if (SvPOK(retval))
35c1215d 1627 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1628 retval);
cea2e8a9 1629 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1630 }
a0ed51b3 1631 return retval;
1632}
1633
035d37be 1634
1635/* This API is wrong for special case conversions since we may need to
1636 * return several Unicode characters for a single Unicode character
1637 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1638 * the lower-level routine, and it is similarly broken for returning
1639 * multiple values. --jhi */
a0ed51b3 1640UV
3568d838 1641Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
a0ed51b3 1642{
1643 HV* hv = (HV*)SvRV(sv);
3568d838 1644 U32 klen;
1645 U32 off;
a0ed51b3 1646 STRLEN slen;
7d85a32c 1647 STRLEN needents;
4ea42e7f 1648 U8 *tmps = NULL;
a0ed51b3 1649 U32 bit;
1650 SV *retval;
3568d838 1651 U8 tmputf8[2];
1652 UV c = NATIVE_TO_ASCII(*ptr);
1653
1654 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
eb160463 1655 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1656 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
3568d838 1657 ptr = tmputf8;
1658 }
1659 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1660 * then the "swatch" is a vec() for al the chars which start
1661 * with 0xAA..0xYY
1662 * So the key in the hash (klen) is length of encoded char -1
1663 */
1664 klen = UTF8SKIP(ptr) - 1;
1665 off = ptr[klen];
a0ed51b3 1666
7d85a32c 1667 if (klen == 0)
1668 {
1669 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1670 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1671 */
1672 needents = UTF_CONTINUATION_MARK;
1673 off = NATIVE_TO_UTF(ptr[klen]);
1674 }
1675 else
1676 {
1677 /* If char is encoded then swatch is for the prefix */
1678 needents = (1 << UTF_ACCUMULATION_SHIFT);
1679 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1680 }
1681
a0ed51b3 1682 /*
1683 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1684 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1685 * it's nothing to sniff at.) Pity we usually come through at least
1686 * two function calls to get here...
1687 *
1688 * NB: this code assumes that swatches are never modified, once generated!
1689 */
1690
3568d838 1691 if (hv == PL_last_swash_hv &&
a0ed51b3 1692 klen == PL_last_swash_klen &&
3568d838 1693 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3 1694 {
1695 tmps = PL_last_swash_tmps;
1696 slen = PL_last_swash_slen;
1697 }
1698 else {
1699 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1700 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3 1701
1702 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1703 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1704 dSP;
2b9d42f0 1705 /* We use utf8n_to_uvuni() as we want an index into
1706 Unicode tables, not a native character number.
1707 */
872c91ae 1708 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
1709 ckWARN(WARN_UTF8) ?
1710 0 : UTF8_ALLOW_ANY);
f8be5cf0 1711 SV *errsv_save;
a0ed51b3 1712 ENTER;
1713 SAVETMPS;
1714 save_re_context();
1715 PUSHSTACKi(PERLSI_MAGIC);
1716 PUSHMARK(SP);
1717 EXTEND(SP,3);
1718 PUSHs((SV*)sv);
ffbc6a93 1719 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838 1720 PUSHs(sv_2mortal(newSViv((klen) ?
1721 (code_point & ~(needents - 1)) : 0)));
a0ed51b3 1722 PUSHs(sv_2mortal(newSViv(needents)));
1723 PUTBACK;
f8be5cf0 1724 errsv_save = newSVsv(ERRSV);
864dbfa3 1725 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1726 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1727 else
e24b16f9 1728 retval = &PL_sv_undef;
f8be5cf0 1729 if (!SvTRUE(ERRSV))
1730 sv_setsv(ERRSV, errsv_save);
1731 SvREFCNT_dec(errsv_save);
a0ed51b3 1732 POPSTACK;
1733 FREETMPS;
1734 LEAVE;
923e4eb5 1735 if (IN_PERL_COMPILETIME)
eb160463 1736 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1737
dfe13c55 1738 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1739
7d85a32c 1740 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1741 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3 1742 }
1743
1744 PL_last_swash_hv = hv;
1745 PL_last_swash_klen = klen;
1746 PL_last_swash_tmps = tmps;
1747 PL_last_swash_slen = slen;
1748 if (klen)
1749 Copy(ptr, PL_last_swash_key, klen, U8);
1750 }
1751
9faf8d75 1752 switch ((int)((slen << 3) / needents)) {
a0ed51b3 1753 case 1:
1754 bit = 1 << (off & 7);
1755 off >>= 3;
1756 return (tmps[off] & bit) != 0;
1757 case 8:
1758 return tmps[off];
1759 case 16:
1760 off <<= 1;
1761 return (tmps[off] << 8) + tmps[off + 1] ;
1762 case 32:
1763 off <<= 2;
1764 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1765 }
cea2e8a9 1766 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3 1767 return 0;
1768}
2b9d42f0 1769
1770
1771/*
37607a96 1772=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0 1773
1e54db1a 1774Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2b9d42f0 1775of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1776bytes available. The return value is the pointer to the byte after the
1777end of the new character. In other words,
1778
1779 d = uvchr_to_utf8(d, uv);
1780
1781is the recommended wide native character-aware way of saying
1782
1783 *(d++) = uv;
1784
1785=cut
1786*/
1787
1788/* On ASCII machines this is normally a macro but we want a
1789 real function in case XS code wants it
1790*/
1791#undef Perl_uvchr_to_utf8
1792U8 *
1793Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1794{
b851fbc1 1795 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2b9d42f0 1796}
1797
b851fbc1 1798U8 *
1799Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1800{
1801 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1802}
2b9d42f0 1803
1804/*
37607a96 1805=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0 1806
1807Returns the native character value of the first character in the string C<s>
1e54db1a 1808which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2b9d42f0 1809length, in bytes, of that character.
1810
1811Allows length and flags to be passed to low level routine.
1812
1813=cut
1814*/
0a2ef054 1815/* On ASCII machines this is normally a macro but we want
1816 a real function in case XS code wants it
2b9d42f0 1817*/
1818#undef Perl_utf8n_to_uvchr
1819UV
37607a96 1820Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0 1821{
1822 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1823 return UNI_TO_NATIVE(uv);
1824}
1825
d2cc3551 1826/*
1827=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1828
1829Build to the scalar dsv a displayable version of the string spv,
1830length len, the displayable version being at most pvlim bytes long
1831(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1832
9e55ce06 1833The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1834isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054 1835to display the \\[nrfta\\] as the backslashed versions (like '\n')
1836(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1837UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1838UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1839
d2cc3551 1840The pointer to the PV of the dsv is returned.
1841
1842=cut */
e6b2e755 1843char *
1844Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1845{
1846 int truncated = 0;
1847 char *s, *e;
1848
1849 sv_setpvn(dsv, "", 0);
1850 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1851 UV u;
c728cb41 1852 bool ok = FALSE;
1853
e6b2e755 1854 if (pvlim && SvCUR(dsv) >= pvlim) {
1855 truncated++;
1856 break;
1857 }
1858 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1859 if (u < 256) {
c728cb41 1860 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1861 switch (u & 0xFF) {
1862 case '\n':
1863 Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1864 case '\r':
1865 Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1866 case '\t':
1867 Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1868 case '\f':
1869 Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1870 case '\a':
1871 Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1872 case '\\':
d79a7a3d 1873 Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
c728cb41 1874 default: break;
1875 }
1876 }
00e86452 1877 /* isPRINT() is the locale-blind version. */
1878 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
2c4547fe 1879 Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
0a2ef054 1880 ok = TRUE;
1881 }
c728cb41 1882 }
1883 if (!ok)
9e55ce06 1884 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755 1885 }
1886 if (truncated)
1887 sv_catpvn(dsv, "...", 3);
1888
1889 return SvPVX(dsv);
1890}
2b9d42f0 1891
d2cc3551 1892/*
1893=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1894
1895Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1896the displayable version being at most pvlim bytes long
d2cc3551 1897(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1898
1899The flags argument is as in pv_uni_display().
1900
d2cc3551 1901The pointer to the PV of the dsv is returned.
1902
1903=cut */
e6b2e755 1904char *
1905Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1906{
701a277b 1907 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1908 pvlim, flags);
1909}
1910
d2cc3551 1911/*
d07ddd77 1912=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551 1913
1914Return true if the strings s1 and s2 differ case-insensitively, false
1915if not (if they are equal case-insensitively). If u1 is true, the
1916string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77 1917the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1918are false, the respective string is assumed to be in native 8-bit
1919encoding.
1920
1921If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1922in there (they will point at the beginning of the I<next> character).
1923If the pointers behind pe1 or pe2 are non-NULL, they are the end
1924pointers beyond which scanning will not continue under any
1925circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1926s2+l2 will be used as goal end pointers that will also stop the scan,
1927and which qualify towards defining a successful match: all the scans
1928that define an explicit length must reach their goal pointers for
1929a match to succeed).
d2cc3551 1930
1931For case-insensitiveness, the "casefolding" of Unicode is used
1932instead of upper/lowercasing both the characters, see
1933http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1934
1935=cut */
701a277b 1936I32
d07ddd77 1937Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 1938{
5469e704 1939 register U8 *p1 = (U8*)s1;
1940 register U8 *p2 = (U8*)s2;
d07ddd77 1941 register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1942 register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1943 STRLEN n1 = 0, n2 = 0;
ffce6cc2 1944 U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1945 U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
d7f013c8 1946 U8 natbuf[1+1];
1947 STRLEN foldlen1, foldlen2;
d07ddd77 1948 bool match;
332ddc25 1949
d07ddd77 1950 if (pe1)
1951 e1 = *(U8**)pe1;
eb160463 1952 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (U8*)s1)))
d07ddd77 1953 f1 = (U8*)s1 + l1;
1954 if (pe2)
1955 e2 = *(U8**)pe2;
eb160463 1956 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (U8*)s2)))
d07ddd77 1957 f2 = (U8*)s2 + l2;
1958
1959 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1960 return 1; /* mismatch; possible infinite loop or false positive */
1961
a6872d42 1962 if (!u1 || !u2)
1963 natbuf[1] = 0; /* Need to terminate the buffer. */
1964
d07ddd77 1965 while ((e1 == 0 || p1 < e1) &&
1966 (f1 == 0 || p1 < f1) &&
1967 (e2 == 0 || p2 < e2) &&
1968 (f2 == 0 || p2 < f2)) {
1969 if (n1 == 0) {
d7f013c8 1970 if (u1)
1971 to_utf8_fold(p1, foldbuf1, &foldlen1);
1972 else {
f5cee151 1973 natbuf[0] = *p1;
d7f013c8 1974 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1975 }
1976 q1 = foldbuf1;
d07ddd77 1977 n1 = foldlen1;
332ddc25 1978 }
d07ddd77 1979 if (n2 == 0) {
d7f013c8 1980 if (u2)
1981 to_utf8_fold(p2, foldbuf2, &foldlen2);
1982 else {
f5cee151 1983 natbuf[0] = *p2;
d7f013c8 1984 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1985 }
1986 q2 = foldbuf2;
d07ddd77 1987 n2 = foldlen2;
332ddc25 1988 }
d07ddd77 1989 while (n1 && n2) {
1990 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1991 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1992 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 1993 return 1; /* mismatch */
d07ddd77 1994 n1 -= UTF8SKIP(q1);
d7f013c8 1995 q1 += UTF8SKIP(q1);
d07ddd77 1996 n2 -= UTF8SKIP(q2);
d7f013c8 1997 q2 += UTF8SKIP(q2);
701a277b 1998 }
d07ddd77 1999 if (n1 == 0)
d7f013c8 2000 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2001 if (n2 == 0)
d7f013c8 2002 p2 += u2 ? UTF8SKIP(p2) : 1;
2003
d2cc3551 2004 }
5469e704 2005
d07ddd77 2006 /* A match is defined by all the scans that specified
2007 * an explicit length reaching their final goals. */
2008 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704 2009
2010 if (match) {
d07ddd77 2011 if (pe1)
2012 *pe1 = (char*)p1;
2013 if (pe2)
2014 *pe2 = (char*)p2;
5469e704 2015 }
2016
2017 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2018}
701a277b 2019