Re: [PATCH] ...while $var = glob(...)
[p5sagit/p5-mst-13.2.git] / utf8.c
CommitLineData
a0ed51b3 1/* utf8.c
2 *
bc89e66f 3 * Copyright (c) 1998-2001, Larry Wall
a0ed51b3 4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3 25#include "perl.h"
26
27/* Unicode support */
28
eebe1485 29/*
37607a96 30=for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
eebe1485 31
32Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34bytes available. The return value is the pointer to the byte after the
9041c2e3 35end of the new character. In other words,
eebe1485 36
9041c2e3 37 d = uvuni_to_utf8(d, uv);
eebe1485 38
39is the recommended Unicode-aware way of saying
40
41 *(d++) = uv;
42
43=cut
44*/
45
dfe13c55 46U8 *
9041c2e3 47Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
a0ed51b3 48{
507b9800 49 if (ckWARN_d(WARN_UTF8)) {
50 if (UNICODE_IS_SURROGATE(uv))
51 Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv);
52 else if ((uv >= 0xFDD0 && uv <= 0xFDEF) ||
53 (uv == 0xFFFE || uv == 0xFFFF))
54 Perl_warner(aTHX_ WARN_UTF8,
55 "Unicode character 0x%04"UVxf" is illegal", uv);
56 }
c4d5f83a 57 if (UNI_IS_INVARIANT(uv)) {
58 *d++ = UTF_TO_NATIVE(uv);
a0ed51b3 59 return d;
60 }
2d331972 61#if defined(EBCDIC)
1d72bdf6 62 else {
63 STRLEN len = UNISKIP(uv);
64 U8 *p = d+len-1;
65 while (p > d) {
66 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
67 uv >>= UTF_ACCUMULATION_SHIFT;
68 }
69 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
70 return d+len;
71 }
72#else /* Non loop style */
a0ed51b3 73 if (uv < 0x800) {
74 *d++ = (( uv >> 6) | 0xc0);
75 *d++ = (( uv & 0x3f) | 0x80);
76 return d;
77 }
78 if (uv < 0x10000) {
79 *d++ = (( uv >> 12) | 0xe0);
80 *d++ = (((uv >> 6) & 0x3f) | 0x80);
81 *d++ = (( uv & 0x3f) | 0x80);
82 return d;
83 }
84 if (uv < 0x200000) {
85 *d++ = (( uv >> 18) | 0xf0);
86 *d++ = (((uv >> 12) & 0x3f) | 0x80);
87 *d++ = (((uv >> 6) & 0x3f) | 0x80);
88 *d++ = (( uv & 0x3f) | 0x80);
89 return d;
90 }
91 if (uv < 0x4000000) {
92 *d++ = (( uv >> 24) | 0xf8);
93 *d++ = (((uv >> 18) & 0x3f) | 0x80);
94 *d++ = (((uv >> 12) & 0x3f) | 0x80);
95 *d++ = (((uv >> 6) & 0x3f) | 0x80);
96 *d++ = (( uv & 0x3f) | 0x80);
97 return d;
98 }
99 if (uv < 0x80000000) {
100 *d++ = (( uv >> 30) | 0xfc);
101 *d++ = (((uv >> 24) & 0x3f) | 0x80);
102 *d++ = (((uv >> 18) & 0x3f) | 0x80);
103 *d++ = (((uv >> 12) & 0x3f) | 0x80);
104 *d++ = (((uv >> 6) & 0x3f) | 0x80);
105 *d++ = (( uv & 0x3f) | 0x80);
106 return d;
107 }
6b8eaf93 108#ifdef HAS_QUAD
d7578b48 109 if (uv < UTF8_QUAD_MAX)
a0ed51b3 110#endif
111 {
112 *d++ = 0xfe; /* Can't match U+FEFF! */
113 *d++ = (((uv >> 30) & 0x3f) | 0x80);
114 *d++ = (((uv >> 24) & 0x3f) | 0x80);
115 *d++ = (((uv >> 18) & 0x3f) | 0x80);
116 *d++ = (((uv >> 12) & 0x3f) | 0x80);
117 *d++ = (((uv >> 6) & 0x3f) | 0x80);
118 *d++ = (( uv & 0x3f) | 0x80);
119 return d;
120 }
6b8eaf93 121#ifdef HAS_QUAD
a0ed51b3 122 {
123 *d++ = 0xff; /* Can't match U+FFFE! */
3c77ea2b 124 *d++ = 0x80; /* 6 Reserved bits */
125 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
126 *d++ = (((uv >> 54) & 0x3f) | 0x80);
127 *d++ = (((uv >> 48) & 0x3f) | 0x80);
128 *d++ = (((uv >> 42) & 0x3f) | 0x80);
a0ed51b3 129 *d++ = (((uv >> 36) & 0x3f) | 0x80);
130 *d++ = (((uv >> 30) & 0x3f) | 0x80);
131 *d++ = (((uv >> 24) & 0x3f) | 0x80);
132 *d++ = (((uv >> 18) & 0x3f) | 0x80);
133 *d++ = (((uv >> 12) & 0x3f) | 0x80);
134 *d++ = (((uv >> 6) & 0x3f) | 0x80);
135 *d++ = (( uv & 0x3f) | 0x80);
136 return d;
137 }
138#endif
1d72bdf6 139#endif /* Loop style */
a0ed51b3 140}
141
9041c2e3 142
143
144/*
eebe1485 145=for apidoc A|STRLEN|is_utf8_char|U8 *s
146
5da9da9e 147Tests if some arbitrary number of bytes begins in a valid UTF-8
1d72bdf6 148character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
5da9da9e 149The actual number of bytes in the UTF-8 character will be returned if
150it is valid, otherwise 0.
9041c2e3 151
37607a96 152=cut
153*/
067a85ef 154STRLEN
386d01d6 155Perl_is_utf8_char(pTHX_ U8 *s)
156{
157 U8 u = *s;
067a85ef 158 STRLEN slen, len;
159 UV uv, ouv;
386d01d6 160
1d72bdf6 161 if (UTF8_IS_INVARIANT(u))
386d01d6 162 return 1;
163
60006e79 164 if (!UTF8_IS_START(u))
386d01d6 165 return 0;
166
9f07fdcd 167 len = UTF8SKIP(s);
386d01d6 168
60006e79 169 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef 170 return 0;
171
386d01d6 172 slen = len - 1;
173 s++;
6eb6869e 174 u &= UTF_START_MASK(len);
1d72bdf6 175 uv = u;
067a85ef 176 ouv = uv;
386d01d6 177 while (slen--) {
60006e79 178 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 179 return 0;
8850bf83 180 uv = UTF8_ACCUMULATE(uv, *s);
209a9bc1 181 if (uv < ouv)
067a85ef 182 return 0;
183 ouv = uv;
386d01d6 184 s++;
185 }
067a85ef 186
5bbb0b5a 187 if (UNISKIP(uv) < len)
067a85ef 188 return 0;
189
386d01d6 190 return len;
191}
192
6662521e 193/*
eebe1485 194=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 195
5da9da9e 196Returns true if first C<len> bytes of the given string form a valid UTF8
197string, false otherwise. Note that 'a valid UTF8 string' does not mean
198'a string that contains UTF8' because a valid ASCII string is a valid
199UTF8 string.
6662521e 200
201=cut
202*/
203
8e84507e 204bool
6662521e 205Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
206{
067a85ef 207 U8* x = s;
1aa99e6b 208 U8* send;
067a85ef 209 STRLEN c;
210
1aa99e6b 211 if (!len)
6cd5fe39 212 len = strlen((char *)s);
1aa99e6b 213 send = s + len;
214
6662521e 215 while (x < send) {
216 c = is_utf8_char(x);
067a85ef 217 if (!c)
218 return FALSE;
6662521e 219 x += c;
6662521e 220 }
60006e79 221 if (x != send)
222 return FALSE;
067a85ef 223
224 return TRUE;
6662521e 225}
226
67e989fb 227/*
9041c2e3 228=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 229
9041c2e3 230Bottom level UTF-8 decode routine.
231Returns the unicode code point value of the first character in the string C<s>
ba210ebe 232which is assumed to be in UTF8 encoding and no longer than C<curlen>;
7df053ec 233C<retlen> will be set to the length, in bytes, of that character.
67e989fb 234
235If C<s> does not point to a well-formed UTF8 character, the behaviour
dcad2880 236is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
237it is assumed that the caller will raise a warning, and this function
28d3d195 238will silently just set C<retlen> to C<-1> and return zero. If the
239C<flags> does not contain UTF8_CHECK_ONLY, warnings about
240malformations will be given, C<retlen> will be set to the expected
241length of the UTF-8 character in bytes, and zero will be returned.
242
243The C<flags> can also contain various flags to allow deviations from
244the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 245
9041c2e3 246Most code should use utf8_to_uvchr() rather than call this directly.
247
37607a96 248=cut
249*/
67e989fb 250
a0ed51b3 251UV
37607a96 252Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 253{
9c5ffd7c 254 UV uv = *s, ouv = 0;
ba210ebe 255 STRLEN len = 1;
256 bool dowarn = ckWARN_d(WARN_UTF8);
257 STRLEN expectlen = 0;
a0dbb045 258 U32 warning = 0;
259
260/* This list is a superset of the UTF8_ALLOW_XXX. */
261
262#define UTF8_WARN_EMPTY 1
263#define UTF8_WARN_CONTINUATION 2
264#define UTF8_WARN_NON_CONTINUATION 3
265#define UTF8_WARN_FE_FF 4
266#define UTF8_WARN_SHORT 5
267#define UTF8_WARN_OVERFLOW 6
268#define UTF8_WARN_SURROGATE 7
269#define UTF8_WARN_BOM 8
270#define UTF8_WARN_LONG 9
271#define UTF8_WARN_FFFF 10
272
273 if (curlen == 0 &&
274 !(flags & UTF8_ALLOW_EMPTY)) {
275 warning = UTF8_WARN_EMPTY;
0c443dc2 276 goto malformed;
277 }
278
1d72bdf6 279 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3 280 if (retlen)
281 *retlen = 1;
c4d5f83a 282 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 283 }
67e989fb 284
421a8bf2 285 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 286 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 287 warning = UTF8_WARN_CONTINUATION;
ba210ebe 288 goto malformed;
289 }
290
421a8bf2 291 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 292 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 293 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 294 goto malformed;
295 }
9041c2e3 296
1d72bdf6 297#ifdef EBCDIC
75383841 298 uv = NATIVE_TO_UTF(uv);
1d72bdf6 299#else
fcc8fcf6 300 if ((uv == 0xfe || uv == 0xff) &&
301 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 302 warning = UTF8_WARN_FE_FF;
ba210ebe 303 goto malformed;
a0ed51b3 304 }
1d72bdf6 305#endif
306
ba210ebe 307 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
308 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
309 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
310 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6 311#ifdef EBCDIC
312 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
313 else { len = 7; uv &= 0x01; }
314#else
ba210ebe 315 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
316 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6 317 else { len = 13; uv = 0; } /* whoa! */
318#endif
319
a0ed51b3 320 if (retlen)
321 *retlen = len;
9041c2e3 322
ba210ebe 323 expectlen = len;
324
fcc8fcf6 325 if ((curlen < expectlen) &&
326 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 327 warning = UTF8_WARN_SHORT;
ba210ebe 328 goto malformed;
329 }
330
331 len--;
a0ed51b3 332 s++;
ba210ebe 333 ouv = uv;
334
a0ed51b3 335 while (len--) {
421a8bf2 336 if (!UTF8_IS_CONTINUATION(*s) &&
337 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 338 s--;
339 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 340 goto malformed;
a0ed51b3 341 }
342 else
8850bf83 343 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045 344 if (!(uv > ouv)) {
345 /* These cannot be allowed. */
346 if (uv == ouv) {
347 if (!(flags & UTF8_ALLOW_LONG)) {
348 warning = UTF8_WARN_LONG;
349 goto malformed;
350 }
351 }
352 else { /* uv < ouv */
353 /* This cannot be allowed. */
354 warning = UTF8_WARN_OVERFLOW;
355 goto malformed;
356 }
ba210ebe 357 }
358 s++;
359 ouv = uv;
360 }
361
421a8bf2 362 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 363 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 364 warning = UTF8_WARN_SURROGATE;
ba210ebe 365 goto malformed;
421a8bf2 366 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
fcc8fcf6 367 !(flags & UTF8_ALLOW_BOM)) {
a0dbb045 368 warning = UTF8_WARN_BOM;
ba210ebe 369 goto malformed;
fcc8fcf6 370 } else if ((expectlen > UNISKIP(uv)) &&
371 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 372 warning = UTF8_WARN_LONG;
ba210ebe 373 goto malformed;
421a8bf2 374 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 375 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 376 warning = UTF8_WARN_FFFF;
a9917092 377 goto malformed;
a0ed51b3 378 }
ba210ebe 379
a0ed51b3 380 return uv;
ba210ebe 381
382malformed:
383
fcc8fcf6 384 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 385 if (retlen)
cc366d4b 386 *retlen = -1;
ba210ebe 387 return 0;
388 }
389
a0dbb045 390 if (dowarn) {
391 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
392
393 switch (warning) {
394 case 0: /* Intentionally empty. */ break;
395 case UTF8_WARN_EMPTY:
396 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
397 break;
398 case UTF8_WARN_CONTINUATION:
399 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
400 break;
401 case UTF8_WARN_NON_CONTINUATION:
402 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
403 (UV)s[1], uv);
404 break;
405 case UTF8_WARN_FE_FF:
406 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
407 break;
408 case UTF8_WARN_SHORT:
409 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
410 curlen, curlen == 1 ? "" : "s", expectlen);
b31f83c2 411 expectlen = curlen; /* distance for caller to skip */
a0dbb045 412 break;
413 case UTF8_WARN_OVERFLOW:
414 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
415 ouv, *s);
416 break;
417 case UTF8_WARN_SURROGATE:
418 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
419 break;
420 case UTF8_WARN_BOM:
421 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
422 break;
423 case UTF8_WARN_LONG:
424 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
425 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
426 break;
427 case UTF8_WARN_FFFF:
428 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
429 break;
430 default:
431 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
432 break;
433 }
434
435 if (warning) {
436 char *s = SvPVX(sv);
437
438 if (PL_op)
439 Perl_warner(aTHX_ WARN_UTF8,
53e06cf0 440 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 441 else
442 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
443 }
444 }
445
ba210ebe 446 if (retlen)
28d3d195 447 *retlen = expectlen ? expectlen : len;
ba210ebe 448
28d3d195 449 return 0;
a0ed51b3 450}
451
8e84507e 452/*
37607a96 453=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
9041c2e3 454
455Returns the native character value of the first character in the string C<s>
456which is assumed to be in UTF8 encoding; C<retlen> will be set to the
457length, in bytes, of that character.
458
459If C<s> does not point to a well-formed UTF8 character, zero is
460returned and retlen is set, if possible, to -1.
461
462=cut
463*/
464
465UV
37607a96 466Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
9041c2e3 467{
468 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
469}
470
471/*
37607a96 472=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
9041c2e3 473
474Returns the Unicode code point of the first character in the string C<s>
475which is assumed to be in UTF8 encoding; C<retlen> will be set to the
476length, in bytes, of that character.
477
478This function should only be used when returned UV is considered
479an index into the Unicode semantic tables (e.g. swashes).
480
ba210ebe 481If C<s> does not point to a well-formed UTF8 character, zero is
482returned and retlen is set, if possible, to -1.
8e84507e 483
484=cut
485*/
486
487UV
37607a96 488Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
8e84507e 489{
9041c2e3 490 /* Call the low level routine asking for checks */
491 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
8e84507e 492}
493
b76347f2 494/*
37607a96 495=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
b76347f2 496
497Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47 498Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
499up past C<e>, croaks.
b76347f2 500
501=cut
502*/
503
504STRLEN
37607a96 505Perl_utf8_length(pTHX_ U8 *s, U8 *e)
b76347f2 506{
507 STRLEN len = 0;
508
8850bf83 509 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
510 * the bitops (especially ~) can create illegal UTF-8.
511 * In other words: in Perl UTF-8 is not just for Unicode. */
512
b76347f2 513 if (e < s)
02eb7b47 514 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
b76347f2 515 while (s < e) {
02eb7b47 516 U8 t = UTF8SKIP(s);
b76347f2 517
518 if (e - s < t)
0064a8a9 519 Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
b76347f2 520 s += t;
521 len++;
522 }
523
524 return len;
525}
526
b06226ff 527/*
eebe1485 528=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff 529
530Returns the number of UTF8 characters between the UTF-8 pointers C<a>
531and C<b>.
532
533WARNING: use only if you *know* that the pointers point inside the
534same UTF-8 buffer.
535
37607a96 536=cut
537*/
a0ed51b3 538
02eb7b47 539IV
864dbfa3 540Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 541{
02eb7b47 542 IV off = 0;
543
8850bf83 544 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
545 * the bitops (especially ~) can create illegal UTF-8.
546 * In other words: in Perl UTF-8 is not just for Unicode. */
547
a0ed51b3 548 if (a < b) {
549 while (a < b) {
02eb7b47 550 U8 c = UTF8SKIP(a);
551
552 if (b - a < c)
553 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
554 a += c;
a0ed51b3 555 off--;
556 }
557 }
558 else {
559 while (b < a) {
02eb7b47 560 U8 c = UTF8SKIP(b);
561
562 if (a - b < c)
563 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
564 b += c;
a0ed51b3 565 off++;
566 }
567 }
02eb7b47 568
a0ed51b3 569 return off;
570}
571
b06226ff 572/*
37607a96 573=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 574
8850bf83 575Return the UTF-8 pointer C<s> displaced by C<off> characters, either
576forward or backward.
b06226ff 577
578WARNING: do not use the following unless you *know* C<off> is within
8850bf83 579the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
580on the first byte of character or just after the last byte of a character.
b06226ff 581
37607a96 582=cut
583*/
a0ed51b3 584
585U8 *
864dbfa3 586Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 587{
8850bf83 588 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
589 * the bitops (especially ~) can create illegal UTF-8.
590 * In other words: in Perl UTF-8 is not just for Unicode. */
591
a0ed51b3 592 if (off >= 0) {
593 while (off--)
594 s += UTF8SKIP(s);
595 }
596 else {
597 while (off++) {
598 s--;
8850bf83 599 while (UTF8_IS_CONTINUATION(*s))
600 s--;
a0ed51b3 601 }
602 }
603 return s;
604}
605
6940069f 606/*
eebe1485 607=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 608
246fae53 609Converts a string C<s> of length C<len> from UTF8 into byte encoding.
610Unlike C<bytes_to_utf8>, this over-writes the original string, and
611updates len to contain the new length.
67e989fb 612Returns zero on failure, setting C<len> to -1.
6940069f 613
614=cut
615*/
616
617U8 *
37607a96 618Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 619{
6940069f 620 U8 *send;
621 U8 *d;
dcad2880 622 U8 *save = s;
246fae53 623
624 /* ensure valid UTF8 and chars < 256 before updating string */
dcad2880 625 for (send = s + *len; s < send; ) {
626 U8 c = *s++;
627
1d72bdf6 628 if (!UTF8_IS_INVARIANT(c) &&
629 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
630 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880 631 *len = -1;
632 return 0;
633 }
246fae53 634 }
dcad2880 635
636 d = s = save;
6940069f 637 while (s < send) {
ed646e6e 638 STRLEN ulen;
9041c2e3 639 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 640 s += ulen;
6940069f 641 }
642 *d = '\0';
246fae53 643 *len = d - save;
6940069f 644 return save;
645}
646
647/*
f9a63242 648=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
649
650Converts a string C<s> of length C<len> from UTF8 into byte encoding.
651Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0 652the newly-created string, and updates C<len> to contain the new
653length. Returns the original string if no conversion occurs, C<len>
654is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
6550 if C<s> is converted or contains all 7bit characters.
f9a63242 656
37607a96 657=cut
658*/
f9a63242 659
660U8 *
37607a96 661Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 662{
f9a63242 663 U8 *d;
664 U8 *start = s;
db42d148 665 U8 *send;
f9a63242 666 I32 count = 0;
667
668 if (!*is_utf8)
669 return start;
670
ef9edfd0 671 /* ensure valid UTF8 and chars < 256 before converting string */
f9a63242 672 for (send = s + *len; s < send;) {
673 U8 c = *s++;
1d72bdf6 674 if (!UTF8_IS_INVARIANT(c)) {
db42d148 675 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
676 (c = *s++) && UTF8_IS_CONTINUATION(c))
677 count++;
678 else
f9a63242 679 return start;
db42d148 680 }
f9a63242 681 }
682
683 *is_utf8 = 0;
684
f9a63242 685 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 686 s = start; start = d;
f9a63242 687 while (s < send) {
688 U8 c = *s++;
c4d5f83a 689 if (!UTF8_IS_INVARIANT(c)) {
690 /* Then it is two-byte encoded */
691 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
692 c = ASCII_TO_NATIVE(c);
693 }
694 *d++ = c;
f9a63242 695 }
696 *d = '\0';
697 *len = d - start;
698 return start;
699}
700
701/*
eebe1485 702=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f 703
704Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
6662521e 705Returns a pointer to the newly-created string, and sets C<len> to
706reflect the new length.
6940069f 707
497711e7 708=cut
6940069f 709*/
710
711U8*
37607a96 712Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
6940069f 713{
6940069f 714 U8 *send;
715 U8 *d;
716 U8 *dst;
6662521e 717 send = s + (*len);
6940069f 718
6662521e 719 Newz(801, d, (*len) * 2 + 1, U8);
6940069f 720 dst = d;
721
722 while (s < send) {
db42d148 723 UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 724 if (UNI_IS_INVARIANT(uv))
725 *d++ = UTF_TO_NATIVE(uv);
6940069f 726 else {
90f44359 727 *d++ = UTF8_EIGHT_BIT_HI(uv);
728 *d++ = UTF8_EIGHT_BIT_LO(uv);
6940069f 729 }
730 }
731 *d = '\0';
6662521e 732 *len = d-dst;
6940069f 733 return dst;
734}
735
a0ed51b3 736/*
dea0fc0b 737 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3 738 *
739 * Destination must be pre-extended to 3/2 source. Do not use in-place.
740 * We optimize for native, for obvious reasons. */
741
742U8*
dea0fc0b 743Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 744{
dea0fc0b 745 U8* pend;
746 U8* dstart = d;
747
748 if (bytelen & 1)
a7867d0a 749 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b 750
751 pend = p + bytelen;
752
a0ed51b3 753 while (p < pend) {
dea0fc0b 754 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
755 p += 2;
a0ed51b3 756 if (uv < 0x80) {
757 *d++ = uv;
758 continue;
759 }
760 if (uv < 0x800) {
761 *d++ = (( uv >> 6) | 0xc0);
762 *d++ = (( uv & 0x3f) | 0x80);
763 continue;
764 }
765 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
dea0fc0b 766 UV low = *p++;
767 if (low < 0xdc00 || low >= 0xdfff)
768 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3 769 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
770 }
771 if (uv < 0x10000) {
772 *d++ = (( uv >> 12) | 0xe0);
773 *d++ = (((uv >> 6) & 0x3f) | 0x80);
774 *d++ = (( uv & 0x3f) | 0x80);
775 continue;
776 }
777 else {
778 *d++ = (( uv >> 18) | 0xf0);
779 *d++ = (((uv >> 12) & 0x3f) | 0x80);
780 *d++ = (((uv >> 6) & 0x3f) | 0x80);
781 *d++ = (( uv & 0x3f) | 0x80);
782 continue;
783 }
784 }
dea0fc0b 785 *newlen = d - dstart;
a0ed51b3 786 return d;
787}
788
789/* Note: this one is slightly destructive of the source. */
790
791U8*
dea0fc0b 792Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 793{
794 U8* s = (U8*)p;
795 U8* send = s + bytelen;
796 while (s < send) {
797 U8 tmp = s[0];
798 s[0] = s[1];
799 s[1] = tmp;
800 s += 2;
801 }
dea0fc0b 802 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3 803}
804
805/* for now these are all defined (inefficiently) in terms of the utf8 versions */
806
807bool
84afefe6 808Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 809{
ad391ad9 810 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 811 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 812 return is_utf8_alnum(tmpbuf);
813}
814
815bool
84afefe6 816Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 817{
ad391ad9 818 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 819 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f 820 return is_utf8_alnumc(tmpbuf);
821}
822
823bool
84afefe6 824Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 825{
ad391ad9 826 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 827 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 828 return is_utf8_idfirst(tmpbuf);
829}
830
831bool
84afefe6 832Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 833{
ad391ad9 834 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 835 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 836 return is_utf8_alpha(tmpbuf);
837}
838
839bool
84afefe6 840Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 841{
ad391ad9 842 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 843 uvchr_to_utf8(tmpbuf, (UV)c);
4d61ec05 844 return is_utf8_ascii(tmpbuf);
845}
846
847bool
84afefe6 848Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 849{
ad391ad9 850 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 851 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 852 return is_utf8_space(tmpbuf);
853}
854
855bool
84afefe6 856Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 857{
ad391ad9 858 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 859 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 860 return is_utf8_digit(tmpbuf);
861}
862
863bool
84afefe6 864Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 865{
ad391ad9 866 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 867 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 868 return is_utf8_upper(tmpbuf);
869}
870
871bool
84afefe6 872Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 873{
ad391ad9 874 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 875 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 876 return is_utf8_lower(tmpbuf);
877}
878
879bool
84afefe6 880Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 881{
ad391ad9 882 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 883 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f 884 return is_utf8_cntrl(tmpbuf);
885}
886
887bool
84afefe6 888Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 889{
ad391ad9 890 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 891 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f 892 return is_utf8_graph(tmpbuf);
893}
894
895bool
84afefe6 896Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 897{
ad391ad9 898 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 899 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 900 return is_utf8_print(tmpbuf);
901}
902
b8c5462f 903bool
84afefe6 904Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 905{
ad391ad9 906 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 907 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f 908 return is_utf8_punct(tmpbuf);
909}
910
4d61ec05 911bool
84afefe6 912Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 913{
e7ae6809 914 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
2b9d42f0 915 uvchr_to_utf8(tmpbuf, (UV)c);
4d61ec05 916 return is_utf8_xdigit(tmpbuf);
917}
918
84afefe6 919UV
920Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 921{
e7ae6809 922 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
2b9d42f0 923 uvchr_to_utf8(tmpbuf, (UV)c);
a2a2844f 924 return to_utf8_upper(tmpbuf, p, lenp);
a0ed51b3 925}
926
84afefe6 927UV
928Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 929{
e7ae6809 930 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
2b9d42f0 931 uvchr_to_utf8(tmpbuf, (UV)c);
a2a2844f 932 return to_utf8_title(tmpbuf, p, lenp);
a0ed51b3 933}
934
84afefe6 935UV
936Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 937{
d90824e8 938 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
2b9d42f0 939 uvchr_to_utf8(tmpbuf, (UV)c);
a2a2844f 940 return to_utf8_lower(tmpbuf, p, lenp);
a0ed51b3 941}
942
84afefe6 943UV
944Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
945{
d90824e8 946 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
84afefe6 947 uvchr_to_utf8(tmpbuf, (UV)c);
948 return to_utf8_fold(tmpbuf, p, lenp);
949}
950
a0ed51b3 951/* for now these all assume no locale info available for Unicode > 255 */
952
953bool
84afefe6 954Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3 955{
956 return is_uni_alnum(c); /* XXX no locale support yet */
957}
958
959bool
84afefe6 960Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f 961{
962 return is_uni_alnumc(c); /* XXX no locale support yet */
963}
964
965bool
84afefe6 966Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3 967{
968 return is_uni_idfirst(c); /* XXX no locale support yet */
969}
970
971bool
84afefe6 972Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3 973{
974 return is_uni_alpha(c); /* XXX no locale support yet */
975}
976
977bool
84afefe6 978Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05 979{
980 return is_uni_ascii(c); /* XXX no locale support yet */
981}
982
983bool
84afefe6 984Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3 985{
986 return is_uni_space(c); /* XXX no locale support yet */
987}
988
989bool
84afefe6 990Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3 991{
992 return is_uni_digit(c); /* XXX no locale support yet */
993}
994
995bool
84afefe6 996Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3 997{
998 return is_uni_upper(c); /* XXX no locale support yet */
999}
1000
1001bool
84afefe6 1002Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3 1003{
1004 return is_uni_lower(c); /* XXX no locale support yet */
1005}
1006
1007bool
84afefe6 1008Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f 1009{
1010 return is_uni_cntrl(c); /* XXX no locale support yet */
1011}
1012
1013bool
84afefe6 1014Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f 1015{
1016 return is_uni_graph(c); /* XXX no locale support yet */
1017}
1018
1019bool
84afefe6 1020Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3 1021{
1022 return is_uni_print(c); /* XXX no locale support yet */
1023}
1024
b8c5462f 1025bool
84afefe6 1026Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f 1027{
1028 return is_uni_punct(c); /* XXX no locale support yet */
1029}
1030
4d61ec05 1031bool
84afefe6 1032Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05 1033{
1034 return is_uni_xdigit(c); /* XXX no locale support yet */
1035}
1036
a0ed51b3 1037bool
864dbfa3 1038Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1039{
386d01d6 1040 if (!is_utf8_char(p))
1041 return FALSE;
a0ed51b3 1042 if (!PL_utf8_alnum)
289d4f09 1043 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1044 * descendant of isalnum(3), in other words, it doesn't
1045 * contain the '_'. --jhi */
1046 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
3568d838 1047 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3 1048/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1049#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1050 if (!PL_utf8_alnum)
1051 PL_utf8_alnum = swash_init("utf8", "",
1052 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1053 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3 1054#endif
1055}
1056
1057bool
b8c5462f 1058Perl_is_utf8_alnumc(pTHX_ U8 *p)
1059{
386d01d6 1060 if (!is_utf8_char(p))
1061 return FALSE;
b8c5462f 1062 if (!PL_utf8_alnum)
1063 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
3568d838 1064 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f 1065/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1066#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1067 if (!PL_utf8_alnum)
1068 PL_utf8_alnum = swash_init("utf8", "",
1069 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1070 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f 1071#endif
1072}
1073
1074bool
864dbfa3 1075Perl_is_utf8_idfirst(pTHX_ U8 *p)
a0ed51b3 1076{
1077 return *p == '_' || is_utf8_alpha(p);
1078}
1079
1080bool
864dbfa3 1081Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1082{
386d01d6 1083 if (!is_utf8_char(p))
1084 return FALSE;
a0ed51b3 1085 if (!PL_utf8_alpha)
e24b16f9 1086 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
3568d838 1087 return swash_fetch(PL_utf8_alpha, p, TRUE);
a0ed51b3 1088}
1089
1090bool
b8c5462f 1091Perl_is_utf8_ascii(pTHX_ U8 *p)
1092{
386d01d6 1093 if (!is_utf8_char(p))
1094 return FALSE;
b8c5462f 1095 if (!PL_utf8_ascii)
1096 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
3568d838 1097 return swash_fetch(PL_utf8_ascii, p, TRUE);
b8c5462f 1098}
1099
1100bool
864dbfa3 1101Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1102{
386d01d6 1103 if (!is_utf8_char(p))
1104 return FALSE;
a0ed51b3 1105 if (!PL_utf8_space)
3bec3564 1106 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
3568d838 1107 return swash_fetch(PL_utf8_space, p, TRUE);
a0ed51b3 1108}
1109
1110bool
864dbfa3 1111Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1112{
386d01d6 1113 if (!is_utf8_char(p))
1114 return FALSE;
a0ed51b3 1115 if (!PL_utf8_digit)
e24b16f9 1116 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
3568d838 1117 return swash_fetch(PL_utf8_digit, p, TRUE);
a0ed51b3 1118}
1119
1120bool
864dbfa3 1121Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1122{
386d01d6 1123 if (!is_utf8_char(p))
1124 return FALSE;
a0ed51b3 1125 if (!PL_utf8_upper)
e24b16f9 1126 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
3568d838 1127 return swash_fetch(PL_utf8_upper, p, TRUE);
a0ed51b3 1128}
1129
1130bool
864dbfa3 1131Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1132{
386d01d6 1133 if (!is_utf8_char(p))
1134 return FALSE;
a0ed51b3 1135 if (!PL_utf8_lower)
e24b16f9 1136 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
3568d838 1137 return swash_fetch(PL_utf8_lower, p, TRUE);
a0ed51b3 1138}
1139
1140bool
b8c5462f 1141Perl_is_utf8_cntrl(pTHX_ U8 *p)
1142{
386d01d6 1143 if (!is_utf8_char(p))
1144 return FALSE;
b8c5462f 1145 if (!PL_utf8_cntrl)
1146 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
3568d838 1147 return swash_fetch(PL_utf8_cntrl, p, TRUE);
b8c5462f 1148}
1149
1150bool
1151Perl_is_utf8_graph(pTHX_ U8 *p)
1152{
386d01d6 1153 if (!is_utf8_char(p))
1154 return FALSE;
b8c5462f 1155 if (!PL_utf8_graph)
1156 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
3568d838 1157 return swash_fetch(PL_utf8_graph, p, TRUE);
b8c5462f 1158}
1159
1160bool
864dbfa3 1161Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1162{
386d01d6 1163 if (!is_utf8_char(p))
1164 return FALSE;
a0ed51b3 1165 if (!PL_utf8_print)
e24b16f9 1166 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
3568d838 1167 return swash_fetch(PL_utf8_print, p, TRUE);
a0ed51b3 1168}
1169
1170bool
b8c5462f 1171Perl_is_utf8_punct(pTHX_ U8 *p)
1172{
386d01d6 1173 if (!is_utf8_char(p))
1174 return FALSE;
b8c5462f 1175 if (!PL_utf8_punct)
1176 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
3568d838 1177 return swash_fetch(PL_utf8_punct, p, TRUE);
b8c5462f 1178}
1179
1180bool
1181Perl_is_utf8_xdigit(pTHX_ U8 *p)
1182{
386d01d6 1183 if (!is_utf8_char(p))
1184 return FALSE;
b8c5462f 1185 if (!PL_utf8_xdigit)
1186 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
3568d838 1187 return swash_fetch(PL_utf8_xdigit, p, TRUE);
b8c5462f 1188}
1189
1190bool
864dbfa3 1191Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1192{
386d01d6 1193 if (!is_utf8_char(p))
1194 return FALSE;
a0ed51b3 1195 if (!PL_utf8_mark)
e24b16f9 1196 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
3568d838 1197 return swash_fetch(PL_utf8_mark, p, TRUE);
a0ed51b3 1198}
1199
6b5c0936 1200/*
1201=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1202
1203The "p" contains the pointer to the UTF-8 string encoding
1204the character that is being converted.
1205
1206The "ustrp" is a pointer to the character buffer to put the
1207conversion result to. The "lenp" is a pointer to the length
1208of the result.
1209
1210The "swash" is a pointer to the swash to use.
1211
1212The "normal" is a string like "ToLower" which means the swash
1213$utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
1214and loaded by SWASHGET, using lib/utf8_heavy.pl.
1215
1216The "special" is a string like "utf8::ToSpecLower", which means
d1be9408 1217the hash %utf8::ToSpecLower, which is stored in the same file,
6b5c0936 1218lib/unicore/To/Lower.pl, and also loaded by SWASHGET. The access
1219to the hash is by Perl_to_utf8_case().
1220
1221=cut
1222 */
1223
2104c8d9 1224UV
983ffd37 1225Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special)
a0ed51b3 1226{
1227 UV uv;
1228
983ffd37 1229 if (!*swashp)
1230 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1231 uv = swash_fetch(*swashp, p, TRUE);
1232 if (uv)
1233 uv = UNI_TO_NATIVE(uv);
1234 else {
1235 HV *hv;
1236 SV *keysv;
1237 HE *he;
1238
1239 uv = utf8_to_uvchr(p, 0);
1240
1241 if ((hv = get_hv(special, FALSE)) &&
6d47b937 1242 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) &&
983ffd37 1243 (he = hv_fetch_ent(hv, keysv, FALSE, 0))) {
1244 SV *val = HeVAL(he);
1245 char *s = SvPV(val, *lenp);
1246 U8 c = *(U8*)s;
1247 if (*lenp > 1 || UNI_IS_INVARIANT(c))
1248 Copy(s, ustrp, *lenp, U8);
1249 else {
1250 /* something in the 0x80..0xFF range */
1251 ustrp[0] = UTF8_EIGHT_BIT_HI(c);
1252 ustrp[1] = UTF8_EIGHT_BIT_LO(c);
1253 *lenp = 2;
1254 }
1255 return 0;
1256 }
1257 }
a58d912c 1258 if (lenp)
d90824e8 1259 *lenp = UNISKIP(uv);
a2a2844f 1260 uvuni_to_utf8(ustrp, uv);
1261 return uv;
a0ed51b3 1262}
1263
2104c8d9 1264UV
983ffd37 1265Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1266{
983ffd37 1267 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1268 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1269}
a0ed51b3 1270
983ffd37 1271UV
1272Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1273{
1274 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1275 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3 1276}
1277
2104c8d9 1278UV
a2a2844f 1279Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1280{
983ffd37 1281 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1282 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1283}
1284
1285UV
1286Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1287{
1288 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1289 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3 1290}
1291
1292/* a "swash" is a swatch hash */
1293
1294SV*
864dbfa3 1295Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1296{
1297 SV* retval;
bf1fed83 1298 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1299 dSP;
1b026014 1300 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
f8be5cf0 1301 SV* errsv_save;
ce3b816e 1302
1b026014 1303 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1304 ENTER;
f8be5cf0 1305 errsv_save = newSVsv(ERRSV);
ce3b816e 1306 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
f8be5cf0 1307 if (!SvTRUE(ERRSV))
1308 sv_setsv(ERRSV, errsv_save);
1309 SvREFCNT_dec(errsv_save);
ce3b816e 1310 LEAVE;
1311 }
1312 SPAGAIN;
a0ed51b3 1313 PUSHSTACKi(PERLSI_MAGIC);
1314 PUSHMARK(SP);
1315 EXTEND(SP,5);
1316 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1317 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1318 PUSHs(listsv);
1319 PUSHs(sv_2mortal(newSViv(minbits)));
1320 PUSHs(sv_2mortal(newSViv(none)));
1321 PUTBACK;
1322 ENTER;
1323 SAVEI32(PL_hints);
1324 PL_hints = 0;
1325 save_re_context();
bf1fed83 1326 if (PL_curcop == &PL_compiling)
1327 /* XXX ought to be handled by lex_start */
1328 sv_setpv(tokenbufsv, PL_tokenbuf);
f8be5cf0 1329 errsv_save = newSVsv(ERRSV);
864dbfa3 1330 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1331 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1332 else
e24b16f9 1333 retval = &PL_sv_undef;
f8be5cf0 1334 if (!SvTRUE(ERRSV))
1335 sv_setsv(ERRSV, errsv_save);
1336 SvREFCNT_dec(errsv_save);
a0ed51b3 1337 LEAVE;
1338 POPSTACK;
e24b16f9 1339 if (PL_curcop == &PL_compiling) {
bf1fed83 1340 STRLEN len;
1341 char* pv = SvPV(tokenbufsv, len);
1342
1343 Copy(pv, PL_tokenbuf, len+1, char);
e24b16f9 1344 PL_curcop->op_private = PL_hints;
a0ed51b3 1345 }
1346 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
cea2e8a9 1347 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
a0ed51b3 1348 return retval;
1349}
1350
035d37be 1351
1352/* This API is wrong for special case conversions since we may need to
1353 * return several Unicode characters for a single Unicode character
1354 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1355 * the lower-level routine, and it is similarly broken for returning
1356 * multiple values. --jhi */
a0ed51b3 1357UV
3568d838 1358Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
a0ed51b3 1359{
1360 HV* hv = (HV*)SvRV(sv);
3568d838 1361 U32 klen;
1362 U32 off;
a0ed51b3 1363 STRLEN slen;
7d85a32c 1364 STRLEN needents;
4ea42e7f 1365 U8 *tmps = NULL;
a0ed51b3 1366 U32 bit;
1367 SV *retval;
3568d838 1368 U8 tmputf8[2];
1369 UV c = NATIVE_TO_ASCII(*ptr);
1370
1371 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1372 tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1373 tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1374 ptr = tmputf8;
1375 }
1376 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1377 * then the "swatch" is a vec() for al the chars which start
1378 * with 0xAA..0xYY
1379 * So the key in the hash (klen) is length of encoded char -1
1380 */
1381 klen = UTF8SKIP(ptr) - 1;
1382 off = ptr[klen];
a0ed51b3 1383
7d85a32c 1384 if (klen == 0)
1385 {
1386 /* If char in invariant then swatch is for all the invariant chars
1387 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1388 */
1389 needents = UTF_CONTINUATION_MARK;
1390 off = NATIVE_TO_UTF(ptr[klen]);
1391 }
1392 else
1393 {
1394 /* If char is encoded then swatch is for the prefix */
1395 needents = (1 << UTF_ACCUMULATION_SHIFT);
1396 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1397 }
1398
a0ed51b3 1399 /*
1400 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1401 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1402 * it's nothing to sniff at.) Pity we usually come through at least
1403 * two function calls to get here...
1404 *
1405 * NB: this code assumes that swatches are never modified, once generated!
1406 */
1407
3568d838 1408 if (hv == PL_last_swash_hv &&
a0ed51b3 1409 klen == PL_last_swash_klen &&
3568d838 1410 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3 1411 {
1412 tmps = PL_last_swash_tmps;
1413 slen = PL_last_swash_slen;
1414 }
1415 else {
1416 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1417 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3 1418
1419 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1420 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1421 dSP;
2b9d42f0 1422 /* We use utf8n_to_uvuni() as we want an index into
1423 Unicode tables, not a native character number.
1424 */
1425 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
f8be5cf0 1426 SV *errsv_save;
a0ed51b3 1427 ENTER;
1428 SAVETMPS;
1429 save_re_context();
1430 PUSHSTACKi(PERLSI_MAGIC);
1431 PUSHMARK(SP);
1432 EXTEND(SP,3);
1433 PUSHs((SV*)sv);
ffbc6a93 1434 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838 1435 PUSHs(sv_2mortal(newSViv((klen) ?
1436 (code_point & ~(needents - 1)) : 0)));
a0ed51b3 1437 PUSHs(sv_2mortal(newSViv(needents)));
1438 PUTBACK;
f8be5cf0 1439 errsv_save = newSVsv(ERRSV);
864dbfa3 1440 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1441 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1442 else
e24b16f9 1443 retval = &PL_sv_undef;
f8be5cf0 1444 if (!SvTRUE(ERRSV))
1445 sv_setsv(ERRSV, errsv_save);
1446 SvREFCNT_dec(errsv_save);
a0ed51b3 1447 POPSTACK;
1448 FREETMPS;
1449 LEAVE;
e24b16f9 1450 if (PL_curcop == &PL_compiling)
1451 PL_curcop->op_private = PL_hints;
a0ed51b3 1452
dfe13c55 1453 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1454
7d85a32c 1455 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1456 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3 1457 }
1458
1459 PL_last_swash_hv = hv;
1460 PL_last_swash_klen = klen;
1461 PL_last_swash_tmps = tmps;
1462 PL_last_swash_slen = slen;
1463 if (klen)
1464 Copy(ptr, PL_last_swash_key, klen, U8);
1465 }
1466
9faf8d75 1467 switch ((int)((slen << 3) / needents)) {
a0ed51b3 1468 case 1:
1469 bit = 1 << (off & 7);
1470 off >>= 3;
1471 return (tmps[off] & bit) != 0;
1472 case 8:
1473 return tmps[off];
1474 case 16:
1475 off <<= 1;
1476 return (tmps[off] << 8) + tmps[off + 1] ;
1477 case 32:
1478 off <<= 2;
1479 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1480 }
cea2e8a9 1481 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3 1482 return 0;
1483}
2b9d42f0 1484
1485
1486/*
37607a96 1487=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0 1488
1489Adds the UTF8 representation of the Native codepoint C<uv> to the end
1490of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1491bytes available. The return value is the pointer to the byte after the
1492end of the new character. In other words,
1493
1494 d = uvchr_to_utf8(d, uv);
1495
1496is the recommended wide native character-aware way of saying
1497
1498 *(d++) = uv;
1499
1500=cut
1501*/
1502
1503/* On ASCII machines this is normally a macro but we want a
1504 real function in case XS code wants it
1505*/
1506#undef Perl_uvchr_to_utf8
1507U8 *
1508Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1509{
1510 return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1511}
1512
1513
1514/*
37607a96 1515=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0 1516
1517Returns the native character value of the first character in the string C<s>
1518which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1519length, in bytes, of that character.
1520
1521Allows length and flags to be passed to low level routine.
1522
1523=cut
1524*/
1525/* On ASCII machines this is normally a macro but we want a
1526 real function in case XS code wants it
1527*/
1528#undef Perl_utf8n_to_uvchr
1529UV
37607a96 1530Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0 1531{
1532 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1533 return UNI_TO_NATIVE(uv);
1534}
1535
d2cc3551 1536/*
1537=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1538
1539Build to the scalar dsv a displayable version of the string spv,
1540length len, the displayable version being at most pvlim bytes long
1541(if longer, the rest is truncated and "..." will be appended).
1542The flags argument is currently unused but available for future extensions.
1543The pointer to the PV of the dsv is returned.
1544
1545=cut */
e6b2e755 1546char *
1547Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1548{
1549 int truncated = 0;
1550 char *s, *e;
1551
1552 sv_setpvn(dsv, "", 0);
1553 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1554 UV u;
1555 if (pvlim && SvCUR(dsv) >= pvlim) {
1556 truncated++;
1557 break;
1558 }
1559 u = utf8_to_uvchr((U8*)s, 0);
1560 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1561 }
1562 if (truncated)
1563 sv_catpvn(dsv, "...", 3);
1564
1565 return SvPVX(dsv);
1566}
2b9d42f0 1567
d2cc3551 1568/*
1569=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1570
1571Build to the scalar dsv a displayable version of the scalar sv,
1572he displayable version being at most pvlim bytes long
1573(if longer, the rest is truncated and "..." will be appended).
1574The flags argument is currently unused but available for future extensions.
1575The pointer to the PV of the dsv is returned.
1576
1577=cut */
e6b2e755 1578char *
1579Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1580{
701a277b 1581 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1582 pvlim, flags);
1583}
1584
d2cc3551 1585/*
1586=for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|const char *s2|bool u2|register I32 len
1587
1588Return true if the strings s1 and s2 differ case-insensitively, false
1589if not (if they are equal case-insensitively). If u1 is true, the
1590string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
332ddc25 1591the string s2 is assumed to be in UTF-8-encoded Unicode.
d2cc3551 1592
1593For case-insensitiveness, the "casefolding" of Unicode is used
1594instead of upper/lowercasing both the characters, see
1595http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1596
1597=cut */
701a277b 1598I32
332ddc25 1599Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, register I32 len1, const char *s2, bool u2, register I32 len2)
1600{
1601 register U8 *a = (U8*)s1;
1602 register U8 *b = (U8*)s2;
1603 register U8 *ae = b + len1;
1604 register U8 *be = b + len2;
1605 STRLEN la, lb;
1606 UV ca, cb;
1607 STRLEN ulen1, ulen2;
e7ae6809 1608 U8 tmpbuf1[UTF8_MAXLEN_FOLD+1];
1609 U8 tmpbuf2[UTF8_MAXLEN_FOLD+1];
332ddc25 1610
1611 while (a < ae && b < be) {
1612 if (u1) {
1613 if (a + UTF8SKIP(a) > ae)
1614 break;
1615 ca = utf8_to_uvchr((U8*)a, &la);
1616 } else {
1617 ca = *a;
1618 la = 1;
1619 }
1620 if (u2) {
1621 if (b + UTF8SKIP(b) > be)
1622 break;
1623 cb = utf8_to_uvchr((U8*)b, &lb);
1624 } else {
1625 cb = *b;
1626 lb = 1;
1627 }
1628 if (ca != cb) {
701a277b 1629 if (u1)
332ddc25 1630 to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
1631 else
1632 ulen1 = 1;
701a277b 1633 if (u2)
332ddc25 1634 to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
1635 else
1636 ulen2 = 1;
1637 if (ulen1 != ulen2
1638 || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb])
1639 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
1640 return 1; /* mismatch */
701a277b 1641 }
332ddc25 1642 a += la;
1643 b += lb;
d2cc3551 1644 }
332ddc25 1645 return a == ae && b == be ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 1646}
701a277b 1647