more POSIX tests, and more autoloading
[p5sagit/p5-mst-13.2.git] / utf8.c
CommitLineData
a0ed51b3 1/* utf8.c
2 *
bc89e66f 3 * Copyright (c) 1998-2001, Larry Wall
a0ed51b3 4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3 25#include "perl.h"
26
27/* Unicode support */
28
eebe1485 29/*
37607a96 30=for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
eebe1485 31
32Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34bytes available. The return value is the pointer to the byte after the
9041c2e3 35end of the new character. In other words,
eebe1485 36
9041c2e3 37 d = uvuni_to_utf8(d, uv);
eebe1485 38
39is the recommended Unicode-aware way of saying
40
41 *(d++) = uv;
42
43=cut
44*/
45
dfe13c55 46U8 *
9041c2e3 47Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
a0ed51b3 48{
3d401ffb 49 if (UNICODE_IS_SURROGATE(uv))
50 Perl_croak(aTHX_ "UTF-16 surrogate 0x%04"UVxf, uv);
51 else if ((uv >= 0xFDD0 && uv <= 0xFDEF) ||
52 (uv == 0xFFFE || uv == 0xFFFF))
53 Perl_croak(aTHX_ "Unicode character 0x%04"UVxf" is illegal", uv);
c4d5f83a 54 if (UNI_IS_INVARIANT(uv)) {
55 *d++ = UTF_TO_NATIVE(uv);
a0ed51b3 56 return d;
57 }
2d331972 58#if defined(EBCDIC)
1d72bdf6 59 else {
60 STRLEN len = UNISKIP(uv);
61 U8 *p = d+len-1;
62 while (p > d) {
63 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
64 uv >>= UTF_ACCUMULATION_SHIFT;
65 }
66 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
67 return d+len;
68 }
69#else /* Non loop style */
a0ed51b3 70 if (uv < 0x800) {
71 *d++ = (( uv >> 6) | 0xc0);
72 *d++ = (( uv & 0x3f) | 0x80);
73 return d;
74 }
75 if (uv < 0x10000) {
76 *d++ = (( uv >> 12) | 0xe0);
77 *d++ = (((uv >> 6) & 0x3f) | 0x80);
78 *d++ = (( uv & 0x3f) | 0x80);
79 return d;
80 }
81 if (uv < 0x200000) {
82 *d++ = (( uv >> 18) | 0xf0);
83 *d++ = (((uv >> 12) & 0x3f) | 0x80);
84 *d++ = (((uv >> 6) & 0x3f) | 0x80);
85 *d++ = (( uv & 0x3f) | 0x80);
86 return d;
87 }
88 if (uv < 0x4000000) {
89 *d++ = (( uv >> 24) | 0xf8);
90 *d++ = (((uv >> 18) & 0x3f) | 0x80);
91 *d++ = (((uv >> 12) & 0x3f) | 0x80);
92 *d++ = (((uv >> 6) & 0x3f) | 0x80);
93 *d++ = (( uv & 0x3f) | 0x80);
94 return d;
95 }
96 if (uv < 0x80000000) {
97 *d++ = (( uv >> 30) | 0xfc);
98 *d++ = (((uv >> 24) & 0x3f) | 0x80);
99 *d++ = (((uv >> 18) & 0x3f) | 0x80);
100 *d++ = (((uv >> 12) & 0x3f) | 0x80);
101 *d++ = (((uv >> 6) & 0x3f) | 0x80);
102 *d++ = (( uv & 0x3f) | 0x80);
103 return d;
104 }
6b8eaf93 105#ifdef HAS_QUAD
d7578b48 106 if (uv < UTF8_QUAD_MAX)
a0ed51b3 107#endif
108 {
109 *d++ = 0xfe; /* Can't match U+FEFF! */
110 *d++ = (((uv >> 30) & 0x3f) | 0x80);
111 *d++ = (((uv >> 24) & 0x3f) | 0x80);
112 *d++ = (((uv >> 18) & 0x3f) | 0x80);
113 *d++ = (((uv >> 12) & 0x3f) | 0x80);
114 *d++ = (((uv >> 6) & 0x3f) | 0x80);
115 *d++ = (( uv & 0x3f) | 0x80);
116 return d;
117 }
6b8eaf93 118#ifdef HAS_QUAD
a0ed51b3 119 {
120 *d++ = 0xff; /* Can't match U+FFFE! */
3c77ea2b 121 *d++ = 0x80; /* 6 Reserved bits */
122 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
123 *d++ = (((uv >> 54) & 0x3f) | 0x80);
124 *d++ = (((uv >> 48) & 0x3f) | 0x80);
125 *d++ = (((uv >> 42) & 0x3f) | 0x80);
a0ed51b3 126 *d++ = (((uv >> 36) & 0x3f) | 0x80);
127 *d++ = (((uv >> 30) & 0x3f) | 0x80);
128 *d++ = (((uv >> 24) & 0x3f) | 0x80);
129 *d++ = (((uv >> 18) & 0x3f) | 0x80);
130 *d++ = (((uv >> 12) & 0x3f) | 0x80);
131 *d++ = (((uv >> 6) & 0x3f) | 0x80);
132 *d++ = (( uv & 0x3f) | 0x80);
133 return d;
134 }
135#endif
1d72bdf6 136#endif /* Loop style */
a0ed51b3 137}
138
9041c2e3 139
140
141/*
eebe1485 142=for apidoc A|STRLEN|is_utf8_char|U8 *s
143
5da9da9e 144Tests if some arbitrary number of bytes begins in a valid UTF-8
1d72bdf6 145character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
5da9da9e 146The actual number of bytes in the UTF-8 character will be returned if
147it is valid, otherwise 0.
9041c2e3 148
37607a96 149=cut
150*/
067a85ef 151STRLEN
386d01d6 152Perl_is_utf8_char(pTHX_ U8 *s)
153{
154 U8 u = *s;
067a85ef 155 STRLEN slen, len;
156 UV uv, ouv;
386d01d6 157
1d72bdf6 158 if (UTF8_IS_INVARIANT(u))
386d01d6 159 return 1;
160
60006e79 161 if (!UTF8_IS_START(u))
386d01d6 162 return 0;
163
9f07fdcd 164 len = UTF8SKIP(s);
386d01d6 165
60006e79 166 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef 167 return 0;
168
386d01d6 169 slen = len - 1;
170 s++;
6eb6869e 171 u &= UTF_START_MASK(len);
1d72bdf6 172 uv = u;
067a85ef 173 ouv = uv;
386d01d6 174 while (slen--) {
60006e79 175 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 176 return 0;
8850bf83 177 uv = UTF8_ACCUMULATE(uv, *s);
209a9bc1 178 if (uv < ouv)
067a85ef 179 return 0;
180 ouv = uv;
386d01d6 181 s++;
182 }
067a85ef 183
5bbb0b5a 184 if (UNISKIP(uv) < len)
067a85ef 185 return 0;
186
386d01d6 187 return len;
188}
189
6662521e 190/*
eebe1485 191=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 192
5da9da9e 193Returns true if first C<len> bytes of the given string form a valid UTF8
194string, false otherwise. Note that 'a valid UTF8 string' does not mean
195'a string that contains UTF8' because a valid ASCII string is a valid
196UTF8 string.
6662521e 197
198=cut
199*/
200
8e84507e 201bool
6662521e 202Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
203{
067a85ef 204 U8* x = s;
1aa99e6b 205 U8* send;
067a85ef 206 STRLEN c;
207
1aa99e6b 208 if (!len)
6cd5fe39 209 len = strlen((char *)s);
1aa99e6b 210 send = s + len;
211
6662521e 212 while (x < send) {
213 c = is_utf8_char(x);
067a85ef 214 if (!c)
215 return FALSE;
6662521e 216 x += c;
6662521e 217 }
60006e79 218 if (x != send)
219 return FALSE;
067a85ef 220
221 return TRUE;
6662521e 222}
223
67e989fb 224/*
9041c2e3 225=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 226
9041c2e3 227Bottom level UTF-8 decode routine.
228Returns the unicode code point value of the first character in the string C<s>
ba210ebe 229which is assumed to be in UTF8 encoding and no longer than C<curlen>;
7df053ec 230C<retlen> will be set to the length, in bytes, of that character.
67e989fb 231
232If C<s> does not point to a well-formed UTF8 character, the behaviour
dcad2880 233is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
234it is assumed that the caller will raise a warning, and this function
28d3d195 235will silently just set C<retlen> to C<-1> and return zero. If the
236C<flags> does not contain UTF8_CHECK_ONLY, warnings about
237malformations will be given, C<retlen> will be set to the expected
238length of the UTF-8 character in bytes, and zero will be returned.
239
240The C<flags> can also contain various flags to allow deviations from
241the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 242
9041c2e3 243Most code should use utf8_to_uvchr() rather than call this directly.
244
37607a96 245=cut
246*/
67e989fb 247
a0ed51b3 248UV
37607a96 249Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 250{
9c5ffd7c 251 UV uv = *s, ouv = 0;
ba210ebe 252 STRLEN len = 1;
253 bool dowarn = ckWARN_d(WARN_UTF8);
254 STRLEN expectlen = 0;
a0dbb045 255 U32 warning = 0;
256
257/* This list is a superset of the UTF8_ALLOW_XXX. */
258
259#define UTF8_WARN_EMPTY 1
260#define UTF8_WARN_CONTINUATION 2
261#define UTF8_WARN_NON_CONTINUATION 3
262#define UTF8_WARN_FE_FF 4
263#define UTF8_WARN_SHORT 5
264#define UTF8_WARN_OVERFLOW 6
265#define UTF8_WARN_SURROGATE 7
266#define UTF8_WARN_BOM 8
267#define UTF8_WARN_LONG 9
268#define UTF8_WARN_FFFF 10
269
270 if (curlen == 0 &&
271 !(flags & UTF8_ALLOW_EMPTY)) {
272 warning = UTF8_WARN_EMPTY;
0c443dc2 273 goto malformed;
274 }
275
1d72bdf6 276 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3 277 if (retlen)
278 *retlen = 1;
c4d5f83a 279 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 280 }
67e989fb 281
421a8bf2 282 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 283 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 284 warning = UTF8_WARN_CONTINUATION;
ba210ebe 285 goto malformed;
286 }
287
421a8bf2 288 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 289 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 290 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 291 goto malformed;
292 }
9041c2e3 293
1d72bdf6 294#ifdef EBCDIC
75383841 295 uv = NATIVE_TO_UTF(uv);
1d72bdf6 296#else
fcc8fcf6 297 if ((uv == 0xfe || uv == 0xff) &&
298 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 299 warning = UTF8_WARN_FE_FF;
ba210ebe 300 goto malformed;
a0ed51b3 301 }
1d72bdf6 302#endif
303
ba210ebe 304 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
305 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
306 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
307 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6 308#ifdef EBCDIC
309 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
310 else { len = 7; uv &= 0x01; }
311#else
ba210ebe 312 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
313 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6 314 else { len = 13; uv = 0; } /* whoa! */
315#endif
316
a0ed51b3 317 if (retlen)
318 *retlen = len;
9041c2e3 319
ba210ebe 320 expectlen = len;
321
fcc8fcf6 322 if ((curlen < expectlen) &&
323 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 324 warning = UTF8_WARN_SHORT;
ba210ebe 325 goto malformed;
326 }
327
328 len--;
a0ed51b3 329 s++;
ba210ebe 330 ouv = uv;
331
a0ed51b3 332 while (len--) {
421a8bf2 333 if (!UTF8_IS_CONTINUATION(*s) &&
334 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 335 s--;
336 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 337 goto malformed;
a0ed51b3 338 }
339 else
8850bf83 340 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045 341 if (!(uv > ouv)) {
342 /* These cannot be allowed. */
343 if (uv == ouv) {
344 if (!(flags & UTF8_ALLOW_LONG)) {
345 warning = UTF8_WARN_LONG;
346 goto malformed;
347 }
348 }
349 else { /* uv < ouv */
350 /* This cannot be allowed. */
351 warning = UTF8_WARN_OVERFLOW;
352 goto malformed;
353 }
ba210ebe 354 }
355 s++;
356 ouv = uv;
357 }
358
421a8bf2 359 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 360 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 361 warning = UTF8_WARN_SURROGATE;
ba210ebe 362 goto malformed;
421a8bf2 363 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
fcc8fcf6 364 !(flags & UTF8_ALLOW_BOM)) {
a0dbb045 365 warning = UTF8_WARN_BOM;
ba210ebe 366 goto malformed;
fcc8fcf6 367 } else if ((expectlen > UNISKIP(uv)) &&
368 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 369 warning = UTF8_WARN_LONG;
ba210ebe 370 goto malformed;
421a8bf2 371 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 372 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 373 warning = UTF8_WARN_FFFF;
a9917092 374 goto malformed;
a0ed51b3 375 }
ba210ebe 376
a0ed51b3 377 return uv;
ba210ebe 378
379malformed:
380
fcc8fcf6 381 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 382 if (retlen)
cc366d4b 383 *retlen = -1;
ba210ebe 384 return 0;
385 }
386
a0dbb045 387 if (dowarn) {
388 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
389
390 switch (warning) {
391 case 0: /* Intentionally empty. */ break;
392 case UTF8_WARN_EMPTY:
393 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
394 break;
395 case UTF8_WARN_CONTINUATION:
396 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
397 break;
398 case UTF8_WARN_NON_CONTINUATION:
399 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
400 (UV)s[1], uv);
401 break;
402 case UTF8_WARN_FE_FF:
403 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
404 break;
405 case UTF8_WARN_SHORT:
406 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
407 curlen, curlen == 1 ? "" : "s", expectlen);
b31f83c2 408 expectlen = curlen; /* distance for caller to skip */
a0dbb045 409 break;
410 case UTF8_WARN_OVERFLOW:
411 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
412 ouv, *s);
413 break;
414 case UTF8_WARN_SURROGATE:
415 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
416 break;
417 case UTF8_WARN_BOM:
418 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
419 break;
420 case UTF8_WARN_LONG:
421 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
422 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
423 break;
424 case UTF8_WARN_FFFF:
425 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
426 break;
427 default:
428 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
429 break;
430 }
431
432 if (warning) {
433 char *s = SvPVX(sv);
434
435 if (PL_op)
436 Perl_warner(aTHX_ WARN_UTF8,
53e06cf0 437 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 438 else
439 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
440 }
441 }
442
ba210ebe 443 if (retlen)
28d3d195 444 *retlen = expectlen ? expectlen : len;
ba210ebe 445
28d3d195 446 return 0;
a0ed51b3 447}
448
8e84507e 449/*
37607a96 450=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
9041c2e3 451
452Returns the native character value of the first character in the string C<s>
453which is assumed to be in UTF8 encoding; C<retlen> will be set to the
454length, in bytes, of that character.
455
456If C<s> does not point to a well-formed UTF8 character, zero is
457returned and retlen is set, if possible, to -1.
458
459=cut
460*/
461
462UV
37607a96 463Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
9041c2e3 464{
465 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
466}
467
468/*
37607a96 469=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
9041c2e3 470
471Returns the Unicode code point of the first character in the string C<s>
472which is assumed to be in UTF8 encoding; C<retlen> will be set to the
473length, in bytes, of that character.
474
475This function should only be used when returned UV is considered
476an index into the Unicode semantic tables (e.g. swashes).
477
ba210ebe 478If C<s> does not point to a well-formed UTF8 character, zero is
479returned and retlen is set, if possible, to -1.
8e84507e 480
481=cut
482*/
483
484UV
37607a96 485Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
8e84507e 486{
9041c2e3 487 /* Call the low level routine asking for checks */
488 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
8e84507e 489}
490
b76347f2 491/*
37607a96 492=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
b76347f2 493
494Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47 495Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
496up past C<e>, croaks.
b76347f2 497
498=cut
499*/
500
501STRLEN
37607a96 502Perl_utf8_length(pTHX_ U8 *s, U8 *e)
b76347f2 503{
504 STRLEN len = 0;
505
8850bf83 506 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
507 * the bitops (especially ~) can create illegal UTF-8.
508 * In other words: in Perl UTF-8 is not just for Unicode. */
509
b76347f2 510 if (e < s)
02eb7b47 511 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
b76347f2 512 while (s < e) {
02eb7b47 513 U8 t = UTF8SKIP(s);
b76347f2 514
515 if (e - s < t)
0064a8a9 516 Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
b76347f2 517 s += t;
518 len++;
519 }
520
521 return len;
522}
523
b06226ff 524/*
eebe1485 525=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff 526
527Returns the number of UTF8 characters between the UTF-8 pointers C<a>
528and C<b>.
529
530WARNING: use only if you *know* that the pointers point inside the
531same UTF-8 buffer.
532
37607a96 533=cut
534*/
a0ed51b3 535
02eb7b47 536IV
864dbfa3 537Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 538{
02eb7b47 539 IV off = 0;
540
8850bf83 541 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
542 * the bitops (especially ~) can create illegal UTF-8.
543 * In other words: in Perl UTF-8 is not just for Unicode. */
544
a0ed51b3 545 if (a < b) {
546 while (a < b) {
02eb7b47 547 U8 c = UTF8SKIP(a);
548
549 if (b - a < c)
550 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
551 a += c;
a0ed51b3 552 off--;
553 }
554 }
555 else {
556 while (b < a) {
02eb7b47 557 U8 c = UTF8SKIP(b);
558
559 if (a - b < c)
560 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
561 b += c;
a0ed51b3 562 off++;
563 }
564 }
02eb7b47 565
a0ed51b3 566 return off;
567}
568
b06226ff 569/*
37607a96 570=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 571
8850bf83 572Return the UTF-8 pointer C<s> displaced by C<off> characters, either
573forward or backward.
b06226ff 574
575WARNING: do not use the following unless you *know* C<off> is within
8850bf83 576the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
577on the first byte of character or just after the last byte of a character.
b06226ff 578
37607a96 579=cut
580*/
a0ed51b3 581
582U8 *
864dbfa3 583Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 584{
8850bf83 585 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
586 * the bitops (especially ~) can create illegal UTF-8.
587 * In other words: in Perl UTF-8 is not just for Unicode. */
588
a0ed51b3 589 if (off >= 0) {
590 while (off--)
591 s += UTF8SKIP(s);
592 }
593 else {
594 while (off++) {
595 s--;
8850bf83 596 while (UTF8_IS_CONTINUATION(*s))
597 s--;
a0ed51b3 598 }
599 }
600 return s;
601}
602
6940069f 603/*
eebe1485 604=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 605
246fae53 606Converts a string C<s> of length C<len> from UTF8 into byte encoding.
607Unlike C<bytes_to_utf8>, this over-writes the original string, and
608updates len to contain the new length.
67e989fb 609Returns zero on failure, setting C<len> to -1.
6940069f 610
611=cut
612*/
613
614U8 *
37607a96 615Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 616{
6940069f 617 U8 *send;
618 U8 *d;
dcad2880 619 U8 *save = s;
246fae53 620
621 /* ensure valid UTF8 and chars < 256 before updating string */
dcad2880 622 for (send = s + *len; s < send; ) {
623 U8 c = *s++;
624
1d72bdf6 625 if (!UTF8_IS_INVARIANT(c) &&
626 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
627 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880 628 *len = -1;
629 return 0;
630 }
246fae53 631 }
dcad2880 632
633 d = s = save;
6940069f 634 while (s < send) {
ed646e6e 635 STRLEN ulen;
9041c2e3 636 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 637 s += ulen;
6940069f 638 }
639 *d = '\0';
246fae53 640 *len = d - save;
6940069f 641 return save;
642}
643
644/*
f9a63242 645=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
646
647Converts a string C<s> of length C<len> from UTF8 into byte encoding.
648Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0 649the newly-created string, and updates C<len> to contain the new
650length. Returns the original string if no conversion occurs, C<len>
651is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
6520 if C<s> is converted or contains all 7bit characters.
f9a63242 653
37607a96 654=cut
655*/
f9a63242 656
657U8 *
37607a96 658Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 659{
f9a63242 660 U8 *d;
661 U8 *start = s;
db42d148 662 U8 *send;
f9a63242 663 I32 count = 0;
664
665 if (!*is_utf8)
666 return start;
667
ef9edfd0 668 /* ensure valid UTF8 and chars < 256 before converting string */
f9a63242 669 for (send = s + *len; s < send;) {
670 U8 c = *s++;
1d72bdf6 671 if (!UTF8_IS_INVARIANT(c)) {
db42d148 672 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
673 (c = *s++) && UTF8_IS_CONTINUATION(c))
674 count++;
675 else
f9a63242 676 return start;
db42d148 677 }
f9a63242 678 }
679
680 *is_utf8 = 0;
681
f9a63242 682 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 683 s = start; start = d;
f9a63242 684 while (s < send) {
685 U8 c = *s++;
c4d5f83a 686 if (!UTF8_IS_INVARIANT(c)) {
687 /* Then it is two-byte encoded */
688 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
689 c = ASCII_TO_NATIVE(c);
690 }
691 *d++ = c;
f9a63242 692 }
693 *d = '\0';
694 *len = d - start;
695 return start;
696}
697
698/*
eebe1485 699=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f 700
701Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
6662521e 702Returns a pointer to the newly-created string, and sets C<len> to
703reflect the new length.
6940069f 704
497711e7 705=cut
6940069f 706*/
707
708U8*
37607a96 709Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
6940069f 710{
6940069f 711 U8 *send;
712 U8 *d;
713 U8 *dst;
6662521e 714 send = s + (*len);
6940069f 715
6662521e 716 Newz(801, d, (*len) * 2 + 1, U8);
6940069f 717 dst = d;
718
719 while (s < send) {
db42d148 720 UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 721 if (UNI_IS_INVARIANT(uv))
722 *d++ = UTF_TO_NATIVE(uv);
6940069f 723 else {
90f44359 724 *d++ = UTF8_EIGHT_BIT_HI(uv);
725 *d++ = UTF8_EIGHT_BIT_LO(uv);
6940069f 726 }
727 }
728 *d = '\0';
6662521e 729 *len = d-dst;
6940069f 730 return dst;
731}
732
a0ed51b3 733/*
dea0fc0b 734 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3 735 *
736 * Destination must be pre-extended to 3/2 source. Do not use in-place.
737 * We optimize for native, for obvious reasons. */
738
739U8*
dea0fc0b 740Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 741{
dea0fc0b 742 U8* pend;
743 U8* dstart = d;
744
745 if (bytelen & 1)
a7867d0a 746 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b 747
748 pend = p + bytelen;
749
a0ed51b3 750 while (p < pend) {
dea0fc0b 751 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
752 p += 2;
a0ed51b3 753 if (uv < 0x80) {
754 *d++ = uv;
755 continue;
756 }
757 if (uv < 0x800) {
758 *d++ = (( uv >> 6) | 0xc0);
759 *d++ = (( uv & 0x3f) | 0x80);
760 continue;
761 }
762 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
dea0fc0b 763 UV low = *p++;
764 if (low < 0xdc00 || low >= 0xdfff)
765 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3 766 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
767 }
768 if (uv < 0x10000) {
769 *d++ = (( uv >> 12) | 0xe0);
770 *d++ = (((uv >> 6) & 0x3f) | 0x80);
771 *d++ = (( uv & 0x3f) | 0x80);
772 continue;
773 }
774 else {
775 *d++ = (( uv >> 18) | 0xf0);
776 *d++ = (((uv >> 12) & 0x3f) | 0x80);
777 *d++ = (((uv >> 6) & 0x3f) | 0x80);
778 *d++ = (( uv & 0x3f) | 0x80);
779 continue;
780 }
781 }
dea0fc0b 782 *newlen = d - dstart;
a0ed51b3 783 return d;
784}
785
786/* Note: this one is slightly destructive of the source. */
787
788U8*
dea0fc0b 789Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 790{
791 U8* s = (U8*)p;
792 U8* send = s + bytelen;
793 while (s < send) {
794 U8 tmp = s[0];
795 s[0] = s[1];
796 s[1] = tmp;
797 s += 2;
798 }
dea0fc0b 799 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3 800}
801
802/* for now these are all defined (inefficiently) in terms of the utf8 versions */
803
804bool
84afefe6 805Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 806{
ad391ad9 807 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 808 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 809 return is_utf8_alnum(tmpbuf);
810}
811
812bool
84afefe6 813Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 814{
ad391ad9 815 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 816 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f 817 return is_utf8_alnumc(tmpbuf);
818}
819
820bool
84afefe6 821Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 822{
ad391ad9 823 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 824 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 825 return is_utf8_idfirst(tmpbuf);
826}
827
828bool
84afefe6 829Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 830{
ad391ad9 831 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 832 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 833 return is_utf8_alpha(tmpbuf);
834}
835
836bool
84afefe6 837Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 838{
ad391ad9 839 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 840 uvchr_to_utf8(tmpbuf, (UV)c);
4d61ec05 841 return is_utf8_ascii(tmpbuf);
842}
843
844bool
84afefe6 845Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 846{
ad391ad9 847 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 848 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 849 return is_utf8_space(tmpbuf);
850}
851
852bool
84afefe6 853Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 854{
ad391ad9 855 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 856 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 857 return is_utf8_digit(tmpbuf);
858}
859
860bool
84afefe6 861Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 862{
ad391ad9 863 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 864 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 865 return is_utf8_upper(tmpbuf);
866}
867
868bool
84afefe6 869Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 870{
ad391ad9 871 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 872 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 873 return is_utf8_lower(tmpbuf);
874}
875
876bool
84afefe6 877Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 878{
ad391ad9 879 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 880 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f 881 return is_utf8_cntrl(tmpbuf);
882}
883
884bool
84afefe6 885Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 886{
ad391ad9 887 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 888 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f 889 return is_utf8_graph(tmpbuf);
890}
891
892bool
84afefe6 893Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 894{
ad391ad9 895 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 896 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 897 return is_utf8_print(tmpbuf);
898}
899
b8c5462f 900bool
84afefe6 901Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 902{
ad391ad9 903 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 904 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f 905 return is_utf8_punct(tmpbuf);
906}
907
4d61ec05 908bool
84afefe6 909Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 910{
e7ae6809 911 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
2b9d42f0 912 uvchr_to_utf8(tmpbuf, (UV)c);
4d61ec05 913 return is_utf8_xdigit(tmpbuf);
914}
915
84afefe6 916UV
917Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 918{
e7ae6809 919 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
2b9d42f0 920 uvchr_to_utf8(tmpbuf, (UV)c);
a2a2844f 921 return to_utf8_upper(tmpbuf, p, lenp);
a0ed51b3 922}
923
84afefe6 924UV
925Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 926{
e7ae6809 927 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
2b9d42f0 928 uvchr_to_utf8(tmpbuf, (UV)c);
a2a2844f 929 return to_utf8_title(tmpbuf, p, lenp);
a0ed51b3 930}
931
84afefe6 932UV
933Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 934{
d90824e8 935 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
2b9d42f0 936 uvchr_to_utf8(tmpbuf, (UV)c);
a2a2844f 937 return to_utf8_lower(tmpbuf, p, lenp);
a0ed51b3 938}
939
84afefe6 940UV
941Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
942{
d90824e8 943 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
84afefe6 944 uvchr_to_utf8(tmpbuf, (UV)c);
945 return to_utf8_fold(tmpbuf, p, lenp);
946}
947
a0ed51b3 948/* for now these all assume no locale info available for Unicode > 255 */
949
950bool
84afefe6 951Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3 952{
953 return is_uni_alnum(c); /* XXX no locale support yet */
954}
955
956bool
84afefe6 957Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f 958{
959 return is_uni_alnumc(c); /* XXX no locale support yet */
960}
961
962bool
84afefe6 963Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3 964{
965 return is_uni_idfirst(c); /* XXX no locale support yet */
966}
967
968bool
84afefe6 969Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3 970{
971 return is_uni_alpha(c); /* XXX no locale support yet */
972}
973
974bool
84afefe6 975Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05 976{
977 return is_uni_ascii(c); /* XXX no locale support yet */
978}
979
980bool
84afefe6 981Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3 982{
983 return is_uni_space(c); /* XXX no locale support yet */
984}
985
986bool
84afefe6 987Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3 988{
989 return is_uni_digit(c); /* XXX no locale support yet */
990}
991
992bool
84afefe6 993Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3 994{
995 return is_uni_upper(c); /* XXX no locale support yet */
996}
997
998bool
84afefe6 999Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3 1000{
1001 return is_uni_lower(c); /* XXX no locale support yet */
1002}
1003
1004bool
84afefe6 1005Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f 1006{
1007 return is_uni_cntrl(c); /* XXX no locale support yet */
1008}
1009
1010bool
84afefe6 1011Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f 1012{
1013 return is_uni_graph(c); /* XXX no locale support yet */
1014}
1015
1016bool
84afefe6 1017Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3 1018{
1019 return is_uni_print(c); /* XXX no locale support yet */
1020}
1021
b8c5462f 1022bool
84afefe6 1023Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f 1024{
1025 return is_uni_punct(c); /* XXX no locale support yet */
1026}
1027
4d61ec05 1028bool
84afefe6 1029Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05 1030{
1031 return is_uni_xdigit(c); /* XXX no locale support yet */
1032}
1033
a0ed51b3 1034bool
864dbfa3 1035Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1036{
386d01d6 1037 if (!is_utf8_char(p))
1038 return FALSE;
a0ed51b3 1039 if (!PL_utf8_alnum)
289d4f09 1040 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1041 * descendant of isalnum(3), in other words, it doesn't
1042 * contain the '_'. --jhi */
1043 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
3568d838 1044 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3 1045/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1046#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1047 if (!PL_utf8_alnum)
1048 PL_utf8_alnum = swash_init("utf8", "",
1049 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1050 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3 1051#endif
1052}
1053
1054bool
b8c5462f 1055Perl_is_utf8_alnumc(pTHX_ U8 *p)
1056{
386d01d6 1057 if (!is_utf8_char(p))
1058 return FALSE;
b8c5462f 1059 if (!PL_utf8_alnum)
1060 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
3568d838 1061 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f 1062/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1063#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1064 if (!PL_utf8_alnum)
1065 PL_utf8_alnum = swash_init("utf8", "",
1066 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1067 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f 1068#endif
1069}
1070
1071bool
864dbfa3 1072Perl_is_utf8_idfirst(pTHX_ U8 *p)
a0ed51b3 1073{
1074 return *p == '_' || is_utf8_alpha(p);
1075}
1076
1077bool
864dbfa3 1078Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1079{
386d01d6 1080 if (!is_utf8_char(p))
1081 return FALSE;
a0ed51b3 1082 if (!PL_utf8_alpha)
e24b16f9 1083 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
3568d838 1084 return swash_fetch(PL_utf8_alpha, p, TRUE);
a0ed51b3 1085}
1086
1087bool
b8c5462f 1088Perl_is_utf8_ascii(pTHX_ U8 *p)
1089{
386d01d6 1090 if (!is_utf8_char(p))
1091 return FALSE;
b8c5462f 1092 if (!PL_utf8_ascii)
1093 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
3568d838 1094 return swash_fetch(PL_utf8_ascii, p, TRUE);
b8c5462f 1095}
1096
1097bool
864dbfa3 1098Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1099{
386d01d6 1100 if (!is_utf8_char(p))
1101 return FALSE;
a0ed51b3 1102 if (!PL_utf8_space)
3bec3564 1103 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
3568d838 1104 return swash_fetch(PL_utf8_space, p, TRUE);
a0ed51b3 1105}
1106
1107bool
864dbfa3 1108Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1109{
386d01d6 1110 if (!is_utf8_char(p))
1111 return FALSE;
a0ed51b3 1112 if (!PL_utf8_digit)
e24b16f9 1113 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
3568d838 1114 return swash_fetch(PL_utf8_digit, p, TRUE);
a0ed51b3 1115}
1116
1117bool
864dbfa3 1118Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1119{
386d01d6 1120 if (!is_utf8_char(p))
1121 return FALSE;
a0ed51b3 1122 if (!PL_utf8_upper)
e24b16f9 1123 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
3568d838 1124 return swash_fetch(PL_utf8_upper, p, TRUE);
a0ed51b3 1125}
1126
1127bool
864dbfa3 1128Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1129{
386d01d6 1130 if (!is_utf8_char(p))
1131 return FALSE;
a0ed51b3 1132 if (!PL_utf8_lower)
e24b16f9 1133 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
3568d838 1134 return swash_fetch(PL_utf8_lower, p, TRUE);
a0ed51b3 1135}
1136
1137bool
b8c5462f 1138Perl_is_utf8_cntrl(pTHX_ U8 *p)
1139{
386d01d6 1140 if (!is_utf8_char(p))
1141 return FALSE;
b8c5462f 1142 if (!PL_utf8_cntrl)
1143 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
3568d838 1144 return swash_fetch(PL_utf8_cntrl, p, TRUE);
b8c5462f 1145}
1146
1147bool
1148Perl_is_utf8_graph(pTHX_ U8 *p)
1149{
386d01d6 1150 if (!is_utf8_char(p))
1151 return FALSE;
b8c5462f 1152 if (!PL_utf8_graph)
1153 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
3568d838 1154 return swash_fetch(PL_utf8_graph, p, TRUE);
b8c5462f 1155}
1156
1157bool
864dbfa3 1158Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1159{
386d01d6 1160 if (!is_utf8_char(p))
1161 return FALSE;
a0ed51b3 1162 if (!PL_utf8_print)
e24b16f9 1163 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
3568d838 1164 return swash_fetch(PL_utf8_print, p, TRUE);
a0ed51b3 1165}
1166
1167bool
b8c5462f 1168Perl_is_utf8_punct(pTHX_ U8 *p)
1169{
386d01d6 1170 if (!is_utf8_char(p))
1171 return FALSE;
b8c5462f 1172 if (!PL_utf8_punct)
1173 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
3568d838 1174 return swash_fetch(PL_utf8_punct, p, TRUE);
b8c5462f 1175}
1176
1177bool
1178Perl_is_utf8_xdigit(pTHX_ U8 *p)
1179{
386d01d6 1180 if (!is_utf8_char(p))
1181 return FALSE;
b8c5462f 1182 if (!PL_utf8_xdigit)
1183 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
3568d838 1184 return swash_fetch(PL_utf8_xdigit, p, TRUE);
b8c5462f 1185}
1186
1187bool
864dbfa3 1188Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1189{
386d01d6 1190 if (!is_utf8_char(p))
1191 return FALSE;
a0ed51b3 1192 if (!PL_utf8_mark)
e24b16f9 1193 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
3568d838 1194 return swash_fetch(PL_utf8_mark, p, TRUE);
a0ed51b3 1195}
1196
6b5c0936 1197/*
1198=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1199
1200The "p" contains the pointer to the UTF-8 string encoding
1201the character that is being converted.
1202
1203The "ustrp" is a pointer to the character buffer to put the
1204conversion result to. The "lenp" is a pointer to the length
1205of the result.
1206
1207The "swash" is a pointer to the swash to use.
1208
1209The "normal" is a string like "ToLower" which means the swash
1210$utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
1211and loaded by SWASHGET, using lib/utf8_heavy.pl.
1212
1213The "special" is a string like "utf8::ToSpecLower", which means
d1be9408 1214the hash %utf8::ToSpecLower, which is stored in the same file,
6b5c0936 1215lib/unicore/To/Lower.pl, and also loaded by SWASHGET. The access
1216to the hash is by Perl_to_utf8_case().
1217
1218=cut
1219 */
1220
2104c8d9 1221UV
983ffd37 1222Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special)
a0ed51b3 1223{
1224 UV uv;
1225
983ffd37 1226 if (!*swashp)
1227 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1228 uv = swash_fetch(*swashp, p, TRUE);
1229 if (uv)
1230 uv = UNI_TO_NATIVE(uv);
1231 else {
1232 HV *hv;
1233 SV *keysv;
1234 HE *he;
1235
1236 uv = utf8_to_uvchr(p, 0);
1237
1238 if ((hv = get_hv(special, FALSE)) &&
6d47b937 1239 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) &&
983ffd37 1240 (he = hv_fetch_ent(hv, keysv, FALSE, 0))) {
1241 SV *val = HeVAL(he);
1242 char *s = SvPV(val, *lenp);
1243 U8 c = *(U8*)s;
1244 if (*lenp > 1 || UNI_IS_INVARIANT(c))
1245 Copy(s, ustrp, *lenp, U8);
1246 else {
1247 /* something in the 0x80..0xFF range */
1248 ustrp[0] = UTF8_EIGHT_BIT_HI(c);
1249 ustrp[1] = UTF8_EIGHT_BIT_LO(c);
1250 *lenp = 2;
1251 }
1252 return 0;
1253 }
1254 }
a58d912c 1255 if (lenp)
d90824e8 1256 *lenp = UNISKIP(uv);
a2a2844f 1257 uvuni_to_utf8(ustrp, uv);
1258 return uv;
a0ed51b3 1259}
1260
2104c8d9 1261UV
983ffd37 1262Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1263{
983ffd37 1264 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1265 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1266}
a0ed51b3 1267
983ffd37 1268UV
1269Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1270{
1271 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1272 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3 1273}
1274
2104c8d9 1275UV
a2a2844f 1276Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1277{
983ffd37 1278 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1279 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1280}
1281
1282UV
1283Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1284{
1285 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1286 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3 1287}
1288
1289/* a "swash" is a swatch hash */
1290
1291SV*
864dbfa3 1292Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1293{
1294 SV* retval;
bf1fed83 1295 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1296 dSP;
1b026014 1297 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
f8be5cf0 1298 SV* errsv_save;
ce3b816e 1299
1b026014 1300 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1301 ENTER;
f8be5cf0 1302 errsv_save = newSVsv(ERRSV);
ce3b816e 1303 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
f8be5cf0 1304 if (!SvTRUE(ERRSV))
1305 sv_setsv(ERRSV, errsv_save);
1306 SvREFCNT_dec(errsv_save);
ce3b816e 1307 LEAVE;
1308 }
1309 SPAGAIN;
a0ed51b3 1310 PUSHSTACKi(PERLSI_MAGIC);
1311 PUSHMARK(SP);
1312 EXTEND(SP,5);
1313 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1314 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1315 PUSHs(listsv);
1316 PUSHs(sv_2mortal(newSViv(minbits)));
1317 PUSHs(sv_2mortal(newSViv(none)));
1318 PUTBACK;
1319 ENTER;
1320 SAVEI32(PL_hints);
1321 PL_hints = 0;
1322 save_re_context();
bf1fed83 1323 if (PL_curcop == &PL_compiling)
1324 /* XXX ought to be handled by lex_start */
1325 sv_setpv(tokenbufsv, PL_tokenbuf);
f8be5cf0 1326 errsv_save = newSVsv(ERRSV);
864dbfa3 1327 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1328 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1329 else
e24b16f9 1330 retval = &PL_sv_undef;
f8be5cf0 1331 if (!SvTRUE(ERRSV))
1332 sv_setsv(ERRSV, errsv_save);
1333 SvREFCNT_dec(errsv_save);
a0ed51b3 1334 LEAVE;
1335 POPSTACK;
e24b16f9 1336 if (PL_curcop == &PL_compiling) {
bf1fed83 1337 STRLEN len;
1338 char* pv = SvPV(tokenbufsv, len);
1339
1340 Copy(pv, PL_tokenbuf, len+1, char);
e24b16f9 1341 PL_curcop->op_private = PL_hints;
a0ed51b3 1342 }
1343 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
cea2e8a9 1344 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
a0ed51b3 1345 return retval;
1346}
1347
035d37be 1348
1349/* This API is wrong for special case conversions since we may need to
1350 * return several Unicode characters for a single Unicode character
1351 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1352 * the lower-level routine, and it is similarly broken for returning
1353 * multiple values. --jhi */
a0ed51b3 1354UV
3568d838 1355Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
a0ed51b3 1356{
1357 HV* hv = (HV*)SvRV(sv);
3568d838 1358 U32 klen;
1359 U32 off;
a0ed51b3 1360 STRLEN slen;
7d85a32c 1361 STRLEN needents;
4ea42e7f 1362 U8 *tmps = NULL;
a0ed51b3 1363 U32 bit;
1364 SV *retval;
3568d838 1365 U8 tmputf8[2];
1366 UV c = NATIVE_TO_ASCII(*ptr);
1367
1368 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1369 tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1370 tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1371 ptr = tmputf8;
1372 }
1373 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1374 * then the "swatch" is a vec() for al the chars which start
1375 * with 0xAA..0xYY
1376 * So the key in the hash (klen) is length of encoded char -1
1377 */
1378 klen = UTF8SKIP(ptr) - 1;
1379 off = ptr[klen];
a0ed51b3 1380
7d85a32c 1381 if (klen == 0)
1382 {
1383 /* If char in invariant then swatch is for all the invariant chars
1384 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1385 */
1386 needents = UTF_CONTINUATION_MARK;
1387 off = NATIVE_TO_UTF(ptr[klen]);
1388 }
1389 else
1390 {
1391 /* If char is encoded then swatch is for the prefix */
1392 needents = (1 << UTF_ACCUMULATION_SHIFT);
1393 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1394 }
1395
a0ed51b3 1396 /*
1397 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1398 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1399 * it's nothing to sniff at.) Pity we usually come through at least
1400 * two function calls to get here...
1401 *
1402 * NB: this code assumes that swatches are never modified, once generated!
1403 */
1404
3568d838 1405 if (hv == PL_last_swash_hv &&
a0ed51b3 1406 klen == PL_last_swash_klen &&
3568d838 1407 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3 1408 {
1409 tmps = PL_last_swash_tmps;
1410 slen = PL_last_swash_slen;
1411 }
1412 else {
1413 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1414 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3 1415
1416 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1417 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1418 dSP;
2b9d42f0 1419 /* We use utf8n_to_uvuni() as we want an index into
1420 Unicode tables, not a native character number.
1421 */
1422 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
f8be5cf0 1423 SV *errsv_save;
a0ed51b3 1424 ENTER;
1425 SAVETMPS;
1426 save_re_context();
1427 PUSHSTACKi(PERLSI_MAGIC);
1428 PUSHMARK(SP);
1429 EXTEND(SP,3);
1430 PUSHs((SV*)sv);
ffbc6a93 1431 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838 1432 PUSHs(sv_2mortal(newSViv((klen) ?
1433 (code_point & ~(needents - 1)) : 0)));
a0ed51b3 1434 PUSHs(sv_2mortal(newSViv(needents)));
1435 PUTBACK;
f8be5cf0 1436 errsv_save = newSVsv(ERRSV);
864dbfa3 1437 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1438 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1439 else
e24b16f9 1440 retval = &PL_sv_undef;
f8be5cf0 1441 if (!SvTRUE(ERRSV))
1442 sv_setsv(ERRSV, errsv_save);
1443 SvREFCNT_dec(errsv_save);
a0ed51b3 1444 POPSTACK;
1445 FREETMPS;
1446 LEAVE;
e24b16f9 1447 if (PL_curcop == &PL_compiling)
1448 PL_curcop->op_private = PL_hints;
a0ed51b3 1449
dfe13c55 1450 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1451
7d85a32c 1452 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1453 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3 1454 }
1455
1456 PL_last_swash_hv = hv;
1457 PL_last_swash_klen = klen;
1458 PL_last_swash_tmps = tmps;
1459 PL_last_swash_slen = slen;
1460 if (klen)
1461 Copy(ptr, PL_last_swash_key, klen, U8);
1462 }
1463
9faf8d75 1464 switch ((int)((slen << 3) / needents)) {
a0ed51b3 1465 case 1:
1466 bit = 1 << (off & 7);
1467 off >>= 3;
1468 return (tmps[off] & bit) != 0;
1469 case 8:
1470 return tmps[off];
1471 case 16:
1472 off <<= 1;
1473 return (tmps[off] << 8) + tmps[off + 1] ;
1474 case 32:
1475 off <<= 2;
1476 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1477 }
cea2e8a9 1478 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3 1479 return 0;
1480}
2b9d42f0 1481
1482
1483/*
37607a96 1484=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0 1485
1486Adds the UTF8 representation of the Native codepoint C<uv> to the end
1487of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1488bytes available. The return value is the pointer to the byte after the
1489end of the new character. In other words,
1490
1491 d = uvchr_to_utf8(d, uv);
1492
1493is the recommended wide native character-aware way of saying
1494
1495 *(d++) = uv;
1496
1497=cut
1498*/
1499
1500/* On ASCII machines this is normally a macro but we want a
1501 real function in case XS code wants it
1502*/
1503#undef Perl_uvchr_to_utf8
1504U8 *
1505Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1506{
1507 return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1508}
1509
1510
1511/*
37607a96 1512=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0 1513
1514Returns the native character value of the first character in the string C<s>
1515which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1516length, in bytes, of that character.
1517
1518Allows length and flags to be passed to low level routine.
1519
1520=cut
1521*/
1522/* On ASCII machines this is normally a macro but we want a
1523 real function in case XS code wants it
1524*/
1525#undef Perl_utf8n_to_uvchr
1526UV
37607a96 1527Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0 1528{
1529 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1530 return UNI_TO_NATIVE(uv);
1531}
1532
d2cc3551 1533/*
1534=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1535
1536Build to the scalar dsv a displayable version of the string spv,
1537length len, the displayable version being at most pvlim bytes long
1538(if longer, the rest is truncated and "..." will be appended).
1539The flags argument is currently unused but available for future extensions.
1540The pointer to the PV of the dsv is returned.
1541
1542=cut */
e6b2e755 1543char *
1544Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1545{
1546 int truncated = 0;
1547 char *s, *e;
1548
1549 sv_setpvn(dsv, "", 0);
1550 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1551 UV u;
1552 if (pvlim && SvCUR(dsv) >= pvlim) {
1553 truncated++;
1554 break;
1555 }
1556 u = utf8_to_uvchr((U8*)s, 0);
1557 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1558 }
1559 if (truncated)
1560 sv_catpvn(dsv, "...", 3);
1561
1562 return SvPVX(dsv);
1563}
2b9d42f0 1564
d2cc3551 1565/*
1566=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1567
1568Build to the scalar dsv a displayable version of the scalar sv,
1569he displayable version being at most pvlim bytes long
1570(if longer, the rest is truncated and "..." will be appended).
1571The flags argument is currently unused but available for future extensions.
1572The pointer to the PV of the dsv is returned.
1573
1574=cut */
e6b2e755 1575char *
1576Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1577{
701a277b 1578 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1579 pvlim, flags);
1580}
1581
d2cc3551 1582/*
1583=for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|const char *s2|bool u2|register I32 len
1584
1585Return true if the strings s1 and s2 differ case-insensitively, false
1586if not (if they are equal case-insensitively). If u1 is true, the
1587string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
332ddc25 1588the string s2 is assumed to be in UTF-8-encoded Unicode.
d2cc3551 1589
1590For case-insensitiveness, the "casefolding" of Unicode is used
1591instead of upper/lowercasing both the characters, see
1592http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1593
1594=cut */
701a277b 1595I32
332ddc25 1596Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, register I32 len1, const char *s2, bool u2, register I32 len2)
1597{
1598 register U8 *a = (U8*)s1;
1599 register U8 *b = (U8*)s2;
1600 register U8 *ae = b + len1;
1601 register U8 *be = b + len2;
1602 STRLEN la, lb;
1603 UV ca, cb;
1604 STRLEN ulen1, ulen2;
e7ae6809 1605 U8 tmpbuf1[UTF8_MAXLEN_FOLD+1];
1606 U8 tmpbuf2[UTF8_MAXLEN_FOLD+1];
332ddc25 1607
1608 while (a < ae && b < be) {
1609 if (u1) {
1610 if (a + UTF8SKIP(a) > ae)
1611 break;
1612 ca = utf8_to_uvchr((U8*)a, &la);
1613 } else {
1614 ca = *a;
1615 la = 1;
1616 }
1617 if (u2) {
1618 if (b + UTF8SKIP(b) > be)
1619 break;
1620 cb = utf8_to_uvchr((U8*)b, &lb);
1621 } else {
1622 cb = *b;
1623 lb = 1;
1624 }
1625 if (ca != cb) {
701a277b 1626 if (u1)
332ddc25 1627 to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
1628 else
1629 ulen1 = 1;
701a277b 1630 if (u2)
332ddc25 1631 to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
1632 else
1633 ulen2 = 1;
1634 if (ulen1 != ulen2
1635 || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb])
1636 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
1637 return 1; /* mismatch */
701a277b 1638 }
332ddc25 1639 a += la;
1640 b += lb;
d2cc3551 1641 }
332ddc25 1642 return a == ae && b == be ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 1643}
701a277b 1644