RESENT - [PATCH] utf8_heavy.pl
[p5sagit/p5-mst-13.2.git] / utf8.c
CommitLineData
a0ed51b3 1/* utf8.c
2 *
bc89e66f 3 * Copyright (c) 1998-2001, Larry Wall
a0ed51b3 4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3 25#include "perl.h"
26
27/* Unicode support */
28
eebe1485 29/*
37607a96 30=for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
eebe1485 31
32Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34bytes available. The return value is the pointer to the byte after the
9041c2e3 35end of the new character. In other words,
eebe1485 36
9041c2e3 37 d = uvuni_to_utf8(d, uv);
eebe1485 38
39is the recommended Unicode-aware way of saying
40
41 *(d++) = uv;
42
43=cut
44*/
45
dfe13c55 46U8 *
9041c2e3 47Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
a0ed51b3 48{
c4d5f83a 49 if (UNI_IS_INVARIANT(uv)) {
50 *d++ = UTF_TO_NATIVE(uv);
a0ed51b3 51 return d;
52 }
2d331972 53#if defined(EBCDIC)
1d72bdf6 54 else {
55 STRLEN len = UNISKIP(uv);
56 U8 *p = d+len-1;
57 while (p > d) {
58 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
59 uv >>= UTF_ACCUMULATION_SHIFT;
60 }
61 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
62 return d+len;
63 }
64#else /* Non loop style */
a0ed51b3 65 if (uv < 0x800) {
66 *d++ = (( uv >> 6) | 0xc0);
67 *d++ = (( uv & 0x3f) | 0x80);
68 return d;
69 }
70 if (uv < 0x10000) {
9466bab6 71 if (UNICODE_IS_SURROGATE(uv))
72 Perl_croak(aTHX_ "UTF-16 surrogate 0x%04"UVxf, uv);
a0ed51b3 73 *d++ = (( uv >> 12) | 0xe0);
74 *d++ = (((uv >> 6) & 0x3f) | 0x80);
75 *d++ = (( uv & 0x3f) | 0x80);
76 return d;
77 }
78 if (uv < 0x200000) {
79 *d++ = (( uv >> 18) | 0xf0);
80 *d++ = (((uv >> 12) & 0x3f) | 0x80);
81 *d++ = (((uv >> 6) & 0x3f) | 0x80);
82 *d++ = (( uv & 0x3f) | 0x80);
83 return d;
84 }
85 if (uv < 0x4000000) {
86 *d++ = (( uv >> 24) | 0xf8);
87 *d++ = (((uv >> 18) & 0x3f) | 0x80);
88 *d++ = (((uv >> 12) & 0x3f) | 0x80);
89 *d++ = (((uv >> 6) & 0x3f) | 0x80);
90 *d++ = (( uv & 0x3f) | 0x80);
91 return d;
92 }
93 if (uv < 0x80000000) {
94 *d++ = (( uv >> 30) | 0xfc);
95 *d++ = (((uv >> 24) & 0x3f) | 0x80);
96 *d++ = (((uv >> 18) & 0x3f) | 0x80);
97 *d++ = (((uv >> 12) & 0x3f) | 0x80);
98 *d++ = (((uv >> 6) & 0x3f) | 0x80);
99 *d++ = (( uv & 0x3f) | 0x80);
100 return d;
101 }
6b8eaf93 102#ifdef HAS_QUAD
d7578b48 103 if (uv < UTF8_QUAD_MAX)
a0ed51b3 104#endif
105 {
106 *d++ = 0xfe; /* Can't match U+FEFF! */
107 *d++ = (((uv >> 30) & 0x3f) | 0x80);
108 *d++ = (((uv >> 24) & 0x3f) | 0x80);
109 *d++ = (((uv >> 18) & 0x3f) | 0x80);
110 *d++ = (((uv >> 12) & 0x3f) | 0x80);
111 *d++ = (((uv >> 6) & 0x3f) | 0x80);
112 *d++ = (( uv & 0x3f) | 0x80);
113 return d;
114 }
6b8eaf93 115#ifdef HAS_QUAD
a0ed51b3 116 {
117 *d++ = 0xff; /* Can't match U+FFFE! */
3c77ea2b 118 *d++ = 0x80; /* 6 Reserved bits */
119 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
120 *d++ = (((uv >> 54) & 0x3f) | 0x80);
121 *d++ = (((uv >> 48) & 0x3f) | 0x80);
122 *d++ = (((uv >> 42) & 0x3f) | 0x80);
a0ed51b3 123 *d++ = (((uv >> 36) & 0x3f) | 0x80);
124 *d++ = (((uv >> 30) & 0x3f) | 0x80);
125 *d++ = (((uv >> 24) & 0x3f) | 0x80);
126 *d++ = (((uv >> 18) & 0x3f) | 0x80);
127 *d++ = (((uv >> 12) & 0x3f) | 0x80);
128 *d++ = (((uv >> 6) & 0x3f) | 0x80);
129 *d++ = (( uv & 0x3f) | 0x80);
130 return d;
131 }
132#endif
1d72bdf6 133#endif /* Loop style */
a0ed51b3 134}
135
9041c2e3 136
137
138/*
eebe1485 139=for apidoc A|STRLEN|is_utf8_char|U8 *s
140
5da9da9e 141Tests if some arbitrary number of bytes begins in a valid UTF-8
1d72bdf6 142character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
5da9da9e 143The actual number of bytes in the UTF-8 character will be returned if
144it is valid, otherwise 0.
9041c2e3 145
37607a96 146=cut
147*/
067a85ef 148STRLEN
386d01d6 149Perl_is_utf8_char(pTHX_ U8 *s)
150{
151 U8 u = *s;
067a85ef 152 STRLEN slen, len;
153 UV uv, ouv;
386d01d6 154
1d72bdf6 155 if (UTF8_IS_INVARIANT(u))
386d01d6 156 return 1;
157
60006e79 158 if (!UTF8_IS_START(u))
386d01d6 159 return 0;
160
9f07fdcd 161 len = UTF8SKIP(s);
386d01d6 162
60006e79 163 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef 164 return 0;
165
386d01d6 166 slen = len - 1;
167 s++;
6eb6869e 168 u &= UTF_START_MASK(len);
1d72bdf6 169 uv = u;
067a85ef 170 ouv = uv;
386d01d6 171 while (slen--) {
60006e79 172 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 173 return 0;
8850bf83 174 uv = UTF8_ACCUMULATE(uv, *s);
209a9bc1 175 if (uv < ouv)
067a85ef 176 return 0;
177 ouv = uv;
386d01d6 178 s++;
179 }
067a85ef 180
5bbb0b5a 181 if (UNISKIP(uv) < len)
067a85ef 182 return 0;
183
386d01d6 184 return len;
185}
186
6662521e 187/*
eebe1485 188=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 189
5da9da9e 190Returns true if first C<len> bytes of the given string form a valid UTF8
191string, false otherwise. Note that 'a valid UTF8 string' does not mean
192'a string that contains UTF8' because a valid ASCII string is a valid
193UTF8 string.
6662521e 194
195=cut
196*/
197
8e84507e 198bool
6662521e 199Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
200{
067a85ef 201 U8* x = s;
1aa99e6b 202 U8* send;
067a85ef 203 STRLEN c;
204
1aa99e6b 205 if (!len)
6cd5fe39 206 len = strlen((char *)s);
1aa99e6b 207 send = s + len;
208
6662521e 209 while (x < send) {
210 c = is_utf8_char(x);
067a85ef 211 if (!c)
212 return FALSE;
6662521e 213 x += c;
6662521e 214 }
60006e79 215 if (x != send)
216 return FALSE;
067a85ef 217
218 return TRUE;
6662521e 219}
220
67e989fb 221/*
9041c2e3 222=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 223
9041c2e3 224Bottom level UTF-8 decode routine.
225Returns the unicode code point value of the first character in the string C<s>
ba210ebe 226which is assumed to be in UTF8 encoding and no longer than C<curlen>;
7df053ec 227C<retlen> will be set to the length, in bytes, of that character.
67e989fb 228
229If C<s> does not point to a well-formed UTF8 character, the behaviour
dcad2880 230is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
231it is assumed that the caller will raise a warning, and this function
28d3d195 232will silently just set C<retlen> to C<-1> and return zero. If the
233C<flags> does not contain UTF8_CHECK_ONLY, warnings about
234malformations will be given, C<retlen> will be set to the expected
235length of the UTF-8 character in bytes, and zero will be returned.
236
237The C<flags> can also contain various flags to allow deviations from
238the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 239
9041c2e3 240Most code should use utf8_to_uvchr() rather than call this directly.
241
37607a96 242=cut
243*/
67e989fb 244
a0ed51b3 245UV
37607a96 246Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 247{
9c5ffd7c 248 UV uv = *s, ouv = 0;
ba210ebe 249 STRLEN len = 1;
250 bool dowarn = ckWARN_d(WARN_UTF8);
251 STRLEN expectlen = 0;
a0dbb045 252 U32 warning = 0;
253
254/* This list is a superset of the UTF8_ALLOW_XXX. */
255
256#define UTF8_WARN_EMPTY 1
257#define UTF8_WARN_CONTINUATION 2
258#define UTF8_WARN_NON_CONTINUATION 3
259#define UTF8_WARN_FE_FF 4
260#define UTF8_WARN_SHORT 5
261#define UTF8_WARN_OVERFLOW 6
262#define UTF8_WARN_SURROGATE 7
263#define UTF8_WARN_BOM 8
264#define UTF8_WARN_LONG 9
265#define UTF8_WARN_FFFF 10
266
267 if (curlen == 0 &&
268 !(flags & UTF8_ALLOW_EMPTY)) {
269 warning = UTF8_WARN_EMPTY;
0c443dc2 270 goto malformed;
271 }
272
1d72bdf6 273 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3 274 if (retlen)
275 *retlen = 1;
c4d5f83a 276 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 277 }
67e989fb 278
421a8bf2 279 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 280 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 281 warning = UTF8_WARN_CONTINUATION;
ba210ebe 282 goto malformed;
283 }
284
421a8bf2 285 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 286 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 287 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 288 goto malformed;
289 }
9041c2e3 290
1d72bdf6 291#ifdef EBCDIC
75383841 292 uv = NATIVE_TO_UTF(uv);
1d72bdf6 293#else
fcc8fcf6 294 if ((uv == 0xfe || uv == 0xff) &&
295 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 296 warning = UTF8_WARN_FE_FF;
ba210ebe 297 goto malformed;
a0ed51b3 298 }
1d72bdf6 299#endif
300
ba210ebe 301 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
302 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
303 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
304 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6 305#ifdef EBCDIC
306 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
307 else { len = 7; uv &= 0x01; }
308#else
ba210ebe 309 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
310 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6 311 else { len = 13; uv = 0; } /* whoa! */
312#endif
313
a0ed51b3 314 if (retlen)
315 *retlen = len;
9041c2e3 316
ba210ebe 317 expectlen = len;
318
fcc8fcf6 319 if ((curlen < expectlen) &&
320 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 321 warning = UTF8_WARN_SHORT;
ba210ebe 322 goto malformed;
323 }
324
325 len--;
a0ed51b3 326 s++;
ba210ebe 327 ouv = uv;
328
a0ed51b3 329 while (len--) {
421a8bf2 330 if (!UTF8_IS_CONTINUATION(*s) &&
331 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 332 s--;
333 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 334 goto malformed;
a0ed51b3 335 }
336 else
8850bf83 337 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045 338 if (!(uv > ouv)) {
339 /* These cannot be allowed. */
340 if (uv == ouv) {
341 if (!(flags & UTF8_ALLOW_LONG)) {
342 warning = UTF8_WARN_LONG;
343 goto malformed;
344 }
345 }
346 else { /* uv < ouv */
347 /* This cannot be allowed. */
348 warning = UTF8_WARN_OVERFLOW;
349 goto malformed;
350 }
ba210ebe 351 }
352 s++;
353 ouv = uv;
354 }
355
421a8bf2 356 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 357 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 358 warning = UTF8_WARN_SURROGATE;
ba210ebe 359 goto malformed;
421a8bf2 360 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
fcc8fcf6 361 !(flags & UTF8_ALLOW_BOM)) {
a0dbb045 362 warning = UTF8_WARN_BOM;
ba210ebe 363 goto malformed;
fcc8fcf6 364 } else if ((expectlen > UNISKIP(uv)) &&
365 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 366 warning = UTF8_WARN_LONG;
ba210ebe 367 goto malformed;
421a8bf2 368 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 369 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 370 warning = UTF8_WARN_FFFF;
a9917092 371 goto malformed;
a0ed51b3 372 }
ba210ebe 373
a0ed51b3 374 return uv;
ba210ebe 375
376malformed:
377
fcc8fcf6 378 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 379 if (retlen)
cc366d4b 380 *retlen = -1;
ba210ebe 381 return 0;
382 }
383
a0dbb045 384 if (dowarn) {
385 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
386
387 switch (warning) {
388 case 0: /* Intentionally empty. */ break;
389 case UTF8_WARN_EMPTY:
390 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
391 break;
392 case UTF8_WARN_CONTINUATION:
393 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
394 break;
395 case UTF8_WARN_NON_CONTINUATION:
396 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
397 (UV)s[1], uv);
398 break;
399 case UTF8_WARN_FE_FF:
400 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
401 break;
402 case UTF8_WARN_SHORT:
403 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
404 curlen, curlen == 1 ? "" : "s", expectlen);
b31f83c2 405 expectlen = curlen; /* distance for caller to skip */
a0dbb045 406 break;
407 case UTF8_WARN_OVERFLOW:
408 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
409 ouv, *s);
410 break;
411 case UTF8_WARN_SURROGATE:
412 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
413 break;
414 case UTF8_WARN_BOM:
415 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
416 break;
417 case UTF8_WARN_LONG:
418 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
419 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
420 break;
421 case UTF8_WARN_FFFF:
422 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
423 break;
424 default:
425 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
426 break;
427 }
428
429 if (warning) {
430 char *s = SvPVX(sv);
431
432 if (PL_op)
433 Perl_warner(aTHX_ WARN_UTF8,
53e06cf0 434 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 435 else
436 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
437 }
438 }
439
ba210ebe 440 if (retlen)
28d3d195 441 *retlen = expectlen ? expectlen : len;
ba210ebe 442
28d3d195 443 return 0;
a0ed51b3 444}
445
8e84507e 446/*
37607a96 447=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
9041c2e3 448
449Returns the native character value of the first character in the string C<s>
450which is assumed to be in UTF8 encoding; C<retlen> will be set to the
451length, in bytes, of that character.
452
453If C<s> does not point to a well-formed UTF8 character, zero is
454returned and retlen is set, if possible, to -1.
455
456=cut
457*/
458
459UV
37607a96 460Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
9041c2e3 461{
462 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
463}
464
465/*
37607a96 466=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
9041c2e3 467
468Returns the Unicode code point of the first character in the string C<s>
469which is assumed to be in UTF8 encoding; C<retlen> will be set to the
470length, in bytes, of that character.
471
472This function should only be used when returned UV is considered
473an index into the Unicode semantic tables (e.g. swashes).
474
ba210ebe 475If C<s> does not point to a well-formed UTF8 character, zero is
476returned and retlen is set, if possible, to -1.
8e84507e 477
478=cut
479*/
480
481UV
37607a96 482Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
8e84507e 483{
9041c2e3 484 /* Call the low level routine asking for checks */
485 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
8e84507e 486}
487
b76347f2 488/*
37607a96 489=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
b76347f2 490
491Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47 492Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
493up past C<e>, croaks.
b76347f2 494
495=cut
496*/
497
498STRLEN
37607a96 499Perl_utf8_length(pTHX_ U8 *s, U8 *e)
b76347f2 500{
501 STRLEN len = 0;
502
8850bf83 503 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
504 * the bitops (especially ~) can create illegal UTF-8.
505 * In other words: in Perl UTF-8 is not just for Unicode. */
506
b76347f2 507 if (e < s)
02eb7b47 508 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
b76347f2 509 while (s < e) {
02eb7b47 510 U8 t = UTF8SKIP(s);
b76347f2 511
512 if (e - s < t)
0064a8a9 513 Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
b76347f2 514 s += t;
515 len++;
516 }
517
518 return len;
519}
520
b06226ff 521/*
eebe1485 522=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff 523
524Returns the number of UTF8 characters between the UTF-8 pointers C<a>
525and C<b>.
526
527WARNING: use only if you *know* that the pointers point inside the
528same UTF-8 buffer.
529
37607a96 530=cut
531*/
a0ed51b3 532
02eb7b47 533IV
864dbfa3 534Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 535{
02eb7b47 536 IV off = 0;
537
8850bf83 538 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
539 * the bitops (especially ~) can create illegal UTF-8.
540 * In other words: in Perl UTF-8 is not just for Unicode. */
541
a0ed51b3 542 if (a < b) {
543 while (a < b) {
02eb7b47 544 U8 c = UTF8SKIP(a);
545
546 if (b - a < c)
547 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
548 a += c;
a0ed51b3 549 off--;
550 }
551 }
552 else {
553 while (b < a) {
02eb7b47 554 U8 c = UTF8SKIP(b);
555
556 if (a - b < c)
557 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
558 b += c;
a0ed51b3 559 off++;
560 }
561 }
02eb7b47 562
a0ed51b3 563 return off;
564}
565
b06226ff 566/*
37607a96 567=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 568
8850bf83 569Return the UTF-8 pointer C<s> displaced by C<off> characters, either
570forward or backward.
b06226ff 571
572WARNING: do not use the following unless you *know* C<off> is within
8850bf83 573the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
574on the first byte of character or just after the last byte of a character.
b06226ff 575
37607a96 576=cut
577*/
a0ed51b3 578
579U8 *
864dbfa3 580Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 581{
8850bf83 582 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
583 * the bitops (especially ~) can create illegal UTF-8.
584 * In other words: in Perl UTF-8 is not just for Unicode. */
585
a0ed51b3 586 if (off >= 0) {
587 while (off--)
588 s += UTF8SKIP(s);
589 }
590 else {
591 while (off++) {
592 s--;
8850bf83 593 while (UTF8_IS_CONTINUATION(*s))
594 s--;
a0ed51b3 595 }
596 }
597 return s;
598}
599
6940069f 600/*
eebe1485 601=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 602
246fae53 603Converts a string C<s> of length C<len> from UTF8 into byte encoding.
604Unlike C<bytes_to_utf8>, this over-writes the original string, and
605updates len to contain the new length.
67e989fb 606Returns zero on failure, setting C<len> to -1.
6940069f 607
608=cut
609*/
610
611U8 *
37607a96 612Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 613{
6940069f 614 U8 *send;
615 U8 *d;
dcad2880 616 U8 *save = s;
246fae53 617
618 /* ensure valid UTF8 and chars < 256 before updating string */
dcad2880 619 for (send = s + *len; s < send; ) {
620 U8 c = *s++;
621
1d72bdf6 622 if (!UTF8_IS_INVARIANT(c) &&
623 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
624 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880 625 *len = -1;
626 return 0;
627 }
246fae53 628 }
dcad2880 629
630 d = s = save;
6940069f 631 while (s < send) {
ed646e6e 632 STRLEN ulen;
9041c2e3 633 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 634 s += ulen;
6940069f 635 }
636 *d = '\0';
246fae53 637 *len = d - save;
6940069f 638 return save;
639}
640
641/*
f9a63242 642=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
643
644Converts a string C<s> of length C<len> from UTF8 into byte encoding.
645Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0 646the newly-created string, and updates C<len> to contain the new
647length. Returns the original string if no conversion occurs, C<len>
648is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
6490 if C<s> is converted or contains all 7bit characters.
f9a63242 650
37607a96 651=cut
652*/
f9a63242 653
654U8 *
37607a96 655Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 656{
f9a63242 657 U8 *d;
658 U8 *start = s;
db42d148 659 U8 *send;
f9a63242 660 I32 count = 0;
661
662 if (!*is_utf8)
663 return start;
664
ef9edfd0 665 /* ensure valid UTF8 and chars < 256 before converting string */
f9a63242 666 for (send = s + *len; s < send;) {
667 U8 c = *s++;
1d72bdf6 668 if (!UTF8_IS_INVARIANT(c)) {
db42d148 669 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
670 (c = *s++) && UTF8_IS_CONTINUATION(c))
671 count++;
672 else
f9a63242 673 return start;
db42d148 674 }
f9a63242 675 }
676
677 *is_utf8 = 0;
678
f9a63242 679 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 680 s = start; start = d;
f9a63242 681 while (s < send) {
682 U8 c = *s++;
c4d5f83a 683 if (!UTF8_IS_INVARIANT(c)) {
684 /* Then it is two-byte encoded */
685 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
686 c = ASCII_TO_NATIVE(c);
687 }
688 *d++ = c;
f9a63242 689 }
690 *d = '\0';
691 *len = d - start;
692 return start;
693}
694
695/*
eebe1485 696=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f 697
698Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
6662521e 699Returns a pointer to the newly-created string, and sets C<len> to
700reflect the new length.
6940069f 701
497711e7 702=cut
6940069f 703*/
704
705U8*
37607a96 706Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
6940069f 707{
6940069f 708 U8 *send;
709 U8 *d;
710 U8 *dst;
6662521e 711 send = s + (*len);
6940069f 712
6662521e 713 Newz(801, d, (*len) * 2 + 1, U8);
6940069f 714 dst = d;
715
716 while (s < send) {
db42d148 717 UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 718 if (UNI_IS_INVARIANT(uv))
719 *d++ = UTF_TO_NATIVE(uv);
6940069f 720 else {
90f44359 721 *d++ = UTF8_EIGHT_BIT_HI(uv);
722 *d++ = UTF8_EIGHT_BIT_LO(uv);
6940069f 723 }
724 }
725 *d = '\0';
6662521e 726 *len = d-dst;
6940069f 727 return dst;
728}
729
a0ed51b3 730/*
dea0fc0b 731 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3 732 *
733 * Destination must be pre-extended to 3/2 source. Do not use in-place.
734 * We optimize for native, for obvious reasons. */
735
736U8*
dea0fc0b 737Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 738{
dea0fc0b 739 U8* pend;
740 U8* dstart = d;
741
742 if (bytelen & 1)
a7867d0a 743 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b 744
745 pend = p + bytelen;
746
a0ed51b3 747 while (p < pend) {
dea0fc0b 748 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
749 p += 2;
a0ed51b3 750 if (uv < 0x80) {
751 *d++ = uv;
752 continue;
753 }
754 if (uv < 0x800) {
755 *d++ = (( uv >> 6) | 0xc0);
756 *d++ = (( uv & 0x3f) | 0x80);
757 continue;
758 }
759 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
dea0fc0b 760 UV low = *p++;
761 if (low < 0xdc00 || low >= 0xdfff)
762 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3 763 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
764 }
765 if (uv < 0x10000) {
766 *d++ = (( uv >> 12) | 0xe0);
767 *d++ = (((uv >> 6) & 0x3f) | 0x80);
768 *d++ = (( uv & 0x3f) | 0x80);
769 continue;
770 }
771 else {
772 *d++ = (( uv >> 18) | 0xf0);
773 *d++ = (((uv >> 12) & 0x3f) | 0x80);
774 *d++ = (((uv >> 6) & 0x3f) | 0x80);
775 *d++ = (( uv & 0x3f) | 0x80);
776 continue;
777 }
778 }
dea0fc0b 779 *newlen = d - dstart;
a0ed51b3 780 return d;
781}
782
783/* Note: this one is slightly destructive of the source. */
784
785U8*
dea0fc0b 786Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 787{
788 U8* s = (U8*)p;
789 U8* send = s + bytelen;
790 while (s < send) {
791 U8 tmp = s[0];
792 s[0] = s[1];
793 s[1] = tmp;
794 s += 2;
795 }
dea0fc0b 796 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3 797}
798
799/* for now these are all defined (inefficiently) in terms of the utf8 versions */
800
801bool
84afefe6 802Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 803{
ad391ad9 804 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 805 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 806 return is_utf8_alnum(tmpbuf);
807}
808
809bool
84afefe6 810Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 811{
ad391ad9 812 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 813 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f 814 return is_utf8_alnumc(tmpbuf);
815}
816
817bool
84afefe6 818Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 819{
ad391ad9 820 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 821 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 822 return is_utf8_idfirst(tmpbuf);
823}
824
825bool
84afefe6 826Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 827{
ad391ad9 828 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 829 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 830 return is_utf8_alpha(tmpbuf);
831}
832
833bool
84afefe6 834Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 835{
ad391ad9 836 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 837 uvchr_to_utf8(tmpbuf, (UV)c);
4d61ec05 838 return is_utf8_ascii(tmpbuf);
839}
840
841bool
84afefe6 842Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 843{
ad391ad9 844 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 845 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 846 return is_utf8_space(tmpbuf);
847}
848
849bool
84afefe6 850Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 851{
ad391ad9 852 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 853 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 854 return is_utf8_digit(tmpbuf);
855}
856
857bool
84afefe6 858Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 859{
ad391ad9 860 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 861 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 862 return is_utf8_upper(tmpbuf);
863}
864
865bool
84afefe6 866Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 867{
ad391ad9 868 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 869 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 870 return is_utf8_lower(tmpbuf);
871}
872
873bool
84afefe6 874Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 875{
ad391ad9 876 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 877 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f 878 return is_utf8_cntrl(tmpbuf);
879}
880
881bool
84afefe6 882Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 883{
ad391ad9 884 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 885 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f 886 return is_utf8_graph(tmpbuf);
887}
888
889bool
84afefe6 890Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 891{
ad391ad9 892 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 893 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 894 return is_utf8_print(tmpbuf);
895}
896
b8c5462f 897bool
84afefe6 898Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 899{
ad391ad9 900 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 901 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f 902 return is_utf8_punct(tmpbuf);
903}
904
4d61ec05 905bool
84afefe6 906Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 907{
e7ae6809 908 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
2b9d42f0 909 uvchr_to_utf8(tmpbuf, (UV)c);
4d61ec05 910 return is_utf8_xdigit(tmpbuf);
911}
912
84afefe6 913UV
914Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 915{
e7ae6809 916 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
2b9d42f0 917 uvchr_to_utf8(tmpbuf, (UV)c);
a2a2844f 918 return to_utf8_upper(tmpbuf, p, lenp);
a0ed51b3 919}
920
84afefe6 921UV
922Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 923{
e7ae6809 924 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
2b9d42f0 925 uvchr_to_utf8(tmpbuf, (UV)c);
a2a2844f 926 return to_utf8_title(tmpbuf, p, lenp);
a0ed51b3 927}
928
84afefe6 929UV
930Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 931{
d90824e8 932 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
2b9d42f0 933 uvchr_to_utf8(tmpbuf, (UV)c);
a2a2844f 934 return to_utf8_lower(tmpbuf, p, lenp);
a0ed51b3 935}
936
84afefe6 937UV
938Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
939{
d90824e8 940 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
84afefe6 941 uvchr_to_utf8(tmpbuf, (UV)c);
942 return to_utf8_fold(tmpbuf, p, lenp);
943}
944
a0ed51b3 945/* for now these all assume no locale info available for Unicode > 255 */
946
947bool
84afefe6 948Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3 949{
950 return is_uni_alnum(c); /* XXX no locale support yet */
951}
952
953bool
84afefe6 954Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f 955{
956 return is_uni_alnumc(c); /* XXX no locale support yet */
957}
958
959bool
84afefe6 960Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3 961{
962 return is_uni_idfirst(c); /* XXX no locale support yet */
963}
964
965bool
84afefe6 966Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3 967{
968 return is_uni_alpha(c); /* XXX no locale support yet */
969}
970
971bool
84afefe6 972Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05 973{
974 return is_uni_ascii(c); /* XXX no locale support yet */
975}
976
977bool
84afefe6 978Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3 979{
980 return is_uni_space(c); /* XXX no locale support yet */
981}
982
983bool
84afefe6 984Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3 985{
986 return is_uni_digit(c); /* XXX no locale support yet */
987}
988
989bool
84afefe6 990Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3 991{
992 return is_uni_upper(c); /* XXX no locale support yet */
993}
994
995bool
84afefe6 996Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3 997{
998 return is_uni_lower(c); /* XXX no locale support yet */
999}
1000
1001bool
84afefe6 1002Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f 1003{
1004 return is_uni_cntrl(c); /* XXX no locale support yet */
1005}
1006
1007bool
84afefe6 1008Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f 1009{
1010 return is_uni_graph(c); /* XXX no locale support yet */
1011}
1012
1013bool
84afefe6 1014Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3 1015{
1016 return is_uni_print(c); /* XXX no locale support yet */
1017}
1018
b8c5462f 1019bool
84afefe6 1020Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f 1021{
1022 return is_uni_punct(c); /* XXX no locale support yet */
1023}
1024
4d61ec05 1025bool
84afefe6 1026Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05 1027{
1028 return is_uni_xdigit(c); /* XXX no locale support yet */
1029}
1030
a0ed51b3 1031bool
864dbfa3 1032Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1033{
386d01d6 1034 if (!is_utf8_char(p))
1035 return FALSE;
a0ed51b3 1036 if (!PL_utf8_alnum)
289d4f09 1037 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1038 * descendant of isalnum(3), in other words, it doesn't
1039 * contain the '_'. --jhi */
1040 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
3568d838 1041 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3 1042/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1043#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1044 if (!PL_utf8_alnum)
1045 PL_utf8_alnum = swash_init("utf8", "",
1046 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1047 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3 1048#endif
1049}
1050
1051bool
b8c5462f 1052Perl_is_utf8_alnumc(pTHX_ U8 *p)
1053{
386d01d6 1054 if (!is_utf8_char(p))
1055 return FALSE;
b8c5462f 1056 if (!PL_utf8_alnum)
1057 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
3568d838 1058 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f 1059/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1060#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1061 if (!PL_utf8_alnum)
1062 PL_utf8_alnum = swash_init("utf8", "",
1063 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1064 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f 1065#endif
1066}
1067
1068bool
864dbfa3 1069Perl_is_utf8_idfirst(pTHX_ U8 *p)
a0ed51b3 1070{
1071 return *p == '_' || is_utf8_alpha(p);
1072}
1073
1074bool
864dbfa3 1075Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1076{
386d01d6 1077 if (!is_utf8_char(p))
1078 return FALSE;
a0ed51b3 1079 if (!PL_utf8_alpha)
e24b16f9 1080 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
3568d838 1081 return swash_fetch(PL_utf8_alpha, p, TRUE);
a0ed51b3 1082}
1083
1084bool
b8c5462f 1085Perl_is_utf8_ascii(pTHX_ U8 *p)
1086{
386d01d6 1087 if (!is_utf8_char(p))
1088 return FALSE;
b8c5462f 1089 if (!PL_utf8_ascii)
1090 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
3568d838 1091 return swash_fetch(PL_utf8_ascii, p, TRUE);
b8c5462f 1092}
1093
1094bool
864dbfa3 1095Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1096{
386d01d6 1097 if (!is_utf8_char(p))
1098 return FALSE;
a0ed51b3 1099 if (!PL_utf8_space)
3bec3564 1100 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
3568d838 1101 return swash_fetch(PL_utf8_space, p, TRUE);
a0ed51b3 1102}
1103
1104bool
864dbfa3 1105Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1106{
386d01d6 1107 if (!is_utf8_char(p))
1108 return FALSE;
a0ed51b3 1109 if (!PL_utf8_digit)
e24b16f9 1110 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
3568d838 1111 return swash_fetch(PL_utf8_digit, p, TRUE);
a0ed51b3 1112}
1113
1114bool
864dbfa3 1115Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1116{
386d01d6 1117 if (!is_utf8_char(p))
1118 return FALSE;
a0ed51b3 1119 if (!PL_utf8_upper)
e24b16f9 1120 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
3568d838 1121 return swash_fetch(PL_utf8_upper, p, TRUE);
a0ed51b3 1122}
1123
1124bool
864dbfa3 1125Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1126{
386d01d6 1127 if (!is_utf8_char(p))
1128 return FALSE;
a0ed51b3 1129 if (!PL_utf8_lower)
e24b16f9 1130 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
3568d838 1131 return swash_fetch(PL_utf8_lower, p, TRUE);
a0ed51b3 1132}
1133
1134bool
b8c5462f 1135Perl_is_utf8_cntrl(pTHX_ U8 *p)
1136{
386d01d6 1137 if (!is_utf8_char(p))
1138 return FALSE;
b8c5462f 1139 if (!PL_utf8_cntrl)
1140 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
3568d838 1141 return swash_fetch(PL_utf8_cntrl, p, TRUE);
b8c5462f 1142}
1143
1144bool
1145Perl_is_utf8_graph(pTHX_ U8 *p)
1146{
386d01d6 1147 if (!is_utf8_char(p))
1148 return FALSE;
b8c5462f 1149 if (!PL_utf8_graph)
1150 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
3568d838 1151 return swash_fetch(PL_utf8_graph, p, TRUE);
b8c5462f 1152}
1153
1154bool
864dbfa3 1155Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1156{
386d01d6 1157 if (!is_utf8_char(p))
1158 return FALSE;
a0ed51b3 1159 if (!PL_utf8_print)
e24b16f9 1160 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
3568d838 1161 return swash_fetch(PL_utf8_print, p, TRUE);
a0ed51b3 1162}
1163
1164bool
b8c5462f 1165Perl_is_utf8_punct(pTHX_ U8 *p)
1166{
386d01d6 1167 if (!is_utf8_char(p))
1168 return FALSE;
b8c5462f 1169 if (!PL_utf8_punct)
1170 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
3568d838 1171 return swash_fetch(PL_utf8_punct, p, TRUE);
b8c5462f 1172}
1173
1174bool
1175Perl_is_utf8_xdigit(pTHX_ U8 *p)
1176{
386d01d6 1177 if (!is_utf8_char(p))
1178 return FALSE;
b8c5462f 1179 if (!PL_utf8_xdigit)
1180 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
3568d838 1181 return swash_fetch(PL_utf8_xdigit, p, TRUE);
b8c5462f 1182}
1183
1184bool
864dbfa3 1185Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1186{
386d01d6 1187 if (!is_utf8_char(p))
1188 return FALSE;
a0ed51b3 1189 if (!PL_utf8_mark)
e24b16f9 1190 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
3568d838 1191 return swash_fetch(PL_utf8_mark, p, TRUE);
a0ed51b3 1192}
1193
6b5c0936 1194/*
1195=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1196
1197The "p" contains the pointer to the UTF-8 string encoding
1198the character that is being converted.
1199
1200The "ustrp" is a pointer to the character buffer to put the
1201conversion result to. The "lenp" is a pointer to the length
1202of the result.
1203
1204The "swash" is a pointer to the swash to use.
1205
1206The "normal" is a string like "ToLower" which means the swash
1207$utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
1208and loaded by SWASHGET, using lib/utf8_heavy.pl.
1209
1210The "special" is a string like "utf8::ToSpecLower", which means
d1be9408 1211the hash %utf8::ToSpecLower, which is stored in the same file,
6b5c0936 1212lib/unicore/To/Lower.pl, and also loaded by SWASHGET. The access
1213to the hash is by Perl_to_utf8_case().
1214
1215=cut
1216 */
1217
2104c8d9 1218UV
983ffd37 1219Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special)
a0ed51b3 1220{
1221 UV uv;
1222
983ffd37 1223 if (!*swashp)
1224 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1225 uv = swash_fetch(*swashp, p, TRUE);
1226 if (uv)
1227 uv = UNI_TO_NATIVE(uv);
1228 else {
1229 HV *hv;
1230 SV *keysv;
1231 HE *he;
1232
1233 uv = utf8_to_uvchr(p, 0);
1234
1235 if ((hv = get_hv(special, FALSE)) &&
6d47b937 1236 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) &&
983ffd37 1237 (he = hv_fetch_ent(hv, keysv, FALSE, 0))) {
1238 SV *val = HeVAL(he);
1239 char *s = SvPV(val, *lenp);
1240 U8 c = *(U8*)s;
1241 if (*lenp > 1 || UNI_IS_INVARIANT(c))
1242 Copy(s, ustrp, *lenp, U8);
1243 else {
1244 /* something in the 0x80..0xFF range */
1245 ustrp[0] = UTF8_EIGHT_BIT_HI(c);
1246 ustrp[1] = UTF8_EIGHT_BIT_LO(c);
1247 *lenp = 2;
1248 }
1249 return 0;
1250 }
1251 }
a58d912c 1252 if (lenp)
d90824e8 1253 *lenp = UNISKIP(uv);
a2a2844f 1254 uvuni_to_utf8(ustrp, uv);
1255 return uv;
a0ed51b3 1256}
1257
2104c8d9 1258UV
983ffd37 1259Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1260{
983ffd37 1261 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1262 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1263}
a0ed51b3 1264
983ffd37 1265UV
1266Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1267{
1268 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1269 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3 1270}
1271
2104c8d9 1272UV
a2a2844f 1273Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1274{
983ffd37 1275 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1276 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1277}
1278
1279UV
1280Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1281{
1282 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1283 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3 1284}
1285
1286/* a "swash" is a swatch hash */
1287
1288SV*
864dbfa3 1289Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1290{
1291 SV* retval;
bf1fed83 1292 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1293 dSP;
1b026014 1294 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
f8be5cf0 1295 SV* errsv_save;
ce3b816e 1296
1b026014 1297 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1298 ENTER;
f8be5cf0 1299 errsv_save = newSVsv(ERRSV);
ce3b816e 1300 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
f8be5cf0 1301 if (!SvTRUE(ERRSV))
1302 sv_setsv(ERRSV, errsv_save);
1303 SvREFCNT_dec(errsv_save);
ce3b816e 1304 LEAVE;
1305 }
1306 SPAGAIN;
a0ed51b3 1307 PUSHSTACKi(PERLSI_MAGIC);
1308 PUSHMARK(SP);
1309 EXTEND(SP,5);
1310 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1311 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1312 PUSHs(listsv);
1313 PUSHs(sv_2mortal(newSViv(minbits)));
1314 PUSHs(sv_2mortal(newSViv(none)));
1315 PUTBACK;
1316 ENTER;
1317 SAVEI32(PL_hints);
1318 PL_hints = 0;
1319 save_re_context();
bf1fed83 1320 if (PL_curcop == &PL_compiling)
1321 /* XXX ought to be handled by lex_start */
1322 sv_setpv(tokenbufsv, PL_tokenbuf);
f8be5cf0 1323 errsv_save = newSVsv(ERRSV);
864dbfa3 1324 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1325 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1326 else
e24b16f9 1327 retval = &PL_sv_undef;
f8be5cf0 1328 if (!SvTRUE(ERRSV))
1329 sv_setsv(ERRSV, errsv_save);
1330 SvREFCNT_dec(errsv_save);
a0ed51b3 1331 LEAVE;
1332 POPSTACK;
e24b16f9 1333 if (PL_curcop == &PL_compiling) {
bf1fed83 1334 STRLEN len;
1335 char* pv = SvPV(tokenbufsv, len);
1336
1337 Copy(pv, PL_tokenbuf, len+1, char);
e24b16f9 1338 PL_curcop->op_private = PL_hints;
a0ed51b3 1339 }
1340 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
cea2e8a9 1341 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
a0ed51b3 1342 return retval;
1343}
1344
035d37be 1345
1346/* This API is wrong for special case conversions since we may need to
1347 * return several Unicode characters for a single Unicode character
1348 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1349 * the lower-level routine, and it is similarly broken for returning
1350 * multiple values. --jhi */
a0ed51b3 1351UV
3568d838 1352Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
a0ed51b3 1353{
1354 HV* hv = (HV*)SvRV(sv);
3568d838 1355 U32 klen;
1356 U32 off;
a0ed51b3 1357 STRLEN slen;
7d85a32c 1358 STRLEN needents;
4ea42e7f 1359 U8 *tmps = NULL;
a0ed51b3 1360 U32 bit;
1361 SV *retval;
3568d838 1362 U8 tmputf8[2];
1363 UV c = NATIVE_TO_ASCII(*ptr);
1364
1365 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1366 tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1367 tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1368 ptr = tmputf8;
1369 }
1370 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1371 * then the "swatch" is a vec() for al the chars which start
1372 * with 0xAA..0xYY
1373 * So the key in the hash (klen) is length of encoded char -1
1374 */
1375 klen = UTF8SKIP(ptr) - 1;
1376 off = ptr[klen];
a0ed51b3 1377
7d85a32c 1378 if (klen == 0)
1379 {
1380 /* If char in invariant then swatch is for all the invariant chars
1381 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1382 */
1383 needents = UTF_CONTINUATION_MARK;
1384 off = NATIVE_TO_UTF(ptr[klen]);
1385 }
1386 else
1387 {
1388 /* If char is encoded then swatch is for the prefix */
1389 needents = (1 << UTF_ACCUMULATION_SHIFT);
1390 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1391 }
1392
a0ed51b3 1393 /*
1394 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1395 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1396 * it's nothing to sniff at.) Pity we usually come through at least
1397 * two function calls to get here...
1398 *
1399 * NB: this code assumes that swatches are never modified, once generated!
1400 */
1401
3568d838 1402 if (hv == PL_last_swash_hv &&
a0ed51b3 1403 klen == PL_last_swash_klen &&
3568d838 1404 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3 1405 {
1406 tmps = PL_last_swash_tmps;
1407 slen = PL_last_swash_slen;
1408 }
1409 else {
1410 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1411 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3 1412
1413 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1414 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1415 dSP;
2b9d42f0 1416 /* We use utf8n_to_uvuni() as we want an index into
1417 Unicode tables, not a native character number.
1418 */
1419 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
f8be5cf0 1420 SV *errsv_save;
a0ed51b3 1421 ENTER;
1422 SAVETMPS;
1423 save_re_context();
1424 PUSHSTACKi(PERLSI_MAGIC);
1425 PUSHMARK(SP);
1426 EXTEND(SP,3);
1427 PUSHs((SV*)sv);
ffbc6a93 1428 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838 1429 PUSHs(sv_2mortal(newSViv((klen) ?
1430 (code_point & ~(needents - 1)) : 0)));
a0ed51b3 1431 PUSHs(sv_2mortal(newSViv(needents)));
1432 PUTBACK;
f8be5cf0 1433 errsv_save = newSVsv(ERRSV);
864dbfa3 1434 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1435 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1436 else
e24b16f9 1437 retval = &PL_sv_undef;
f8be5cf0 1438 if (!SvTRUE(ERRSV))
1439 sv_setsv(ERRSV, errsv_save);
1440 SvREFCNT_dec(errsv_save);
a0ed51b3 1441 POPSTACK;
1442 FREETMPS;
1443 LEAVE;
e24b16f9 1444 if (PL_curcop == &PL_compiling)
1445 PL_curcop->op_private = PL_hints;
a0ed51b3 1446
dfe13c55 1447 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1448
7d85a32c 1449 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1450 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3 1451 }
1452
1453 PL_last_swash_hv = hv;
1454 PL_last_swash_klen = klen;
1455 PL_last_swash_tmps = tmps;
1456 PL_last_swash_slen = slen;
1457 if (klen)
1458 Copy(ptr, PL_last_swash_key, klen, U8);
1459 }
1460
9faf8d75 1461 switch ((int)((slen << 3) / needents)) {
a0ed51b3 1462 case 1:
1463 bit = 1 << (off & 7);
1464 off >>= 3;
1465 return (tmps[off] & bit) != 0;
1466 case 8:
1467 return tmps[off];
1468 case 16:
1469 off <<= 1;
1470 return (tmps[off] << 8) + tmps[off + 1] ;
1471 case 32:
1472 off <<= 2;
1473 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1474 }
cea2e8a9 1475 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3 1476 return 0;
1477}
2b9d42f0 1478
1479
1480/*
37607a96 1481=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0 1482
1483Adds the UTF8 representation of the Native codepoint C<uv> to the end
1484of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1485bytes available. The return value is the pointer to the byte after the
1486end of the new character. In other words,
1487
1488 d = uvchr_to_utf8(d, uv);
1489
1490is the recommended wide native character-aware way of saying
1491
1492 *(d++) = uv;
1493
1494=cut
1495*/
1496
1497/* On ASCII machines this is normally a macro but we want a
1498 real function in case XS code wants it
1499*/
1500#undef Perl_uvchr_to_utf8
1501U8 *
1502Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1503{
1504 return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1505}
1506
1507
1508/*
37607a96 1509=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0 1510
1511Returns the native character value of the first character in the string C<s>
1512which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1513length, in bytes, of that character.
1514
1515Allows length and flags to be passed to low level routine.
1516
1517=cut
1518*/
1519/* On ASCII machines this is normally a macro but we want a
1520 real function in case XS code wants it
1521*/
1522#undef Perl_utf8n_to_uvchr
1523UV
37607a96 1524Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0 1525{
1526 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1527 return UNI_TO_NATIVE(uv);
1528}
1529
d2cc3551 1530/*
1531=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1532
1533Build to the scalar dsv a displayable version of the string spv,
1534length len, the displayable version being at most pvlim bytes long
1535(if longer, the rest is truncated and "..." will be appended).
1536The flags argument is currently unused but available for future extensions.
1537The pointer to the PV of the dsv is returned.
1538
1539=cut */
e6b2e755 1540char *
1541Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1542{
1543 int truncated = 0;
1544 char *s, *e;
1545
1546 sv_setpvn(dsv, "", 0);
1547 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1548 UV u;
1549 if (pvlim && SvCUR(dsv) >= pvlim) {
1550 truncated++;
1551 break;
1552 }
1553 u = utf8_to_uvchr((U8*)s, 0);
1554 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1555 }
1556 if (truncated)
1557 sv_catpvn(dsv, "...", 3);
1558
1559 return SvPVX(dsv);
1560}
2b9d42f0 1561
d2cc3551 1562/*
1563=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1564
1565Build to the scalar dsv a displayable version of the scalar sv,
1566he displayable version being at most pvlim bytes long
1567(if longer, the rest is truncated and "..." will be appended).
1568The flags argument is currently unused but available for future extensions.
1569The pointer to the PV of the dsv is returned.
1570
1571=cut */
e6b2e755 1572char *
1573Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1574{
701a277b 1575 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1576 pvlim, flags);
1577}
1578
d2cc3551 1579/*
1580=for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|const char *s2|bool u2|register I32 len
1581
1582Return true if the strings s1 and s2 differ case-insensitively, false
1583if not (if they are equal case-insensitively). If u1 is true, the
1584string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
332ddc25 1585the string s2 is assumed to be in UTF-8-encoded Unicode.
d2cc3551 1586
1587For case-insensitiveness, the "casefolding" of Unicode is used
1588instead of upper/lowercasing both the characters, see
1589http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1590
1591=cut */
701a277b 1592I32
332ddc25 1593Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, register I32 len1, const char *s2, bool u2, register I32 len2)
1594{
1595 register U8 *a = (U8*)s1;
1596 register U8 *b = (U8*)s2;
1597 register U8 *ae = b + len1;
1598 register U8 *be = b + len2;
1599 STRLEN la, lb;
1600 UV ca, cb;
1601 STRLEN ulen1, ulen2;
e7ae6809 1602 U8 tmpbuf1[UTF8_MAXLEN_FOLD+1];
1603 U8 tmpbuf2[UTF8_MAXLEN_FOLD+1];
332ddc25 1604
1605 while (a < ae && b < be) {
1606 if (u1) {
1607 if (a + UTF8SKIP(a) > ae)
1608 break;
1609 ca = utf8_to_uvchr((U8*)a, &la);
1610 } else {
1611 ca = *a;
1612 la = 1;
1613 }
1614 if (u2) {
1615 if (b + UTF8SKIP(b) > be)
1616 break;
1617 cb = utf8_to_uvchr((U8*)b, &lb);
1618 } else {
1619 cb = *b;
1620 lb = 1;
1621 }
1622 if (ca != cb) {
701a277b 1623 if (u1)
332ddc25 1624 to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
1625 else
1626 ulen1 = 1;
701a277b 1627 if (u2)
332ddc25 1628 to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
1629 else
1630 ulen2 = 1;
1631 if (ulen1 != ulen2
1632 || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb])
1633 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
1634 return 1; /* mismatch */
701a277b 1635 }
332ddc25 1636 a += la;
1637 b += lb;
d2cc3551 1638 }
332ddc25 1639 return a == ae && b == be ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 1640}
701a277b 1641