Declare hints only if needed.
[p5sagit/p5-mst-13.2.git] / utf8.c
CommitLineData
a0ed51b3 1/* utf8.c
2 *
bc89e66f 3 * Copyright (c) 1998-2001, Larry Wall
a0ed51b3 4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3 25#include "perl.h"
26
27/* Unicode support */
28
eebe1485 29/*
37607a96 30=for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
eebe1485 31
32Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34bytes available. The return value is the pointer to the byte after the
9041c2e3 35end of the new character. In other words,
eebe1485 36
9041c2e3 37 d = uvuni_to_utf8(d, uv);
eebe1485 38
39is the recommended Unicode-aware way of saying
40
41 *(d++) = uv;
42
43=cut
44*/
45
dfe13c55 46U8 *
9041c2e3 47Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
a0ed51b3 48{
c4d5f83a 49 if (UNI_IS_INVARIANT(uv)) {
50 *d++ = UTF_TO_NATIVE(uv);
a0ed51b3 51 return d;
52 }
1d72bdf6 53#if defined(EBCDIC) || 1 /* always for testing */
54 else {
55 STRLEN len = UNISKIP(uv);
56 U8 *p = d+len-1;
57 while (p > d) {
58 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
59 uv >>= UTF_ACCUMULATION_SHIFT;
60 }
61 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
62 return d+len;
63 }
64#else /* Non loop style */
a0ed51b3 65 if (uv < 0x800) {
66 *d++ = (( uv >> 6) | 0xc0);
67 *d++ = (( uv & 0x3f) | 0x80);
68 return d;
69 }
70 if (uv < 0x10000) {
71 *d++ = (( uv >> 12) | 0xe0);
72 *d++ = (((uv >> 6) & 0x3f) | 0x80);
73 *d++ = (( uv & 0x3f) | 0x80);
74 return d;
75 }
76 if (uv < 0x200000) {
77 *d++ = (( uv >> 18) | 0xf0);
78 *d++ = (((uv >> 12) & 0x3f) | 0x80);
79 *d++ = (((uv >> 6) & 0x3f) | 0x80);
80 *d++ = (( uv & 0x3f) | 0x80);
81 return d;
82 }
83 if (uv < 0x4000000) {
84 *d++ = (( uv >> 24) | 0xf8);
85 *d++ = (((uv >> 18) & 0x3f) | 0x80);
86 *d++ = (((uv >> 12) & 0x3f) | 0x80);
87 *d++ = (((uv >> 6) & 0x3f) | 0x80);
88 *d++ = (( uv & 0x3f) | 0x80);
89 return d;
90 }
91 if (uv < 0x80000000) {
92 *d++ = (( uv >> 30) | 0xfc);
93 *d++ = (((uv >> 24) & 0x3f) | 0x80);
94 *d++ = (((uv >> 18) & 0x3f) | 0x80);
95 *d++ = (((uv >> 12) & 0x3f) | 0x80);
96 *d++ = (((uv >> 6) & 0x3f) | 0x80);
97 *d++ = (( uv & 0x3f) | 0x80);
98 return d;
99 }
6b8eaf93 100#ifdef HAS_QUAD
d7578b48 101 if (uv < UTF8_QUAD_MAX)
a0ed51b3 102#endif
103 {
104 *d++ = 0xfe; /* Can't match U+FEFF! */
105 *d++ = (((uv >> 30) & 0x3f) | 0x80);
106 *d++ = (((uv >> 24) & 0x3f) | 0x80);
107 *d++ = (((uv >> 18) & 0x3f) | 0x80);
108 *d++ = (((uv >> 12) & 0x3f) | 0x80);
109 *d++ = (((uv >> 6) & 0x3f) | 0x80);
110 *d++ = (( uv & 0x3f) | 0x80);
111 return d;
112 }
6b8eaf93 113#ifdef HAS_QUAD
a0ed51b3 114 {
115 *d++ = 0xff; /* Can't match U+FFFE! */
3c77ea2b 116 *d++ = 0x80; /* 6 Reserved bits */
117 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
118 *d++ = (((uv >> 54) & 0x3f) | 0x80);
119 *d++ = (((uv >> 48) & 0x3f) | 0x80);
120 *d++ = (((uv >> 42) & 0x3f) | 0x80);
a0ed51b3 121 *d++ = (((uv >> 36) & 0x3f) | 0x80);
122 *d++ = (((uv >> 30) & 0x3f) | 0x80);
123 *d++ = (((uv >> 24) & 0x3f) | 0x80);
124 *d++ = (((uv >> 18) & 0x3f) | 0x80);
125 *d++ = (((uv >> 12) & 0x3f) | 0x80);
126 *d++ = (((uv >> 6) & 0x3f) | 0x80);
127 *d++ = (( uv & 0x3f) | 0x80);
128 return d;
129 }
130#endif
1d72bdf6 131#endif /* Loop style */
a0ed51b3 132}
133
9041c2e3 134
135
136/*
eebe1485 137=for apidoc A|STRLEN|is_utf8_char|U8 *s
138
5da9da9e 139Tests if some arbitrary number of bytes begins in a valid UTF-8
1d72bdf6 140character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
5da9da9e 141The actual number of bytes in the UTF-8 character will be returned if
142it is valid, otherwise 0.
9041c2e3 143
37607a96 144=cut
145*/
067a85ef 146STRLEN
386d01d6 147Perl_is_utf8_char(pTHX_ U8 *s)
148{
149 U8 u = *s;
067a85ef 150 STRLEN slen, len;
151 UV uv, ouv;
386d01d6 152
1d72bdf6 153 if (UTF8_IS_INVARIANT(u))
386d01d6 154 return 1;
155
60006e79 156 if (!UTF8_IS_START(u))
386d01d6 157 return 0;
158
9f07fdcd 159 len = UTF8SKIP(s);
386d01d6 160
60006e79 161 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef 162 return 0;
163
386d01d6 164 slen = len - 1;
165 s++;
1d72bdf6 166 /* The initial value is dubious */
167 uv = u;
067a85ef 168 ouv = uv;
386d01d6 169 while (slen--) {
60006e79 170 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 171 return 0;
8850bf83 172 uv = UTF8_ACCUMULATE(uv, *s);
209a9bc1 173#if 0
174 /* Depending on the compiler the wrap of the value takig pladve
175 * between 5 and 6 bytes of UTF-8 encoding either works or not.
176 * See similar spot in utf8_to_uvuni(). --jhi */
177 if (uv < ouv)
067a85ef 178 return 0;
209a9bc1 179#endif
067a85ef 180 ouv = uv;
386d01d6 181 s++;
182 }
067a85ef 183
5bbb0b5a 184 if (UNISKIP(uv) < len)
067a85ef 185 return 0;
186
386d01d6 187 return len;
188}
189
6662521e 190/*
eebe1485 191=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 192
5da9da9e 193Returns true if first C<len> bytes of the given string form a valid UTF8
194string, false otherwise. Note that 'a valid UTF8 string' does not mean
195'a string that contains UTF8' because a valid ASCII string is a valid
196UTF8 string.
6662521e 197
198=cut
199*/
200
8e84507e 201bool
6662521e 202Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
203{
067a85ef 204 U8* x = s;
1aa99e6b 205 U8* send;
067a85ef 206 STRLEN c;
207
1aa99e6b 208 if (!len)
6cd5fe39 209 len = strlen((char *)s);
1aa99e6b 210 send = s + len;
211
6662521e 212 while (x < send) {
213 c = is_utf8_char(x);
067a85ef 214 if (!c)
215 return FALSE;
6662521e 216 x += c;
6662521e 217 }
60006e79 218 if (x != send)
219 return FALSE;
067a85ef 220
221 return TRUE;
6662521e 222}
223
67e989fb 224/*
9041c2e3 225=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 226
9041c2e3 227Bottom level UTF-8 decode routine.
228Returns the unicode code point value of the first character in the string C<s>
ba210ebe 229which is assumed to be in UTF8 encoding and no longer than C<curlen>;
7df053ec 230C<retlen> will be set to the length, in bytes, of that character.
67e989fb 231
232If C<s> does not point to a well-formed UTF8 character, the behaviour
dcad2880 233is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
234it is assumed that the caller will raise a warning, and this function
28d3d195 235will silently just set C<retlen> to C<-1> and return zero. If the
236C<flags> does not contain UTF8_CHECK_ONLY, warnings about
237malformations will be given, C<retlen> will be set to the expected
238length of the UTF-8 character in bytes, and zero will be returned.
239
240The C<flags> can also contain various flags to allow deviations from
241the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 242
9041c2e3 243Most code should use utf8_to_uvchr() rather than call this directly.
244
37607a96 245=cut
246*/
67e989fb 247
a0ed51b3 248UV
37607a96 249Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 250{
ba210ebe 251 UV uv = *s, ouv;
252 STRLEN len = 1;
253 bool dowarn = ckWARN_d(WARN_UTF8);
254 STRLEN expectlen = 0;
a0dbb045 255 U32 warning = 0;
256
257/* This list is a superset of the UTF8_ALLOW_XXX. */
258
259#define UTF8_WARN_EMPTY 1
260#define UTF8_WARN_CONTINUATION 2
261#define UTF8_WARN_NON_CONTINUATION 3
262#define UTF8_WARN_FE_FF 4
263#define UTF8_WARN_SHORT 5
264#define UTF8_WARN_OVERFLOW 6
265#define UTF8_WARN_SURROGATE 7
266#define UTF8_WARN_BOM 8
267#define UTF8_WARN_LONG 9
268#define UTF8_WARN_FFFF 10
269
270 if (curlen == 0 &&
271 !(flags & UTF8_ALLOW_EMPTY)) {
272 warning = UTF8_WARN_EMPTY;
0c443dc2 273 goto malformed;
274 }
275
1d72bdf6 276 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3 277 if (retlen)
278 *retlen = 1;
c4d5f83a 279 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 280 }
67e989fb 281
421a8bf2 282 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 283 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 284 warning = UTF8_WARN_CONTINUATION;
ba210ebe 285 goto malformed;
286 }
287
421a8bf2 288 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 289 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 290 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 291 goto malformed;
292 }
9041c2e3 293
1d72bdf6 294#ifdef EBCDIC
75383841 295 uv = NATIVE_TO_UTF(uv);
1d72bdf6 296#else
fcc8fcf6 297 if ((uv == 0xfe || uv == 0xff) &&
298 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 299 warning = UTF8_WARN_FE_FF;
ba210ebe 300 goto malformed;
a0ed51b3 301 }
1d72bdf6 302#endif
303
ba210ebe 304 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
305 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
306 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
307 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6 308#ifdef EBCDIC
309 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
310 else { len = 7; uv &= 0x01; }
311#else
ba210ebe 312 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
313 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6 314 else { len = 13; uv = 0; } /* whoa! */
315#endif
316
a0ed51b3 317 if (retlen)
318 *retlen = len;
9041c2e3 319
ba210ebe 320 expectlen = len;
321
fcc8fcf6 322 if ((curlen < expectlen) &&
323 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 324 warning = UTF8_WARN_SHORT;
ba210ebe 325 goto malformed;
326 }
327
328 len--;
a0ed51b3 329 s++;
ba210ebe 330 ouv = uv;
331
a0ed51b3 332 while (len--) {
421a8bf2 333 if (!UTF8_IS_CONTINUATION(*s) &&
334 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 335 s--;
336 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 337 goto malformed;
a0ed51b3 338 }
339 else
8850bf83 340 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045 341 if (!(uv > ouv)) {
342 /* These cannot be allowed. */
343 if (uv == ouv) {
344 if (!(flags & UTF8_ALLOW_LONG)) {
345 warning = UTF8_WARN_LONG;
346 goto malformed;
347 }
348 }
349 else { /* uv < ouv */
209a9bc1 350#if 0
351 /* Depending on the compiler the wrap of the value takig pladve
352 * between 5 and 6 bytes of UTF-8 encoding either works or not.
353 * See similar spot in is_utf8_char(). --jhi */
a0dbb045 354 /* This cannot be allowed. */
355 warning = UTF8_WARN_OVERFLOW;
356 goto malformed;
209a9bc1 357#endif
a0dbb045 358 }
ba210ebe 359 }
360 s++;
361 ouv = uv;
362 }
363
421a8bf2 364 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 365 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 366 warning = UTF8_WARN_SURROGATE;
ba210ebe 367 goto malformed;
421a8bf2 368 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
fcc8fcf6 369 !(flags & UTF8_ALLOW_BOM)) {
a0dbb045 370 warning = UTF8_WARN_BOM;
ba210ebe 371 goto malformed;
fcc8fcf6 372 } else if ((expectlen > UNISKIP(uv)) &&
373 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 374 warning = UTF8_WARN_LONG;
ba210ebe 375 goto malformed;
421a8bf2 376 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 377 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 378 warning = UTF8_WARN_FFFF;
a9917092 379 goto malformed;
a0ed51b3 380 }
ba210ebe 381
a0ed51b3 382 return uv;
ba210ebe 383
384malformed:
385
fcc8fcf6 386 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 387 if (retlen)
cc366d4b 388 *retlen = -1;
ba210ebe 389 return 0;
390 }
391
a0dbb045 392 if (dowarn) {
393 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
394
395 switch (warning) {
396 case 0: /* Intentionally empty. */ break;
397 case UTF8_WARN_EMPTY:
398 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
399 break;
400 case UTF8_WARN_CONTINUATION:
401 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
402 break;
403 case UTF8_WARN_NON_CONTINUATION:
404 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
405 (UV)s[1], uv);
406 break;
407 case UTF8_WARN_FE_FF:
408 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
409 break;
410 case UTF8_WARN_SHORT:
411 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
412 curlen, curlen == 1 ? "" : "s", expectlen);
413 break;
414 case UTF8_WARN_OVERFLOW:
415 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
416 ouv, *s);
417 break;
418 case UTF8_WARN_SURROGATE:
419 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
420 break;
421 case UTF8_WARN_BOM:
422 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
423 break;
424 case UTF8_WARN_LONG:
425 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
426 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
427 break;
428 case UTF8_WARN_FFFF:
429 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
430 break;
431 default:
432 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
433 break;
434 }
435
436 if (warning) {
437 char *s = SvPVX(sv);
438
439 if (PL_op)
440 Perl_warner(aTHX_ WARN_UTF8,
441 "%s in %s", s, PL_op_desc[PL_op->op_type]);
442 else
443 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
444 }
445 }
446
ba210ebe 447 if (retlen)
28d3d195 448 *retlen = expectlen ? expectlen : len;
ba210ebe 449
28d3d195 450 return 0;
a0ed51b3 451}
452
8e84507e 453/*
37607a96 454=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
9041c2e3 455
456Returns the native character value of the first character in the string C<s>
457which is assumed to be in UTF8 encoding; C<retlen> will be set to the
458length, in bytes, of that character.
459
460If C<s> does not point to a well-formed UTF8 character, zero is
461returned and retlen is set, if possible, to -1.
462
463=cut
464*/
465
466UV
37607a96 467Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
9041c2e3 468{
469 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
470}
471
472/*
37607a96 473=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
9041c2e3 474
475Returns the Unicode code point of the first character in the string C<s>
476which is assumed to be in UTF8 encoding; C<retlen> will be set to the
477length, in bytes, of that character.
478
479This function should only be used when returned UV is considered
480an index into the Unicode semantic tables (e.g. swashes).
481
ba210ebe 482If C<s> does not point to a well-formed UTF8 character, zero is
483returned and retlen is set, if possible, to -1.
8e84507e 484
485=cut
486*/
487
488UV
37607a96 489Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
8e84507e 490{
9041c2e3 491 /* Call the low level routine asking for checks */
492 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
8e84507e 493}
494
b76347f2 495/*
37607a96 496=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
b76347f2 497
498Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47 499Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
500up past C<e>, croaks.
b76347f2 501
502=cut
503*/
504
505STRLEN
37607a96 506Perl_utf8_length(pTHX_ U8 *s, U8 *e)
b76347f2 507{
508 STRLEN len = 0;
509
8850bf83 510 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
511 * the bitops (especially ~) can create illegal UTF-8.
512 * In other words: in Perl UTF-8 is not just for Unicode. */
513
b76347f2 514 if (e < s)
02eb7b47 515 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
b76347f2 516 while (s < e) {
02eb7b47 517 U8 t = UTF8SKIP(s);
b76347f2 518
519 if (e - s < t)
c4d5f83a 520 Perl_croak(aTHX_ "panic: utf8_length: s=%p (%02X) e=%p l=%d - unaligned end",s,*s,e,t);
b76347f2 521 s += t;
522 len++;
523 }
524
525 return len;
526}
527
b06226ff 528/*
eebe1485 529=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff 530
531Returns the number of UTF8 characters between the UTF-8 pointers C<a>
532and C<b>.
533
534WARNING: use only if you *know* that the pointers point inside the
535same UTF-8 buffer.
536
37607a96 537=cut
538*/
a0ed51b3 539
02eb7b47 540IV
864dbfa3 541Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 542{
02eb7b47 543 IV off = 0;
544
8850bf83 545 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
546 * the bitops (especially ~) can create illegal UTF-8.
547 * In other words: in Perl UTF-8 is not just for Unicode. */
548
a0ed51b3 549 if (a < b) {
550 while (a < b) {
02eb7b47 551 U8 c = UTF8SKIP(a);
552
553 if (b - a < c)
554 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
555 a += c;
a0ed51b3 556 off--;
557 }
558 }
559 else {
560 while (b < a) {
02eb7b47 561 U8 c = UTF8SKIP(b);
562
563 if (a - b < c)
564 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
565 b += c;
a0ed51b3 566 off++;
567 }
568 }
02eb7b47 569
a0ed51b3 570 return off;
571}
572
b06226ff 573/*
37607a96 574=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 575
8850bf83 576Return the UTF-8 pointer C<s> displaced by C<off> characters, either
577forward or backward.
b06226ff 578
579WARNING: do not use the following unless you *know* C<off> is within
8850bf83 580the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
581on the first byte of character or just after the last byte of a character.
b06226ff 582
37607a96 583=cut
584*/
a0ed51b3 585
586U8 *
864dbfa3 587Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 588{
8850bf83 589 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
590 * the bitops (especially ~) can create illegal UTF-8.
591 * In other words: in Perl UTF-8 is not just for Unicode. */
592
a0ed51b3 593 if (off >= 0) {
594 while (off--)
595 s += UTF8SKIP(s);
596 }
597 else {
598 while (off++) {
599 s--;
8850bf83 600 while (UTF8_IS_CONTINUATION(*s))
601 s--;
a0ed51b3 602 }
603 }
604 return s;
605}
606
6940069f 607/*
eebe1485 608=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 609
246fae53 610Converts a string C<s> of length C<len> from UTF8 into byte encoding.
611Unlike C<bytes_to_utf8>, this over-writes the original string, and
612updates len to contain the new length.
67e989fb 613Returns zero on failure, setting C<len> to -1.
6940069f 614
615=cut
616*/
617
618U8 *
37607a96 619Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 620{
6940069f 621 U8 *send;
622 U8 *d;
dcad2880 623 U8 *save = s;
246fae53 624
625 /* ensure valid UTF8 and chars < 256 before updating string */
dcad2880 626 for (send = s + *len; s < send; ) {
627 U8 c = *s++;
628
1d72bdf6 629 if (!UTF8_IS_INVARIANT(c) &&
630 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
631 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880 632 *len = -1;
633 return 0;
634 }
246fae53 635 }
dcad2880 636
637 d = s = save;
6940069f 638 while (s < send) {
ed646e6e 639 STRLEN ulen;
9041c2e3 640 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 641 s += ulen;
6940069f 642 }
643 *d = '\0';
246fae53 644 *len = d - save;
6940069f 645 return save;
646}
647
648/*
f9a63242 649=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
650
651Converts a string C<s> of length C<len> from UTF8 into byte encoding.
652Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0 653the newly-created string, and updates C<len> to contain the new
654length. Returns the original string if no conversion occurs, C<len>
655is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
6560 if C<s> is converted or contains all 7bit characters.
f9a63242 657
37607a96 658=cut
659*/
f9a63242 660
661U8 *
37607a96 662Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 663{
f9a63242 664 U8 *d;
665 U8 *start = s;
db42d148 666 U8 *send;
f9a63242 667 I32 count = 0;
668
669 if (!*is_utf8)
670 return start;
671
ef9edfd0 672 /* ensure valid UTF8 and chars < 256 before converting string */
f9a63242 673 for (send = s + *len; s < send;) {
674 U8 c = *s++;
1d72bdf6 675 if (!UTF8_IS_INVARIANT(c)) {
db42d148 676 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
677 (c = *s++) && UTF8_IS_CONTINUATION(c))
678 count++;
679 else
f9a63242 680 return start;
db42d148 681 }
f9a63242 682 }
683
684 *is_utf8 = 0;
685
f9a63242 686 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 687 s = start; start = d;
f9a63242 688 while (s < send) {
689 U8 c = *s++;
c4d5f83a 690 if (!UTF8_IS_INVARIANT(c)) {
691 /* Then it is two-byte encoded */
692 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
693 c = ASCII_TO_NATIVE(c);
694 }
695 *d++ = c;
f9a63242 696 }
697 *d = '\0';
698 *len = d - start;
699 return start;
700}
701
702/*
eebe1485 703=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f 704
705Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
6662521e 706Returns a pointer to the newly-created string, and sets C<len> to
707reflect the new length.
6940069f 708
497711e7 709=cut
6940069f 710*/
711
712U8*
37607a96 713Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
6940069f 714{
6940069f 715 U8 *send;
716 U8 *d;
717 U8 *dst;
6662521e 718 send = s + (*len);
6940069f 719
6662521e 720 Newz(801, d, (*len) * 2 + 1, U8);
6940069f 721 dst = d;
722
723 while (s < send) {
db42d148 724 UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 725 if (UNI_IS_INVARIANT(uv))
726 *d++ = UTF_TO_NATIVE(uv);
6940069f 727 else {
90f44359 728 *d++ = UTF8_EIGHT_BIT_HI(uv);
729 *d++ = UTF8_EIGHT_BIT_LO(uv);
6940069f 730 }
731 }
732 *d = '\0';
6662521e 733 *len = d-dst;
6940069f 734 return dst;
735}
736
a0ed51b3 737/*
dea0fc0b 738 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3 739 *
740 * Destination must be pre-extended to 3/2 source. Do not use in-place.
741 * We optimize for native, for obvious reasons. */
742
743U8*
dea0fc0b 744Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 745{
dea0fc0b 746 U8* pend;
747 U8* dstart = d;
748
749 if (bytelen & 1)
a7867d0a 750 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b 751
752 pend = p + bytelen;
753
a0ed51b3 754 while (p < pend) {
dea0fc0b 755 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
756 p += 2;
a0ed51b3 757 if (uv < 0x80) {
758 *d++ = uv;
759 continue;
760 }
761 if (uv < 0x800) {
762 *d++ = (( uv >> 6) | 0xc0);
763 *d++ = (( uv & 0x3f) | 0x80);
764 continue;
765 }
766 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
dea0fc0b 767 UV low = *p++;
768 if (low < 0xdc00 || low >= 0xdfff)
769 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3 770 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
771 }
772 if (uv < 0x10000) {
773 *d++ = (( uv >> 12) | 0xe0);
774 *d++ = (((uv >> 6) & 0x3f) | 0x80);
775 *d++ = (( uv & 0x3f) | 0x80);
776 continue;
777 }
778 else {
779 *d++ = (( uv >> 18) | 0xf0);
780 *d++ = (((uv >> 12) & 0x3f) | 0x80);
781 *d++ = (((uv >> 6) & 0x3f) | 0x80);
782 *d++ = (( uv & 0x3f) | 0x80);
783 continue;
784 }
785 }
dea0fc0b 786 *newlen = d - dstart;
a0ed51b3 787 return d;
788}
789
790/* Note: this one is slightly destructive of the source. */
791
792U8*
dea0fc0b 793Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 794{
795 U8* s = (U8*)p;
796 U8* send = s + bytelen;
797 while (s < send) {
798 U8 tmp = s[0];
799 s[0] = s[1];
800 s[1] = tmp;
801 s += 2;
802 }
dea0fc0b 803 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3 804}
805
806/* for now these are all defined (inefficiently) in terms of the utf8 versions */
807
808bool
864dbfa3 809Perl_is_uni_alnum(pTHX_ U32 c)
a0ed51b3 810{
ad391ad9 811 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 812 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 813 return is_utf8_alnum(tmpbuf);
814}
815
816bool
b8c5462f 817Perl_is_uni_alnumc(pTHX_ U32 c)
818{
ad391ad9 819 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 820 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f 821 return is_utf8_alnumc(tmpbuf);
822}
823
824bool
864dbfa3 825Perl_is_uni_idfirst(pTHX_ U32 c)
a0ed51b3 826{
ad391ad9 827 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 828 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 829 return is_utf8_idfirst(tmpbuf);
830}
831
832bool
864dbfa3 833Perl_is_uni_alpha(pTHX_ U32 c)
a0ed51b3 834{
ad391ad9 835 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 836 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 837 return is_utf8_alpha(tmpbuf);
838}
839
840bool
4d61ec05 841Perl_is_uni_ascii(pTHX_ U32 c)
842{
ad391ad9 843 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 844 uvchr_to_utf8(tmpbuf, (UV)c);
4d61ec05 845 return is_utf8_ascii(tmpbuf);
846}
847
848bool
864dbfa3 849Perl_is_uni_space(pTHX_ U32 c)
a0ed51b3 850{
ad391ad9 851 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 852 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 853 return is_utf8_space(tmpbuf);
854}
855
856bool
864dbfa3 857Perl_is_uni_digit(pTHX_ U32 c)
a0ed51b3 858{
ad391ad9 859 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 860 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 861 return is_utf8_digit(tmpbuf);
862}
863
864bool
864dbfa3 865Perl_is_uni_upper(pTHX_ U32 c)
a0ed51b3 866{
ad391ad9 867 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 868 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 869 return is_utf8_upper(tmpbuf);
870}
871
872bool
864dbfa3 873Perl_is_uni_lower(pTHX_ U32 c)
a0ed51b3 874{
ad391ad9 875 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 876 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 877 return is_utf8_lower(tmpbuf);
878}
879
880bool
b8c5462f 881Perl_is_uni_cntrl(pTHX_ U32 c)
882{
ad391ad9 883 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 884 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f 885 return is_utf8_cntrl(tmpbuf);
886}
887
888bool
889Perl_is_uni_graph(pTHX_ U32 c)
890{
ad391ad9 891 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 892 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f 893 return is_utf8_graph(tmpbuf);
894}
895
896bool
864dbfa3 897Perl_is_uni_print(pTHX_ U32 c)
a0ed51b3 898{
ad391ad9 899 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 900 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 901 return is_utf8_print(tmpbuf);
902}
903
b8c5462f 904bool
f248d071 905Perl_is_uni_punct(pTHX_ U32 c)
b8c5462f 906{
ad391ad9 907 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 908 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f 909 return is_utf8_punct(tmpbuf);
910}
911
4d61ec05 912bool
913Perl_is_uni_xdigit(pTHX_ U32 c)
914{
ad391ad9 915 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 916 uvchr_to_utf8(tmpbuf, (UV)c);
4d61ec05 917 return is_utf8_xdigit(tmpbuf);
918}
919
a0ed51b3 920U32
864dbfa3 921Perl_to_uni_upper(pTHX_ U32 c)
a0ed51b3 922{
ad391ad9 923 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 924 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 925 return to_utf8_upper(tmpbuf);
926}
927
928U32
864dbfa3 929Perl_to_uni_title(pTHX_ U32 c)
a0ed51b3 930{
ad391ad9 931 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 932 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 933 return to_utf8_title(tmpbuf);
934}
935
936U32
864dbfa3 937Perl_to_uni_lower(pTHX_ U32 c)
a0ed51b3 938{
ad391ad9 939 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 940 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3 941 return to_utf8_lower(tmpbuf);
942}
943
944/* for now these all assume no locale info available for Unicode > 255 */
945
946bool
864dbfa3 947Perl_is_uni_alnum_lc(pTHX_ U32 c)
a0ed51b3 948{
949 return is_uni_alnum(c); /* XXX no locale support yet */
950}
951
952bool
b8c5462f 953Perl_is_uni_alnumc_lc(pTHX_ U32 c)
954{
955 return is_uni_alnumc(c); /* XXX no locale support yet */
956}
957
958bool
864dbfa3 959Perl_is_uni_idfirst_lc(pTHX_ U32 c)
a0ed51b3 960{
961 return is_uni_idfirst(c); /* XXX no locale support yet */
962}
963
964bool
864dbfa3 965Perl_is_uni_alpha_lc(pTHX_ U32 c)
a0ed51b3 966{
967 return is_uni_alpha(c); /* XXX no locale support yet */
968}
969
970bool
4d61ec05 971Perl_is_uni_ascii_lc(pTHX_ U32 c)
972{
973 return is_uni_ascii(c); /* XXX no locale support yet */
974}
975
976bool
864dbfa3 977Perl_is_uni_space_lc(pTHX_ U32 c)
a0ed51b3 978{
979 return is_uni_space(c); /* XXX no locale support yet */
980}
981
982bool
864dbfa3 983Perl_is_uni_digit_lc(pTHX_ U32 c)
a0ed51b3 984{
985 return is_uni_digit(c); /* XXX no locale support yet */
986}
987
988bool
864dbfa3 989Perl_is_uni_upper_lc(pTHX_ U32 c)
a0ed51b3 990{
991 return is_uni_upper(c); /* XXX no locale support yet */
992}
993
994bool
864dbfa3 995Perl_is_uni_lower_lc(pTHX_ U32 c)
a0ed51b3 996{
997 return is_uni_lower(c); /* XXX no locale support yet */
998}
999
1000bool
b8c5462f 1001Perl_is_uni_cntrl_lc(pTHX_ U32 c)
1002{
1003 return is_uni_cntrl(c); /* XXX no locale support yet */
1004}
1005
1006bool
1007Perl_is_uni_graph_lc(pTHX_ U32 c)
1008{
1009 return is_uni_graph(c); /* XXX no locale support yet */
1010}
1011
1012bool
864dbfa3 1013Perl_is_uni_print_lc(pTHX_ U32 c)
a0ed51b3 1014{
1015 return is_uni_print(c); /* XXX no locale support yet */
1016}
1017
b8c5462f 1018bool
1019Perl_is_uni_punct_lc(pTHX_ U32 c)
1020{
1021 return is_uni_punct(c); /* XXX no locale support yet */
1022}
1023
4d61ec05 1024bool
1025Perl_is_uni_xdigit_lc(pTHX_ U32 c)
1026{
1027 return is_uni_xdigit(c); /* XXX no locale support yet */
1028}
1029
a0ed51b3 1030U32
864dbfa3 1031Perl_to_uni_upper_lc(pTHX_ U32 c)
a0ed51b3 1032{
1033 return to_uni_upper(c); /* XXX no locale support yet */
1034}
1035
1036U32
864dbfa3 1037Perl_to_uni_title_lc(pTHX_ U32 c)
a0ed51b3 1038{
1039 return to_uni_title(c); /* XXX no locale support yet */
1040}
1041
1042U32
864dbfa3 1043Perl_to_uni_lower_lc(pTHX_ U32 c)
a0ed51b3 1044{
1045 return to_uni_lower(c); /* XXX no locale support yet */
1046}
1047
a0ed51b3 1048bool
864dbfa3 1049Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1050{
386d01d6 1051 if (!is_utf8_char(p))
1052 return FALSE;
a0ed51b3 1053 if (!PL_utf8_alnum)
289d4f09 1054 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1055 * descendant of isalnum(3), in other words, it doesn't
1056 * contain the '_'. --jhi */
1057 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
a0ed51b3 1058 return swash_fetch(PL_utf8_alnum, p);
1059/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1060#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1061 if (!PL_utf8_alnum)
1062 PL_utf8_alnum = swash_init("utf8", "",
1063 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1064 return swash_fetch(PL_utf8_alnum, p);
1065#endif
1066}
1067
1068bool
b8c5462f 1069Perl_is_utf8_alnumc(pTHX_ U8 *p)
1070{
386d01d6 1071 if (!is_utf8_char(p))
1072 return FALSE;
b8c5462f 1073 if (!PL_utf8_alnum)
1074 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1075 return swash_fetch(PL_utf8_alnum, p);
1076/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1077#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1078 if (!PL_utf8_alnum)
1079 PL_utf8_alnum = swash_init("utf8", "",
1080 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1081 return swash_fetch(PL_utf8_alnum, p);
1082#endif
1083}
1084
1085bool
864dbfa3 1086Perl_is_utf8_idfirst(pTHX_ U8 *p)
a0ed51b3 1087{
1088 return *p == '_' || is_utf8_alpha(p);
1089}
1090
1091bool
864dbfa3 1092Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1093{
386d01d6 1094 if (!is_utf8_char(p))
1095 return FALSE;
a0ed51b3 1096 if (!PL_utf8_alpha)
e24b16f9 1097 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
a0ed51b3 1098 return swash_fetch(PL_utf8_alpha, p);
1099}
1100
1101bool
b8c5462f 1102Perl_is_utf8_ascii(pTHX_ U8 *p)
1103{
386d01d6 1104 if (!is_utf8_char(p))
1105 return FALSE;
b8c5462f 1106 if (!PL_utf8_ascii)
1107 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1108 return swash_fetch(PL_utf8_ascii, p);
1109}
1110
1111bool
864dbfa3 1112Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1113{
386d01d6 1114 if (!is_utf8_char(p))
1115 return FALSE;
a0ed51b3 1116 if (!PL_utf8_space)
3bec3564 1117 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
a0ed51b3 1118 return swash_fetch(PL_utf8_space, p);
1119}
1120
1121bool
864dbfa3 1122Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1123{
386d01d6 1124 if (!is_utf8_char(p))
1125 return FALSE;
a0ed51b3 1126 if (!PL_utf8_digit)
e24b16f9 1127 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
a0ed51b3 1128 return swash_fetch(PL_utf8_digit, p);
1129}
1130
1131bool
864dbfa3 1132Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1133{
386d01d6 1134 if (!is_utf8_char(p))
1135 return FALSE;
a0ed51b3 1136 if (!PL_utf8_upper)
e24b16f9 1137 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
a0ed51b3 1138 return swash_fetch(PL_utf8_upper, p);
1139}
1140
1141bool
864dbfa3 1142Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1143{
386d01d6 1144 if (!is_utf8_char(p))
1145 return FALSE;
a0ed51b3 1146 if (!PL_utf8_lower)
e24b16f9 1147 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
a0ed51b3 1148 return swash_fetch(PL_utf8_lower, p);
1149}
1150
1151bool
b8c5462f 1152Perl_is_utf8_cntrl(pTHX_ U8 *p)
1153{
386d01d6 1154 if (!is_utf8_char(p))
1155 return FALSE;
b8c5462f 1156 if (!PL_utf8_cntrl)
1157 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1158 return swash_fetch(PL_utf8_cntrl, p);
1159}
1160
1161bool
1162Perl_is_utf8_graph(pTHX_ U8 *p)
1163{
386d01d6 1164 if (!is_utf8_char(p))
1165 return FALSE;
b8c5462f 1166 if (!PL_utf8_graph)
1167 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1168 return swash_fetch(PL_utf8_graph, p);
1169}
1170
1171bool
864dbfa3 1172Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1173{
386d01d6 1174 if (!is_utf8_char(p))
1175 return FALSE;
a0ed51b3 1176 if (!PL_utf8_print)
e24b16f9 1177 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
a0ed51b3 1178 return swash_fetch(PL_utf8_print, p);
1179}
1180
1181bool
b8c5462f 1182Perl_is_utf8_punct(pTHX_ U8 *p)
1183{
386d01d6 1184 if (!is_utf8_char(p))
1185 return FALSE;
b8c5462f 1186 if (!PL_utf8_punct)
1187 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1188 return swash_fetch(PL_utf8_punct, p);
1189}
1190
1191bool
1192Perl_is_utf8_xdigit(pTHX_ U8 *p)
1193{
386d01d6 1194 if (!is_utf8_char(p))
1195 return FALSE;
b8c5462f 1196 if (!PL_utf8_xdigit)
1197 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1198 return swash_fetch(PL_utf8_xdigit, p);
1199}
1200
1201bool
864dbfa3 1202Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1203{
386d01d6 1204 if (!is_utf8_char(p))
1205 return FALSE;
a0ed51b3 1206 if (!PL_utf8_mark)
e24b16f9 1207 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
a0ed51b3 1208 return swash_fetch(PL_utf8_mark, p);
1209}
1210
2104c8d9 1211UV
864dbfa3 1212Perl_to_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1213{
1214 UV uv;
1215
1216 if (!PL_utf8_toupper)
e24b16f9 1217 PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
a0ed51b3 1218 uv = swash_fetch(PL_utf8_toupper, p);
9041c2e3 1219 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
a0ed51b3 1220}
1221
2104c8d9 1222UV
864dbfa3 1223Perl_to_utf8_title(pTHX_ U8 *p)
a0ed51b3 1224{
1225 UV uv;
1226
1227 if (!PL_utf8_totitle)
e24b16f9 1228 PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
a0ed51b3 1229 uv = swash_fetch(PL_utf8_totitle, p);
9041c2e3 1230 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
a0ed51b3 1231}
1232
2104c8d9 1233UV
864dbfa3 1234Perl_to_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1235{
1236 UV uv;
1237
1238 if (!PL_utf8_tolower)
e24b16f9 1239 PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
a0ed51b3 1240 uv = swash_fetch(PL_utf8_tolower, p);
9041c2e3 1241 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
a0ed51b3 1242}
1243
1244/* a "swash" is a swatch hash */
1245
1246SV*
864dbfa3 1247Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1248{
1249 SV* retval;
bf1fed83 1250 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1251 dSP;
1b026014 1252 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
ce3b816e 1253
1b026014 1254 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1255 ENTER;
1256 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1257 LEAVE;
1258 }
1259 SPAGAIN;
a0ed51b3 1260 PUSHSTACKi(PERLSI_MAGIC);
1261 PUSHMARK(SP);
1262 EXTEND(SP,5);
1263 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1264 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1265 PUSHs(listsv);
1266 PUSHs(sv_2mortal(newSViv(minbits)));
1267 PUSHs(sv_2mortal(newSViv(none)));
1268 PUTBACK;
1269 ENTER;
1270 SAVEI32(PL_hints);
1271 PL_hints = 0;
1272 save_re_context();
bf1fed83 1273 if (PL_curcop == &PL_compiling)
1274 /* XXX ought to be handled by lex_start */
1275 sv_setpv(tokenbufsv, PL_tokenbuf);
864dbfa3 1276 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1277 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1278 else
e24b16f9 1279 retval = &PL_sv_undef;
a0ed51b3 1280 LEAVE;
1281 POPSTACK;
e24b16f9 1282 if (PL_curcop == &PL_compiling) {
bf1fed83 1283 STRLEN len;
1284 char* pv = SvPV(tokenbufsv, len);
1285
1286 Copy(pv, PL_tokenbuf, len+1, char);
e24b16f9 1287 PL_curcop->op_private = PL_hints;
a0ed51b3 1288 }
1289 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
cea2e8a9 1290 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
a0ed51b3 1291 return retval;
1292}
1293
1294UV
864dbfa3 1295Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
a0ed51b3 1296{
1297 HV* hv = (HV*)SvRV(sv);
7d85a32c 1298 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1299 then the "swatch" is a vec() for al the chars which start
1300 with 0xAA..0xYY
1301 So the key in the hash is length of encoded char -1
1302 */
a0ed51b3 1303 U32 klen = UTF8SKIP(ptr) - 1;
7d85a32c 1304 U32 off = ptr[klen];
a0ed51b3 1305 STRLEN slen;
7d85a32c 1306 STRLEN needents;
dfe13c55 1307 U8 *tmps;
a0ed51b3 1308 U32 bit;
1309 SV *retval;
1310
7d85a32c 1311 if (klen == 0)
1312 {
1313 /* If char in invariant then swatch is for all the invariant chars
1314 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1315 */
1316 needents = UTF_CONTINUATION_MARK;
1317 off = NATIVE_TO_UTF(ptr[klen]);
1318 }
1319 else
1320 {
1321 /* If char is encoded then swatch is for the prefix */
1322 needents = (1 << UTF_ACCUMULATION_SHIFT);
1323 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1324 }
1325
a0ed51b3 1326 /*
1327 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1328 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1329 * it's nothing to sniff at.) Pity we usually come through at least
1330 * two function calls to get here...
1331 *
1332 * NB: this code assumes that swatches are never modified, once generated!
1333 */
1334
1335 if (hv == PL_last_swash_hv &&
1336 klen == PL_last_swash_klen &&
12ae5dfc 1337 (!klen || memEQ((char *)ptr,(char *)PL_last_swash_key,klen)) )
a0ed51b3 1338 {
1339 tmps = PL_last_swash_tmps;
1340 slen = PL_last_swash_slen;
1341 }
1342 else {
1343 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1344 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3 1345
1346 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1347 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1348 dSP;
2b9d42f0 1349 /* We use utf8n_to_uvuni() as we want an index into
1350 Unicode tables, not a native character number.
1351 */
1352 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
a0ed51b3 1353 ENTER;
1354 SAVETMPS;
1355 save_re_context();
1356 PUSHSTACKi(PERLSI_MAGIC);
1357 PUSHMARK(SP);
1358 EXTEND(SP,3);
1359 PUSHs((SV*)sv);
ffbc6a93 1360 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1361 PUSHs(sv_2mortal(newSViv((klen) ? (code_point & ~(needents - 1)) : 0)));
a0ed51b3 1362 PUSHs(sv_2mortal(newSViv(needents)));
1363 PUTBACK;
864dbfa3 1364 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1365 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1366 else
e24b16f9 1367 retval = &PL_sv_undef;
a0ed51b3 1368 POPSTACK;
1369 FREETMPS;
1370 LEAVE;
e24b16f9 1371 if (PL_curcop == &PL_compiling)
1372 PL_curcop->op_private = PL_hints;
a0ed51b3 1373
dfe13c55 1374 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1375
7d85a32c 1376 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1377 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3 1378 }
1379
1380 PL_last_swash_hv = hv;
1381 PL_last_swash_klen = klen;
1382 PL_last_swash_tmps = tmps;
1383 PL_last_swash_slen = slen;
1384 if (klen)
1385 Copy(ptr, PL_last_swash_key, klen, U8);
1386 }
1387
9faf8d75 1388 switch ((int)((slen << 3) / needents)) {
a0ed51b3 1389 case 1:
1390 bit = 1 << (off & 7);
1391 off >>= 3;
1392 return (tmps[off] & bit) != 0;
1393 case 8:
1394 return tmps[off];
1395 case 16:
1396 off <<= 1;
1397 return (tmps[off] << 8) + tmps[off + 1] ;
1398 case 32:
1399 off <<= 2;
1400 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1401 }
cea2e8a9 1402 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3 1403 return 0;
1404}
2b9d42f0 1405
1406
1407/*
37607a96 1408=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0 1409
1410Adds the UTF8 representation of the Native codepoint C<uv> to the end
1411of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1412bytes available. The return value is the pointer to the byte after the
1413end of the new character. In other words,
1414
1415 d = uvchr_to_utf8(d, uv);
1416
1417is the recommended wide native character-aware way of saying
1418
1419 *(d++) = uv;
1420
1421=cut
1422*/
1423
1424/* On ASCII machines this is normally a macro but we want a
1425 real function in case XS code wants it
1426*/
1427#undef Perl_uvchr_to_utf8
1428U8 *
1429Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1430{
1431 return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1432}
1433
1434
1435/*
37607a96 1436=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0 1437
1438Returns the native character value of the first character in the string C<s>
1439which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1440length, in bytes, of that character.
1441
1442Allows length and flags to be passed to low level routine.
1443
1444=cut
1445*/
1446/* On ASCII machines this is normally a macro but we want a
1447 real function in case XS code wants it
1448*/
1449#undef Perl_utf8n_to_uvchr
1450UV
37607a96 1451Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0 1452{
1453 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1454 return UNI_TO_NATIVE(uv);
1455}
1456
1457