3 * Copyright (c) 1998-2001, Larry Wall
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
24 #define PERL_IN_UTF8_C
30 =for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34 bytes available. The return value is the pointer to the byte after the
35 end of the new character. In other words,
37 d = uvuni_to_utf8(d, uv);
39 is the recommended Unicode-aware way of saying
47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
49 if (ckWARN_d(WARN_UTF8)) {
50 if (UNICODE_IS_SURROGATE(uv))
51 Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv);
52 else if ((uv >= 0xFDD0 && uv <= 0xFDEF) ||
53 (uv == 0xFFFE || uv == 0xFFFF))
54 Perl_warner(aTHX_ WARN_UTF8,
55 "Unicode character 0x%04"UVxf" is illegal", uv);
57 if (UNI_IS_INVARIANT(uv)) {
58 *d++ = UTF_TO_NATIVE(uv);
63 STRLEN len = UNISKIP(uv);
66 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
67 uv >>= UTF_ACCUMULATION_SHIFT;
69 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
72 #else /* Non loop style */
74 *d++ = (( uv >> 6) | 0xc0);
75 *d++ = (( uv & 0x3f) | 0x80);
79 *d++ = (( uv >> 12) | 0xe0);
80 *d++ = (((uv >> 6) & 0x3f) | 0x80);
81 *d++ = (( uv & 0x3f) | 0x80);
85 *d++ = (( uv >> 18) | 0xf0);
86 *d++ = (((uv >> 12) & 0x3f) | 0x80);
87 *d++ = (((uv >> 6) & 0x3f) | 0x80);
88 *d++ = (( uv & 0x3f) | 0x80);
92 *d++ = (( uv >> 24) | 0xf8);
93 *d++ = (((uv >> 18) & 0x3f) | 0x80);
94 *d++ = (((uv >> 12) & 0x3f) | 0x80);
95 *d++ = (((uv >> 6) & 0x3f) | 0x80);
96 *d++ = (( uv & 0x3f) | 0x80);
99 if (uv < 0x80000000) {
100 *d++ = (( uv >> 30) | 0xfc);
101 *d++ = (((uv >> 24) & 0x3f) | 0x80);
102 *d++ = (((uv >> 18) & 0x3f) | 0x80);
103 *d++ = (((uv >> 12) & 0x3f) | 0x80);
104 *d++ = (((uv >> 6) & 0x3f) | 0x80);
105 *d++ = (( uv & 0x3f) | 0x80);
109 if (uv < UTF8_QUAD_MAX)
112 *d++ = 0xfe; /* Can't match U+FEFF! */
113 *d++ = (((uv >> 30) & 0x3f) | 0x80);
114 *d++ = (((uv >> 24) & 0x3f) | 0x80);
115 *d++ = (((uv >> 18) & 0x3f) | 0x80);
116 *d++ = (((uv >> 12) & 0x3f) | 0x80);
117 *d++ = (((uv >> 6) & 0x3f) | 0x80);
118 *d++ = (( uv & 0x3f) | 0x80);
123 *d++ = 0xff; /* Can't match U+FFFE! */
124 *d++ = 0x80; /* 6 Reserved bits */
125 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
126 *d++ = (((uv >> 54) & 0x3f) | 0x80);
127 *d++ = (((uv >> 48) & 0x3f) | 0x80);
128 *d++ = (((uv >> 42) & 0x3f) | 0x80);
129 *d++ = (((uv >> 36) & 0x3f) | 0x80);
130 *d++ = (((uv >> 30) & 0x3f) | 0x80);
131 *d++ = (((uv >> 24) & 0x3f) | 0x80);
132 *d++ = (((uv >> 18) & 0x3f) | 0x80);
133 *d++ = (((uv >> 12) & 0x3f) | 0x80);
134 *d++ = (((uv >> 6) & 0x3f) | 0x80);
135 *d++ = (( uv & 0x3f) | 0x80);
139 #endif /* Loop style */
145 =for apidoc A|STRLEN|is_utf8_char|U8 *s
147 Tests if some arbitrary number of bytes begins in a valid UTF-8
148 character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
149 The actual number of bytes in the UTF-8 character will be returned if
150 it is valid, otherwise 0.
155 Perl_is_utf8_char(pTHX_ U8 *s)
161 if (UTF8_IS_INVARIANT(u))
164 if (!UTF8_IS_START(u))
169 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
174 u &= UTF_START_MASK(len);
178 if (!UTF8_IS_CONTINUATION(*s))
180 uv = UTF8_ACCUMULATE(uv, *s);
187 if (UNISKIP(uv) < len)
194 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
196 Returns true if first C<len> bytes of the given string form a valid UTF8
197 string, false otherwise. Note that 'a valid UTF8 string' does not mean
198 'a string that contains UTF8' because a valid ASCII string is a valid
205 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
212 len = strlen((char *)s);
228 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
230 Bottom level UTF-8 decode routine.
231 Returns the unicode code point value of the first character in the string C<s>
232 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
233 C<retlen> will be set to the length, in bytes, of that character.
235 If C<s> does not point to a well-formed UTF8 character, the behaviour
236 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
237 it is assumed that the caller will raise a warning, and this function
238 will silently just set C<retlen> to C<-1> and return zero. If the
239 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
240 malformations will be given, C<retlen> will be set to the expected
241 length of the UTF-8 character in bytes, and zero will be returned.
243 The C<flags> can also contain various flags to allow deviations from
244 the strict UTF-8 encoding (see F<utf8.h>).
246 Most code should use utf8_to_uvchr() rather than call this directly.
252 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
256 bool dowarn = ckWARN_d(WARN_UTF8);
257 STRLEN expectlen = 0;
260 /* This list is a superset of the UTF8_ALLOW_XXX. */
262 #define UTF8_WARN_EMPTY 1
263 #define UTF8_WARN_CONTINUATION 2
264 #define UTF8_WARN_NON_CONTINUATION 3
265 #define UTF8_WARN_FE_FF 4
266 #define UTF8_WARN_SHORT 5
267 #define UTF8_WARN_OVERFLOW 6
268 #define UTF8_WARN_SURROGATE 7
269 #define UTF8_WARN_BOM 8
270 #define UTF8_WARN_LONG 9
271 #define UTF8_WARN_FFFF 10
274 !(flags & UTF8_ALLOW_EMPTY)) {
275 warning = UTF8_WARN_EMPTY;
279 if (UTF8_IS_INVARIANT(uv)) {
282 return (UV) (NATIVE_TO_UTF(*s));
285 if (UTF8_IS_CONTINUATION(uv) &&
286 !(flags & UTF8_ALLOW_CONTINUATION)) {
287 warning = UTF8_WARN_CONTINUATION;
291 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
292 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
293 warning = UTF8_WARN_NON_CONTINUATION;
298 uv = NATIVE_TO_UTF(uv);
300 if ((uv == 0xfe || uv == 0xff) &&
301 !(flags & UTF8_ALLOW_FE_FF)) {
302 warning = UTF8_WARN_FE_FF;
307 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
308 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
309 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
310 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
312 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
313 else { len = 7; uv &= 0x01; }
315 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
316 else if (!(uv & 0x01)) { len = 7; uv = 0; }
317 else { len = 13; uv = 0; } /* whoa! */
325 if ((curlen < expectlen) &&
326 !(flags & UTF8_ALLOW_SHORT)) {
327 warning = UTF8_WARN_SHORT;
336 if (!UTF8_IS_CONTINUATION(*s) &&
337 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
339 warning = UTF8_WARN_NON_CONTINUATION;
343 uv = UTF8_ACCUMULATE(uv, *s);
345 /* These cannot be allowed. */
347 if (!(flags & UTF8_ALLOW_LONG)) {
348 warning = UTF8_WARN_LONG;
352 else { /* uv < ouv */
353 /* This cannot be allowed. */
354 warning = UTF8_WARN_OVERFLOW;
362 if (UNICODE_IS_SURROGATE(uv) &&
363 !(flags & UTF8_ALLOW_SURROGATE)) {
364 warning = UTF8_WARN_SURROGATE;
366 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
367 !(flags & UTF8_ALLOW_BOM)) {
368 warning = UTF8_WARN_BOM;
370 } else if ((expectlen > UNISKIP(uv)) &&
371 !(flags & UTF8_ALLOW_LONG)) {
372 warning = UTF8_WARN_LONG;
374 } else if (UNICODE_IS_ILLEGAL(uv) &&
375 !(flags & UTF8_ALLOW_FFFF)) {
376 warning = UTF8_WARN_FFFF;
384 if (flags & UTF8_CHECK_ONLY) {
391 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
394 case 0: /* Intentionally empty. */ break;
395 case UTF8_WARN_EMPTY:
396 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
398 case UTF8_WARN_CONTINUATION:
399 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
401 case UTF8_WARN_NON_CONTINUATION:
402 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
405 case UTF8_WARN_FE_FF:
406 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
408 case UTF8_WARN_SHORT:
409 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
410 curlen, curlen == 1 ? "" : "s", expectlen);
411 expectlen = curlen; /* distance for caller to skip */
413 case UTF8_WARN_OVERFLOW:
414 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
417 case UTF8_WARN_SURROGATE:
418 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
421 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
424 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
425 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
428 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
431 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
439 Perl_warner(aTHX_ WARN_UTF8,
440 "%s in %s", s, OP_DESC(PL_op));
442 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
447 *retlen = expectlen ? expectlen : len;
453 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
455 Returns the native character value of the first character in the string C<s>
456 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
457 length, in bytes, of that character.
459 If C<s> does not point to a well-formed UTF8 character, zero is
460 returned and retlen is set, if possible, to -1.
466 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
468 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
472 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
474 Returns the Unicode code point of the first character in the string C<s>
475 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
476 length, in bytes, of that character.
478 This function should only be used when returned UV is considered
479 an index into the Unicode semantic tables (e.g. swashes).
481 If C<s> does not point to a well-formed UTF8 character, zero is
482 returned and retlen is set, if possible, to -1.
488 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
490 /* Call the low level routine asking for checks */
491 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
495 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
497 Return the length of the UTF-8 char encoded string C<s> in characters.
498 Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
499 up past C<e>, croaks.
505 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
509 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
510 * the bitops (especially ~) can create illegal UTF-8.
511 * In other words: in Perl UTF-8 is not just for Unicode. */
514 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
519 Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
528 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
530 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
533 WARNING: use only if you *know* that the pointers point inside the
540 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
544 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
545 * the bitops (especially ~) can create illegal UTF-8.
546 * In other words: in Perl UTF-8 is not just for Unicode. */
553 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
563 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
573 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
575 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
578 WARNING: do not use the following unless you *know* C<off> is within
579 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
580 on the first byte of character or just after the last byte of a character.
586 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
588 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
589 * the bitops (especially ~) can create illegal UTF-8.
590 * In other words: in Perl UTF-8 is not just for Unicode. */
599 while (UTF8_IS_CONTINUATION(*s))
607 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
609 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
610 Unlike C<bytes_to_utf8>, this over-writes the original string, and
611 updates len to contain the new length.
612 Returns zero on failure, setting C<len> to -1.
618 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
624 /* ensure valid UTF8 and chars < 256 before updating string */
625 for (send = s + *len; s < send; ) {
628 if (!UTF8_IS_INVARIANT(c) &&
629 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
630 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
639 *d++ = (U8)utf8_to_uvchr(s, &ulen);
648 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
650 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
651 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
652 the newly-created string, and updates C<len> to contain the new
653 length. Returns the original string if no conversion occurs, C<len>
654 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
655 0 if C<s> is converted or contains all 7bit characters.
661 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
671 /* ensure valid UTF8 and chars < 256 before converting string */
672 for (send = s + *len; s < send;) {
674 if (!UTF8_IS_INVARIANT(c)) {
675 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
676 (c = *s++) && UTF8_IS_CONTINUATION(c))
685 Newz(801, d, (*len) - count + 1, U8);
686 s = start; start = d;
689 if (!UTF8_IS_INVARIANT(c)) {
690 /* Then it is two-byte encoded */
691 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
692 c = ASCII_TO_NATIVE(c);
702 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
704 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
705 Returns a pointer to the newly-created string, and sets C<len> to
706 reflect the new length.
712 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
719 Newz(801, d, (*len) * 2 + 1, U8);
723 UV uv = NATIVE_TO_ASCII(*s++);
724 if (UNI_IS_INVARIANT(uv))
725 *d++ = UTF_TO_NATIVE(uv);
727 *d++ = UTF8_EIGHT_BIT_HI(uv);
728 *d++ = UTF8_EIGHT_BIT_LO(uv);
737 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
739 * Destination must be pre-extended to 3/2 source. Do not use in-place.
740 * We optimize for native, for obvious reasons. */
743 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
749 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
754 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
761 *d++ = (( uv >> 6) | 0xc0);
762 *d++ = (( uv & 0x3f) | 0x80);
765 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
767 if (low < 0xdc00 || low >= 0xdfff)
768 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
769 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
772 *d++ = (( uv >> 12) | 0xe0);
773 *d++ = (((uv >> 6) & 0x3f) | 0x80);
774 *d++ = (( uv & 0x3f) | 0x80);
778 *d++ = (( uv >> 18) | 0xf0);
779 *d++ = (((uv >> 12) & 0x3f) | 0x80);
780 *d++ = (((uv >> 6) & 0x3f) | 0x80);
781 *d++ = (( uv & 0x3f) | 0x80);
785 *newlen = d - dstart;
789 /* Note: this one is slightly destructive of the source. */
792 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
795 U8* send = s + bytelen;
802 return utf16_to_utf8(p, d, bytelen, newlen);
805 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
808 Perl_is_uni_alnum(pTHX_ UV c)
810 U8 tmpbuf[UTF8_MAXLEN+1];
811 uvchr_to_utf8(tmpbuf, (UV)c);
812 return is_utf8_alnum(tmpbuf);
816 Perl_is_uni_alnumc(pTHX_ UV c)
818 U8 tmpbuf[UTF8_MAXLEN+1];
819 uvchr_to_utf8(tmpbuf, (UV)c);
820 return is_utf8_alnumc(tmpbuf);
824 Perl_is_uni_idfirst(pTHX_ UV c)
826 U8 tmpbuf[UTF8_MAXLEN+1];
827 uvchr_to_utf8(tmpbuf, (UV)c);
828 return is_utf8_idfirst(tmpbuf);
832 Perl_is_uni_alpha(pTHX_ UV c)
834 U8 tmpbuf[UTF8_MAXLEN+1];
835 uvchr_to_utf8(tmpbuf, (UV)c);
836 return is_utf8_alpha(tmpbuf);
840 Perl_is_uni_ascii(pTHX_ UV c)
842 U8 tmpbuf[UTF8_MAXLEN+1];
843 uvchr_to_utf8(tmpbuf, (UV)c);
844 return is_utf8_ascii(tmpbuf);
848 Perl_is_uni_space(pTHX_ UV c)
850 U8 tmpbuf[UTF8_MAXLEN+1];
851 uvchr_to_utf8(tmpbuf, (UV)c);
852 return is_utf8_space(tmpbuf);
856 Perl_is_uni_digit(pTHX_ UV c)
858 U8 tmpbuf[UTF8_MAXLEN+1];
859 uvchr_to_utf8(tmpbuf, (UV)c);
860 return is_utf8_digit(tmpbuf);
864 Perl_is_uni_upper(pTHX_ UV c)
866 U8 tmpbuf[UTF8_MAXLEN+1];
867 uvchr_to_utf8(tmpbuf, (UV)c);
868 return is_utf8_upper(tmpbuf);
872 Perl_is_uni_lower(pTHX_ UV c)
874 U8 tmpbuf[UTF8_MAXLEN+1];
875 uvchr_to_utf8(tmpbuf, (UV)c);
876 return is_utf8_lower(tmpbuf);
880 Perl_is_uni_cntrl(pTHX_ UV c)
882 U8 tmpbuf[UTF8_MAXLEN+1];
883 uvchr_to_utf8(tmpbuf, (UV)c);
884 return is_utf8_cntrl(tmpbuf);
888 Perl_is_uni_graph(pTHX_ UV c)
890 U8 tmpbuf[UTF8_MAXLEN+1];
891 uvchr_to_utf8(tmpbuf, (UV)c);
892 return is_utf8_graph(tmpbuf);
896 Perl_is_uni_print(pTHX_ UV c)
898 U8 tmpbuf[UTF8_MAXLEN+1];
899 uvchr_to_utf8(tmpbuf, (UV)c);
900 return is_utf8_print(tmpbuf);
904 Perl_is_uni_punct(pTHX_ UV c)
906 U8 tmpbuf[UTF8_MAXLEN+1];
907 uvchr_to_utf8(tmpbuf, (UV)c);
908 return is_utf8_punct(tmpbuf);
912 Perl_is_uni_xdigit(pTHX_ UV c)
914 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
915 uvchr_to_utf8(tmpbuf, (UV)c);
916 return is_utf8_xdigit(tmpbuf);
920 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
922 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
923 uvchr_to_utf8(tmpbuf, (UV)c);
924 return to_utf8_upper(tmpbuf, p, lenp);
928 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
930 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
931 uvchr_to_utf8(tmpbuf, (UV)c);
932 return to_utf8_title(tmpbuf, p, lenp);
936 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
938 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
939 uvchr_to_utf8(tmpbuf, (UV)c);
940 return to_utf8_lower(tmpbuf, p, lenp);
944 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
946 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
947 uvchr_to_utf8(tmpbuf, (UV)c);
948 return to_utf8_fold(tmpbuf, p, lenp);
951 /* for now these all assume no locale info available for Unicode > 255 */
954 Perl_is_uni_alnum_lc(pTHX_ UV c)
956 return is_uni_alnum(c); /* XXX no locale support yet */
960 Perl_is_uni_alnumc_lc(pTHX_ UV c)
962 return is_uni_alnumc(c); /* XXX no locale support yet */
966 Perl_is_uni_idfirst_lc(pTHX_ UV c)
968 return is_uni_idfirst(c); /* XXX no locale support yet */
972 Perl_is_uni_alpha_lc(pTHX_ UV c)
974 return is_uni_alpha(c); /* XXX no locale support yet */
978 Perl_is_uni_ascii_lc(pTHX_ UV c)
980 return is_uni_ascii(c); /* XXX no locale support yet */
984 Perl_is_uni_space_lc(pTHX_ UV c)
986 return is_uni_space(c); /* XXX no locale support yet */
990 Perl_is_uni_digit_lc(pTHX_ UV c)
992 return is_uni_digit(c); /* XXX no locale support yet */
996 Perl_is_uni_upper_lc(pTHX_ UV c)
998 return is_uni_upper(c); /* XXX no locale support yet */
1002 Perl_is_uni_lower_lc(pTHX_ UV c)
1004 return is_uni_lower(c); /* XXX no locale support yet */
1008 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1010 return is_uni_cntrl(c); /* XXX no locale support yet */
1014 Perl_is_uni_graph_lc(pTHX_ UV c)
1016 return is_uni_graph(c); /* XXX no locale support yet */
1020 Perl_is_uni_print_lc(pTHX_ UV c)
1022 return is_uni_print(c); /* XXX no locale support yet */
1026 Perl_is_uni_punct_lc(pTHX_ UV c)
1028 return is_uni_punct(c); /* XXX no locale support yet */
1032 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1034 return is_uni_xdigit(c); /* XXX no locale support yet */
1038 Perl_is_utf8_alnum(pTHX_ U8 *p)
1040 if (!is_utf8_char(p))
1043 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1044 * descendant of isalnum(3), in other words, it doesn't
1045 * contain the '_'. --jhi */
1046 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1047 return swash_fetch(PL_utf8_alnum, p, TRUE);
1048 /* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1049 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1051 PL_utf8_alnum = swash_init("utf8", "",
1052 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1053 return swash_fetch(PL_utf8_alnum, p, TRUE);
1058 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1060 if (!is_utf8_char(p))
1063 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1064 return swash_fetch(PL_utf8_alnum, p, TRUE);
1065 /* return is_utf8_alpha(p) || is_utf8_digit(p); */
1066 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1068 PL_utf8_alnum = swash_init("utf8", "",
1069 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1070 return swash_fetch(PL_utf8_alnum, p, TRUE);
1075 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1077 return *p == '_' || is_utf8_alpha(p);
1081 Perl_is_utf8_alpha(pTHX_ U8 *p)
1083 if (!is_utf8_char(p))
1086 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1087 return swash_fetch(PL_utf8_alpha, p, TRUE);
1091 Perl_is_utf8_ascii(pTHX_ U8 *p)
1093 if (!is_utf8_char(p))
1096 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1097 return swash_fetch(PL_utf8_ascii, p, TRUE);
1101 Perl_is_utf8_space(pTHX_ U8 *p)
1103 if (!is_utf8_char(p))
1106 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1107 return swash_fetch(PL_utf8_space, p, TRUE);
1111 Perl_is_utf8_digit(pTHX_ U8 *p)
1113 if (!is_utf8_char(p))
1116 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1117 return swash_fetch(PL_utf8_digit, p, TRUE);
1121 Perl_is_utf8_upper(pTHX_ U8 *p)
1123 if (!is_utf8_char(p))
1126 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1127 return swash_fetch(PL_utf8_upper, p, TRUE);
1131 Perl_is_utf8_lower(pTHX_ U8 *p)
1133 if (!is_utf8_char(p))
1136 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1137 return swash_fetch(PL_utf8_lower, p, TRUE);
1141 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1143 if (!is_utf8_char(p))
1146 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1147 return swash_fetch(PL_utf8_cntrl, p, TRUE);
1151 Perl_is_utf8_graph(pTHX_ U8 *p)
1153 if (!is_utf8_char(p))
1156 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1157 return swash_fetch(PL_utf8_graph, p, TRUE);
1161 Perl_is_utf8_print(pTHX_ U8 *p)
1163 if (!is_utf8_char(p))
1166 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1167 return swash_fetch(PL_utf8_print, p, TRUE);
1171 Perl_is_utf8_punct(pTHX_ U8 *p)
1173 if (!is_utf8_char(p))
1176 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1177 return swash_fetch(PL_utf8_punct, p, TRUE);
1181 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1183 if (!is_utf8_char(p))
1185 if (!PL_utf8_xdigit)
1186 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1187 return swash_fetch(PL_utf8_xdigit, p, TRUE);
1191 Perl_is_utf8_mark(pTHX_ U8 *p)
1193 if (!is_utf8_char(p))
1196 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1197 return swash_fetch(PL_utf8_mark, p, TRUE);
1201 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1203 The "p" contains the pointer to the UTF-8 string encoding
1204 the character that is being converted.
1206 The "ustrp" is a pointer to the character buffer to put the
1207 conversion result to. The "lenp" is a pointer to the length
1210 The "swash" is a pointer to the swash to use.
1212 The "normal" is a string like "ToLower" which means the swash
1213 $utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
1214 and loaded by SWASHGET, using lib/utf8_heavy.pl.
1216 The "special" is a string like "utf8::ToSpecLower", which means
1217 the hash %utf8::ToSpecLower, which is stored in the same file,
1218 lib/unicore/To/Lower.pl, and also loaded by SWASHGET. The access
1219 to the hash is by Perl_to_utf8_case().
1225 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special)
1230 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1231 uv = swash_fetch(*swashp, p, TRUE);
1233 uv = UNI_TO_NATIVE(uv);
1239 uv = utf8_to_uvchr(p, 0);
1241 if ((hv = get_hv(special, FALSE)) &&
1242 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) &&
1243 (he = hv_fetch_ent(hv, keysv, FALSE, 0))) {
1244 SV *val = HeVAL(he);
1245 char *s = SvPV(val, *lenp);
1247 if (*lenp > 1 || UNI_IS_INVARIANT(c))
1248 Copy(s, ustrp, *lenp, U8);
1250 /* something in the 0x80..0xFF range */
1251 ustrp[0] = UTF8_EIGHT_BIT_HI(c);
1252 ustrp[1] = UTF8_EIGHT_BIT_LO(c);
1259 *lenp = UNISKIP(uv);
1260 uvuni_to_utf8(ustrp, uv);
1265 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1267 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1268 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1272 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1274 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1275 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1279 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1281 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1282 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1286 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1288 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1289 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1292 /* a "swash" is a swatch hash */
1295 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1298 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1300 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1303 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1305 errsv_save = newSVsv(ERRSV);
1306 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1308 sv_setsv(ERRSV, errsv_save);
1309 SvREFCNT_dec(errsv_save);
1313 PUSHSTACKi(PERLSI_MAGIC);
1316 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1317 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1319 PUSHs(sv_2mortal(newSViv(minbits)));
1320 PUSHs(sv_2mortal(newSViv(none)));
1326 if (PL_curcop == &PL_compiling)
1327 /* XXX ought to be handled by lex_start */
1328 sv_setpv(tokenbufsv, PL_tokenbuf);
1329 errsv_save = newSVsv(ERRSV);
1330 if (call_method("SWASHNEW", G_SCALAR))
1331 retval = newSVsv(*PL_stack_sp--);
1333 retval = &PL_sv_undef;
1335 sv_setsv(ERRSV, errsv_save);
1336 SvREFCNT_dec(errsv_save);
1339 if (PL_curcop == &PL_compiling) {
1341 char* pv = SvPV(tokenbufsv, len);
1343 Copy(pv, PL_tokenbuf, len+1, char);
1344 PL_curcop->op_private = PL_hints;
1346 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1347 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1352 /* This API is wrong for special case conversions since we may need to
1353 * return several Unicode characters for a single Unicode character
1354 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1355 * the lower-level routine, and it is similarly broken for returning
1356 * multiple values. --jhi */
1358 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1360 HV* hv = (HV*)SvRV(sv);
1369 UV c = NATIVE_TO_ASCII(*ptr);
1371 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1372 tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1373 tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1376 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1377 * then the "swatch" is a vec() for al the chars which start
1379 * So the key in the hash (klen) is length of encoded char -1
1381 klen = UTF8SKIP(ptr) - 1;
1386 /* If char in invariant then swatch is for all the invariant chars
1387 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1389 needents = UTF_CONTINUATION_MARK;
1390 off = NATIVE_TO_UTF(ptr[klen]);
1394 /* If char is encoded then swatch is for the prefix */
1395 needents = (1 << UTF_ACCUMULATION_SHIFT);
1396 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1400 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1401 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1402 * it's nothing to sniff at.) Pity we usually come through at least
1403 * two function calls to get here...
1405 * NB: this code assumes that swatches are never modified, once generated!
1408 if (hv == PL_last_swash_hv &&
1409 klen == PL_last_swash_klen &&
1410 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1412 tmps = PL_last_swash_tmps;
1413 slen = PL_last_swash_slen;
1416 /* Try our second-level swatch cache, kept in a hash. */
1417 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1419 /* If not cached, generate it via utf8::SWASHGET */
1420 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1422 /* We use utf8n_to_uvuni() as we want an index into
1423 Unicode tables, not a native character number.
1425 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
1430 PUSHSTACKi(PERLSI_MAGIC);
1434 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1435 PUSHs(sv_2mortal(newSViv((klen) ?
1436 (code_point & ~(needents - 1)) : 0)));
1437 PUSHs(sv_2mortal(newSViv(needents)));
1439 errsv_save = newSVsv(ERRSV);
1440 if (call_method("SWASHGET", G_SCALAR))
1441 retval = newSVsv(*PL_stack_sp--);
1443 retval = &PL_sv_undef;
1445 sv_setsv(ERRSV, errsv_save);
1446 SvREFCNT_dec(errsv_save);
1450 if (PL_curcop == &PL_compiling)
1451 PL_curcop->op_private = PL_hints;
1453 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1455 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1456 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1459 PL_last_swash_hv = hv;
1460 PL_last_swash_klen = klen;
1461 PL_last_swash_tmps = tmps;
1462 PL_last_swash_slen = slen;
1464 Copy(ptr, PL_last_swash_key, klen, U8);
1467 switch ((int)((slen << 3) / needents)) {
1469 bit = 1 << (off & 7);
1471 return (tmps[off] & bit) != 0;
1476 return (tmps[off] << 8) + tmps[off + 1] ;
1479 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1481 Perl_croak(aTHX_ "panic: swash_fetch");
1487 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1489 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1490 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1491 bytes available. The return value is the pointer to the byte after the
1492 end of the new character. In other words,
1494 d = uvchr_to_utf8(d, uv);
1496 is the recommended wide native character-aware way of saying
1503 /* On ASCII machines this is normally a macro but we want a
1504 real function in case XS code wants it
1506 #undef Perl_uvchr_to_utf8
1508 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1510 return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1515 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1517 Returns the native character value of the first character in the string C<s>
1518 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1519 length, in bytes, of that character.
1521 Allows length and flags to be passed to low level routine.
1525 /* On ASCII machines this is normally a macro but we want a
1526 real function in case XS code wants it
1528 #undef Perl_utf8n_to_uvchr
1530 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1532 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1533 return UNI_TO_NATIVE(uv);
1537 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1539 Build to the scalar dsv a displayable version of the string spv,
1540 length len, the displayable version being at most pvlim bytes long
1541 (if longer, the rest is truncated and "..." will be appended).
1542 The flags argument is currently unused but available for future extensions.
1543 The pointer to the PV of the dsv is returned.
1547 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1552 sv_setpvn(dsv, "", 0);
1553 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1555 if (pvlim && SvCUR(dsv) >= pvlim) {
1559 u = utf8_to_uvchr((U8*)s, 0);
1560 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1563 sv_catpvn(dsv, "...", 3);
1569 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1571 Build to the scalar dsv a displayable version of the scalar sv,
1572 he displayable version being at most pvlim bytes long
1573 (if longer, the rest is truncated and "..." will be appended).
1574 The flags argument is currently unused but available for future extensions.
1575 The pointer to the PV of the dsv is returned.
1579 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1581 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1586 =for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|const char *s2|bool u2|register I32 len
1588 Return true if the strings s1 and s2 differ case-insensitively, false
1589 if not (if they are equal case-insensitively). If u1 is true, the
1590 string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
1591 the string s2 is assumed to be in UTF-8-encoded Unicode.
1593 For case-insensitiveness, the "casefolding" of Unicode is used
1594 instead of upper/lowercasing both the characters, see
1595 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1599 Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, register I32 len1, const char *s2, bool u2, register I32 len2)
1601 register U8 *a = (U8*)s1;
1602 register U8 *b = (U8*)s2;
1603 register U8 *ae = b + len1;
1604 register U8 *be = b + len2;
1607 STRLEN ulen1, ulen2;
1608 U8 tmpbuf1[UTF8_MAXLEN_FOLD+1];
1609 U8 tmpbuf2[UTF8_MAXLEN_FOLD+1];
1611 while (a < ae && b < be) {
1613 if (a + UTF8SKIP(a) > ae)
1615 ca = utf8_to_uvchr((U8*)a, &la);
1621 if (b + UTF8SKIP(b) > be)
1623 cb = utf8_to_uvchr((U8*)b, &lb);
1630 to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
1634 to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
1638 || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb])
1639 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
1640 return 1; /* mismatch */
1645 return a == ae && b == be ? 0 : 1; /* 0 match, 1 mismatch */