3 * Copyright (c) 1998-2002, Larry Wall
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
24 #define PERL_IN_UTF8_C
27 static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
30 =head1 Unicode Support
32 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
34 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
35 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
36 bytes available. The return value is the pointer to the byte after the
37 end of the new character. In other words,
39 d = uvuni_to_utf8_flags(d, uv, flags);
43 d = uvuni_to_utf8(d, uv);
45 (which is equivalent to)
47 d = uvuni_to_utf8_flags(d, uv, 0);
49 is the recommended Unicode-aware way of saying
57 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
59 if (ckWARN(WARN_UTF8)) {
60 if (UNICODE_IS_SURROGATE(uv) &&
61 !(flags & UNICODE_ALLOW_SURROGATE))
62 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
64 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
65 !(flags & UNICODE_ALLOW_FDD0))
67 ((uv & 0xFFFF) == 0xFFFE &&
68 !(flags & UNICODE_ALLOW_FFFE))
70 ((uv & 0xFFFF) == 0xFFFF &&
71 !(flags & UNICODE_ALLOW_FFFF))) &&
72 /* UNICODE_ALLOW_SUPER includes
73 * FFFEs and FFFFs beyond 0x10FFFF. */
74 ((uv <= PERL_UNICODE_MAX) ||
75 !(flags & UNICODE_ALLOW_SUPER))
77 Perl_warner(aTHX_ packWARN(WARN_UTF8),
78 "Unicode character 0x%04"UVxf" is illegal", uv);
80 if (UNI_IS_INVARIANT(uv)) {
81 *d++ = UTF_TO_NATIVE(uv);
86 STRLEN len = UNISKIP(uv);
89 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
90 uv >>= UTF_ACCUMULATION_SHIFT;
92 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
95 #else /* Non loop style */
97 *d++ = (( uv >> 6) | 0xc0);
98 *d++ = (( uv & 0x3f) | 0x80);
102 *d++ = (( uv >> 12) | 0xe0);
103 *d++ = (((uv >> 6) & 0x3f) | 0x80);
104 *d++ = (( uv & 0x3f) | 0x80);
108 *d++ = (( uv >> 18) | 0xf0);
109 *d++ = (((uv >> 12) & 0x3f) | 0x80);
110 *d++ = (((uv >> 6) & 0x3f) | 0x80);
111 *d++ = (( uv & 0x3f) | 0x80);
114 if (uv < 0x4000000) {
115 *d++ = (( uv >> 24) | 0xf8);
116 *d++ = (((uv >> 18) & 0x3f) | 0x80);
117 *d++ = (((uv >> 12) & 0x3f) | 0x80);
118 *d++ = (((uv >> 6) & 0x3f) | 0x80);
119 *d++ = (( uv & 0x3f) | 0x80);
122 if (uv < 0x80000000) {
123 *d++ = (( uv >> 30) | 0xfc);
124 *d++ = (((uv >> 24) & 0x3f) | 0x80);
125 *d++ = (((uv >> 18) & 0x3f) | 0x80);
126 *d++ = (((uv >> 12) & 0x3f) | 0x80);
127 *d++ = (((uv >> 6) & 0x3f) | 0x80);
128 *d++ = (( uv & 0x3f) | 0x80);
132 if (uv < UTF8_QUAD_MAX)
135 *d++ = 0xfe; /* Can't match U+FEFF! */
136 *d++ = (((uv >> 30) & 0x3f) | 0x80);
137 *d++ = (((uv >> 24) & 0x3f) | 0x80);
138 *d++ = (((uv >> 18) & 0x3f) | 0x80);
139 *d++ = (((uv >> 12) & 0x3f) | 0x80);
140 *d++ = (((uv >> 6) & 0x3f) | 0x80);
141 *d++ = (( uv & 0x3f) | 0x80);
146 *d++ = 0xff; /* Can't match U+FFFE! */
147 *d++ = 0x80; /* 6 Reserved bits */
148 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
149 *d++ = (((uv >> 54) & 0x3f) | 0x80);
150 *d++ = (((uv >> 48) & 0x3f) | 0x80);
151 *d++ = (((uv >> 42) & 0x3f) | 0x80);
152 *d++ = (((uv >> 36) & 0x3f) | 0x80);
153 *d++ = (((uv >> 30) & 0x3f) | 0x80);
154 *d++ = (((uv >> 24) & 0x3f) | 0x80);
155 *d++ = (((uv >> 18) & 0x3f) | 0x80);
156 *d++ = (((uv >> 12) & 0x3f) | 0x80);
157 *d++ = (((uv >> 6) & 0x3f) | 0x80);
158 *d++ = (( uv & 0x3f) | 0x80);
162 #endif /* Loop style */
166 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
168 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
173 =for apidoc A|STRLEN|is_utf8_char|U8 *s
175 Tests if some arbitrary number of bytes begins in a valid UTF-8
176 character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
177 The actual number of bytes in the UTF-8 character will be returned if
178 it is valid, otherwise 0.
183 Perl_is_utf8_char(pTHX_ U8 *s)
189 if (UTF8_IS_INVARIANT(u))
192 if (!UTF8_IS_START(u))
197 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
202 u &= UTF_START_MASK(len);
206 if (!UTF8_IS_CONTINUATION(*s))
208 uv = UTF8_ACCUMULATE(uv, *s);
215 if (UNISKIP(uv) < len)
222 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
224 Returns true if first C<len> bytes of the given string form a valid UTF8
225 string, false otherwise. Note that 'a valid UTF8 string' does not mean
226 'a string that contains UTF8' because a valid ASCII string is a valid
233 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
240 len = strlen((char *)s);
256 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
258 Bottom level UTF-8 decode routine.
259 Returns the unicode code point value of the first character in the string C<s>
260 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
261 C<retlen> will be set to the length, in bytes, of that character.
263 If C<s> does not point to a well-formed UTF8 character, the behaviour
264 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
265 it is assumed that the caller will raise a warning, and this function
266 will silently just set C<retlen> to C<-1> and return zero. If the
267 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
268 malformations will be given, C<retlen> will be set to the expected
269 length of the UTF-8 character in bytes, and zero will be returned.
271 The C<flags> can also contain various flags to allow deviations from
272 the strict UTF-8 encoding (see F<utf8.h>).
274 Most code should use utf8_to_uvchr() rather than call this directly.
280 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
285 bool dowarn = ckWARN_d(WARN_UTF8);
287 STRLEN expectlen = 0;
290 /* This list is a superset of the UTF8_ALLOW_XXX. */
292 #define UTF8_WARN_EMPTY 1
293 #define UTF8_WARN_CONTINUATION 2
294 #define UTF8_WARN_NON_CONTINUATION 3
295 #define UTF8_WARN_FE_FF 4
296 #define UTF8_WARN_SHORT 5
297 #define UTF8_WARN_OVERFLOW 6
298 #define UTF8_WARN_SURROGATE 7
299 #define UTF8_WARN_BOM 8
300 #define UTF8_WARN_LONG 9
301 #define UTF8_WARN_FFFF 10
304 !(flags & UTF8_ALLOW_EMPTY)) {
305 warning = UTF8_WARN_EMPTY;
309 if (UTF8_IS_INVARIANT(uv)) {
312 return (UV) (NATIVE_TO_UTF(*s));
315 if (UTF8_IS_CONTINUATION(uv) &&
316 !(flags & UTF8_ALLOW_CONTINUATION)) {
317 warning = UTF8_WARN_CONTINUATION;
321 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
322 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
323 warning = UTF8_WARN_NON_CONTINUATION;
328 uv = NATIVE_TO_UTF(uv);
330 if ((uv == 0xfe || uv == 0xff) &&
331 !(flags & UTF8_ALLOW_FE_FF)) {
332 warning = UTF8_WARN_FE_FF;
337 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
338 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
339 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
340 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
342 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
343 else { len = 7; uv &= 0x01; }
345 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
346 else if (!(uv & 0x01)) { len = 7; uv = 0; }
347 else { len = 13; uv = 0; } /* whoa! */
355 if ((curlen < expectlen) &&
356 !(flags & UTF8_ALLOW_SHORT)) {
357 warning = UTF8_WARN_SHORT;
366 if (!UTF8_IS_CONTINUATION(*s) &&
367 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
369 warning = UTF8_WARN_NON_CONTINUATION;
373 uv = UTF8_ACCUMULATE(uv, *s);
375 /* These cannot be allowed. */
377 if (!(flags & UTF8_ALLOW_LONG)) {
378 warning = UTF8_WARN_LONG;
382 else { /* uv < ouv */
383 /* This cannot be allowed. */
384 warning = UTF8_WARN_OVERFLOW;
392 if (UNICODE_IS_SURROGATE(uv) &&
393 !(flags & UTF8_ALLOW_SURROGATE)) {
394 warning = UTF8_WARN_SURROGATE;
396 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
397 !(flags & UTF8_ALLOW_BOM)) {
398 warning = UTF8_WARN_BOM;
400 } else if ((expectlen > UNISKIP(uv)) &&
401 !(flags & UTF8_ALLOW_LONG)) {
402 warning = UTF8_WARN_LONG;
404 } else if (UNICODE_IS_ILLEGAL(uv) &&
405 !(flags & UTF8_ALLOW_FFFF)) {
406 warning = UTF8_WARN_FFFF;
414 if (flags & UTF8_CHECK_ONLY) {
421 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
424 case 0: /* Intentionally empty. */ break;
425 case UTF8_WARN_EMPTY:
426 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
428 case UTF8_WARN_CONTINUATION:
429 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
431 case UTF8_WARN_NON_CONTINUATION:
433 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
434 (UV)s[1], startbyte);
436 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
437 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
440 case UTF8_WARN_FE_FF:
441 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
443 case UTF8_WARN_SHORT:
444 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
445 curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
446 expectlen = curlen; /* distance for caller to skip */
448 case UTF8_WARN_OVERFLOW:
449 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
452 case UTF8_WARN_SURROGATE:
453 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
456 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
459 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
460 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
463 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
466 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
474 Perl_warner(aTHX_ packWARN(WARN_UTF8),
475 "%s in %s", s, OP_DESC(PL_op));
477 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
482 *retlen = expectlen ? expectlen : len;
488 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
490 Returns the native character value of the first character in the string C<s>
491 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
492 length, in bytes, of that character.
494 If C<s> does not point to a well-formed UTF8 character, zero is
495 returned and retlen is set, if possible, to -1.
501 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
503 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
507 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
509 Returns the Unicode code point of the first character in the string C<s>
510 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
511 length, in bytes, of that character.
513 This function should only be used when returned UV is considered
514 an index into the Unicode semantic tables (e.g. swashes).
516 If C<s> does not point to a well-formed UTF8 character, zero is
517 returned and retlen is set, if possible, to -1.
523 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
525 /* Call the low level routine asking for checks */
526 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
530 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
532 Return the length of the UTF-8 char encoded string C<s> in characters.
533 Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
534 up past C<e>, croaks.
540 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
544 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
545 * the bitops (especially ~) can create illegal UTF-8.
546 * In other words: in Perl UTF-8 is not just for Unicode. */
549 if (ckWARN_d(WARN_UTF8)) {
551 Perl_warner(aTHX_ packWARN(WARN_UTF8),
552 "%s in %s", unees, OP_DESC(PL_op));
554 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
562 if (ckWARN_d(WARN_UTF8)) {
564 Perl_warner(aTHX_ packWARN(WARN_UTF8),
565 unees, OP_DESC(PL_op));
567 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
579 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
581 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
584 WARNING: use only if you *know* that the pointers point inside the
591 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
595 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
596 * the bitops (especially ~) can create illegal UTF-8.
597 * In other words: in Perl UTF-8 is not just for Unicode. */
604 if (ckWARN_d(WARN_UTF8)) {
606 Perl_warner(aTHX_ packWARN(WARN_UTF8),
607 "%s in %s", unees, OP_DESC(PL_op));
609 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
622 if (ckWARN_d(WARN_UTF8)) {
624 Perl_warner(aTHX_ packWARN(WARN_UTF8),
625 "%s in %s", unees, OP_DESC(PL_op));
627 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
640 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
642 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
645 WARNING: do not use the following unless you *know* C<off> is within
646 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
647 on the first byte of character or just after the last byte of a character.
653 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
655 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
656 * the bitops (especially ~) can create illegal UTF-8.
657 * In other words: in Perl UTF-8 is not just for Unicode. */
666 while (UTF8_IS_CONTINUATION(*s))
674 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
676 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
677 Unlike C<bytes_to_utf8>, this over-writes the original string, and
678 updates len to contain the new length.
679 Returns zero on failure, setting C<len> to -1.
685 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
691 /* ensure valid UTF8 and chars < 256 before updating string */
692 for (send = s + *len; s < send; ) {
695 if (!UTF8_IS_INVARIANT(c) &&
696 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
697 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
706 *d++ = (U8)utf8_to_uvchr(s, &ulen);
715 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
717 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
718 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
719 the newly-created string, and updates C<len> to contain the new
720 length. Returns the original string if no conversion occurs, C<len>
721 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
722 0 if C<s> is converted or contains all 7bit characters.
728 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
738 /* ensure valid UTF8 and chars < 256 before converting string */
739 for (send = s + *len; s < send;) {
741 if (!UTF8_IS_INVARIANT(c)) {
742 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
743 (c = *s++) && UTF8_IS_CONTINUATION(c))
752 Newz(801, d, (*len) - count + 1, U8);
753 s = start; start = d;
756 if (!UTF8_IS_INVARIANT(c)) {
757 /* Then it is two-byte encoded */
758 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
759 c = ASCII_TO_NATIVE(c);
769 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
771 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
772 Returns a pointer to the newly-created string, and sets C<len> to
773 reflect the new length.
779 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
786 Newz(801, d, (*len) * 2 + 1, U8);
790 UV uv = NATIVE_TO_ASCII(*s++);
791 if (UNI_IS_INVARIANT(uv))
792 *d++ = UTF_TO_NATIVE(uv);
794 *d++ = UTF8_EIGHT_BIT_HI(uv);
795 *d++ = UTF8_EIGHT_BIT_LO(uv);
804 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
806 * Destination must be pre-extended to 3/2 source. Do not use in-place.
807 * We optimize for native, for obvious reasons. */
810 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
816 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
821 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
828 *d++ = (( uv >> 6) | 0xc0);
829 *d++ = (( uv & 0x3f) | 0x80);
832 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
834 if (low < 0xdc00 || low >= 0xdfff)
835 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
836 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
839 *d++ = (( uv >> 12) | 0xe0);
840 *d++ = (((uv >> 6) & 0x3f) | 0x80);
841 *d++ = (( uv & 0x3f) | 0x80);
845 *d++ = (( uv >> 18) | 0xf0);
846 *d++ = (((uv >> 12) & 0x3f) | 0x80);
847 *d++ = (((uv >> 6) & 0x3f) | 0x80);
848 *d++ = (( uv & 0x3f) | 0x80);
852 *newlen = d - dstart;
856 /* Note: this one is slightly destructive of the source. */
859 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
862 U8* send = s + bytelen;
869 return utf16_to_utf8(p, d, bytelen, newlen);
872 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
875 Perl_is_uni_alnum(pTHX_ UV c)
877 U8 tmpbuf[UTF8_MAXLEN+1];
878 uvchr_to_utf8(tmpbuf, c);
879 return is_utf8_alnum(tmpbuf);
883 Perl_is_uni_alnumc(pTHX_ UV c)
885 U8 tmpbuf[UTF8_MAXLEN+1];
886 uvchr_to_utf8(tmpbuf, c);
887 return is_utf8_alnumc(tmpbuf);
891 Perl_is_uni_idfirst(pTHX_ UV c)
893 U8 tmpbuf[UTF8_MAXLEN+1];
894 uvchr_to_utf8(tmpbuf, c);
895 return is_utf8_idfirst(tmpbuf);
899 Perl_is_uni_alpha(pTHX_ UV c)
901 U8 tmpbuf[UTF8_MAXLEN+1];
902 uvchr_to_utf8(tmpbuf, c);
903 return is_utf8_alpha(tmpbuf);
907 Perl_is_uni_ascii(pTHX_ UV c)
909 U8 tmpbuf[UTF8_MAXLEN+1];
910 uvchr_to_utf8(tmpbuf, c);
911 return is_utf8_ascii(tmpbuf);
915 Perl_is_uni_space(pTHX_ UV c)
917 U8 tmpbuf[UTF8_MAXLEN+1];
918 uvchr_to_utf8(tmpbuf, c);
919 return is_utf8_space(tmpbuf);
923 Perl_is_uni_digit(pTHX_ UV c)
925 U8 tmpbuf[UTF8_MAXLEN+1];
926 uvchr_to_utf8(tmpbuf, c);
927 return is_utf8_digit(tmpbuf);
931 Perl_is_uni_upper(pTHX_ UV c)
933 U8 tmpbuf[UTF8_MAXLEN+1];
934 uvchr_to_utf8(tmpbuf, c);
935 return is_utf8_upper(tmpbuf);
939 Perl_is_uni_lower(pTHX_ UV c)
941 U8 tmpbuf[UTF8_MAXLEN+1];
942 uvchr_to_utf8(tmpbuf, c);
943 return is_utf8_lower(tmpbuf);
947 Perl_is_uni_cntrl(pTHX_ UV c)
949 U8 tmpbuf[UTF8_MAXLEN+1];
950 uvchr_to_utf8(tmpbuf, c);
951 return is_utf8_cntrl(tmpbuf);
955 Perl_is_uni_graph(pTHX_ UV c)
957 U8 tmpbuf[UTF8_MAXLEN+1];
958 uvchr_to_utf8(tmpbuf, c);
959 return is_utf8_graph(tmpbuf);
963 Perl_is_uni_print(pTHX_ UV c)
965 U8 tmpbuf[UTF8_MAXLEN+1];
966 uvchr_to_utf8(tmpbuf, c);
967 return is_utf8_print(tmpbuf);
971 Perl_is_uni_punct(pTHX_ UV c)
973 U8 tmpbuf[UTF8_MAXLEN+1];
974 uvchr_to_utf8(tmpbuf, c);
975 return is_utf8_punct(tmpbuf);
979 Perl_is_uni_xdigit(pTHX_ UV c)
981 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
982 uvchr_to_utf8(tmpbuf, c);
983 return is_utf8_xdigit(tmpbuf);
987 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
990 return to_utf8_upper(p, p, lenp);
994 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
997 return to_utf8_title(p, p, lenp);
1001 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1003 uvchr_to_utf8(p, c);
1004 return to_utf8_lower(p, p, lenp);
1008 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1010 uvchr_to_utf8(p, c);
1011 return to_utf8_fold(p, p, lenp);
1014 /* for now these all assume no locale info available for Unicode > 255 */
1017 Perl_is_uni_alnum_lc(pTHX_ UV c)
1019 return is_uni_alnum(c); /* XXX no locale support yet */
1023 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1025 return is_uni_alnumc(c); /* XXX no locale support yet */
1029 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1031 return is_uni_idfirst(c); /* XXX no locale support yet */
1035 Perl_is_uni_alpha_lc(pTHX_ UV c)
1037 return is_uni_alpha(c); /* XXX no locale support yet */
1041 Perl_is_uni_ascii_lc(pTHX_ UV c)
1043 return is_uni_ascii(c); /* XXX no locale support yet */
1047 Perl_is_uni_space_lc(pTHX_ UV c)
1049 return is_uni_space(c); /* XXX no locale support yet */
1053 Perl_is_uni_digit_lc(pTHX_ UV c)
1055 return is_uni_digit(c); /* XXX no locale support yet */
1059 Perl_is_uni_upper_lc(pTHX_ UV c)
1061 return is_uni_upper(c); /* XXX no locale support yet */
1065 Perl_is_uni_lower_lc(pTHX_ UV c)
1067 return is_uni_lower(c); /* XXX no locale support yet */
1071 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1073 return is_uni_cntrl(c); /* XXX no locale support yet */
1077 Perl_is_uni_graph_lc(pTHX_ UV c)
1079 return is_uni_graph(c); /* XXX no locale support yet */
1083 Perl_is_uni_print_lc(pTHX_ UV c)
1085 return is_uni_print(c); /* XXX no locale support yet */
1089 Perl_is_uni_punct_lc(pTHX_ UV c)
1091 return is_uni_punct(c); /* XXX no locale support yet */
1095 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1097 return is_uni_xdigit(c); /* XXX no locale support yet */
1101 Perl_to_uni_upper_lc(pTHX_ U32 c)
1103 /* XXX returns only the first character -- do not use XXX */
1104 /* XXX no locale support yet */
1106 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1107 return (U32)to_uni_upper(c, tmpbuf, &len);
1111 Perl_to_uni_title_lc(pTHX_ U32 c)
1113 /* XXX returns only the first character XXX -- do not use XXX */
1114 /* XXX no locale support yet */
1116 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1117 return (U32)to_uni_title(c, tmpbuf, &len);
1121 Perl_to_uni_lower_lc(pTHX_ U32 c)
1123 /* XXX returns only the first character -- do not use XXX */
1124 /* XXX no locale support yet */
1126 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1127 return (U32)to_uni_lower(c, tmpbuf, &len);
1131 Perl_is_utf8_alnum(pTHX_ U8 *p)
1133 if (!is_utf8_char(p))
1136 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1137 * descendant of isalnum(3), in other words, it doesn't
1138 * contain the '_'. --jhi */
1139 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1140 return swash_fetch(PL_utf8_alnum, p, TRUE);
1141 /* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1142 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1144 PL_utf8_alnum = swash_init("utf8", "",
1145 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1146 return swash_fetch(PL_utf8_alnum, p, TRUE);
1151 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1153 if (!is_utf8_char(p))
1156 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1157 return swash_fetch(PL_utf8_alnum, p, TRUE);
1158 /* return is_utf8_alpha(p) || is_utf8_digit(p); */
1159 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1161 PL_utf8_alnum = swash_init("utf8", "",
1162 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1163 return swash_fetch(PL_utf8_alnum, p, TRUE);
1168 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1170 return *p == '_' || is_utf8_alpha(p);
1174 Perl_is_utf8_alpha(pTHX_ U8 *p)
1176 if (!is_utf8_char(p))
1179 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1180 return swash_fetch(PL_utf8_alpha, p, TRUE);
1184 Perl_is_utf8_ascii(pTHX_ U8 *p)
1186 if (!is_utf8_char(p))
1189 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1190 return swash_fetch(PL_utf8_ascii, p, TRUE);
1194 Perl_is_utf8_space(pTHX_ U8 *p)
1196 if (!is_utf8_char(p))
1199 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1200 return swash_fetch(PL_utf8_space, p, TRUE);
1204 Perl_is_utf8_digit(pTHX_ U8 *p)
1206 if (!is_utf8_char(p))
1209 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1210 return swash_fetch(PL_utf8_digit, p, TRUE);
1214 Perl_is_utf8_upper(pTHX_ U8 *p)
1216 if (!is_utf8_char(p))
1219 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1220 return swash_fetch(PL_utf8_upper, p, TRUE);
1224 Perl_is_utf8_lower(pTHX_ U8 *p)
1226 if (!is_utf8_char(p))
1229 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1230 return swash_fetch(PL_utf8_lower, p, TRUE);
1234 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1236 if (!is_utf8_char(p))
1239 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1240 return swash_fetch(PL_utf8_cntrl, p, TRUE);
1244 Perl_is_utf8_graph(pTHX_ U8 *p)
1246 if (!is_utf8_char(p))
1249 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1250 return swash_fetch(PL_utf8_graph, p, TRUE);
1254 Perl_is_utf8_print(pTHX_ U8 *p)
1256 if (!is_utf8_char(p))
1259 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1260 return swash_fetch(PL_utf8_print, p, TRUE);
1264 Perl_is_utf8_punct(pTHX_ U8 *p)
1266 if (!is_utf8_char(p))
1269 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1270 return swash_fetch(PL_utf8_punct, p, TRUE);
1274 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1276 if (!is_utf8_char(p))
1278 if (!PL_utf8_xdigit)
1279 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1280 return swash_fetch(PL_utf8_xdigit, p, TRUE);
1284 Perl_is_utf8_mark(pTHX_ U8 *p)
1286 if (!is_utf8_char(p))
1289 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1290 return swash_fetch(PL_utf8_mark, p, TRUE);
1294 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1296 The "p" contains the pointer to the UTF-8 string encoding
1297 the character that is being converted.
1299 The "ustrp" is a pointer to the character buffer to put the
1300 conversion result to. The "lenp" is a pointer to the length
1303 The "swashp" is a pointer to the swash to use.
1305 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1306 and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1307 but not always, a multicharacter mapping), is tried first.
1309 The "special" is a string like "utf8::ToSpecLower", which means the
1310 hash %utf8::ToSpecLower. The access to the hash is through
1311 Perl_to_utf8_case().
1313 The "normal" is a string like "ToLower" which means the swash
1319 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1322 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
1325 uv0 = utf8_to_uvchr(p, 0);
1326 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1327 * are necessary in EBCDIC, they are redundant no-ops
1328 * in ASCII-ish platforms, and hopefully optimized away. */
1329 uv1 = NATIVE_TO_UNI(uv0);
1330 uvuni_to_utf8(tmpbuf, uv1);
1332 if (!*swashp) /* load on-demand */
1333 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1336 /* It might be "special" (sometimes, but not always,
1337 * a multicharacter mapping) */
1343 if ((hv = get_hv(special, FALSE)) &&
1344 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
1345 (he = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
1346 (val = HeVAL(he))) {
1351 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1354 /* If we have EBCDIC we need to remap the characters
1355 * since any characters in the low 256 are Unicode
1356 * code points, not EBCDIC. */
1357 U8 *t = (U8*)s, *tend = t + len, *d;
1364 UV c = utf8_to_uvchr(t, &tlen);
1366 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1375 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1380 Copy(tmpbuf, ustrp, len, U8);
1382 Copy(s, ustrp, len, U8);
1388 if (!len && *swashp) {
1389 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1392 /* It was "normal" (a single character mapping). */
1393 UV uv3 = UNI_TO_NATIVE(uv2);
1395 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1399 if (!len) /* Neither: just copy. */
1400 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1405 return len ? utf8_to_uvchr(ustrp, 0) : 0;
1409 =for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1411 Convert the UTF-8 encoded character at p to its uppercase version and
1412 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1413 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1414 uppercase version may be longer than the original character (up to two
1417 The first character of the uppercased version is returned
1418 (but note, as explained above, that there may be more.)
1423 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1425 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1426 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1430 =for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1432 Convert the UTF-8 encoded character at p to its titlecase version and
1433 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1434 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1435 titlecase version may be longer than the original character (up to two
1438 The first character of the titlecased version is returned
1439 (but note, as explained above, that there may be more.)
1444 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1446 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1447 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1451 =for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1453 Convert the UTF-8 encoded character at p to its lowercase version and
1454 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1455 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1456 lowercase version may be longer than the original character (up to two
1459 The first character of the lowercased version is returned
1460 (but note, as explained above, that there may be more.)
1465 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1467 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1468 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1472 =for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1474 Convert the UTF-8 encoded character at p to its foldcase version and
1475 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1476 that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1477 foldcase version may be longer than the original character (up to
1480 The first character of the foldcased version is returned
1481 (but note, as explained above, that there may be more.)
1486 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1488 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1489 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1492 /* a "swash" is a swatch hash */
1495 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1498 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1500 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1503 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1505 errsv_save = newSVsv(ERRSV);
1506 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1508 sv_setsv(ERRSV, errsv_save);
1509 SvREFCNT_dec(errsv_save);
1513 PUSHSTACKi(PERLSI_MAGIC);
1516 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1517 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1519 PUSHs(sv_2mortal(newSViv(minbits)));
1520 PUSHs(sv_2mortal(newSViv(none)));
1526 if (PL_curcop == &PL_compiling)
1527 /* XXX ought to be handled by lex_start */
1528 sv_setpv(tokenbufsv, PL_tokenbuf);
1529 errsv_save = newSVsv(ERRSV);
1530 if (call_method("SWASHNEW", G_SCALAR))
1531 retval = newSVsv(*PL_stack_sp--);
1533 retval = &PL_sv_undef;
1535 sv_setsv(ERRSV, errsv_save);
1536 SvREFCNT_dec(errsv_save);
1539 if (PL_curcop == &PL_compiling) {
1541 char* pv = SvPV(tokenbufsv, len);
1543 Copy(pv, PL_tokenbuf, len+1, char);
1544 PL_curcop->op_private = PL_hints;
1546 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1548 Perl_croak(aTHX_ "Can't find Unicode property definition \"%s\"",
1549 SvPV_nolen(retval));
1550 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1556 /* This API is wrong for special case conversions since we may need to
1557 * return several Unicode characters for a single Unicode character
1558 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1559 * the lower-level routine, and it is similarly broken for returning
1560 * multiple values. --jhi */
1562 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1564 HV* hv = (HV*)SvRV(sv);
1573 UV c = NATIVE_TO_ASCII(*ptr);
1575 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1576 tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1577 tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1580 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1581 * then the "swatch" is a vec() for al the chars which start
1583 * So the key in the hash (klen) is length of encoded char -1
1585 klen = UTF8SKIP(ptr) - 1;
1590 /* If char in invariant then swatch is for all the invariant chars
1591 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1593 needents = UTF_CONTINUATION_MARK;
1594 off = NATIVE_TO_UTF(ptr[klen]);
1598 /* If char is encoded then swatch is for the prefix */
1599 needents = (1 << UTF_ACCUMULATION_SHIFT);
1600 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1604 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1605 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1606 * it's nothing to sniff at.) Pity we usually come through at least
1607 * two function calls to get here...
1609 * NB: this code assumes that swatches are never modified, once generated!
1612 if (hv == PL_last_swash_hv &&
1613 klen == PL_last_swash_klen &&
1614 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1616 tmps = PL_last_swash_tmps;
1617 slen = PL_last_swash_slen;
1620 /* Try our second-level swatch cache, kept in a hash. */
1621 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1623 /* If not cached, generate it via utf8::SWASHGET */
1624 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1626 /* We use utf8n_to_uvuni() as we want an index into
1627 Unicode tables, not a native character number.
1629 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
1634 PUSHSTACKi(PERLSI_MAGIC);
1638 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1639 PUSHs(sv_2mortal(newSViv((klen) ?
1640 (code_point & ~(needents - 1)) : 0)));
1641 PUSHs(sv_2mortal(newSViv(needents)));
1643 errsv_save = newSVsv(ERRSV);
1644 if (call_method("SWASHGET", G_SCALAR))
1645 retval = newSVsv(*PL_stack_sp--);
1647 retval = &PL_sv_undef;
1649 sv_setsv(ERRSV, errsv_save);
1650 SvREFCNT_dec(errsv_save);
1654 if (PL_curcop == &PL_compiling)
1655 PL_curcop->op_private = PL_hints;
1657 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1659 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1660 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1663 PL_last_swash_hv = hv;
1664 PL_last_swash_klen = klen;
1665 PL_last_swash_tmps = tmps;
1666 PL_last_swash_slen = slen;
1668 Copy(ptr, PL_last_swash_key, klen, U8);
1671 switch ((int)((slen << 3) / needents)) {
1673 bit = 1 << (off & 7);
1675 return (tmps[off] & bit) != 0;
1680 return (tmps[off] << 8) + tmps[off + 1] ;
1683 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1685 Perl_croak(aTHX_ "panic: swash_fetch");
1691 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1693 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1694 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1695 bytes available. The return value is the pointer to the byte after the
1696 end of the new character. In other words,
1698 d = uvchr_to_utf8(d, uv);
1700 is the recommended wide native character-aware way of saying
1707 /* On ASCII machines this is normally a macro but we want a
1708 real function in case XS code wants it
1710 #undef Perl_uvchr_to_utf8
1712 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1714 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1718 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1720 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1724 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1726 Returns the native character value of the first character in the string C<s>
1727 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1728 length, in bytes, of that character.
1730 Allows length and flags to be passed to low level routine.
1734 /* On ASCII machines this is normally a macro but we want
1735 a real function in case XS code wants it
1737 #undef Perl_utf8n_to_uvchr
1739 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1741 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1742 return UNI_TO_NATIVE(uv);
1746 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1748 Build to the scalar dsv a displayable version of the string spv,
1749 length len, the displayable version being at most pvlim bytes long
1750 (if longer, the rest is truncated and "..." will be appended).
1752 The flags argument can have UNI_DISPLAY_ISPRINT set to display
1753 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1754 to display the \\[nrfta\\] as the backslashed versions (like '\n')
1755 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1756 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1757 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1759 The pointer to the PV of the dsv is returned.
1763 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1768 sv_setpvn(dsv, "", 0);
1769 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1773 if (pvlim && SvCUR(dsv) >= pvlim) {
1777 u = utf8_to_uvchr((U8*)s, 0);
1779 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1782 Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1784 Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1786 Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1788 Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1790 Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1792 Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
1796 /* isPRINT() is the locale-blind version. */
1797 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
1798 Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
1803 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1806 sv_catpvn(dsv, "...", 3);
1812 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1814 Build to the scalar dsv a displayable version of the scalar sv,
1815 the displayable version being at most pvlim bytes long
1816 (if longer, the rest is truncated and "..." will be appended).
1818 The flags argument is as in pv_uni_display().
1820 The pointer to the PV of the dsv is returned.
1824 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1826 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1831 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1833 Return true if the strings s1 and s2 differ case-insensitively, false
1834 if not (if they are equal case-insensitively). If u1 is true, the
1835 string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
1836 the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1837 are false, the respective string is assumed to be in native 8-bit
1840 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1841 in there (they will point at the beginning of the I<next> character).
1842 If the pointers behind pe1 or pe2 are non-NULL, they are the end
1843 pointers beyond which scanning will not continue under any
1844 circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1845 s2+l2 will be used as goal end pointers that will also stop the scan,
1846 and which qualify towards defining a successful match: all the scans
1847 that define an explicit length must reach their goal pointers for
1848 a match to succeed).
1850 For case-insensitiveness, the "casefolding" of Unicode is used
1851 instead of upper/lowercasing both the characters, see
1852 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1856 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1858 register U8 *p1 = (U8*)s1;
1859 register U8 *p2 = (U8*)s2;
1860 register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1861 register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1862 STRLEN n1 = 0, n2 = 0;
1863 U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1864 U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
1866 STRLEN foldlen1, foldlen2;
1871 if (e1 == 0 || (l1 && l1 < e1 - (U8*)s1))
1875 if (e2 == 0 || (l2 && l2 < e2 - (U8*)s2))
1878 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1879 return 1; /* mismatch; possible infinite loop or false positive */
1882 natbuf[1] = 0; /* Need to terminate the buffer. */
1884 while ((e1 == 0 || p1 < e1) &&
1885 (f1 == 0 || p1 < f1) &&
1886 (e2 == 0 || p2 < e2) &&
1887 (f2 == 0 || p2 < f2)) {
1890 to_utf8_fold(p1, foldbuf1, &foldlen1);
1893 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1900 to_utf8_fold(p2, foldbuf2, &foldlen2);
1903 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1909 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1910 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1911 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
1912 return 1; /* mismatch */
1919 p1 += u1 ? UTF8SKIP(p1) : 1;
1921 p2 += u2 ? UTF8SKIP(p2) : 1;
1925 /* A match is defined by all the scans that specified
1926 * an explicit length reaching their final goals. */
1927 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
1936 return match ? 0 : 1; /* 0 match, 1 mismatch */