3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
4 * by Larry Wall and others
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
25 #define PERL_IN_UTF8_C
28 static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
32 =head1 Unicode Support
34 This file contains various utility functions for manipulating UTF8-encoded
35 strings. For the uninitiated, this is a method of representing arbitrary
36 Unicode characters as a variable number of bytes, in such a way that
37 characters in the ASCII range are unmodified, and a zero byte never appears
38 within non-zero characters.
40 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
42 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
43 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
44 bytes available. The return value is the pointer to the byte after the
45 end of the new character. In other words,
47 d = uvuni_to_utf8_flags(d, uv, flags);
51 d = uvuni_to_utf8(d, uv);
53 (which is equivalent to)
55 d = uvuni_to_utf8_flags(d, uv, 0);
57 is the recommended Unicode-aware way of saying
65 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
67 if (ckWARN(WARN_UTF8)) {
68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
78 * FFFEs and FFFFs beyond 0x10FFFF. */
79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
83 "Unicode character 0x%04"UVxf" is illegal", uv);
85 if (UNI_IS_INVARIANT(uv)) {
86 *d++ = (U8)UTF_TO_NATIVE(uv);
91 STRLEN len = UNISKIP(uv);
94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
95 uv >>= UTF_ACCUMULATION_SHIFT;
97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
100 #else /* Non loop style */
102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
119 if (uv < 0x4000000) {
120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
127 if (uv < 0x80000000) {
128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
137 if (uv < UTF8_QUAD_MAX)
140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
167 #endif /* Loop style */
172 Tests if some arbitrary number of bytes begins in a valid UTF-8
173 character. Note that an INVARIANT (i.e. ASCII) character is a valid
174 UTF-8 character. The actual number of bytes in the UTF-8 character
175 will be returned if it is valid, otherwise 0.
177 This is the "slow" version as opposed to the "fast" version which is
178 the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
179 difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
180 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
181 you should use the _slow(). In practice this means that the _slow()
182 will be used very rarely, since the maximum Unicode code point (as of
183 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
184 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
189 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
195 if (UTF8_IS_INVARIANT(u))
198 if (!UTF8_IS_START(u))
201 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
207 u = NATIVE_TO_UTF(u);
209 u &= UTF_START_MASK(len);
213 if (!UTF8_IS_CONTINUATION(*s))
215 uv = UTF8_ACCUMULATE(uv, *s);
222 if ((STRLEN)UNISKIP(uv) < len)
229 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
231 Tests if some arbitrary number of bytes begins in a valid UTF-8
232 character. Note that an INVARIANT (i.e. ASCII) character is a valid
233 UTF-8 character. The actual number of bytes in the UTF-8 character
234 will be returned if it is valid, otherwise 0.
238 Perl_is_utf8_char(pTHX_ const U8 *s)
240 const STRLEN len = UTF8SKIP(s);
243 if (IS_UTF8_CHAR_FAST(len))
244 return IS_UTF8_CHAR(s, len) ? len : 0;
245 #endif /* #ifdef IS_UTF8_CHAR */
246 return is_utf8_char_slow(s, len);
250 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
252 Returns true if first C<len> bytes of the given string form a valid
253 UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
254 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
255 because a valid ASCII string is a valid UTF-8 string.
257 See also is_utf8_string_loclen() and is_utf8_string_loc().
263 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
270 len = strlen((const char *)s);
275 /* Inline the easy bits of is_utf8_char() here for speed... */
276 if (UTF8_IS_INVARIANT(*x))
278 else if (!UTF8_IS_START(*x))
281 /* ... and call is_utf8_char() only if really needed. */
284 if (IS_UTF8_CHAR_FAST(c)) {
285 if (!IS_UTF8_CHAR(x, c))
289 c = is_utf8_char_slow(x, c);
292 #endif /* #ifdef IS_UTF8_CHAR */
307 Implemented as a macro in utf8.h
309 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
311 Like is_utf8_string() but stores the location of the failure (in the
312 case of "utf8ness failure") or the location s+len (in the case of
313 "utf8ness success") in the C<ep>.
315 See also is_utf8_string_loclen() and is_utf8_string().
317 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
319 Like is_utf8_string() but stores the location of the failure (in the
320 case of "utf8ness failure") or the location s+len (in the case of
321 "utf8ness success") in the C<ep>, and the number of UTF-8
322 encoded characters in the C<el>.
324 See also is_utf8_string_loc() and is_utf8_string().
330 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
338 len = strlen((const char *)s);
344 /* Inline the easy bits of is_utf8_char() here for speed... */
345 if (UTF8_IS_INVARIANT(*x))
347 else if (!UTF8_IS_START(*x))
350 /* ... and call is_utf8_char() only if really needed. */
353 if (IS_UTF8_CHAR_FAST(c)) {
354 if (!IS_UTF8_CHAR(x, c))
357 c = is_utf8_char_slow(x, c);
360 #endif /* #ifdef IS_UTF8_CHAR */
380 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
382 Bottom level UTF-8 decode routine.
383 Returns the unicode code point value of the first character in the string C<s>
384 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
385 C<retlen> will be set to the length, in bytes, of that character.
387 If C<s> does not point to a well-formed UTF-8 character, the behaviour
388 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
389 it is assumed that the caller will raise a warning, and this function
390 will silently just set C<retlen> to C<-1> and return zero. If the
391 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
392 malformations will be given, C<retlen> will be set to the expected
393 length of the UTF-8 character in bytes, and zero will be returned.
395 The C<flags> can also contain various flags to allow deviations from
396 the strict UTF-8 encoding (see F<utf8.h>).
398 Most code should use utf8_to_uvchr() rather than call this directly.
404 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
407 const U8 * const s0 = s;
410 const bool dowarn = ckWARN_d(WARN_UTF8);
411 const UV startbyte = *s;
412 STRLEN expectlen = 0;
415 /* This list is a superset of the UTF8_ALLOW_XXX. */
417 #define UTF8_WARN_EMPTY 1
418 #define UTF8_WARN_CONTINUATION 2
419 #define UTF8_WARN_NON_CONTINUATION 3
420 #define UTF8_WARN_FE_FF 4
421 #define UTF8_WARN_SHORT 5
422 #define UTF8_WARN_OVERFLOW 6
423 #define UTF8_WARN_SURROGATE 7
424 #define UTF8_WARN_LONG 8
425 #define UTF8_WARN_FFFF 9 /* Also FFFE. */
428 !(flags & UTF8_ALLOW_EMPTY)) {
429 warning = UTF8_WARN_EMPTY;
433 if (UTF8_IS_INVARIANT(uv)) {
436 return (UV) (NATIVE_TO_UTF(*s));
439 if (UTF8_IS_CONTINUATION(uv) &&
440 !(flags & UTF8_ALLOW_CONTINUATION)) {
441 warning = UTF8_WARN_CONTINUATION;
445 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
446 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
447 warning = UTF8_WARN_NON_CONTINUATION;
452 uv = NATIVE_TO_UTF(uv);
454 if ((uv == 0xfe || uv == 0xff) &&
455 !(flags & UTF8_ALLOW_FE_FF)) {
456 warning = UTF8_WARN_FE_FF;
461 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
462 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
463 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
464 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
466 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
467 else { len = 7; uv &= 0x01; }
469 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
470 else if (!(uv & 0x01)) { len = 7; uv = 0; }
471 else { len = 13; uv = 0; } /* whoa! */
479 if ((curlen < expectlen) &&
480 !(flags & UTF8_ALLOW_SHORT)) {
481 warning = UTF8_WARN_SHORT;
490 if (!UTF8_IS_CONTINUATION(*s) &&
491 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
493 warning = UTF8_WARN_NON_CONTINUATION;
497 uv = UTF8_ACCUMULATE(uv, *s);
499 /* These cannot be allowed. */
501 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
502 warning = UTF8_WARN_LONG;
506 else { /* uv < ouv */
507 /* This cannot be allowed. */
508 warning = UTF8_WARN_OVERFLOW;
516 if (UNICODE_IS_SURROGATE(uv) &&
517 !(flags & UTF8_ALLOW_SURROGATE)) {
518 warning = UTF8_WARN_SURROGATE;
520 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
521 !(flags & UTF8_ALLOW_LONG)) {
522 warning = UTF8_WARN_LONG;
524 } else if (UNICODE_IS_ILLEGAL(uv) &&
525 !(flags & UTF8_ALLOW_FFFF)) {
526 warning = UTF8_WARN_FFFF;
534 if (flags & UTF8_CHECK_ONLY) {
541 SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
544 case 0: /* Intentionally empty. */ break;
545 case UTF8_WARN_EMPTY:
546 sv_catpvs(sv, "(empty string)");
548 case UTF8_WARN_CONTINUATION:
549 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
551 case UTF8_WARN_NON_CONTINUATION:
553 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
554 (UV)s[1], startbyte);
556 const int len = (int)(s-s0);
557 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
558 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
562 case UTF8_WARN_FE_FF:
563 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
565 case UTF8_WARN_SHORT:
566 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
567 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
568 expectlen = curlen; /* distance for caller to skip */
570 case UTF8_WARN_OVERFLOW:
571 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
574 case UTF8_WARN_SURROGATE:
575 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
578 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
579 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
582 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
585 sv_catpvs(sv, "(unknown reason)");
590 const char * const s = SvPVX_const(sv);
593 Perl_warner(aTHX_ packWARN(WARN_UTF8),
594 "%s in %s", s, OP_DESC(PL_op));
596 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
601 *retlen = expectlen ? expectlen : len;
607 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
609 Returns the native character value of the first character in the string C<s>
610 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
611 length, in bytes, of that character.
613 If C<s> does not point to a well-formed UTF-8 character, zero is
614 returned and retlen is set, if possible, to -1.
620 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
622 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
623 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
627 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
629 Returns the Unicode code point of the first character in the string C<s>
630 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
631 length, in bytes, of that character.
633 This function should only be used when returned UV is considered
634 an index into the Unicode semantic tables (e.g. swashes).
636 If C<s> does not point to a well-formed UTF-8 character, zero is
637 returned and retlen is set, if possible, to -1.
643 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
645 /* Call the low level routine asking for checks */
646 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
647 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
651 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
653 Return the length of the UTF-8 char encoded string C<s> in characters.
654 Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
655 up past C<e>, croaks.
661 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
666 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
667 * the bitops (especially ~) can create illegal UTF-8.
668 * In other words: in Perl UTF-8 is not just for Unicode. */
671 goto warn_and_return;
673 const U8 t = UTF8SKIP(s);
676 if (ckWARN_d(WARN_UTF8)) {
678 Perl_warner(aTHX_ packWARN(WARN_UTF8),
679 "%s in %s", unees, OP_DESC(PL_op));
681 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
693 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
695 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
698 WARNING: use only if you *know* that the pointers point inside the
705 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
707 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
711 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
713 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
716 WARNING: do not use the following unless you *know* C<off> is within
717 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
718 on the first byte of character or just after the last byte of a character.
724 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
727 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
728 * the bitops (especially ~) can create illegal UTF-8.
729 * In other words: in Perl UTF-8 is not just for Unicode. */
738 while (UTF8_IS_CONTINUATION(*s))
746 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
748 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
749 Unlike C<bytes_to_utf8>, this over-writes the original string, and
750 updates len to contain the new length.
751 Returns zero on failure, setting C<len> to -1.
757 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
760 U8 * const send = s + *len;
763 /* ensure valid UTF-8 and chars < 256 before updating string */
767 if (!UTF8_IS_INVARIANT(c) &&
768 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
769 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
778 *d++ = (U8)utf8_to_uvchr(s, &ulen);
787 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
789 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
790 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
791 the newly-created string, and updates C<len> to contain the new
792 length. Returns the original string if no conversion occurs, C<len>
793 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
794 0 if C<s> is converted or contains all 7bit characters.
800 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
811 /* ensure valid UTF-8 and chars < 256 before converting string */
812 for (send = s + *len; s < send;) {
814 if (!UTF8_IS_INVARIANT(c)) {
815 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
816 (c = *s++) && UTF8_IS_CONTINUATION(c))
825 Newx(d, (*len) - count + 1, U8);
826 s = start; start = d;
829 if (!UTF8_IS_INVARIANT(c)) {
830 /* Then it is two-byte encoded */
831 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
832 c = ASCII_TO_NATIVE(c);
842 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
844 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
845 Returns a pointer to the newly-created string, and sets C<len> to
846 reflect the new length.
848 If you want to convert to UTF-8 from other encodings than ASCII,
849 see sv_recode_to_utf8().
855 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
857 const U8 * const send = s + (*len);
862 Newx(d, (*len) * 2 + 1, U8);
866 const UV uv = NATIVE_TO_ASCII(*s++);
867 if (UNI_IS_INVARIANT(uv))
868 *d++ = (U8)UTF_TO_NATIVE(uv);
870 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
871 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
880 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
882 * Destination must be pre-extended to 3/2 source. Do not use in-place.
883 * We optimize for native, for obvious reasons. */
886 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
891 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
898 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
903 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
910 *d++ = (U8)(( uv >> 6) | 0xc0);
911 *d++ = (U8)(( uv & 0x3f) | 0x80);
914 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
915 UV low = (p[0] << 8) + p[1];
917 if (low < 0xdc00 || low >= 0xdfff)
918 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
919 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
922 *d++ = (U8)(( uv >> 12) | 0xe0);
923 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
924 *d++ = (U8)(( uv & 0x3f) | 0x80);
928 *d++ = (U8)(( uv >> 18) | 0xf0);
929 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
930 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
931 *d++ = (U8)(( uv & 0x3f) | 0x80);
935 *newlen = d - dstart;
939 /* Note: this one is slightly destructive of the source. */
942 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
945 U8* const send = s + bytelen;
952 return utf16_to_utf8(p, d, bytelen, newlen);
955 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
958 Perl_is_uni_alnum(pTHX_ UV c)
960 U8 tmpbuf[UTF8_MAXBYTES+1];
961 uvchr_to_utf8(tmpbuf, c);
962 return is_utf8_alnum(tmpbuf);
966 Perl_is_uni_alnumc(pTHX_ UV c)
968 U8 tmpbuf[UTF8_MAXBYTES+1];
969 uvchr_to_utf8(tmpbuf, c);
970 return is_utf8_alnumc(tmpbuf);
974 Perl_is_uni_idfirst(pTHX_ UV c)
976 U8 tmpbuf[UTF8_MAXBYTES+1];
977 uvchr_to_utf8(tmpbuf, c);
978 return is_utf8_idfirst(tmpbuf);
982 Perl_is_uni_alpha(pTHX_ UV c)
984 U8 tmpbuf[UTF8_MAXBYTES+1];
985 uvchr_to_utf8(tmpbuf, c);
986 return is_utf8_alpha(tmpbuf);
990 Perl_is_uni_ascii(pTHX_ UV c)
992 U8 tmpbuf[UTF8_MAXBYTES+1];
993 uvchr_to_utf8(tmpbuf, c);
994 return is_utf8_ascii(tmpbuf);
998 Perl_is_uni_space(pTHX_ UV c)
1000 U8 tmpbuf[UTF8_MAXBYTES+1];
1001 uvchr_to_utf8(tmpbuf, c);
1002 return is_utf8_space(tmpbuf);
1006 Perl_is_uni_digit(pTHX_ UV c)
1008 U8 tmpbuf[UTF8_MAXBYTES+1];
1009 uvchr_to_utf8(tmpbuf, c);
1010 return is_utf8_digit(tmpbuf);
1014 Perl_is_uni_upper(pTHX_ UV c)
1016 U8 tmpbuf[UTF8_MAXBYTES+1];
1017 uvchr_to_utf8(tmpbuf, c);
1018 return is_utf8_upper(tmpbuf);
1022 Perl_is_uni_lower(pTHX_ UV c)
1024 U8 tmpbuf[UTF8_MAXBYTES+1];
1025 uvchr_to_utf8(tmpbuf, c);
1026 return is_utf8_lower(tmpbuf);
1030 Perl_is_uni_cntrl(pTHX_ UV c)
1032 U8 tmpbuf[UTF8_MAXBYTES+1];
1033 uvchr_to_utf8(tmpbuf, c);
1034 return is_utf8_cntrl(tmpbuf);
1038 Perl_is_uni_graph(pTHX_ UV c)
1040 U8 tmpbuf[UTF8_MAXBYTES+1];
1041 uvchr_to_utf8(tmpbuf, c);
1042 return is_utf8_graph(tmpbuf);
1046 Perl_is_uni_print(pTHX_ UV c)
1048 U8 tmpbuf[UTF8_MAXBYTES+1];
1049 uvchr_to_utf8(tmpbuf, c);
1050 return is_utf8_print(tmpbuf);
1054 Perl_is_uni_punct(pTHX_ UV c)
1056 U8 tmpbuf[UTF8_MAXBYTES+1];
1057 uvchr_to_utf8(tmpbuf, c);
1058 return is_utf8_punct(tmpbuf);
1062 Perl_is_uni_xdigit(pTHX_ UV c)
1064 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1065 uvchr_to_utf8(tmpbuf, c);
1066 return is_utf8_xdigit(tmpbuf);
1070 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1072 uvchr_to_utf8(p, c);
1073 return to_utf8_upper(p, p, lenp);
1077 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1079 uvchr_to_utf8(p, c);
1080 return to_utf8_title(p, p, lenp);
1084 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1086 uvchr_to_utf8(p, c);
1087 return to_utf8_lower(p, p, lenp);
1091 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1093 uvchr_to_utf8(p, c);
1094 return to_utf8_fold(p, p, lenp);
1097 /* for now these all assume no locale info available for Unicode > 255 */
1100 Perl_is_uni_alnum_lc(pTHX_ UV c)
1102 return is_uni_alnum(c); /* XXX no locale support yet */
1106 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1108 return is_uni_alnumc(c); /* XXX no locale support yet */
1112 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1114 return is_uni_idfirst(c); /* XXX no locale support yet */
1118 Perl_is_uni_alpha_lc(pTHX_ UV c)
1120 return is_uni_alpha(c); /* XXX no locale support yet */
1124 Perl_is_uni_ascii_lc(pTHX_ UV c)
1126 return is_uni_ascii(c); /* XXX no locale support yet */
1130 Perl_is_uni_space_lc(pTHX_ UV c)
1132 return is_uni_space(c); /* XXX no locale support yet */
1136 Perl_is_uni_digit_lc(pTHX_ UV c)
1138 return is_uni_digit(c); /* XXX no locale support yet */
1142 Perl_is_uni_upper_lc(pTHX_ UV c)
1144 return is_uni_upper(c); /* XXX no locale support yet */
1148 Perl_is_uni_lower_lc(pTHX_ UV c)
1150 return is_uni_lower(c); /* XXX no locale support yet */
1154 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1156 return is_uni_cntrl(c); /* XXX no locale support yet */
1160 Perl_is_uni_graph_lc(pTHX_ UV c)
1162 return is_uni_graph(c); /* XXX no locale support yet */
1166 Perl_is_uni_print_lc(pTHX_ UV c)
1168 return is_uni_print(c); /* XXX no locale support yet */
1172 Perl_is_uni_punct_lc(pTHX_ UV c)
1174 return is_uni_punct(c); /* XXX no locale support yet */
1178 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1180 return is_uni_xdigit(c); /* XXX no locale support yet */
1184 Perl_to_uni_upper_lc(pTHX_ U32 c)
1186 /* XXX returns only the first character -- do not use XXX */
1187 /* XXX no locale support yet */
1189 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1190 return (U32)to_uni_upper(c, tmpbuf, &len);
1194 Perl_to_uni_title_lc(pTHX_ U32 c)
1196 /* XXX returns only the first character XXX -- do not use XXX */
1197 /* XXX no locale support yet */
1199 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1200 return (U32)to_uni_title(c, tmpbuf, &len);
1204 Perl_to_uni_lower_lc(pTHX_ U32 c)
1206 /* XXX returns only the first character -- do not use XXX */
1207 /* XXX no locale support yet */
1209 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1210 return (U32)to_uni_lower(c, tmpbuf, &len);
1214 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1215 const char *const swashname)
1218 if (!is_utf8_char(p))
1221 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1222 return swash_fetch(*swash, p, TRUE) != 0;
1226 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1229 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1230 * descendant of isalnum(3), in other words, it doesn't
1231 * contain the '_'. --jhi */
1232 return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1236 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1239 return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
1243 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1248 /* is_utf8_idstart would be more logical. */
1249 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1253 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1258 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1262 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1265 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1269 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1272 return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1276 Perl_is_utf8_space(pTHX_ const U8 *p)
1279 return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1283 Perl_is_utf8_digit(pTHX_ const U8 *p)
1286 return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1290 Perl_is_utf8_upper(pTHX_ const U8 *p)
1293 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1297 Perl_is_utf8_lower(pTHX_ const U8 *p)
1300 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1304 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1307 return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1311 Perl_is_utf8_graph(pTHX_ const U8 *p)
1314 return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1318 Perl_is_utf8_print(pTHX_ const U8 *p)
1321 return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1325 Perl_is_utf8_punct(pTHX_ const U8 *p)
1328 return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1332 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1335 return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
1339 Perl_is_utf8_mark(pTHX_ const U8 *p)
1342 return is_utf8_common(p, &PL_utf8_mark, "IsM");
1346 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1348 The "p" contains the pointer to the UTF-8 string encoding
1349 the character that is being converted.
1351 The "ustrp" is a pointer to the character buffer to put the
1352 conversion result to. The "lenp" is a pointer to the length
1355 The "swashp" is a pointer to the swash to use.
1357 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1358 and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
1359 but not always, a multicharacter mapping), is tried first.
1361 The "special" is a string like "utf8::ToSpecLower", which means the
1362 hash %utf8::ToSpecLower. The access to the hash is through
1363 Perl_to_utf8_case().
1365 The "normal" is a string like "ToLower" which means the swash
1371 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1372 SV **swashp, const char *normal, const char *special)
1375 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1378 const UV uv0 = utf8_to_uvchr(p, NULL);
1379 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1380 * are necessary in EBCDIC, they are redundant no-ops
1381 * in ASCII-ish platforms, and hopefully optimized away. */
1382 const UV uv1 = NATIVE_TO_UNI(uv0);
1383 uvuni_to_utf8(tmpbuf, uv1);
1385 if (!*swashp) /* load on-demand */
1386 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1388 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1389 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1390 /* It might be "special" (sometimes, but not always,
1391 * a multicharacter mapping) */
1395 if ((hv = get_hv(special, FALSE)) &&
1396 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1400 s = SvPV_const(*svp, len);
1402 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1405 /* If we have EBCDIC we need to remap the characters
1406 * since any characters in the low 256 are Unicode
1407 * code points, not EBCDIC. */
1408 U8 *t = (U8*)s, *tend = t + len, *d;
1415 const UV c = utf8_to_uvchr(t, &tlen);
1417 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1426 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1431 Copy(tmpbuf, ustrp, len, U8);
1433 Copy(s, ustrp, len, U8);
1439 if (!len && *swashp) {
1440 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1443 /* It was "normal" (a single character mapping). */
1444 const UV uv3 = UNI_TO_NATIVE(uv2);
1445 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1449 if (!len) /* Neither: just copy. */
1450 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1455 return len ? utf8_to_uvchr(ustrp, 0) : 0;
1459 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1461 Convert the UTF-8 encoded character at p to its uppercase version and
1462 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1463 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1464 the uppercase version may be longer than the original character.
1466 The first character of the uppercased version is returned
1467 (but note, as explained above, that there may be more.)
1472 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1475 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1476 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1480 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1482 Convert the UTF-8 encoded character at p to its titlecase version and
1483 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1484 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1485 titlecase version may be longer than the original character.
1487 The first character of the titlecased version is returned
1488 (but note, as explained above, that there may be more.)
1493 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1496 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1497 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1501 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1503 Convert the UTF-8 encoded character at p to its lowercase version and
1504 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1505 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1506 lowercase version may be longer than the original character.
1508 The first character of the lowercased version is returned
1509 (but note, as explained above, that there may be more.)
1514 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1517 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1518 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1522 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1524 Convert the UTF-8 encoded character at p to its foldcase version and
1525 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1526 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1527 foldcase version may be longer than the original character (up to
1530 The first character of the foldcased version is returned
1531 (but note, as explained above, that there may be more.)
1536 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1539 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1540 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1544 * A "swash" is a swatch hash.
1545 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1546 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1547 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1550 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1554 SV* const tokenbufsv = sv_newmortal();
1556 const size_t pkg_len = strlen(pkg);
1557 const size_t name_len = strlen(name);
1558 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1561 PUSHSTACKi(PERLSI_MAGIC);
1566 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1568 errsv_save = newSVsv(ERRSV);
1569 /* It is assumed that callers of this routine are not passing in any
1570 user derived data. */
1571 /* Need to do this after save_re_context() as it will set PL_tainted to
1572 1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1573 Even line to create errsv_save can turn on PL_tainted. */
1574 SAVEBOOL(PL_tainted);
1576 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1579 sv_setsv(ERRSV, errsv_save);
1580 SvREFCNT_dec(errsv_save);
1586 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1587 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1589 PUSHs(sv_2mortal(newSViv(minbits)));
1590 PUSHs(sv_2mortal(newSViv(none)));
1592 if (IN_PERL_COMPILETIME) {
1593 /* XXX ought to be handled by lex_start */
1596 sv_setpv(tokenbufsv, PL_tokenbuf);
1598 errsv_save = newSVsv(ERRSV);
1599 if (call_method("SWASHNEW", G_SCALAR))
1600 retval = newSVsv(*PL_stack_sp--);
1602 retval = &PL_sv_undef;
1604 sv_setsv(ERRSV, errsv_save);
1605 SvREFCNT_dec(errsv_save);
1608 if (IN_PERL_COMPILETIME) {
1610 const char* const pv = SvPV_const(tokenbufsv, len);
1612 Copy(pv, PL_tokenbuf, len+1, char);
1613 CopHINTS_set(PL_curcop, PL_hints);
1615 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1617 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1619 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1625 /* This API is wrong for special case conversions since we may need to
1626 * return several Unicode characters for a single Unicode character
1627 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1628 * the lower-level routine, and it is similarly broken for returning
1629 * multiple values. --jhi */
1630 /* Now SWASHGET is recasted into S_swash_get in this file. */
1633 * Returns the value of property/mapping C<swash> for the first character
1634 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1635 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1636 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1639 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1642 HV* const hv = (HV*)SvRV(swash);
1647 const U8 *tmps = NULL;
1651 UV c = NATIVE_TO_ASCII(*ptr);
1653 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1654 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1655 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1658 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1659 * then the "swatch" is a vec() for al the chars which start
1661 * So the key in the hash (klen) is length of encoded char -1
1663 klen = UTF8SKIP(ptr) - 1;
1667 /* If char in invariant then swatch is for all the invariant chars
1668 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1670 needents = UTF_CONTINUATION_MARK;
1671 off = NATIVE_TO_UTF(ptr[klen]);
1674 /* If char is encoded then swatch is for the prefix */
1675 needents = (1 << UTF_ACCUMULATION_SHIFT);
1676 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1680 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1681 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1682 * it's nothing to sniff at.) Pity we usually come through at least
1683 * two function calls to get here...
1685 * NB: this code assumes that swatches are never modified, once generated!
1688 if (hv == PL_last_swash_hv &&
1689 klen == PL_last_swash_klen &&
1690 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1692 tmps = PL_last_swash_tmps;
1693 slen = PL_last_swash_slen;
1696 /* Try our second-level swatch cache, kept in a hash. */
1697 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1699 /* If not cached, generate it via swash_get */
1700 if (!svp || !SvPOK(*svp)
1701 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1702 /* We use utf8n_to_uvuni() as we want an index into
1703 Unicode tables, not a native character number.
1705 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1707 0 : UTF8_ALLOW_ANY);
1708 swatch = swash_get(swash,
1709 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1710 (klen) ? (code_point & ~(needents - 1)) : 0,
1713 if (IN_PERL_COMPILETIME)
1714 CopHINTS_set(PL_curcop, PL_hints);
1716 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1718 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1719 || (slen << 3) < needents)
1720 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1723 PL_last_swash_hv = hv;
1724 PL_last_swash_klen = klen;
1725 /* FIXME change interpvar.h? */
1726 PL_last_swash_tmps = (U8 *) tmps;
1727 PL_last_swash_slen = slen;
1729 Copy(ptr, PL_last_swash_key, klen, U8);
1732 switch ((int)((slen << 3) / needents)) {
1734 bit = 1 << (off & 7);
1736 return (tmps[off] & bit) != 0;
1741 return (tmps[off] << 8) + tmps[off + 1] ;
1744 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1746 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1750 * Returns a swatch (a bit vector string) for a code point sequence
1751 * that starts from the value C<start> and comprises the number C<span>.
1752 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1753 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1756 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1759 U8 *l, *lend, *x, *xend, *s;
1760 STRLEN lcur, xcur, scur;
1762 HV* const hv = (HV*)SvRV(swash);
1763 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1764 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1765 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1766 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1767 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1768 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1769 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1770 const STRLEN bits = SvUV(*bitssvp);
1771 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1772 const UV none = SvUV(*nonesvp);
1773 const UV end = start + span;
1775 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1776 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1780 /* create and initialize $swatch */
1781 swatch = newSVpvs("");
1782 scur = octets ? (span * octets) : (span + 7) / 8;
1783 SvGROW(swatch, scur + 1);
1784 s = (U8*)SvPVX(swatch);
1785 if (octets && none) {
1786 const U8* const e = s + scur;
1789 *s++ = (U8)(none & 0xff);
1790 else if (bits == 16) {
1791 *s++ = (U8)((none >> 8) & 0xff);
1792 *s++ = (U8)( none & 0xff);
1794 else if (bits == 32) {
1795 *s++ = (U8)((none >> 24) & 0xff);
1796 *s++ = (U8)((none >> 16) & 0xff);
1797 *s++ = (U8)((none >> 8) & 0xff);
1798 *s++ = (U8)( none & 0xff);
1804 (void)memzero((U8*)s, scur + 1);
1806 SvCUR_set(swatch, scur);
1807 s = (U8*)SvPVX(swatch);
1809 /* read $swash->{LIST} */
1810 l = (U8*)SvPV(*listsvp, lcur);
1813 UV min, max, val, key;
1815 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1817 U8* const nl = (U8*)memchr(l, '\n', lend - l);
1820 min = grok_hex((char *)l, &numlen, &flags, NULL);
1824 l = nl + 1; /* 1 is length of "\n" */
1828 l = lend; /* to LIST's end at which \n is not found */
1834 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1836 max = grok_hex((char *)l, &numlen, &flags, NULL);
1845 flags = PERL_SCAN_SILENT_ILLDIGIT |
1846 PERL_SCAN_DISALLOW_PREFIX;
1848 val = grok_hex((char *)l, &numlen, &flags, NULL);
1857 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1863 val = 0; /* bits == 1, then val should be ignored */
1870 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1874 val = 0; /* bits == 1, then val should be ignored */
1887 if (!none || val < none) {
1892 for (key = min; key <= max; key++) {
1896 /* offset must be non-negative (start <= min <= key < end) */
1897 offset = octets * (key - start);
1899 s[offset] = (U8)(val & 0xff);
1900 else if (bits == 16) {
1901 s[offset ] = (U8)((val >> 8) & 0xff);
1902 s[offset + 1] = (U8)( val & 0xff);
1904 else if (bits == 32) {
1905 s[offset ] = (U8)((val >> 24) & 0xff);
1906 s[offset + 1] = (U8)((val >> 16) & 0xff);
1907 s[offset + 2] = (U8)((val >> 8) & 0xff);
1908 s[offset + 3] = (U8)( val & 0xff);
1911 if (!none || val < none)
1915 else { /* bits == 1, then val should be ignored */
1918 for (key = min; key <= max; key++) {
1919 const STRLEN offset = (STRLEN)(key - start);
1922 s[offset >> 3] |= 1 << (offset & 7);
1928 /* read $swash->{EXTRAS} */
1929 x = (U8*)SvPV(*extssvp, xcur);
1937 SV **otherbitssvp, *other;
1945 nl = (U8*)memchr(x, '\n', xend - x);
1947 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1949 x = nl + 1; /* 1 is length of "\n" */
1953 x = xend; /* to EXTRAS' end at which \n is not found */
1960 namelen = nl - namestr;
1964 namelen = xend - namestr;
1968 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
1969 otherhv = (HV*)SvRV(*othersvp);
1970 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
1971 otherbits = (STRLEN)SvUV(*otherbitssvp);
1972 if (bits < otherbits)
1973 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
1975 /* The "other" swatch must be destroyed after. */
1976 other = swash_get(*othersvp, start, span);
1977 o = (U8*)SvPV(other, olen);
1980 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
1982 s = (U8*)SvPV(swatch, slen);
1983 if (bits == 1 && otherbits == 1) {
1985 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2009 STRLEN otheroctets = otherbits >> 3;
2011 U8* send = s + slen;
2016 if (otherbits == 1) {
2017 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2021 STRLEN vlen = otheroctets;
2029 if (opc == '+' && otherval)
2030 NOOP; /* replace with otherval */
2031 else if (opc == '!' && !otherval)
2033 else if (opc == '-' && otherval)
2035 else if (opc == '&' && !otherval)
2038 s += octets; /* no replacement */
2043 *s++ = (U8)( otherval & 0xff);
2044 else if (bits == 16) {
2045 *s++ = (U8)((otherval >> 8) & 0xff);
2046 *s++ = (U8)( otherval & 0xff);
2048 else if (bits == 32) {
2049 *s++ = (U8)((otherval >> 24) & 0xff);
2050 *s++ = (U8)((otherval >> 16) & 0xff);
2051 *s++ = (U8)((otherval >> 8) & 0xff);
2052 *s++ = (U8)( otherval & 0xff);
2056 sv_free(other); /* through with it! */
2062 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2064 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2065 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2066 bytes available. The return value is the pointer to the byte after the
2067 end of the new character. In other words,
2069 d = uvchr_to_utf8(d, uv);
2071 is the recommended wide native character-aware way of saying
2078 /* On ASCII machines this is normally a macro but we want a
2079 real function in case XS code wants it
2082 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2084 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2088 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2090 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2094 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2097 Returns the native character value of the first character in the string
2099 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2100 length, in bytes, of that character.
2102 Allows length and flags to be passed to low level routine.
2106 /* On ASCII machines this is normally a macro but we want
2107 a real function in case XS code wants it
2110 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2113 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2114 return UNI_TO_NATIVE(uv);
2118 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2120 Build to the scalar dsv a displayable version of the string spv,
2121 length len, the displayable version being at most pvlim bytes long
2122 (if longer, the rest is truncated and "..." will be appended).
2124 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2125 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2126 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2127 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2128 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2129 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2131 The pointer to the PV of the dsv is returned.
2135 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2140 sv_setpvn(dsv, "", 0);
2141 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2143 /* This serves double duty as a flag and a character to print after
2144 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2148 if (pvlim && SvCUR(dsv) >= pvlim) {
2152 u = utf8_to_uvchr((U8*)s, 0);
2154 const unsigned char c = (unsigned char)u & 0xFF;
2155 if (flags & UNI_DISPLAY_BACKSLASH) {
2172 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2175 /* isPRINT() is the locale-blind version. */
2176 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2177 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2182 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2185 sv_catpvs(dsv, "...");
2191 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2193 Build to the scalar dsv a displayable version of the scalar sv,
2194 the displayable version being at most pvlim bytes long
2195 (if longer, the rest is truncated and "..." will be appended).
2197 The flags argument is as in pv_uni_display().
2199 The pointer to the PV of the dsv is returned.
2204 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2206 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2207 SvCUR(ssv), pvlim, flags);
2211 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
2213 Return true if the strings s1 and s2 differ case-insensitively, false
2214 if not (if they are equal case-insensitively). If u1 is true, the
2215 string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
2216 the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2217 are false, the respective string is assumed to be in native 8-bit
2220 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2221 in there (they will point at the beginning of the I<next> character).
2222 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2223 pointers beyond which scanning will not continue under any
2224 circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
2225 s2+l2 will be used as goal end pointers that will also stop the scan,
2226 and which qualify towards defining a successful match: all the scans
2227 that define an explicit length must reach their goal pointers for
2228 a match to succeed).
2230 For case-insensitiveness, the "casefolding" of Unicode is used
2231 instead of upper/lowercasing both the characters, see
2232 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2236 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2239 register const U8 *p1 = (const U8*)s1;
2240 register const U8 *p2 = (const U8*)s2;
2241 register const U8 *f1 = NULL;
2242 register const U8 *f2 = NULL;
2243 register U8 *e1 = NULL;
2244 register U8 *q1 = NULL;
2245 register U8 *e2 = NULL;
2246 register U8 *q2 = NULL;
2247 STRLEN n1 = 0, n2 = 0;
2248 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2249 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2251 STRLEN foldlen1, foldlen2;
2256 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2257 f1 = (const U8*)s1 + l1;
2260 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2261 f2 = (const U8*)s2 + l2;
2263 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2264 return 1; /* mismatch; possible infinite loop or false positive */
2267 natbuf[1] = 0; /* Need to terminate the buffer. */
2269 while ((e1 == 0 || p1 < e1) &&
2270 (f1 == 0 || p1 < f1) &&
2271 (e2 == 0 || p2 < e2) &&
2272 (f2 == 0 || p2 < f2)) {
2275 to_utf8_fold(p1, foldbuf1, &foldlen1);
2277 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2278 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2285 to_utf8_fold(p2, foldbuf2, &foldlen2);
2287 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2288 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2294 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2295 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2296 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2297 return 1; /* mismatch */
2304 p1 += u1 ? UTF8SKIP(p1) : 1;
2306 p2 += u2 ? UTF8SKIP(p2) : 1;
2310 /* A match is defined by all the scans that specified
2311 * an explicit length reaching their final goals. */
2312 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2321 return match ? 0 : 1; /* 0 match, 1 mismatch */
2326 * c-indentation-style: bsd
2328 * indent-tabs-mode: t
2331 * ex: set ts=8 sts=4 sw=4 noet: