3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007
4 * by Larry Wall and others
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
25 #define PERL_IN_UTF8_C
29 /* Separate prototypes needed because in ASCII systems these
30 * usually macros but they still are compiled as code, too. */
31 PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
32 PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
35 static const char unees[] =
36 "Malformed UTF-8 character (unexpected end of string)";
39 =head1 Unicode Support
41 This file contains various utility functions for manipulating UTF8-encoded
42 strings. For the uninitiated, this is a method of representing arbitrary
43 Unicode characters as a variable number of bytes, in such a way that
44 characters in the ASCII range are unmodified, and a zero byte never appears
45 within non-zero characters.
47 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
49 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
50 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
51 bytes available. The return value is the pointer to the byte after the
52 end of the new character. In other words,
54 d = uvuni_to_utf8_flags(d, uv, flags);
58 d = uvuni_to_utf8(d, uv);
60 (which is equivalent to)
62 d = uvuni_to_utf8_flags(d, uv, 0);
64 is the recommended Unicode-aware way of saying
72 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
74 PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
76 if (ckWARN(WARN_UTF8)) {
77 if (UNICODE_IS_SURROGATE(uv) &&
78 !(flags & UNICODE_ALLOW_SURROGATE))
79 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
81 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
82 !(flags & UNICODE_ALLOW_FDD0))
84 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
85 !(flags & UNICODE_ALLOW_FFFF))) &&
86 /* UNICODE_ALLOW_SUPER includes
87 * FFFEs and FFFFs beyond 0x10FFFF. */
88 ((uv <= PERL_UNICODE_MAX) ||
89 !(flags & UNICODE_ALLOW_SUPER))
91 Perl_warner(aTHX_ packWARN(WARN_UTF8),
92 "Unicode character 0x%04"UVxf" is illegal", uv);
94 if (UNI_IS_INVARIANT(uv)) {
95 *d++ = (U8)UTF_TO_NATIVE(uv);
100 STRLEN len = UNISKIP(uv);
103 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
104 uv >>= UTF_ACCUMULATION_SHIFT;
106 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
109 #else /* Non loop style */
111 *d++ = (U8)(( uv >> 6) | 0xc0);
112 *d++ = (U8)(( uv & 0x3f) | 0x80);
116 *d++ = (U8)(( uv >> 12) | 0xe0);
117 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
118 *d++ = (U8)(( uv & 0x3f) | 0x80);
122 *d++ = (U8)(( uv >> 18) | 0xf0);
123 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
124 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
125 *d++ = (U8)(( uv & 0x3f) | 0x80);
128 if (uv < 0x4000000) {
129 *d++ = (U8)(( uv >> 24) | 0xf8);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
136 if (uv < 0x80000000) {
137 *d++ = (U8)(( uv >> 30) | 0xfc);
138 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
139 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
140 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
141 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
142 *d++ = (U8)(( uv & 0x3f) | 0x80);
146 if (uv < UTF8_QUAD_MAX)
149 *d++ = 0xfe; /* Can't match U+FEFF! */
150 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
151 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
152 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
153 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
154 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
155 *d++ = (U8)(( uv & 0x3f) | 0x80);
160 *d++ = 0xff; /* Can't match U+FFFE! */
161 *d++ = 0x80; /* 6 Reserved bits */
162 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
163 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
164 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
165 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
166 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
167 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
168 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
169 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
170 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
171 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
172 *d++ = (U8)(( uv & 0x3f) | 0x80);
176 #endif /* Loop style */
181 Tests if some arbitrary number of bytes begins in a valid UTF-8
182 character. Note that an INVARIANT (i.e. ASCII) character is a valid
183 UTF-8 character. The actual number of bytes in the UTF-8 character
184 will be returned if it is valid, otherwise 0.
186 This is the "slow" version as opposed to the "fast" version which is
187 the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
188 difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
189 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
190 you should use the _slow(). In practice this means that the _slow()
191 will be used very rarely, since the maximum Unicode code point (as of
192 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
193 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
198 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
204 PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
206 if (UTF8_IS_INVARIANT(u))
209 if (!UTF8_IS_START(u))
212 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
218 u = NATIVE_TO_UTF(u);
220 u &= UTF_START_MASK(len);
224 if (!UTF8_IS_CONTINUATION(*s))
226 uv = UTF8_ACCUMULATE(uv, *s);
233 if ((STRLEN)UNISKIP(uv) < len)
240 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
242 Tests if some arbitrary number of bytes begins in a valid UTF-8
243 character. Note that an INVARIANT (i.e. ASCII) character is a valid
244 UTF-8 character. The actual number of bytes in the UTF-8 character
245 will be returned if it is valid, otherwise 0.
249 Perl_is_utf8_char(pTHX_ const U8 *s)
251 const STRLEN len = UTF8SKIP(s);
253 PERL_ARGS_ASSERT_IS_UTF8_CHAR;
256 if (IS_UTF8_CHAR_FAST(len))
257 return IS_UTF8_CHAR(s, len) ? len : 0;
258 #endif /* #ifdef IS_UTF8_CHAR */
259 return is_utf8_char_slow(s, len);
263 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
265 Returns true if first C<len> bytes of the given string form a valid
266 UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
267 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
268 because a valid ASCII string is a valid UTF-8 string.
270 See also is_utf8_string_loclen() and is_utf8_string_loc().
276 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
278 const U8* const send = s + (len ? len : strlen((const char *)s));
281 PERL_ARGS_ASSERT_IS_UTF8_STRING;
286 /* Inline the easy bits of is_utf8_char() here for speed... */
287 if (UTF8_IS_INVARIANT(*x))
289 else if (!UTF8_IS_START(*x))
292 /* ... and call is_utf8_char() only if really needed. */
295 if (IS_UTF8_CHAR_FAST(c)) {
296 if (!IS_UTF8_CHAR(x, c))
300 c = is_utf8_char_slow(x, c);
303 #endif /* #ifdef IS_UTF8_CHAR */
318 Implemented as a macro in utf8.h
320 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
322 Like is_utf8_string() but stores the location of the failure (in the
323 case of "utf8ness failure") or the location s+len (in the case of
324 "utf8ness success") in the C<ep>.
326 See also is_utf8_string_loclen() and is_utf8_string().
328 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
330 Like is_utf8_string() but stores the location of the failure (in the
331 case of "utf8ness failure") or the location s+len (in the case of
332 "utf8ness success") in the C<ep>, and the number of UTF-8
333 encoded characters in the C<el>.
335 See also is_utf8_string_loc() and is_utf8_string().
341 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
343 const U8* const send = s + (len ? len : strlen((const char *)s));
348 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
352 /* Inline the easy bits of is_utf8_char() here for speed... */
353 if (UTF8_IS_INVARIANT(*x))
355 else if (!UTF8_IS_START(*x))
358 /* ... and call is_utf8_char() only if really needed. */
361 if (IS_UTF8_CHAR_FAST(c)) {
362 if (!IS_UTF8_CHAR(x, c))
365 c = is_utf8_char_slow(x, c);
368 #endif /* #ifdef IS_UTF8_CHAR */
387 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
389 Bottom level UTF-8 decode routine.
390 Returns the Unicode code point value of the first character in the string C<s>
391 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
392 C<retlen> will be set to the length, in bytes, of that character.
394 If C<s> does not point to a well-formed UTF-8 character, the behaviour
395 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
396 it is assumed that the caller will raise a warning, and this function
397 will silently just set C<retlen> to C<-1> and return zero. If the
398 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
399 malformations will be given, C<retlen> will be set to the expected
400 length of the UTF-8 character in bytes, and zero will be returned.
402 The C<flags> can also contain various flags to allow deviations from
403 the strict UTF-8 encoding (see F<utf8.h>).
405 Most code should use utf8_to_uvchr() rather than call this directly.
411 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
414 const U8 * const s0 = s;
417 const bool dowarn = ckWARN_d(WARN_UTF8);
418 const UV startbyte = *s;
419 STRLEN expectlen = 0;
422 PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
424 /* This list is a superset of the UTF8_ALLOW_XXX. */
426 #define UTF8_WARN_EMPTY 1
427 #define UTF8_WARN_CONTINUATION 2
428 #define UTF8_WARN_NON_CONTINUATION 3
429 #define UTF8_WARN_FE_FF 4
430 #define UTF8_WARN_SHORT 5
431 #define UTF8_WARN_OVERFLOW 6
432 #define UTF8_WARN_SURROGATE 7
433 #define UTF8_WARN_LONG 8
434 #define UTF8_WARN_FFFF 9 /* Also FFFE. */
437 !(flags & UTF8_ALLOW_EMPTY)) {
438 warning = UTF8_WARN_EMPTY;
442 if (UTF8_IS_INVARIANT(uv)) {
445 return (UV) (NATIVE_TO_UTF(*s));
448 if (UTF8_IS_CONTINUATION(uv) &&
449 !(flags & UTF8_ALLOW_CONTINUATION)) {
450 warning = UTF8_WARN_CONTINUATION;
454 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
455 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
456 warning = UTF8_WARN_NON_CONTINUATION;
461 uv = NATIVE_TO_UTF(uv);
463 if ((uv == 0xfe || uv == 0xff) &&
464 !(flags & UTF8_ALLOW_FE_FF)) {
465 warning = UTF8_WARN_FE_FF;
470 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
471 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
472 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
473 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
475 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
476 else { len = 7; uv &= 0x01; }
478 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
479 else if (!(uv & 0x01)) { len = 7; uv = 0; }
480 else { len = 13; uv = 0; } /* whoa! */
488 if ((curlen < expectlen) &&
489 !(flags & UTF8_ALLOW_SHORT)) {
490 warning = UTF8_WARN_SHORT;
499 if (!UTF8_IS_CONTINUATION(*s) &&
500 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
502 warning = UTF8_WARN_NON_CONTINUATION;
506 uv = UTF8_ACCUMULATE(uv, *s);
508 /* These cannot be allowed. */
510 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
511 warning = UTF8_WARN_LONG;
515 else { /* uv < ouv */
516 /* This cannot be allowed. */
517 warning = UTF8_WARN_OVERFLOW;
525 if (UNICODE_IS_SURROGATE(uv) &&
526 !(flags & UTF8_ALLOW_SURROGATE)) {
527 warning = UTF8_WARN_SURROGATE;
529 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
530 !(flags & UTF8_ALLOW_LONG)) {
531 warning = UTF8_WARN_LONG;
533 } else if (UNICODE_IS_ILLEGAL(uv) &&
534 !(flags & UTF8_ALLOW_FFFF)) {
535 warning = UTF8_WARN_FFFF;
543 if (flags & UTF8_CHECK_ONLY) {
545 *retlen = ((STRLEN) -1);
550 SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
553 case 0: /* Intentionally empty. */ break;
554 case UTF8_WARN_EMPTY:
555 sv_catpvs(sv, "(empty string)");
557 case UTF8_WARN_CONTINUATION:
558 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
560 case UTF8_WARN_NON_CONTINUATION:
562 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
563 (UV)s[1], startbyte);
565 const int len = (int)(s-s0);
566 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
567 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
571 case UTF8_WARN_FE_FF:
572 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
574 case UTF8_WARN_SHORT:
575 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
576 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
577 expectlen = curlen; /* distance for caller to skip */
579 case UTF8_WARN_OVERFLOW:
580 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
583 case UTF8_WARN_SURROGATE:
584 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
587 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
588 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
591 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
594 sv_catpvs(sv, "(unknown reason)");
599 const char * const s = SvPVX_const(sv);
602 Perl_warner(aTHX_ packWARN(WARN_UTF8),
603 "%s in %s", s, OP_DESC(PL_op));
605 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
610 *retlen = expectlen ? expectlen : len;
616 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
618 Returns the native character value of the first character in the string C<s>
619 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
620 length, in bytes, of that character.
622 If C<s> does not point to a well-formed UTF-8 character, zero is
623 returned and retlen is set, if possible, to -1.
629 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
631 PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
633 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
634 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
638 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
640 Returns the Unicode code point of the first character in the string C<s>
641 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
642 length, in bytes, of that character.
644 This function should only be used when returned UV is considered
645 an index into the Unicode semantic tables (e.g. swashes).
647 If C<s> does not point to a well-formed UTF-8 character, zero is
648 returned and retlen is set, if possible, to -1.
654 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
656 PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
658 /* Call the low level routine asking for checks */
659 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
660 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
664 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
666 Return the length of the UTF-8 char encoded string C<s> in characters.
667 Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
668 up past C<e>, croaks.
674 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
680 PERL_ARGS_ASSERT_UTF8_LENGTH;
682 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
683 * the bitops (especially ~) can create illegal UTF-8.
684 * In other words: in Perl UTF-8 is not just for Unicode. */
687 goto warn_and_return;
692 if (ckWARN_d(WARN_UTF8)) {
694 Perl_warner(aTHX_ packWARN(WARN_UTF8),
695 "%s in %s", unees, OP_DESC(PL_op));
697 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
709 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
711 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
714 WARNING: use only if you *know* that the pointers point inside the
721 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
723 PERL_ARGS_ASSERT_UTF8_DISTANCE;
725 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
729 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
731 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
734 WARNING: do not use the following unless you *know* C<off> is within
735 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
736 on the first byte of character or just after the last byte of a character.
742 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
744 PERL_ARGS_ASSERT_UTF8_HOP;
747 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
748 * the bitops (especially ~) can create illegal UTF-8.
749 * In other words: in Perl UTF-8 is not just for Unicode. */
758 while (UTF8_IS_CONTINUATION(*s))
766 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
768 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
769 Unlike C<bytes_to_utf8>, this over-writes the original string, and
770 updates len to contain the new length.
771 Returns zero on failure, setting C<len> to -1.
773 If you need a copy of the string, see C<bytes_from_utf8>.
779 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
782 U8 * const send = s + *len;
785 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
787 /* ensure valid UTF-8 and chars < 256 before updating string */
791 if (!UTF8_IS_INVARIANT(c) &&
792 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
793 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
794 *len = ((STRLEN) -1);
802 *d++ = (U8)utf8_to_uvchr(s, &ulen);
811 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
813 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
814 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
815 the newly-created string, and updates C<len> to contain the new
816 length. Returns the original string if no conversion occurs, C<len>
817 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
818 0 if C<s> is converted or contains all 7bit characters.
824 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
831 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
837 /* ensure valid UTF-8 and chars < 256 before converting string */
838 for (send = s + *len; s < send;) {
840 if (!UTF8_IS_INVARIANT(c)) {
841 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
842 (c = *s++) && UTF8_IS_CONTINUATION(c))
851 Newx(d, (*len) - count + 1, U8);
852 s = start; start = d;
855 if (!UTF8_IS_INVARIANT(c)) {
856 /* Then it is two-byte encoded */
857 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
858 c = ASCII_TO_NATIVE(c);
868 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
870 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
871 Returns a pointer to the newly-created string, and sets C<len> to
872 reflect the new length.
874 If you want to convert to UTF-8 from other encodings than ASCII,
875 see sv_recode_to_utf8().
881 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
883 const U8 * const send = s + (*len);
887 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
890 Newx(d, (*len) * 2 + 1, U8);
894 const UV uv = NATIVE_TO_ASCII(*s++);
895 if (UNI_IS_INVARIANT(uv))
896 *d++ = (U8)UTF_TO_NATIVE(uv);
898 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
899 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
908 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
910 * Destination must be pre-extended to 3/2 source. Do not use in-place.
911 * We optimize for native, for obvious reasons. */
914 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
919 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
921 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
928 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
933 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
937 *d++ = UNI_TO_NATIVE(uv);
944 *d++ = (U8)(( uv >> 6) | 0xc0);
945 *d++ = (U8)(( uv & 0x3f) | 0x80);
948 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
949 UV low = (p[0] << 8) + p[1];
951 if (low < 0xdc00 || low >= 0xdfff)
952 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
953 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
956 *d++ = (U8)(( uv >> 12) | 0xe0);
957 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
958 *d++ = (U8)(( uv & 0x3f) | 0x80);
962 *d++ = (U8)(( uv >> 18) | 0xf0);
963 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
964 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
965 *d++ = (U8)(( uv & 0x3f) | 0x80);
969 *newlen = d - dstart;
973 /* Note: this one is slightly destructive of the source. */
976 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
979 U8* const send = s + bytelen;
981 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
989 return utf16_to_utf8(p, d, bytelen, newlen);
992 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
995 Perl_is_uni_alnum(pTHX_ UV c)
997 U8 tmpbuf[UTF8_MAXBYTES+1];
998 uvchr_to_utf8(tmpbuf, c);
999 return is_utf8_alnum(tmpbuf);
1003 Perl_is_uni_alnumc(pTHX_ UV c)
1005 U8 tmpbuf[UTF8_MAXBYTES+1];
1006 uvchr_to_utf8(tmpbuf, c);
1007 return is_utf8_alnumc(tmpbuf);
1011 Perl_is_uni_idfirst(pTHX_ UV c)
1013 U8 tmpbuf[UTF8_MAXBYTES+1];
1014 uvchr_to_utf8(tmpbuf, c);
1015 return is_utf8_idfirst(tmpbuf);
1019 Perl_is_uni_alpha(pTHX_ UV c)
1021 U8 tmpbuf[UTF8_MAXBYTES+1];
1022 uvchr_to_utf8(tmpbuf, c);
1023 return is_utf8_alpha(tmpbuf);
1027 Perl_is_uni_ascii(pTHX_ UV c)
1029 U8 tmpbuf[UTF8_MAXBYTES+1];
1030 uvchr_to_utf8(tmpbuf, c);
1031 return is_utf8_ascii(tmpbuf);
1035 Perl_is_uni_space(pTHX_ UV c)
1037 U8 tmpbuf[UTF8_MAXBYTES+1];
1038 uvchr_to_utf8(tmpbuf, c);
1039 return is_utf8_space(tmpbuf);
1043 Perl_is_uni_digit(pTHX_ UV c)
1045 U8 tmpbuf[UTF8_MAXBYTES+1];
1046 uvchr_to_utf8(tmpbuf, c);
1047 return is_utf8_digit(tmpbuf);
1051 Perl_is_uni_upper(pTHX_ UV c)
1053 U8 tmpbuf[UTF8_MAXBYTES+1];
1054 uvchr_to_utf8(tmpbuf, c);
1055 return is_utf8_upper(tmpbuf);
1059 Perl_is_uni_lower(pTHX_ UV c)
1061 U8 tmpbuf[UTF8_MAXBYTES+1];
1062 uvchr_to_utf8(tmpbuf, c);
1063 return is_utf8_lower(tmpbuf);
1067 Perl_is_uni_cntrl(pTHX_ UV c)
1069 U8 tmpbuf[UTF8_MAXBYTES+1];
1070 uvchr_to_utf8(tmpbuf, c);
1071 return is_utf8_cntrl(tmpbuf);
1075 Perl_is_uni_graph(pTHX_ UV c)
1077 U8 tmpbuf[UTF8_MAXBYTES+1];
1078 uvchr_to_utf8(tmpbuf, c);
1079 return is_utf8_graph(tmpbuf);
1083 Perl_is_uni_print(pTHX_ UV c)
1085 U8 tmpbuf[UTF8_MAXBYTES+1];
1086 uvchr_to_utf8(tmpbuf, c);
1087 return is_utf8_print(tmpbuf);
1091 Perl_is_uni_punct(pTHX_ UV c)
1093 U8 tmpbuf[UTF8_MAXBYTES+1];
1094 uvchr_to_utf8(tmpbuf, c);
1095 return is_utf8_punct(tmpbuf);
1099 Perl_is_uni_xdigit(pTHX_ UV c)
1101 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1102 uvchr_to_utf8(tmpbuf, c);
1103 return is_utf8_xdigit(tmpbuf);
1107 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1109 PERL_ARGS_ASSERT_TO_UNI_UPPER;
1111 uvchr_to_utf8(p, c);
1112 return to_utf8_upper(p, p, lenp);
1116 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1118 PERL_ARGS_ASSERT_TO_UNI_TITLE;
1120 uvchr_to_utf8(p, c);
1121 return to_utf8_title(p, p, lenp);
1125 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1127 PERL_ARGS_ASSERT_TO_UNI_LOWER;
1129 uvchr_to_utf8(p, c);
1130 return to_utf8_lower(p, p, lenp);
1134 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1136 PERL_ARGS_ASSERT_TO_UNI_FOLD;
1138 uvchr_to_utf8(p, c);
1139 return to_utf8_fold(p, p, lenp);
1142 /* for now these all assume no locale info available for Unicode > 255 */
1145 Perl_is_uni_alnum_lc(pTHX_ UV c)
1147 return is_uni_alnum(c); /* XXX no locale support yet */
1151 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1153 return is_uni_alnumc(c); /* XXX no locale support yet */
1157 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1159 return is_uni_idfirst(c); /* XXX no locale support yet */
1163 Perl_is_uni_alpha_lc(pTHX_ UV c)
1165 return is_uni_alpha(c); /* XXX no locale support yet */
1169 Perl_is_uni_ascii_lc(pTHX_ UV c)
1171 return is_uni_ascii(c); /* XXX no locale support yet */
1175 Perl_is_uni_space_lc(pTHX_ UV c)
1177 return is_uni_space(c); /* XXX no locale support yet */
1181 Perl_is_uni_digit_lc(pTHX_ UV c)
1183 return is_uni_digit(c); /* XXX no locale support yet */
1187 Perl_is_uni_upper_lc(pTHX_ UV c)
1189 return is_uni_upper(c); /* XXX no locale support yet */
1193 Perl_is_uni_lower_lc(pTHX_ UV c)
1195 return is_uni_lower(c); /* XXX no locale support yet */
1199 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1201 return is_uni_cntrl(c); /* XXX no locale support yet */
1205 Perl_is_uni_graph_lc(pTHX_ UV c)
1207 return is_uni_graph(c); /* XXX no locale support yet */
1211 Perl_is_uni_print_lc(pTHX_ UV c)
1213 return is_uni_print(c); /* XXX no locale support yet */
1217 Perl_is_uni_punct_lc(pTHX_ UV c)
1219 return is_uni_punct(c); /* XXX no locale support yet */
1223 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1225 return is_uni_xdigit(c); /* XXX no locale support yet */
1229 Perl_to_uni_upper_lc(pTHX_ U32 c)
1231 /* XXX returns only the first character -- do not use XXX */
1232 /* XXX no locale support yet */
1234 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1235 return (U32)to_uni_upper(c, tmpbuf, &len);
1239 Perl_to_uni_title_lc(pTHX_ U32 c)
1241 /* XXX returns only the first character XXX -- do not use XXX */
1242 /* XXX no locale support yet */
1244 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1245 return (U32)to_uni_title(c, tmpbuf, &len);
1249 Perl_to_uni_lower_lc(pTHX_ U32 c)
1251 /* XXX returns only the first character -- do not use XXX */
1252 /* XXX no locale support yet */
1254 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1255 return (U32)to_uni_lower(c, tmpbuf, &len);
1259 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1260 const char *const swashname)
1264 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1266 if (!is_utf8_char(p))
1269 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1270 return swash_fetch(*swash, p, TRUE) != 0;
1274 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1278 PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1280 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1281 * descendant of isalnum(3), in other words, it doesn't
1282 * contain the '_'. --jhi */
1283 return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1287 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1291 PERL_ARGS_ASSERT_IS_UTF8_ALNUMC;
1293 return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
1297 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1301 PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1305 /* is_utf8_idstart would be more logical. */
1306 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1310 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1314 PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1318 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1322 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1326 PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1328 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1332 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1336 PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1338 return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1342 Perl_is_utf8_space(pTHX_ const U8 *p)
1346 PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1348 return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1352 Perl_is_utf8_digit(pTHX_ const U8 *p)
1356 PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1358 return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1362 Perl_is_utf8_upper(pTHX_ const U8 *p)
1366 PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1368 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1372 Perl_is_utf8_lower(pTHX_ const U8 *p)
1376 PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1378 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1382 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1386 PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1388 return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1392 Perl_is_utf8_graph(pTHX_ const U8 *p)
1396 PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1398 return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1402 Perl_is_utf8_print(pTHX_ const U8 *p)
1406 PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1408 return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1412 Perl_is_utf8_punct(pTHX_ const U8 *p)
1416 PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1418 return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1422 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1426 PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1428 return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
1432 Perl_is_utf8_mark(pTHX_ const U8 *p)
1436 PERL_ARGS_ASSERT_IS_UTF8_MARK;
1438 return is_utf8_common(p, &PL_utf8_mark, "IsM");
1442 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1444 The "p" contains the pointer to the UTF-8 string encoding
1445 the character that is being converted.
1447 The "ustrp" is a pointer to the character buffer to put the
1448 conversion result to. The "lenp" is a pointer to the length
1451 The "swashp" is a pointer to the swash to use.
1453 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1454 and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
1455 but not always, a multicharacter mapping), is tried first.
1457 The "special" is a string like "utf8::ToSpecLower", which means the
1458 hash %utf8::ToSpecLower. The access to the hash is through
1459 Perl_to_utf8_case().
1461 The "normal" is a string like "ToLower" which means the swash
1467 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1468 SV **swashp, const char *normal, const char *special)
1471 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1473 const UV uv0 = utf8_to_uvchr(p, NULL);
1474 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1475 * are necessary in EBCDIC, they are redundant no-ops
1476 * in ASCII-ish platforms, and hopefully optimized away. */
1477 const UV uv1 = NATIVE_TO_UNI(uv0);
1479 PERL_ARGS_ASSERT_TO_UTF8_CASE;
1481 uvuni_to_utf8(tmpbuf, uv1);
1483 if (!*swashp) /* load on-demand */
1484 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1486 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1487 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1488 /* It might be "special" (sometimes, but not always,
1489 * a multicharacter mapping) */
1490 HV * const hv = get_hv(special, FALSE);
1494 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1498 s = SvPV_const(*svp, len);
1500 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1503 /* If we have EBCDIC we need to remap the characters
1504 * since any characters in the low 256 are Unicode
1505 * code points, not EBCDIC. */
1506 U8 *t = (U8*)s, *tend = t + len, *d;
1513 const UV c = utf8_to_uvchr(t, &tlen);
1515 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1524 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1529 Copy(tmpbuf, ustrp, len, U8);
1531 Copy(s, ustrp, len, U8);
1537 if (!len && *swashp) {
1538 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1541 /* It was "normal" (a single character mapping). */
1542 const UV uv3 = UNI_TO_NATIVE(uv2);
1543 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1547 if (!len) /* Neither: just copy. */
1548 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1553 return len ? utf8_to_uvchr(ustrp, 0) : 0;
1557 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1559 Convert the UTF-8 encoded character at p to its uppercase version and
1560 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1561 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1562 the uppercase version may be longer than the original character.
1564 The first character of the uppercased version is returned
1565 (but note, as explained above, that there may be more.)
1570 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1574 PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1576 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1577 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1581 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1583 Convert the UTF-8 encoded character at p to its titlecase version and
1584 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1585 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1586 titlecase version may be longer than the original character.
1588 The first character of the titlecased version is returned
1589 (but note, as explained above, that there may be more.)
1594 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1598 PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1600 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1601 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1605 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1607 Convert the UTF-8 encoded character at p to its lowercase version and
1608 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1609 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1610 lowercase version may be longer than the original character.
1612 The first character of the lowercased version is returned
1613 (but note, as explained above, that there may be more.)
1618 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1622 PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1624 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1625 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1629 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1631 Convert the UTF-8 encoded character at p to its foldcase version and
1632 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1633 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1634 foldcase version may be longer than the original character (up to
1637 The first character of the foldcased version is returned
1638 (but note, as explained above, that there may be more.)
1643 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1647 PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1649 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1650 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1654 * A "swash" is a swatch hash.
1655 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1656 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1657 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1660 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1665 const size_t pkg_len = strlen(pkg);
1666 const size_t name_len = strlen(name);
1667 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
1670 PERL_ARGS_ASSERT_SWASH_INIT;
1672 PUSHSTACKi(PERLSI_MAGIC);
1677 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1679 errsv_save = newSVsv(ERRSV);
1680 /* It is assumed that callers of this routine are not passing in any
1681 user derived data. */
1682 /* Need to do this after save_re_context() as it will set PL_tainted to
1683 1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1684 Even line to create errsv_save can turn on PL_tainted. */
1685 SAVEBOOL(PL_tainted);
1687 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1690 sv_setsv(ERRSV, errsv_save);
1691 SvREFCNT_dec(errsv_save);
1697 mPUSHp(pkg, pkg_len);
1698 mPUSHp(name, name_len);
1703 errsv_save = newSVsv(ERRSV);
1704 if (call_method("SWASHNEW", G_SCALAR))
1705 retval = newSVsv(*PL_stack_sp--);
1707 retval = &PL_sv_undef;
1709 sv_setsv(ERRSV, errsv_save);
1710 SvREFCNT_dec(errsv_save);
1713 if (IN_PERL_COMPILETIME) {
1714 CopHINTS_set(PL_curcop, PL_hints);
1716 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1718 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1720 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1726 /* This API is wrong for special case conversions since we may need to
1727 * return several Unicode characters for a single Unicode character
1728 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1729 * the lower-level routine, and it is similarly broken for returning
1730 * multiple values. --jhi */
1731 /* Now SWASHGET is recasted into S_swash_get in this file. */
1734 * Returns the value of property/mapping C<swash> for the first character
1735 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1736 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1737 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1740 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1743 HV* const hv = (HV*)SvRV(swash);
1748 const U8 *tmps = NULL;
1752 const UV c = NATIVE_TO_ASCII(*ptr);
1754 PERL_ARGS_ASSERT_SWASH_FETCH;
1756 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1757 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1758 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1761 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1762 * then the "swatch" is a vec() for al the chars which start
1764 * So the key in the hash (klen) is length of encoded char -1
1766 klen = UTF8SKIP(ptr) - 1;
1770 /* If char in invariant then swatch is for all the invariant chars
1771 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1773 needents = UTF_CONTINUATION_MARK;
1774 off = NATIVE_TO_UTF(ptr[klen]);
1777 /* If char is encoded then swatch is for the prefix */
1778 needents = (1 << UTF_ACCUMULATION_SHIFT);
1779 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1783 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1784 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1785 * it's nothing to sniff at.) Pity we usually come through at least
1786 * two function calls to get here...
1788 * NB: this code assumes that swatches are never modified, once generated!
1791 if (hv == PL_last_swash_hv &&
1792 klen == PL_last_swash_klen &&
1793 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1795 tmps = PL_last_swash_tmps;
1796 slen = PL_last_swash_slen;
1799 /* Try our second-level swatch cache, kept in a hash. */
1800 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1802 /* If not cached, generate it via swash_get */
1803 if (!svp || !SvPOK(*svp)
1804 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1805 /* We use utf8n_to_uvuni() as we want an index into
1806 Unicode tables, not a native character number.
1808 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1810 0 : UTF8_ALLOW_ANY);
1811 swatch = swash_get(swash,
1812 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1813 (klen) ? (code_point & ~(needents - 1)) : 0,
1816 if (IN_PERL_COMPILETIME)
1817 CopHINTS_set(PL_curcop, PL_hints);
1819 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1821 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1822 || (slen << 3) < needents)
1823 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1826 PL_last_swash_hv = hv;
1827 assert(klen <= sizeof(PL_last_swash_key));
1828 PL_last_swash_klen = (U8)klen;
1829 /* FIXME change interpvar.h? */
1830 PL_last_swash_tmps = (U8 *) tmps;
1831 PL_last_swash_slen = slen;
1833 Copy(ptr, PL_last_swash_key, klen, U8);
1836 switch ((int)((slen << 3) / needents)) {
1838 bit = 1 << (off & 7);
1840 return (tmps[off] & bit) != 0;
1845 return (tmps[off] << 8) + tmps[off + 1] ;
1848 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1850 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1851 NORETURN_FUNCTION_END;
1855 * Returns a swatch (a bit vector string) for a code point sequence
1856 * that starts from the value C<start> and comprises the number C<span>.
1857 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1858 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1861 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1864 U8 *l, *lend, *x, *xend, *s;
1865 STRLEN lcur, xcur, scur;
1866 HV* const hv = (HV*)SvRV(swash);
1867 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1868 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1869 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1870 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1871 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1872 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1873 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1874 const STRLEN bits = SvUV(*bitssvp);
1875 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1876 const UV none = SvUV(*nonesvp);
1877 const UV end = start + span;
1879 PERL_ARGS_ASSERT_SWASH_GET;
1881 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1882 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1886 /* create and initialize $swatch */
1887 scur = octets ? (span * octets) : (span + 7) / 8;
1888 swatch = newSV(scur);
1890 s = (U8*)SvPVX(swatch);
1891 if (octets && none) {
1892 const U8* const e = s + scur;
1895 *s++ = (U8)(none & 0xff);
1896 else if (bits == 16) {
1897 *s++ = (U8)((none >> 8) & 0xff);
1898 *s++ = (U8)( none & 0xff);
1900 else if (bits == 32) {
1901 *s++ = (U8)((none >> 24) & 0xff);
1902 *s++ = (U8)((none >> 16) & 0xff);
1903 *s++ = (U8)((none >> 8) & 0xff);
1904 *s++ = (U8)( none & 0xff);
1910 (void)memzero((U8*)s, scur + 1);
1912 SvCUR_set(swatch, scur);
1913 s = (U8*)SvPVX(swatch);
1915 /* read $swash->{LIST} */
1916 l = (U8*)SvPV(*listsvp, lcur);
1921 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1923 U8* const nl = (U8*)memchr(l, '\n', lend - l);
1926 min = grok_hex((char *)l, &numlen, &flags, NULL);
1930 l = nl + 1; /* 1 is length of "\n" */
1934 l = lend; /* to LIST's end at which \n is not found */
1940 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1942 max = grok_hex((char *)l, &numlen, &flags, NULL);
1951 flags = PERL_SCAN_SILENT_ILLDIGIT |
1952 PERL_SCAN_DISALLOW_PREFIX;
1954 val = grok_hex((char *)l, &numlen, &flags, NULL);
1963 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1969 val = 0; /* bits == 1, then val should be ignored */
1976 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1980 val = 0; /* bits == 1, then val should be ignored */
1994 if (!none || val < none) {
1999 for (key = min; key <= max; key++) {
2003 /* offset must be non-negative (start <= min <= key < end) */
2004 offset = octets * (key - start);
2006 s[offset] = (U8)(val & 0xff);
2007 else if (bits == 16) {
2008 s[offset ] = (U8)((val >> 8) & 0xff);
2009 s[offset + 1] = (U8)( val & 0xff);
2011 else if (bits == 32) {
2012 s[offset ] = (U8)((val >> 24) & 0xff);
2013 s[offset + 1] = (U8)((val >> 16) & 0xff);
2014 s[offset + 2] = (U8)((val >> 8) & 0xff);
2015 s[offset + 3] = (U8)( val & 0xff);
2018 if (!none || val < none)
2022 else { /* bits == 1, then val should be ignored */
2026 for (key = min; key <= max; key++) {
2027 const STRLEN offset = (STRLEN)(key - start);
2030 s[offset >> 3] |= 1 << (offset & 7);
2036 /* read $swash->{EXTRAS} */
2037 x = (U8*)SvPV(*extssvp, xcur);
2045 SV **otherbitssvp, *other;
2049 const U8 opc = *x++;
2053 nl = (U8*)memchr(x, '\n', xend - x);
2055 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2057 x = nl + 1; /* 1 is length of "\n" */
2061 x = xend; /* to EXTRAS' end at which \n is not found */
2068 namelen = nl - namestr;
2072 namelen = xend - namestr;
2076 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
2077 otherhv = (HV*)SvRV(*othersvp);
2078 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
2079 otherbits = (STRLEN)SvUV(*otherbitssvp);
2080 if (bits < otherbits)
2081 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
2083 /* The "other" swatch must be destroyed after. */
2084 other = swash_get(*othersvp, start, span);
2085 o = (U8*)SvPV(other, olen);
2088 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
2090 s = (U8*)SvPV(swatch, slen);
2091 if (bits == 1 && otherbits == 1) {
2093 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2117 STRLEN otheroctets = otherbits >> 3;
2119 U8* const send = s + slen;
2124 if (otherbits == 1) {
2125 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2129 STRLEN vlen = otheroctets;
2137 if (opc == '+' && otherval)
2138 NOOP; /* replace with otherval */
2139 else if (opc == '!' && !otherval)
2141 else if (opc == '-' && otherval)
2143 else if (opc == '&' && !otherval)
2146 s += octets; /* no replacement */
2151 *s++ = (U8)( otherval & 0xff);
2152 else if (bits == 16) {
2153 *s++ = (U8)((otherval >> 8) & 0xff);
2154 *s++ = (U8)( otherval & 0xff);
2156 else if (bits == 32) {
2157 *s++ = (U8)((otherval >> 24) & 0xff);
2158 *s++ = (U8)((otherval >> 16) & 0xff);
2159 *s++ = (U8)((otherval >> 8) & 0xff);
2160 *s++ = (U8)( otherval & 0xff);
2164 sv_free(other); /* through with it! */
2170 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2172 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2173 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2174 bytes available. The return value is the pointer to the byte after the
2175 end of the new character. In other words,
2177 d = uvchr_to_utf8(d, uv);
2179 is the recommended wide native character-aware way of saying
2186 /* On ASCII machines this is normally a macro but we want a
2187 real function in case XS code wants it
2190 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2192 PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2194 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2198 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2200 PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2202 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2206 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2209 Returns the native character value of the first character in the string
2211 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2212 length, in bytes, of that character.
2214 Allows length and flags to be passed to low level routine.
2218 /* On ASCII machines this is normally a macro but we want
2219 a real function in case XS code wants it
2222 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2225 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2227 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2229 return UNI_TO_NATIVE(uv);
2233 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2235 Build to the scalar dsv a displayable version of the string spv,
2236 length len, the displayable version being at most pvlim bytes long
2237 (if longer, the rest is truncated and "..." will be appended).
2239 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2240 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2241 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2242 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2243 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2244 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2246 The pointer to the PV of the dsv is returned.
2250 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2255 PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2257 sv_setpvn(dsv, "", 0);
2259 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2261 /* This serves double duty as a flag and a character to print after
2262 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2266 if (pvlim && SvCUR(dsv) >= pvlim) {
2270 u = utf8_to_uvchr((U8*)s, 0);
2272 const unsigned char c = (unsigned char)u & 0xFF;
2273 if (flags & UNI_DISPLAY_BACKSLASH) {
2290 const char string = ok;
2291 sv_catpvn(dsv, &string, 1);
2294 /* isPRINT() is the locale-blind version. */
2295 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2296 const char string = c;
2297 sv_catpvn(dsv, &string, 1);
2302 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2305 sv_catpvs(dsv, "...");
2311 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2313 Build to the scalar dsv a displayable version of the scalar sv,
2314 the displayable version being at most pvlim bytes long
2315 (if longer, the rest is truncated and "..." will be appended).
2317 The flags argument is as in pv_uni_display().
2319 The pointer to the PV of the dsv is returned.
2324 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2326 PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2328 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2329 SvCUR(ssv), pvlim, flags);
2333 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
2335 Return true if the strings s1 and s2 differ case-insensitively, false
2336 if not (if they are equal case-insensitively). If u1 is true, the
2337 string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
2338 the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2339 are false, the respective string is assumed to be in native 8-bit
2342 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2343 in there (they will point at the beginning of the I<next> character).
2344 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2345 pointers beyond which scanning will not continue under any
2346 circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
2347 s2+l2 will be used as goal end pointers that will also stop the scan,
2348 and which qualify towards defining a successful match: all the scans
2349 that define an explicit length must reach their goal pointers for
2350 a match to succeed).
2352 For case-insensitiveness, the "casefolding" of Unicode is used
2353 instead of upper/lowercasing both the characters, see
2354 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2358 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2361 register const U8 *p1 = (const U8*)s1;
2362 register const U8 *p2 = (const U8*)s2;
2363 register const U8 *f1 = NULL;
2364 register const U8 *f2 = NULL;
2365 register U8 *e1 = NULL;
2366 register U8 *q1 = NULL;
2367 register U8 *e2 = NULL;
2368 register U8 *q2 = NULL;
2369 STRLEN n1 = 0, n2 = 0;
2370 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2371 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2373 STRLEN foldlen1, foldlen2;
2376 PERL_ARGS_ASSERT_IBCMP_UTF8;
2380 /* assert(e1 || l1); */
2381 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2382 f1 = (const U8*)s1 + l1;
2385 /* assert(e2 || l2); */
2386 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2387 f2 = (const U8*)s2 + l2;
2389 /* This shouldn't happen. However, putting an assert() there makes some
2391 /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
2392 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2393 return 1; /* mismatch; possible infinite loop or false positive */
2396 natbuf[1] = 0; /* Need to terminate the buffer. */
2398 while ((e1 == 0 || p1 < e1) &&
2399 (f1 == 0 || p1 < f1) &&
2400 (e2 == 0 || p2 < e2) &&
2401 (f2 == 0 || p2 < f2)) {
2404 to_utf8_fold(p1, foldbuf1, &foldlen1);
2406 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2407 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2414 to_utf8_fold(p2, foldbuf2, &foldlen2);
2416 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2417 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2423 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2424 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2425 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2426 return 1; /* mismatch */
2433 p1 += u1 ? UTF8SKIP(p1) : 1;
2435 p2 += u2 ? UTF8SKIP(p2) : 1;
2439 /* A match is defined by all the scans that specified
2440 * an explicit length reaching their final goals. */
2441 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2450 return match ? 0 : 1; /* 0 match, 1 mismatch */
2455 * c-indentation-style: bsd
2457 * indent-tabs-mode: t
2460 * ex: set ts=8 sts=4 sw=4 noet: