3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
4 * by Larry Wall and others
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
20 * as is the custom in the West, if you wish to be answered?'
21 * --Gandalf, addressing Théoden's door wardens
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
32 #define PERL_IN_UTF8_C
36 /* Separate prototypes needed because in ASCII systems these
37 * usually macros but they still are compiled as code, too. */
38 PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
39 PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
42 static const char unees[] =
43 "Malformed UTF-8 character (unexpected end of string)";
46 =head1 Unicode Support
48 This file contains various utility functions for manipulating UTF8-encoded
49 strings. For the uninitiated, this is a method of representing arbitrary
50 Unicode characters as a variable number of bytes, in such a way that
51 characters in the ASCII range are unmodified, and a zero byte never appears
52 within non-zero characters.
58 =for apidoc is_ascii_string
60 Returns true if first C<len> bytes of the given string are ASCII (i.e. none
61 of them even raise the question of UTF-8-ness).
63 See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
69 Perl_is_ascii_string(pTHX_ const U8 *s, STRLEN len)
71 const U8* const send = s + (len ? len : strlen((const char *)s));
74 PERL_ARGS_ASSERT_IS_ASCII_STRING;
77 for (; x < send; ++x) {
78 if (!UTF8_IS_INVARIANT(*x))
86 =for apidoc uvuni_to_utf8_flags
88 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
90 bytes available. The return value is the pointer to the byte after the
91 end of the new character. In other words,
93 d = uvuni_to_utf8_flags(d, uv, flags);
97 d = uvuni_to_utf8(d, uv);
99 (which is equivalent to)
101 d = uvuni_to_utf8_flags(d, uv, 0);
103 is the recommended Unicode-aware way of saying
111 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
113 PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
115 if (ckWARN(WARN_UTF8)) {
116 if (UNICODE_IS_SURROGATE(uv) &&
117 !(flags & UNICODE_ALLOW_SURROGATE))
118 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
120 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
121 !(flags & UNICODE_ALLOW_FDD0))
123 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
124 !(flags & UNICODE_ALLOW_FFFF))) &&
125 /* UNICODE_ALLOW_SUPER includes
126 * FFFEs and FFFFs beyond 0x10FFFF. */
127 ((uv <= PERL_UNICODE_MAX) ||
128 !(flags & UNICODE_ALLOW_SUPER))
130 Perl_warner(aTHX_ packWARN(WARN_UTF8),
131 "Unicode character 0x%04"UVxf" is illegal", uv);
133 if (UNI_IS_INVARIANT(uv)) {
134 *d++ = (U8)UTF_TO_NATIVE(uv);
139 STRLEN len = UNISKIP(uv);
142 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
143 uv >>= UTF_ACCUMULATION_SHIFT;
145 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
148 #else /* Non loop style */
150 *d++ = (U8)(( uv >> 6) | 0xc0);
151 *d++ = (U8)(( uv & 0x3f) | 0x80);
155 *d++ = (U8)(( uv >> 12) | 0xe0);
156 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
157 *d++ = (U8)(( uv & 0x3f) | 0x80);
161 *d++ = (U8)(( uv >> 18) | 0xf0);
162 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
163 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
164 *d++ = (U8)(( uv & 0x3f) | 0x80);
167 if (uv < 0x4000000) {
168 *d++ = (U8)(( uv >> 24) | 0xf8);
169 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
170 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
171 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
172 *d++ = (U8)(( uv & 0x3f) | 0x80);
175 if (uv < 0x80000000) {
176 *d++ = (U8)(( uv >> 30) | 0xfc);
177 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
178 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
179 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
180 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
181 *d++ = (U8)(( uv & 0x3f) | 0x80);
185 if (uv < UTF8_QUAD_MAX)
188 *d++ = 0xfe; /* Can't match U+FEFF! */
189 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
190 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
191 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
192 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
193 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
194 *d++ = (U8)(( uv & 0x3f) | 0x80);
199 *d++ = 0xff; /* Can't match U+FFFE! */
200 *d++ = 0x80; /* 6 Reserved bits */
201 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
202 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
203 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
204 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
205 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
206 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
207 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
208 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
209 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
210 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
211 *d++ = (U8)(( uv & 0x3f) | 0x80);
215 #endif /* Loop style */
220 Tests if some arbitrary number of bytes begins in a valid UTF-8
221 character. Note that an INVARIANT (i.e. ASCII) character is a valid
222 UTF-8 character. The actual number of bytes in the UTF-8 character
223 will be returned if it is valid, otherwise 0.
225 This is the "slow" version as opposed to the "fast" version which is
226 the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
227 difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
228 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
229 you should use the _slow(). In practice this means that the _slow()
230 will be used very rarely, since the maximum Unicode code point (as of
231 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
232 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
237 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
243 PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
245 if (UTF8_IS_INVARIANT(u))
248 if (!UTF8_IS_START(u))
251 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
257 u = NATIVE_TO_UTF(u);
259 u &= UTF_START_MASK(len);
263 if (!UTF8_IS_CONTINUATION(*s))
265 uv = UTF8_ACCUMULATE(uv, *s);
272 if ((STRLEN)UNISKIP(uv) < len)
279 =for apidoc is_utf8_char
281 Tests if some arbitrary number of bytes begins in a valid UTF-8
282 character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
283 character is a valid UTF-8 character. The actual number of bytes in the UTF-8
284 character will be returned if it is valid, otherwise 0.
288 Perl_is_utf8_char(pTHX_ const U8 *s)
290 const STRLEN len = UTF8SKIP(s);
292 PERL_ARGS_ASSERT_IS_UTF8_CHAR;
295 if (IS_UTF8_CHAR_FAST(len))
296 return IS_UTF8_CHAR(s, len) ? len : 0;
297 #endif /* #ifdef IS_UTF8_CHAR */
298 return is_utf8_char_slow(s, len);
303 =for apidoc is_utf8_string
305 Returns true if first C<len> bytes of the given string form a valid
306 UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
307 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
308 because a valid ASCII string is a valid UTF-8 string.
310 See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
316 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
318 const U8* const send = s + (len ? len : strlen((const char *)s));
321 PERL_ARGS_ASSERT_IS_UTF8_STRING;
326 /* Inline the easy bits of is_utf8_char() here for speed... */
327 if (UTF8_IS_INVARIANT(*x))
329 else if (!UTF8_IS_START(*x))
332 /* ... and call is_utf8_char() only if really needed. */
335 if (IS_UTF8_CHAR_FAST(c)) {
336 if (!IS_UTF8_CHAR(x, c))
340 c = is_utf8_char_slow(x, c);
343 #endif /* #ifdef IS_UTF8_CHAR */
358 Implemented as a macro in utf8.h
360 =for apidoc is_utf8_string_loc
362 Like is_utf8_string() but stores the location of the failure (in the
363 case of "utf8ness failure") or the location s+len (in the case of
364 "utf8ness success") in the C<ep>.
366 See also is_utf8_string_loclen() and is_utf8_string().
368 =for apidoc is_utf8_string_loclen
370 Like is_utf8_string() but stores the location of the failure (in the
371 case of "utf8ness failure") or the location s+len (in the case of
372 "utf8ness success") in the C<ep>, and the number of UTF-8
373 encoded characters in the C<el>.
375 See also is_utf8_string_loc() and is_utf8_string().
381 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
383 const U8* const send = s + (len ? len : strlen((const char *)s));
388 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
392 /* Inline the easy bits of is_utf8_char() here for speed... */
393 if (UTF8_IS_INVARIANT(*x))
395 else if (!UTF8_IS_START(*x))
398 /* ... and call is_utf8_char() only if really needed. */
401 if (IS_UTF8_CHAR_FAST(c)) {
402 if (!IS_UTF8_CHAR(x, c))
405 c = is_utf8_char_slow(x, c);
408 #endif /* #ifdef IS_UTF8_CHAR */
427 =for apidoc utf8n_to_uvuni
429 Bottom level UTF-8 decode routine.
430 Returns the Unicode code point value of the first character in the string C<s>
431 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
432 C<retlen> will be set to the length, in bytes, of that character.
434 If C<s> does not point to a well-formed UTF-8 character, the behaviour
435 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
436 it is assumed that the caller will raise a warning, and this function
437 will silently just set C<retlen> to C<-1> and return zero. If the
438 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
439 malformations will be given, C<retlen> will be set to the expected
440 length of the UTF-8 character in bytes, and zero will be returned.
442 The C<flags> can also contain various flags to allow deviations from
443 the strict UTF-8 encoding (see F<utf8.h>).
445 Most code should use utf8_to_uvchr() rather than call this directly.
451 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
454 const U8 * const s0 = s;
457 const bool dowarn = ckWARN_d(WARN_UTF8);
458 const UV startbyte = *s;
459 STRLEN expectlen = 0;
462 PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
464 /* This list is a superset of the UTF8_ALLOW_XXX. */
466 #define UTF8_WARN_EMPTY 1
467 #define UTF8_WARN_CONTINUATION 2
468 #define UTF8_WARN_NON_CONTINUATION 3
469 #define UTF8_WARN_FE_FF 4
470 #define UTF8_WARN_SHORT 5
471 #define UTF8_WARN_OVERFLOW 6
472 #define UTF8_WARN_SURROGATE 7
473 #define UTF8_WARN_LONG 8
474 #define UTF8_WARN_FFFF 9 /* Also FFFE. */
477 !(flags & UTF8_ALLOW_EMPTY)) {
478 warning = UTF8_WARN_EMPTY;
482 if (UTF8_IS_INVARIANT(uv)) {
485 return (UV) (NATIVE_TO_UTF(*s));
488 if (UTF8_IS_CONTINUATION(uv) &&
489 !(flags & UTF8_ALLOW_CONTINUATION)) {
490 warning = UTF8_WARN_CONTINUATION;
494 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
495 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
496 warning = UTF8_WARN_NON_CONTINUATION;
501 uv = NATIVE_TO_UTF(uv);
503 if ((uv == 0xfe || uv == 0xff) &&
504 !(flags & UTF8_ALLOW_FE_FF)) {
505 warning = UTF8_WARN_FE_FF;
510 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
511 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
512 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
513 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
515 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
516 else { len = 7; uv &= 0x01; }
518 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
519 else if (!(uv & 0x01)) { len = 7; uv = 0; }
520 else { len = 13; uv = 0; } /* whoa! */
528 if ((curlen < expectlen) &&
529 !(flags & UTF8_ALLOW_SHORT)) {
530 warning = UTF8_WARN_SHORT;
539 if (!UTF8_IS_CONTINUATION(*s) &&
540 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
542 warning = UTF8_WARN_NON_CONTINUATION;
546 uv = UTF8_ACCUMULATE(uv, *s);
548 /* These cannot be allowed. */
550 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
551 warning = UTF8_WARN_LONG;
555 else { /* uv < ouv */
556 /* This cannot be allowed. */
557 warning = UTF8_WARN_OVERFLOW;
565 if (UNICODE_IS_SURROGATE(uv) &&
566 !(flags & UTF8_ALLOW_SURROGATE)) {
567 warning = UTF8_WARN_SURROGATE;
569 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
570 !(flags & UTF8_ALLOW_LONG)) {
571 warning = UTF8_WARN_LONG;
573 } else if (UNICODE_IS_ILLEGAL(uv) &&
574 !(flags & UTF8_ALLOW_FFFF)) {
575 warning = UTF8_WARN_FFFF;
583 if (flags & UTF8_CHECK_ONLY) {
585 *retlen = ((STRLEN) -1);
590 SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
593 case 0: /* Intentionally empty. */ break;
594 case UTF8_WARN_EMPTY:
595 sv_catpvs(sv, "(empty string)");
597 case UTF8_WARN_CONTINUATION:
598 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
600 case UTF8_WARN_NON_CONTINUATION:
602 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
603 (UV)s[1], startbyte);
605 const int len = (int)(s-s0);
606 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
607 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
611 case UTF8_WARN_FE_FF:
612 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
614 case UTF8_WARN_SHORT:
615 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
616 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
617 expectlen = curlen; /* distance for caller to skip */
619 case UTF8_WARN_OVERFLOW:
620 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
623 case UTF8_WARN_SURROGATE:
624 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
627 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
628 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
631 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
634 sv_catpvs(sv, "(unknown reason)");
639 const char * const s = SvPVX_const(sv);
642 Perl_warner(aTHX_ packWARN(WARN_UTF8),
643 "%s in %s", s, OP_DESC(PL_op));
645 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
650 *retlen = expectlen ? expectlen : len;
656 =for apidoc utf8_to_uvchr
658 Returns the native character value of the first character in the string C<s>
659 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
660 length, in bytes, of that character.
662 If C<s> does not point to a well-formed UTF-8 character, zero is
663 returned and retlen is set, if possible, to -1.
669 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
671 PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
673 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
674 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
678 =for apidoc utf8_to_uvuni
680 Returns the Unicode code point of the first character in the string C<s>
681 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
682 length, in bytes, of that character.
684 This function should only be used when the returned UV is considered
685 an index into the Unicode semantic tables (e.g. swashes).
687 If C<s> does not point to a well-formed UTF-8 character, zero is
688 returned and retlen is set, if possible, to -1.
694 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
696 PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
698 /* Call the low level routine asking for checks */
699 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
700 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
704 =for apidoc utf8_length
706 Return the length of the UTF-8 char encoded string C<s> in characters.
707 Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
708 up past C<e>, croaks.
714 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
719 PERL_ARGS_ASSERT_UTF8_LENGTH;
721 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
722 * the bitops (especially ~) can create illegal UTF-8.
723 * In other words: in Perl UTF-8 is not just for Unicode. */
726 goto warn_and_return;
728 if (!UTF8_IS_INVARIANT(*s))
738 if (ckWARN_d(WARN_UTF8)) {
740 Perl_warner(aTHX_ packWARN(WARN_UTF8),
741 "%s in %s", unees, OP_DESC(PL_op));
743 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
751 =for apidoc utf8_distance
753 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
756 WARNING: use only if you *know* that the pointers point inside the
763 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
765 PERL_ARGS_ASSERT_UTF8_DISTANCE;
767 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
773 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
776 WARNING: do not use the following unless you *know* C<off> is within
777 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
778 on the first byte of character or just after the last byte of a character.
784 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
786 PERL_ARGS_ASSERT_UTF8_HOP;
789 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
790 * the bitops (especially ~) can create illegal UTF-8.
791 * In other words: in Perl UTF-8 is not just for Unicode. */
800 while (UTF8_IS_CONTINUATION(*s))
808 =for apidoc utf8_to_bytes
810 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
811 Unlike C<bytes_to_utf8>, this over-writes the original string, and
812 updates len to contain the new length.
813 Returns zero on failure, setting C<len> to -1.
815 If you need a copy of the string, see C<bytes_from_utf8>.
821 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
824 U8 * const send = s + *len;
827 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
829 /* ensure valid UTF-8 and chars < 256 before updating string */
833 if (!UTF8_IS_INVARIANT(c) &&
834 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
835 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
836 *len = ((STRLEN) -1);
844 *d++ = (U8)utf8_to_uvchr(s, &ulen);
853 =for apidoc bytes_from_utf8
855 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
856 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
857 the newly-created string, and updates C<len> to contain the new
858 length. Returns the original string if no conversion occurs, C<len>
859 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
860 0 if C<s> is converted or consisted entirely of characters that are invariant
861 in utf8 (i.e., US-ASCII on non-EBCDIC machines).
867 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
874 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
880 /* ensure valid UTF-8 and chars < 256 before converting string */
881 for (send = s + *len; s < send;) {
883 if (!UTF8_IS_INVARIANT(c)) {
884 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
885 (c = *s++) && UTF8_IS_CONTINUATION(c))
894 Newx(d, (*len) - count + 1, U8);
895 s = start; start = d;
898 if (!UTF8_IS_INVARIANT(c)) {
899 /* Then it is two-byte encoded */
900 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
901 c = ASCII_TO_NATIVE(c);
911 =for apidoc bytes_to_utf8
913 Converts a string C<s> of length C<len> from the native encoding into UTF-8.
914 Returns a pointer to the newly-created string, and sets C<len> to
915 reflect the new length.
917 A NUL character will be written after the end of the string.
919 If you want to convert to UTF-8 from encodings other than
920 the native (Latin1 or EBCDIC),
921 see sv_recode_to_utf8().
927 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
929 const U8 * const send = s + (*len);
933 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
936 Newx(d, (*len) * 2 + 1, U8);
940 const UV uv = NATIVE_TO_ASCII(*s++);
941 if (UNI_IS_INVARIANT(uv))
942 *d++ = (U8)UTF_TO_NATIVE(uv);
944 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
945 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
954 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
956 * Destination must be pre-extended to 3/2 source. Do not use in-place.
957 * We optimize for native, for obvious reasons. */
960 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
965 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
967 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
974 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
979 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
983 *d++ = UNI_TO_NATIVE(uv);
990 *d++ = (U8)(( uv >> 6) | 0xc0);
991 *d++ = (U8)(( uv & 0x3f) | 0x80);
994 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
995 UV low = (p[0] << 8) + p[1];
997 if (low < 0xdc00 || low >= 0xdfff)
998 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
999 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
1002 *d++ = (U8)(( uv >> 12) | 0xe0);
1003 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1004 *d++ = (U8)(( uv & 0x3f) | 0x80);
1008 *d++ = (U8)(( uv >> 18) | 0xf0);
1009 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1010 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1011 *d++ = (U8)(( uv & 0x3f) | 0x80);
1015 *newlen = d - dstart;
1019 /* Note: this one is slightly destructive of the source. */
1022 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1025 U8* const send = s + bytelen;
1027 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1030 const U8 tmp = s[0];
1035 return utf16_to_utf8(p, d, bytelen, newlen);
1038 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
1041 Perl_is_uni_alnum(pTHX_ UV c)
1043 U8 tmpbuf[UTF8_MAXBYTES+1];
1044 uvchr_to_utf8(tmpbuf, c);
1045 return is_utf8_alnum(tmpbuf);
1049 Perl_is_uni_alnumc(pTHX_ UV c)
1051 U8 tmpbuf[UTF8_MAXBYTES+1];
1052 uvchr_to_utf8(tmpbuf, c);
1053 return is_utf8_alnumc(tmpbuf);
1057 Perl_is_uni_idfirst(pTHX_ UV c)
1059 U8 tmpbuf[UTF8_MAXBYTES+1];
1060 uvchr_to_utf8(tmpbuf, c);
1061 return is_utf8_idfirst(tmpbuf);
1065 Perl_is_uni_alpha(pTHX_ UV c)
1067 U8 tmpbuf[UTF8_MAXBYTES+1];
1068 uvchr_to_utf8(tmpbuf, c);
1069 return is_utf8_alpha(tmpbuf);
1073 Perl_is_uni_ascii(pTHX_ UV c)
1075 U8 tmpbuf[UTF8_MAXBYTES+1];
1076 uvchr_to_utf8(tmpbuf, c);
1077 return is_utf8_ascii(tmpbuf);
1081 Perl_is_uni_space(pTHX_ UV c)
1083 U8 tmpbuf[UTF8_MAXBYTES+1];
1084 uvchr_to_utf8(tmpbuf, c);
1085 return is_utf8_space(tmpbuf);
1089 Perl_is_uni_digit(pTHX_ UV c)
1091 U8 tmpbuf[UTF8_MAXBYTES+1];
1092 uvchr_to_utf8(tmpbuf, c);
1093 return is_utf8_digit(tmpbuf);
1097 Perl_is_uni_upper(pTHX_ UV c)
1099 U8 tmpbuf[UTF8_MAXBYTES+1];
1100 uvchr_to_utf8(tmpbuf, c);
1101 return is_utf8_upper(tmpbuf);
1105 Perl_is_uni_lower(pTHX_ UV c)
1107 U8 tmpbuf[UTF8_MAXBYTES+1];
1108 uvchr_to_utf8(tmpbuf, c);
1109 return is_utf8_lower(tmpbuf);
1113 Perl_is_uni_cntrl(pTHX_ UV c)
1115 U8 tmpbuf[UTF8_MAXBYTES+1];
1116 uvchr_to_utf8(tmpbuf, c);
1117 return is_utf8_cntrl(tmpbuf);
1121 Perl_is_uni_graph(pTHX_ UV c)
1123 U8 tmpbuf[UTF8_MAXBYTES+1];
1124 uvchr_to_utf8(tmpbuf, c);
1125 return is_utf8_graph(tmpbuf);
1129 Perl_is_uni_print(pTHX_ UV c)
1131 U8 tmpbuf[UTF8_MAXBYTES+1];
1132 uvchr_to_utf8(tmpbuf, c);
1133 return is_utf8_print(tmpbuf);
1137 Perl_is_uni_punct(pTHX_ UV c)
1139 U8 tmpbuf[UTF8_MAXBYTES+1];
1140 uvchr_to_utf8(tmpbuf, c);
1141 return is_utf8_punct(tmpbuf);
1145 Perl_is_uni_xdigit(pTHX_ UV c)
1147 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1148 uvchr_to_utf8(tmpbuf, c);
1149 return is_utf8_xdigit(tmpbuf);
1153 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1155 PERL_ARGS_ASSERT_TO_UNI_UPPER;
1157 uvchr_to_utf8(p, c);
1158 return to_utf8_upper(p, p, lenp);
1162 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1164 PERL_ARGS_ASSERT_TO_UNI_TITLE;
1166 uvchr_to_utf8(p, c);
1167 return to_utf8_title(p, p, lenp);
1171 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1173 PERL_ARGS_ASSERT_TO_UNI_LOWER;
1175 uvchr_to_utf8(p, c);
1176 return to_utf8_lower(p, p, lenp);
1180 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1182 PERL_ARGS_ASSERT_TO_UNI_FOLD;
1184 uvchr_to_utf8(p, c);
1185 return to_utf8_fold(p, p, lenp);
1188 /* for now these all assume no locale info available for Unicode > 255 */
1191 Perl_is_uni_alnum_lc(pTHX_ UV c)
1193 return is_uni_alnum(c); /* XXX no locale support yet */
1197 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1199 return is_uni_alnumc(c); /* XXX no locale support yet */
1203 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1205 return is_uni_idfirst(c); /* XXX no locale support yet */
1209 Perl_is_uni_alpha_lc(pTHX_ UV c)
1211 return is_uni_alpha(c); /* XXX no locale support yet */
1215 Perl_is_uni_ascii_lc(pTHX_ UV c)
1217 return is_uni_ascii(c); /* XXX no locale support yet */
1221 Perl_is_uni_space_lc(pTHX_ UV c)
1223 return is_uni_space(c); /* XXX no locale support yet */
1227 Perl_is_uni_digit_lc(pTHX_ UV c)
1229 return is_uni_digit(c); /* XXX no locale support yet */
1233 Perl_is_uni_upper_lc(pTHX_ UV c)
1235 return is_uni_upper(c); /* XXX no locale support yet */
1239 Perl_is_uni_lower_lc(pTHX_ UV c)
1241 return is_uni_lower(c); /* XXX no locale support yet */
1245 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1247 return is_uni_cntrl(c); /* XXX no locale support yet */
1251 Perl_is_uni_graph_lc(pTHX_ UV c)
1253 return is_uni_graph(c); /* XXX no locale support yet */
1257 Perl_is_uni_print_lc(pTHX_ UV c)
1259 return is_uni_print(c); /* XXX no locale support yet */
1263 Perl_is_uni_punct_lc(pTHX_ UV c)
1265 return is_uni_punct(c); /* XXX no locale support yet */
1269 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1271 return is_uni_xdigit(c); /* XXX no locale support yet */
1275 Perl_to_uni_upper_lc(pTHX_ U32 c)
1277 /* XXX returns only the first character -- do not use XXX */
1278 /* XXX no locale support yet */
1280 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1281 return (U32)to_uni_upper(c, tmpbuf, &len);
1285 Perl_to_uni_title_lc(pTHX_ U32 c)
1287 /* XXX returns only the first character XXX -- do not use XXX */
1288 /* XXX no locale support yet */
1290 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1291 return (U32)to_uni_title(c, tmpbuf, &len);
1295 Perl_to_uni_lower_lc(pTHX_ U32 c)
1297 /* XXX returns only the first character -- do not use XXX */
1298 /* XXX no locale support yet */
1300 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1301 return (U32)to_uni_lower(c, tmpbuf, &len);
1305 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1306 const char *const swashname)
1310 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1312 if (!is_utf8_char(p))
1315 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1316 return swash_fetch(*swash, p, TRUE) != 0;
1320 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1324 PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1326 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1327 * descendant of isalnum(3), in other words, it doesn't
1328 * contain the '_'. --jhi */
1329 return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1333 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1337 PERL_ARGS_ASSERT_IS_UTF8_ALNUMC;
1339 return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
1343 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1347 PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1351 /* is_utf8_idstart would be more logical. */
1352 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1356 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1360 PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1364 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1368 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1372 PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1374 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1378 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1382 PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1384 return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1388 Perl_is_utf8_space(pTHX_ const U8 *p)
1392 PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1394 return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1398 Perl_is_utf8_digit(pTHX_ const U8 *p)
1402 PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1404 return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1408 Perl_is_utf8_upper(pTHX_ const U8 *p)
1412 PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1414 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1418 Perl_is_utf8_lower(pTHX_ const U8 *p)
1422 PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1424 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1428 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1432 PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1434 return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1438 Perl_is_utf8_graph(pTHX_ const U8 *p)
1442 PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1444 return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1448 Perl_is_utf8_print(pTHX_ const U8 *p)
1452 PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1454 return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1458 Perl_is_utf8_punct(pTHX_ const U8 *p)
1462 PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1464 return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1468 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1472 PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1474 return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
1478 Perl_is_utf8_mark(pTHX_ const U8 *p)
1482 PERL_ARGS_ASSERT_IS_UTF8_MARK;
1484 return is_utf8_common(p, &PL_utf8_mark, "IsM");
1488 =for apidoc to_utf8_case
1490 The "p" contains the pointer to the UTF-8 string encoding
1491 the character that is being converted.
1493 The "ustrp" is a pointer to the character buffer to put the
1494 conversion result to. The "lenp" is a pointer to the length
1497 The "swashp" is a pointer to the swash to use.
1499 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1500 and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
1501 but not always, a multicharacter mapping), is tried first.
1503 The "special" is a string like "utf8::ToSpecLower", which means the
1504 hash %utf8::ToSpecLower. The access to the hash is through
1505 Perl_to_utf8_case().
1507 The "normal" is a string like "ToLower" which means the swash
1513 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1514 SV **swashp, const char *normal, const char *special)
1517 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1519 const UV uv0 = utf8_to_uvchr(p, NULL);
1520 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1521 * are necessary in EBCDIC, they are redundant no-ops
1522 * in ASCII-ish platforms, and hopefully optimized away. */
1523 const UV uv1 = NATIVE_TO_UNI(uv0);
1525 PERL_ARGS_ASSERT_TO_UTF8_CASE;
1527 uvuni_to_utf8(tmpbuf, uv1);
1529 if (!*swashp) /* load on-demand */
1530 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1532 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1533 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1534 /* It might be "special" (sometimes, but not always,
1535 * a multicharacter mapping) */
1536 HV * const hv = get_hv(special, 0);
1540 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1544 s = SvPV_const(*svp, len);
1546 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1549 /* If we have EBCDIC we need to remap the characters
1550 * since any characters in the low 256 are Unicode
1551 * code points, not EBCDIC. */
1552 U8 *t = (U8*)s, *tend = t + len, *d;
1559 const UV c = utf8_to_uvchr(t, &tlen);
1561 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1570 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1575 Copy(tmpbuf, ustrp, len, U8);
1577 Copy(s, ustrp, len, U8);
1583 if (!len && *swashp) {
1584 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1587 /* It was "normal" (a single character mapping). */
1588 const UV uv3 = UNI_TO_NATIVE(uv2);
1589 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1593 if (!len) /* Neither: just copy. */
1594 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1599 return len ? utf8_to_uvchr(ustrp, 0) : 0;
1603 =for apidoc to_utf8_upper
1605 Convert the UTF-8 encoded character at p to its uppercase version and
1606 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1607 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1608 the uppercase version may be longer than the original character.
1610 The first character of the uppercased version is returned
1611 (but note, as explained above, that there may be more.)
1616 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1620 PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1622 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1623 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1627 =for apidoc to_utf8_title
1629 Convert the UTF-8 encoded character at p to its titlecase version and
1630 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1631 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1632 titlecase version may be longer than the original character.
1634 The first character of the titlecased version is returned
1635 (but note, as explained above, that there may be more.)
1640 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1644 PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1646 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1647 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1651 =for apidoc to_utf8_lower
1653 Convert the UTF-8 encoded character at p to its lowercase version and
1654 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1655 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1656 lowercase version may be longer than the original character.
1658 The first character of the lowercased version is returned
1659 (but note, as explained above, that there may be more.)
1664 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1668 PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1670 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1671 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1675 =for apidoc to_utf8_fold
1677 Convert the UTF-8 encoded character at p to its foldcase version and
1678 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1679 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1680 foldcase version may be longer than the original character (up to
1683 The first character of the foldcased version is returned
1684 (but note, as explained above, that there may be more.)
1689 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1693 PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1695 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1696 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1700 * A "swash" is a swatch hash.
1701 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1702 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1703 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1706 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1711 const size_t pkg_len = strlen(pkg);
1712 const size_t name_len = strlen(name);
1713 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
1716 PERL_ARGS_ASSERT_SWASH_INIT;
1718 PUSHSTACKi(PERLSI_MAGIC);
1723 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1725 errsv_save = newSVsv(ERRSV);
1726 /* It is assumed that callers of this routine are not passing in any
1727 user derived data. */
1728 /* Need to do this after save_re_context() as it will set PL_tainted to
1729 1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1730 Even line to create errsv_save can turn on PL_tainted. */
1731 SAVEBOOL(PL_tainted);
1733 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1736 sv_setsv(ERRSV, errsv_save);
1737 SvREFCNT_dec(errsv_save);
1743 mPUSHp(pkg, pkg_len);
1744 mPUSHp(name, name_len);
1749 errsv_save = newSVsv(ERRSV);
1750 if (call_method("SWASHNEW", G_SCALAR))
1751 retval = newSVsv(*PL_stack_sp--);
1753 retval = &PL_sv_undef;
1755 sv_setsv(ERRSV, errsv_save);
1756 SvREFCNT_dec(errsv_save);
1759 if (IN_PERL_COMPILETIME) {
1760 CopHINTS_set(PL_curcop, PL_hints);
1762 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1764 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1766 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1772 /* This API is wrong for special case conversions since we may need to
1773 * return several Unicode characters for a single Unicode character
1774 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1775 * the lower-level routine, and it is similarly broken for returning
1776 * multiple values. --jhi */
1777 /* Now SWASHGET is recasted into S_swash_get in this file. */
1780 * Returns the value of property/mapping C<swash> for the first character
1781 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1782 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1783 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1786 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1789 HV *const hv = MUTABLE_HV(SvRV(swash));
1794 const U8 *tmps = NULL;
1798 const UV c = NATIVE_TO_ASCII(*ptr);
1800 PERL_ARGS_ASSERT_SWASH_FETCH;
1802 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1803 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1804 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1807 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1808 * then the "swatch" is a vec() for al the chars which start
1810 * So the key in the hash (klen) is length of encoded char -1
1812 klen = UTF8SKIP(ptr) - 1;
1816 /* If char in invariant then swatch is for all the invariant chars
1817 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1819 needents = UTF_CONTINUATION_MARK;
1820 off = NATIVE_TO_UTF(ptr[klen]);
1823 /* If char is encoded then swatch is for the prefix */
1824 needents = (1 << UTF_ACCUMULATION_SHIFT);
1825 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1829 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1830 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1831 * it's nothing to sniff at.) Pity we usually come through at least
1832 * two function calls to get here...
1834 * NB: this code assumes that swatches are never modified, once generated!
1837 if (hv == PL_last_swash_hv &&
1838 klen == PL_last_swash_klen &&
1839 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1841 tmps = PL_last_swash_tmps;
1842 slen = PL_last_swash_slen;
1845 /* Try our second-level swatch cache, kept in a hash. */
1846 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1848 /* If not cached, generate it via swash_get */
1849 if (!svp || !SvPOK(*svp)
1850 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1851 /* We use utf8n_to_uvuni() as we want an index into
1852 Unicode tables, not a native character number.
1854 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1856 0 : UTF8_ALLOW_ANY);
1857 swatch = swash_get(swash,
1858 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1859 (klen) ? (code_point & ~(needents - 1)) : 0,
1862 if (IN_PERL_COMPILETIME)
1863 CopHINTS_set(PL_curcop, PL_hints);
1865 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1867 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1868 || (slen << 3) < needents)
1869 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1872 PL_last_swash_hv = hv;
1873 assert(klen <= sizeof(PL_last_swash_key));
1874 PL_last_swash_klen = (U8)klen;
1875 /* FIXME change interpvar.h? */
1876 PL_last_swash_tmps = (U8 *) tmps;
1877 PL_last_swash_slen = slen;
1879 Copy(ptr, PL_last_swash_key, klen, U8);
1882 switch ((int)((slen << 3) / needents)) {
1884 bit = 1 << (off & 7);
1886 return (tmps[off] & bit) != 0;
1891 return (tmps[off] << 8) + tmps[off + 1] ;
1894 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1896 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1897 NORETURN_FUNCTION_END;
1901 * Returns a swatch (a bit vector string) for a code point sequence
1902 * that starts from the value C<start> and comprises the number C<span>.
1903 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1904 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1907 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1910 U8 *l, *lend, *x, *xend, *s;
1911 STRLEN lcur, xcur, scur;
1912 HV *const hv = MUTABLE_HV(SvRV(swash));
1913 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1914 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1915 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1916 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1917 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1918 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1919 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1920 const STRLEN bits = SvUV(*bitssvp);
1921 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1922 const UV none = SvUV(*nonesvp);
1923 const UV end = start + span;
1925 PERL_ARGS_ASSERT_SWASH_GET;
1927 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1928 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1932 /* create and initialize $swatch */
1933 scur = octets ? (span * octets) : (span + 7) / 8;
1934 swatch = newSV(scur);
1936 s = (U8*)SvPVX(swatch);
1937 if (octets && none) {
1938 const U8* const e = s + scur;
1941 *s++ = (U8)(none & 0xff);
1942 else if (bits == 16) {
1943 *s++ = (U8)((none >> 8) & 0xff);
1944 *s++ = (U8)( none & 0xff);
1946 else if (bits == 32) {
1947 *s++ = (U8)((none >> 24) & 0xff);
1948 *s++ = (U8)((none >> 16) & 0xff);
1949 *s++ = (U8)((none >> 8) & 0xff);
1950 *s++ = (U8)( none & 0xff);
1956 (void)memzero((U8*)s, scur + 1);
1958 SvCUR_set(swatch, scur);
1959 s = (U8*)SvPVX(swatch);
1961 /* read $swash->{LIST} */
1962 l = (U8*)SvPV(*listsvp, lcur);
1967 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1969 U8* const nl = (U8*)memchr(l, '\n', lend - l);
1972 min = grok_hex((char *)l, &numlen, &flags, NULL);
1976 l = nl + 1; /* 1 is length of "\n" */
1980 l = lend; /* to LIST's end at which \n is not found */
1986 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1988 max = grok_hex((char *)l, &numlen, &flags, NULL);
1997 flags = PERL_SCAN_SILENT_ILLDIGIT |
1998 PERL_SCAN_DISALLOW_PREFIX;
2000 val = grok_hex((char *)l, &numlen, &flags, NULL);
2009 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
2015 val = 0; /* bits == 1, then val should be ignored */
2022 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
2026 val = 0; /* bits == 1, then val should be ignored */
2040 if (!none || val < none) {
2045 for (key = min; key <= max; key++) {
2049 /* offset must be non-negative (start <= min <= key < end) */
2050 offset = octets * (key - start);
2052 s[offset] = (U8)(val & 0xff);
2053 else if (bits == 16) {
2054 s[offset ] = (U8)((val >> 8) & 0xff);
2055 s[offset + 1] = (U8)( val & 0xff);
2057 else if (bits == 32) {
2058 s[offset ] = (U8)((val >> 24) & 0xff);
2059 s[offset + 1] = (U8)((val >> 16) & 0xff);
2060 s[offset + 2] = (U8)((val >> 8) & 0xff);
2061 s[offset + 3] = (U8)( val & 0xff);
2064 if (!none || val < none)
2068 else { /* bits == 1, then val should be ignored */
2072 for (key = min; key <= max; key++) {
2073 const STRLEN offset = (STRLEN)(key - start);
2076 s[offset >> 3] |= 1 << (offset & 7);
2082 /* read $swash->{EXTRAS} */
2083 x = (U8*)SvPV(*extssvp, xcur);
2091 SV **otherbitssvp, *other;
2095 const U8 opc = *x++;
2099 nl = (U8*)memchr(x, '\n', xend - x);
2101 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2103 x = nl + 1; /* 1 is length of "\n" */
2107 x = xend; /* to EXTRAS' end at which \n is not found */
2114 namelen = nl - namestr;
2118 namelen = xend - namestr;
2122 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
2123 otherhv = MUTABLE_HV(SvRV(*othersvp));
2124 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
2125 otherbits = (STRLEN)SvUV(*otherbitssvp);
2126 if (bits < otherbits)
2127 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
2129 /* The "other" swatch must be destroyed after. */
2130 other = swash_get(*othersvp, start, span);
2131 o = (U8*)SvPV(other, olen);
2134 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
2136 s = (U8*)SvPV(swatch, slen);
2137 if (bits == 1 && otherbits == 1) {
2139 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2163 STRLEN otheroctets = otherbits >> 3;
2165 U8* const send = s + slen;
2170 if (otherbits == 1) {
2171 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2175 STRLEN vlen = otheroctets;
2183 if (opc == '+' && otherval)
2184 NOOP; /* replace with otherval */
2185 else if (opc == '!' && !otherval)
2187 else if (opc == '-' && otherval)
2189 else if (opc == '&' && !otherval)
2192 s += octets; /* no replacement */
2197 *s++ = (U8)( otherval & 0xff);
2198 else if (bits == 16) {
2199 *s++ = (U8)((otherval >> 8) & 0xff);
2200 *s++ = (U8)( otherval & 0xff);
2202 else if (bits == 32) {
2203 *s++ = (U8)((otherval >> 24) & 0xff);
2204 *s++ = (U8)((otherval >> 16) & 0xff);
2205 *s++ = (U8)((otherval >> 8) & 0xff);
2206 *s++ = (U8)( otherval & 0xff);
2210 sv_free(other); /* through with it! */
2216 =for apidoc uvchr_to_utf8
2218 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2219 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2220 bytes available. The return value is the pointer to the byte after the
2221 end of the new character. In other words,
2223 d = uvchr_to_utf8(d, uv);
2225 is the recommended wide native character-aware way of saying
2232 /* On ASCII machines this is normally a macro but we want a
2233 real function in case XS code wants it
2236 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2238 PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2240 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2244 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2246 PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2248 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2252 =for apidoc utf8n_to_uvchr
2255 Returns the native character value of the first character in the string
2257 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2258 length, in bytes, of that character.
2260 Allows length and flags to be passed to low level routine.
2264 /* On ASCII machines this is normally a macro but we want
2265 a real function in case XS code wants it
2268 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2271 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2273 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2275 return UNI_TO_NATIVE(uv);
2279 =for apidoc pv_uni_display
2281 Build to the scalar dsv a displayable version of the string spv,
2282 length len, the displayable version being at most pvlim bytes long
2283 (if longer, the rest is truncated and "..." will be appended).
2285 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2286 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2287 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2288 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2289 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2290 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2292 The pointer to the PV of the dsv is returned.
2296 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2301 PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2305 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2307 /* This serves double duty as a flag and a character to print after
2308 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2312 if (pvlim && SvCUR(dsv) >= pvlim) {
2316 u = utf8_to_uvchr((U8*)s, 0);
2318 const unsigned char c = (unsigned char)u & 0xFF;
2319 if (flags & UNI_DISPLAY_BACKSLASH) {
2336 const char string = ok;
2337 sv_catpvs(dsv, "\\");
2338 sv_catpvn(dsv, &string, 1);
2341 /* isPRINT() is the locale-blind version. */
2342 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2343 const char string = c;
2344 sv_catpvn(dsv, &string, 1);
2349 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2352 sv_catpvs(dsv, "...");
2358 =for apidoc sv_uni_display
2360 Build to the scalar dsv a displayable version of the scalar sv,
2361 the displayable version being at most pvlim bytes long
2362 (if longer, the rest is truncated and "..." will be appended).
2364 The flags argument is as in pv_uni_display().
2366 The pointer to the PV of the dsv is returned.
2371 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2373 PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2375 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2376 SvCUR(ssv), pvlim, flags);
2380 =for apidoc ibcmp_utf8
2382 Return true if the strings s1 and s2 differ case-insensitively, false
2383 if not (if they are equal case-insensitively). If u1 is true, the
2384 string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
2385 the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2386 are false, the respective string is assumed to be in native 8-bit
2389 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2390 in there (they will point at the beginning of the I<next> character).
2391 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2392 pointers beyond which scanning will not continue under any
2393 circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
2394 s2+l2 will be used as goal end pointers that will also stop the scan,
2395 and which qualify towards defining a successful match: all the scans
2396 that define an explicit length must reach their goal pointers for
2397 a match to succeed).
2399 For case-insensitiveness, the "casefolding" of Unicode is used
2400 instead of upper/lowercasing both the characters, see
2401 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2405 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2408 register const U8 *p1 = (const U8*)s1;
2409 register const U8 *p2 = (const U8*)s2;
2410 register const U8 *f1 = NULL;
2411 register const U8 *f2 = NULL;
2412 register U8 *e1 = NULL;
2413 register U8 *q1 = NULL;
2414 register U8 *e2 = NULL;
2415 register U8 *q2 = NULL;
2416 STRLEN n1 = 0, n2 = 0;
2417 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2418 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2420 STRLEN foldlen1, foldlen2;
2423 PERL_ARGS_ASSERT_IBCMP_UTF8;
2427 /* assert(e1 || l1); */
2428 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2429 f1 = (const U8*)s1 + l1;
2432 /* assert(e2 || l2); */
2433 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2434 f2 = (const U8*)s2 + l2;
2436 /* This shouldn't happen. However, putting an assert() there makes some
2438 /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
2439 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2440 return 1; /* mismatch; possible infinite loop or false positive */
2443 natbuf[1] = 0; /* Need to terminate the buffer. */
2445 while ((e1 == 0 || p1 < e1) &&
2446 (f1 == 0 || p1 < f1) &&
2447 (e2 == 0 || p2 < e2) &&
2448 (f2 == 0 || p2 < f2)) {
2451 to_utf8_fold(p1, foldbuf1, &foldlen1);
2453 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2454 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2461 to_utf8_fold(p2, foldbuf2, &foldlen2);
2463 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2464 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2470 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2471 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2472 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2473 return 1; /* mismatch */
2480 p1 += u1 ? UTF8SKIP(p1) : 1;
2482 p2 += u2 ? UTF8SKIP(p2) : 1;
2486 /* A match is defined by all the scans that specified
2487 * an explicit length reaching their final goals. */
2488 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2497 return match ? 0 : 1; /* 0 match, 1 mismatch */
2502 * c-indentation-style: bsd
2504 * indent-tabs-mode: t
2507 * ex: set ts=8 sts=4 sw=4 noet: