3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
25 #define PERL_IN_UTF8_C
28 static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
32 =head1 Unicode Support
34 This file contains various utility functions for manipulating UTF8-encoded
35 strings. For the uninitiated, this is a method of representing arbitrary
36 Unicode characters as a variable number of bytes, in such a way that
37 characters in the ASCII range are unmodified, and a zero byte never appears
38 within non-zero characters.
40 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
42 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
43 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
44 bytes available. The return value is the pointer to the byte after the
45 end of the new character. In other words,
47 d = uvuni_to_utf8_flags(d, uv, flags);
51 d = uvuni_to_utf8(d, uv);
53 (which is equivalent to)
55 d = uvuni_to_utf8_flags(d, uv, 0);
57 is the recommended Unicode-aware way of saying
65 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
67 if (ckWARN(WARN_UTF8)) {
68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
78 * FFFEs and FFFFs beyond 0x10FFFF. */
79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
83 "Unicode character 0x%04"UVxf" is illegal", uv);
85 if (UNI_IS_INVARIANT(uv)) {
86 *d++ = (U8)UTF_TO_NATIVE(uv);
91 STRLEN len = UNISKIP(uv);
94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
95 uv >>= UTF_ACCUMULATION_SHIFT;
97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
100 #else /* Non loop style */
102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
119 if (uv < 0x4000000) {
120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
127 if (uv < 0x80000000) {
128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
137 if (uv < UTF8_QUAD_MAX)
140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
167 #endif /* Loop style */
171 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
173 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
178 Tests if some arbitrary number of bytes begins in a valid UTF-8
179 character. Note that an INVARIANT (i.e. ASCII) character is a valid
180 UTF-8 character. The actual number of bytes in the UTF-8 character
181 will be returned if it is valid, otherwise 0.
183 This is the "slow" version as opposed to the "fast" version which is
184 the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
185 difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
186 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
187 you should use the _slow(). In practice this means that the _slow()
188 will be used very rarely, since the maximum Unicode code point (as of
189 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
190 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
195 S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
201 if (UTF8_IS_INVARIANT(u))
204 if (!UTF8_IS_START(u))
207 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
213 u = NATIVE_TO_UTF(u);
215 u &= UTF_START_MASK(len);
219 if (!UTF8_IS_CONTINUATION(*s))
221 uv = UTF8_ACCUMULATE(uv, *s);
228 if ((STRLEN)UNISKIP(uv) < len)
235 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
237 Tests if some arbitrary number of bytes begins in a valid UTF-8
238 character. Note that an INVARIANT (i.e. ASCII) character is a valid
239 UTF-8 character. The actual number of bytes in the UTF-8 character
240 will be returned if it is valid, otherwise 0.
244 Perl_is_utf8_char(pTHX_ const U8 *s)
246 const STRLEN len = UTF8SKIP(s);
248 if (IS_UTF8_CHAR_FAST(len))
249 return IS_UTF8_CHAR(s, len) ? len : 0;
250 #endif /* #ifdef IS_UTF8_CHAR */
251 return is_utf8_char_slow(s, len);
255 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
257 Returns true if first C<len> bytes of the given string form a valid
258 UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
259 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
260 because a valid ASCII string is a valid UTF-8 string.
262 See also is_utf8_string_loclen() and is_utf8_string_loc().
268 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
274 len = strlen((const char *)s);
279 /* Inline the easy bits of is_utf8_char() here for speed... */
280 if (UTF8_IS_INVARIANT(*x))
282 else if (!UTF8_IS_START(*x))
285 /* ... and call is_utf8_char() only if really needed. */
288 if (IS_UTF8_CHAR_FAST(c)) {
289 if (!IS_UTF8_CHAR(x, c))
291 } else if (!is_utf8_char_slow(x, c))
295 #endif /* #ifdef IS_UTF8_CHAR */
310 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
312 Like is_ut8_string() but stores the location of the failure (in the
313 case of "utf8ness failure") or the location s+len (in the case of
314 "utf8ness success") in the C<ep>, and the number of UTF-8
315 encoded characters in the C<el>.
317 See also is_utf8_string_loc() and is_utf8_string().
323 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
330 len = strlen((const char *)s);
336 /* Inline the easy bits of is_utf8_char() here for speed... */
337 if (UTF8_IS_INVARIANT(*x))
339 else if (!UTF8_IS_START(*x))
342 /* ... and call is_utf8_char() only if really needed. */
345 if (IS_UTF8_CHAR_FAST(c)) {
346 if (!IS_UTF8_CHAR(x, c))
349 c = is_utf8_char_slow(x, c);
352 #endif /* #ifdef IS_UTF8_CHAR */
371 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
373 Like is_ut8_string() but stores the location of the failure (in the
374 case of "utf8ness failure") or the location s+len (in the case of
375 "utf8ness success") in the C<ep>.
377 See also is_utf8_string_loclen() and is_utf8_string().
383 Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **ep)
385 return is_utf8_string_loclen(s, len, ep, 0);
389 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
391 Bottom level UTF-8 decode routine.
392 Returns the unicode code point value of the first character in the string C<s>
393 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
394 C<retlen> will be set to the length, in bytes, of that character.
396 If C<s> does not point to a well-formed UTF-8 character, the behaviour
397 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
398 it is assumed that the caller will raise a warning, and this function
399 will silently just set C<retlen> to C<-1> and return zero. If the
400 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
401 malformations will be given, C<retlen> will be set to the expected
402 length of the UTF-8 character in bytes, and zero will be returned.
404 The C<flags> can also contain various flags to allow deviations from
405 the strict UTF-8 encoding (see F<utf8.h>).
407 Most code should use utf8_to_uvchr() rather than call this directly.
413 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
418 const bool dowarn = ckWARN_d(WARN_UTF8);
419 const UV startbyte = *s;
420 STRLEN expectlen = 0;
423 /* This list is a superset of the UTF8_ALLOW_XXX. */
425 #define UTF8_WARN_EMPTY 1
426 #define UTF8_WARN_CONTINUATION 2
427 #define UTF8_WARN_NON_CONTINUATION 3
428 #define UTF8_WARN_FE_FF 4
429 #define UTF8_WARN_SHORT 5
430 #define UTF8_WARN_OVERFLOW 6
431 #define UTF8_WARN_SURROGATE 7
432 #define UTF8_WARN_LONG 8
433 #define UTF8_WARN_FFFF 9 /* Also FFFE. */
436 !(flags & UTF8_ALLOW_EMPTY)) {
437 warning = UTF8_WARN_EMPTY;
441 if (UTF8_IS_INVARIANT(uv)) {
444 return (UV) (NATIVE_TO_UTF(*s));
447 if (UTF8_IS_CONTINUATION(uv) &&
448 !(flags & UTF8_ALLOW_CONTINUATION)) {
449 warning = UTF8_WARN_CONTINUATION;
453 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
454 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
455 warning = UTF8_WARN_NON_CONTINUATION;
460 uv = NATIVE_TO_UTF(uv);
462 if ((uv == 0xfe || uv == 0xff) &&
463 !(flags & UTF8_ALLOW_FE_FF)) {
464 warning = UTF8_WARN_FE_FF;
469 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
470 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
471 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
472 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
474 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
475 else { len = 7; uv &= 0x01; }
477 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
478 else if (!(uv & 0x01)) { len = 7; uv = 0; }
479 else { len = 13; uv = 0; } /* whoa! */
487 if ((curlen < expectlen) &&
488 !(flags & UTF8_ALLOW_SHORT)) {
489 warning = UTF8_WARN_SHORT;
498 if (!UTF8_IS_CONTINUATION(*s) &&
499 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
501 warning = UTF8_WARN_NON_CONTINUATION;
505 uv = UTF8_ACCUMULATE(uv, *s);
507 /* These cannot be allowed. */
509 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
510 warning = UTF8_WARN_LONG;
514 else { /* uv < ouv */
515 /* This cannot be allowed. */
516 warning = UTF8_WARN_OVERFLOW;
524 if (UNICODE_IS_SURROGATE(uv) &&
525 !(flags & UTF8_ALLOW_SURROGATE)) {
526 warning = UTF8_WARN_SURROGATE;
528 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
529 !(flags & UTF8_ALLOW_LONG)) {
530 warning = UTF8_WARN_LONG;
532 } else if (UNICODE_IS_ILLEGAL(uv) &&
533 !(flags & UTF8_ALLOW_FFFF)) {
534 warning = UTF8_WARN_FFFF;
542 if (flags & UTF8_CHECK_ONLY) {
549 SV* const sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
552 case 0: /* Intentionally empty. */ break;
553 case UTF8_WARN_EMPTY:
554 Perl_sv_catpv(aTHX_ sv, "(empty string)");
556 case UTF8_WARN_CONTINUATION:
557 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
559 case UTF8_WARN_NON_CONTINUATION:
561 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
562 (UV)s[1], startbyte);
564 const int len = (int)(s-s0);
565 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
566 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
570 case UTF8_WARN_FE_FF:
571 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
573 case UTF8_WARN_SHORT:
574 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
575 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
576 expectlen = curlen; /* distance for caller to skip */
578 case UTF8_WARN_OVERFLOW:
579 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
582 case UTF8_WARN_SURROGATE:
583 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
586 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
587 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
590 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
593 Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
598 const char * const s = SvPVX_const(sv);
601 Perl_warner(aTHX_ packWARN(WARN_UTF8),
602 "%s in %s", s, OP_DESC(PL_op));
604 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
609 *retlen = expectlen ? expectlen : len;
615 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
617 Returns the native character value of the first character in the string C<s>
618 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
619 length, in bytes, of that character.
621 If C<s> does not point to a well-formed UTF-8 character, zero is
622 returned and retlen is set, if possible, to -1.
628 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
630 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
631 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
635 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
637 Returns the Unicode code point of the first character in the string C<s>
638 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
639 length, in bytes, of that character.
641 This function should only be used when returned UV is considered
642 an index into the Unicode semantic tables (e.g. swashes).
644 If C<s> does not point to a well-formed UTF-8 character, zero is
645 returned and retlen is set, if possible, to -1.
651 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
653 /* Call the low level routine asking for checks */
654 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
655 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
659 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
661 Return the length of the UTF-8 char encoded string C<s> in characters.
662 Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
663 up past C<e>, croaks.
669 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
673 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
674 * the bitops (especially ~) can create illegal UTF-8.
675 * In other words: in Perl UTF-8 is not just for Unicode. */
678 goto warn_and_return;
680 const U8 t = UTF8SKIP(s);
683 if (ckWARN_d(WARN_UTF8)) {
685 Perl_warner(aTHX_ packWARN(WARN_UTF8),
686 "%s in %s", unees, OP_DESC(PL_op));
688 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
700 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
702 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
705 WARNING: use only if you *know* that the pointers point inside the
712 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
716 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
717 * the bitops (especially ~) can create illegal UTF-8.
718 * In other words: in Perl UTF-8 is not just for Unicode. */
722 const U8 c = UTF8SKIP(a);
724 goto warn_and_return;
731 const U8 c = UTF8SKIP(b);
735 if (ckWARN_d(WARN_UTF8)) {
737 Perl_warner(aTHX_ packWARN(WARN_UTF8),
738 "%s in %s", unees, OP_DESC(PL_op));
740 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
753 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
755 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
758 WARNING: do not use the following unless you *know* C<off> is within
759 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
760 on the first byte of character or just after the last byte of a character.
766 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
768 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
769 * the bitops (especially ~) can create illegal UTF-8.
770 * In other words: in Perl UTF-8 is not just for Unicode. */
779 while (UTF8_IS_CONTINUATION(*s))
787 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
789 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
790 Unlike C<bytes_to_utf8>, this over-writes the original string, and
791 updates len to contain the new length.
792 Returns zero on failure, setting C<len> to -1.
798 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
804 /* ensure valid UTF-8 and chars < 256 before updating string */
805 for (send = s + *len; s < send; ) {
808 if (!UTF8_IS_INVARIANT(c) &&
809 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
810 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
819 *d++ = (U8)utf8_to_uvchr(s, &ulen);
828 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
830 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
831 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
832 the newly-created string, and updates C<len> to contain the new
833 length. Returns the original string if no conversion occurs, C<len>
834 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
835 0 if C<s> is converted or contains all 7bit characters.
841 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
851 /* ensure valid UTF-8 and chars < 256 before converting string */
852 for (send = s + *len; s < send;) {
854 if (!UTF8_IS_INVARIANT(c)) {
855 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
856 (c = *s++) && UTF8_IS_CONTINUATION(c))
865 Newxz(d, (*len) - count + 1, U8);
866 s = start; start = d;
869 if (!UTF8_IS_INVARIANT(c)) {
870 /* Then it is two-byte encoded */
871 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
872 c = ASCII_TO_NATIVE(c);
882 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
884 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
885 Returns a pointer to the newly-created string, and sets C<len> to
886 reflect the new length.
888 If you want to convert to UTF-8 from other encodings than ASCII,
889 see sv_recode_to_utf8().
895 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
897 const U8 * const send = s + (*len);
901 Newxz(d, (*len) * 2 + 1, U8);
905 const UV uv = NATIVE_TO_ASCII(*s++);
906 if (UNI_IS_INVARIANT(uv))
907 *d++ = (U8)UTF_TO_NATIVE(uv);
909 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
910 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
919 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
921 * Destination must be pre-extended to 3/2 source. Do not use in-place.
922 * We optimize for native, for obvious reasons. */
925 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
930 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
937 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
942 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
949 *d++ = (U8)(( uv >> 6) | 0xc0);
950 *d++ = (U8)(( uv & 0x3f) | 0x80);
953 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
954 UV low = (p[0] << 8) + p[1];
956 if (low < 0xdc00 || low >= 0xdfff)
957 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
958 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
961 *d++ = (U8)(( uv >> 12) | 0xe0);
962 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
963 *d++ = (U8)(( uv & 0x3f) | 0x80);
967 *d++ = (U8)(( uv >> 18) | 0xf0);
968 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
969 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
970 *d++ = (U8)(( uv & 0x3f) | 0x80);
974 *newlen = d - dstart;
978 /* Note: this one is slightly destructive of the source. */
981 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
984 U8* send = s + bytelen;
991 return utf16_to_utf8(p, d, bytelen, newlen);
994 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
997 Perl_is_uni_alnum(pTHX_ UV c)
999 U8 tmpbuf[UTF8_MAXBYTES+1];
1000 uvchr_to_utf8(tmpbuf, c);
1001 return is_utf8_alnum(tmpbuf);
1005 Perl_is_uni_alnumc(pTHX_ UV c)
1007 U8 tmpbuf[UTF8_MAXBYTES+1];
1008 uvchr_to_utf8(tmpbuf, c);
1009 return is_utf8_alnumc(tmpbuf);
1013 Perl_is_uni_idfirst(pTHX_ UV c)
1015 U8 tmpbuf[UTF8_MAXBYTES+1];
1016 uvchr_to_utf8(tmpbuf, c);
1017 return is_utf8_idfirst(tmpbuf);
1021 Perl_is_uni_alpha(pTHX_ UV c)
1023 U8 tmpbuf[UTF8_MAXBYTES+1];
1024 uvchr_to_utf8(tmpbuf, c);
1025 return is_utf8_alpha(tmpbuf);
1029 Perl_is_uni_ascii(pTHX_ UV c)
1031 U8 tmpbuf[UTF8_MAXBYTES+1];
1032 uvchr_to_utf8(tmpbuf, c);
1033 return is_utf8_ascii(tmpbuf);
1037 Perl_is_uni_space(pTHX_ UV c)
1039 U8 tmpbuf[UTF8_MAXBYTES+1];
1040 uvchr_to_utf8(tmpbuf, c);
1041 return is_utf8_space(tmpbuf);
1045 Perl_is_uni_digit(pTHX_ UV c)
1047 U8 tmpbuf[UTF8_MAXBYTES+1];
1048 uvchr_to_utf8(tmpbuf, c);
1049 return is_utf8_digit(tmpbuf);
1053 Perl_is_uni_upper(pTHX_ UV c)
1055 U8 tmpbuf[UTF8_MAXBYTES+1];
1056 uvchr_to_utf8(tmpbuf, c);
1057 return is_utf8_upper(tmpbuf);
1061 Perl_is_uni_lower(pTHX_ UV c)
1063 U8 tmpbuf[UTF8_MAXBYTES+1];
1064 uvchr_to_utf8(tmpbuf, c);
1065 return is_utf8_lower(tmpbuf);
1069 Perl_is_uni_cntrl(pTHX_ UV c)
1071 U8 tmpbuf[UTF8_MAXBYTES+1];
1072 uvchr_to_utf8(tmpbuf, c);
1073 return is_utf8_cntrl(tmpbuf);
1077 Perl_is_uni_graph(pTHX_ UV c)
1079 U8 tmpbuf[UTF8_MAXBYTES+1];
1080 uvchr_to_utf8(tmpbuf, c);
1081 return is_utf8_graph(tmpbuf);
1085 Perl_is_uni_print(pTHX_ UV c)
1087 U8 tmpbuf[UTF8_MAXBYTES+1];
1088 uvchr_to_utf8(tmpbuf, c);
1089 return is_utf8_print(tmpbuf);
1093 Perl_is_uni_punct(pTHX_ UV c)
1095 U8 tmpbuf[UTF8_MAXBYTES+1];
1096 uvchr_to_utf8(tmpbuf, c);
1097 return is_utf8_punct(tmpbuf);
1101 Perl_is_uni_xdigit(pTHX_ UV c)
1103 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1104 uvchr_to_utf8(tmpbuf, c);
1105 return is_utf8_xdigit(tmpbuf);
1109 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1111 uvchr_to_utf8(p, c);
1112 return to_utf8_upper(p, p, lenp);
1116 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1118 uvchr_to_utf8(p, c);
1119 return to_utf8_title(p, p, lenp);
1123 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1125 uvchr_to_utf8(p, c);
1126 return to_utf8_lower(p, p, lenp);
1130 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1132 uvchr_to_utf8(p, c);
1133 return to_utf8_fold(p, p, lenp);
1136 /* for now these all assume no locale info available for Unicode > 255 */
1139 Perl_is_uni_alnum_lc(pTHX_ UV c)
1141 return is_uni_alnum(c); /* XXX no locale support yet */
1145 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1147 return is_uni_alnumc(c); /* XXX no locale support yet */
1151 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1153 return is_uni_idfirst(c); /* XXX no locale support yet */
1157 Perl_is_uni_alpha_lc(pTHX_ UV c)
1159 return is_uni_alpha(c); /* XXX no locale support yet */
1163 Perl_is_uni_ascii_lc(pTHX_ UV c)
1165 return is_uni_ascii(c); /* XXX no locale support yet */
1169 Perl_is_uni_space_lc(pTHX_ UV c)
1171 return is_uni_space(c); /* XXX no locale support yet */
1175 Perl_is_uni_digit_lc(pTHX_ UV c)
1177 return is_uni_digit(c); /* XXX no locale support yet */
1181 Perl_is_uni_upper_lc(pTHX_ UV c)
1183 return is_uni_upper(c); /* XXX no locale support yet */
1187 Perl_is_uni_lower_lc(pTHX_ UV c)
1189 return is_uni_lower(c); /* XXX no locale support yet */
1193 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1195 return is_uni_cntrl(c); /* XXX no locale support yet */
1199 Perl_is_uni_graph_lc(pTHX_ UV c)
1201 return is_uni_graph(c); /* XXX no locale support yet */
1205 Perl_is_uni_print_lc(pTHX_ UV c)
1207 return is_uni_print(c); /* XXX no locale support yet */
1211 Perl_is_uni_punct_lc(pTHX_ UV c)
1213 return is_uni_punct(c); /* XXX no locale support yet */
1217 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1219 return is_uni_xdigit(c); /* XXX no locale support yet */
1223 Perl_to_uni_upper_lc(pTHX_ U32 c)
1225 /* XXX returns only the first character -- do not use XXX */
1226 /* XXX no locale support yet */
1228 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1229 return (U32)to_uni_upper(c, tmpbuf, &len);
1233 Perl_to_uni_title_lc(pTHX_ U32 c)
1235 /* XXX returns only the first character XXX -- do not use XXX */
1236 /* XXX no locale support yet */
1238 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1239 return (U32)to_uni_title(c, tmpbuf, &len);
1243 Perl_to_uni_lower_lc(pTHX_ U32 c)
1245 /* XXX returns only the first character -- do not use XXX */
1246 /* XXX no locale support yet */
1248 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1249 return (U32)to_uni_lower(c, tmpbuf, &len);
1253 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1255 if (!is_utf8_char(p))
1258 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1259 * descendant of isalnum(3), in other words, it doesn't
1260 * contain the '_'. --jhi */
1261 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1262 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1263 /* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1264 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1266 PL_utf8_alnum = swash_init("utf8", "",
1267 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1268 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1273 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1275 if (!is_utf8_char(p))
1278 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1279 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1280 /* return is_utf8_alpha(p) || is_utf8_digit(p); */
1281 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1283 PL_utf8_alnum = swash_init("utf8", "",
1284 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1285 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1290 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1294 if (!is_utf8_char(p))
1296 if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1297 PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1298 return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1302 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1306 if (!is_utf8_char(p))
1308 if (!PL_utf8_idcont)
1309 PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1310 return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1314 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1316 if (!is_utf8_char(p))
1319 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1320 return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1324 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1326 if (!is_utf8_char(p))
1329 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1330 return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1334 Perl_is_utf8_space(pTHX_ const U8 *p)
1336 if (!is_utf8_char(p))
1339 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1340 return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1344 Perl_is_utf8_digit(pTHX_ const U8 *p)
1346 if (!is_utf8_char(p))
1349 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1350 return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1354 Perl_is_utf8_upper(pTHX_ const U8 *p)
1356 if (!is_utf8_char(p))
1359 PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
1360 return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1364 Perl_is_utf8_lower(pTHX_ const U8 *p)
1366 if (!is_utf8_char(p))
1369 PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
1370 return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1374 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1376 if (!is_utf8_char(p))
1379 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1380 return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1384 Perl_is_utf8_graph(pTHX_ const U8 *p)
1386 if (!is_utf8_char(p))
1389 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1390 return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1394 Perl_is_utf8_print(pTHX_ const U8 *p)
1396 if (!is_utf8_char(p))
1399 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1400 return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1404 Perl_is_utf8_punct(pTHX_ const U8 *p)
1406 if (!is_utf8_char(p))
1409 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1410 return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1414 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1416 if (!is_utf8_char(p))
1418 if (!PL_utf8_xdigit)
1419 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1420 return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1424 Perl_is_utf8_mark(pTHX_ const U8 *p)
1426 if (!is_utf8_char(p))
1429 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1430 return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1434 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1436 The "p" contains the pointer to the UTF-8 string encoding
1437 the character that is being converted.
1439 The "ustrp" is a pointer to the character buffer to put the
1440 conversion result to. The "lenp" is a pointer to the length
1443 The "swashp" is a pointer to the swash to use.
1445 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1446 and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1447 but not always, a multicharacter mapping), is tried first.
1449 The "special" is a string like "utf8::ToSpecLower", which means the
1450 hash %utf8::ToSpecLower. The access to the hash is through
1451 Perl_to_utf8_case().
1453 The "normal" is a string like "ToLower" which means the swash
1459 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1460 SV **swashp, const char *normal, const char *special)
1462 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1465 const UV uv0 = utf8_to_uvchr(p, NULL);
1466 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1467 * are necessary in EBCDIC, they are redundant no-ops
1468 * in ASCII-ish platforms, and hopefully optimized away. */
1469 const UV uv1 = NATIVE_TO_UNI(uv0);
1470 uvuni_to_utf8(tmpbuf, uv1);
1472 if (!*swashp) /* load on-demand */
1473 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1475 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1476 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1477 /* It might be "special" (sometimes, but not always,
1478 * a multicharacter mapping) */
1482 if ((hv = get_hv(special, FALSE)) &&
1483 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1487 s = SvPV_const(*svp, len);
1489 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1492 /* If we have EBCDIC we need to remap the characters
1493 * since any characters in the low 256 are Unicode
1494 * code points, not EBCDIC. */
1495 U8 *t = (U8*)s, *tend = t + len, *d;
1502 UV c = utf8_to_uvchr(t, &tlen);
1504 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1513 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1518 Copy(tmpbuf, ustrp, len, U8);
1520 Copy(s, ustrp, len, U8);
1526 if (!len && *swashp) {
1527 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1530 /* It was "normal" (a single character mapping). */
1531 UV uv3 = UNI_TO_NATIVE(uv2);
1533 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1537 if (!len) /* Neither: just copy. */
1538 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1543 return len ? utf8_to_uvchr(ustrp, 0) : 0;
1547 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1549 Convert the UTF-8 encoded character at p to its uppercase version and
1550 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1551 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1552 the uppercase version may be longer than the original character.
1554 The first character of the uppercased version is returned
1555 (but note, as explained above, that there may be more.)
1560 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1562 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1563 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1567 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1569 Convert the UTF-8 encoded character at p to its titlecase version and
1570 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1571 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1572 titlecase version may be longer than the original character.
1574 The first character of the titlecased version is returned
1575 (but note, as explained above, that there may be more.)
1580 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1582 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1583 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1587 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1589 Convert the UTF-8 encoded character at p to its lowercase version and
1590 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1591 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1592 lowercase version may be longer than the original character.
1594 The first character of the lowercased version is returned
1595 (but note, as explained above, that there may be more.)
1600 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1602 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1603 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1607 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1609 Convert the UTF-8 encoded character at p to its foldcase version and
1610 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1611 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1612 foldcase version may be longer than the original character (up to
1615 The first character of the foldcased version is returned
1616 (but note, as explained above, that there may be more.)
1621 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1623 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1624 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1627 /* a "swash" is a swatch hash */
1630 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1634 SV* const tokenbufsv = sv_newmortal();
1636 const size_t pkg_len = strlen(pkg);
1637 const size_t name_len = strlen(name);
1638 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1641 PUSHSTACKi(PERLSI_MAGIC);
1646 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1648 errsv_save = newSVsv(ERRSV);
1649 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1652 sv_setsv(ERRSV, errsv_save);
1653 SvREFCNT_dec(errsv_save);
1659 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1660 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1662 PUSHs(sv_2mortal(newSViv(minbits)));
1663 PUSHs(sv_2mortal(newSViv(none)));
1665 if (IN_PERL_COMPILETIME) {
1666 /* XXX ought to be handled by lex_start */
1669 sv_setpv(tokenbufsv, PL_tokenbuf);
1671 errsv_save = newSVsv(ERRSV);
1672 if (call_method("SWASHNEW", G_SCALAR))
1673 retval = newSVsv(*PL_stack_sp--);
1675 retval = &PL_sv_undef;
1677 sv_setsv(ERRSV, errsv_save);
1678 SvREFCNT_dec(errsv_save);
1681 if (IN_PERL_COMPILETIME) {
1683 const char* const pv = SvPV_const(tokenbufsv, len);
1685 Copy(pv, PL_tokenbuf, len+1, char);
1686 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1688 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1690 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1692 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1698 /* This API is wrong for special case conversions since we may need to
1699 * return several Unicode characters for a single Unicode character
1700 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1701 * the lower-level routine, and it is similarly broken for returning
1702 * multiple values. --jhi */
1704 Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
1707 HV* const hv = (HV*)SvRV(sv);
1712 const U8 *tmps = NULL;
1716 UV c = NATIVE_TO_ASCII(*ptr);
1718 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1719 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1720 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1723 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1724 * then the "swatch" is a vec() for al the chars which start
1726 * So the key in the hash (klen) is length of encoded char -1
1728 klen = UTF8SKIP(ptr) - 1;
1733 /* If char in invariant then swatch is for all the invariant chars
1734 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1736 needents = UTF_CONTINUATION_MARK;
1737 off = NATIVE_TO_UTF(ptr[klen]);
1741 /* If char is encoded then swatch is for the prefix */
1742 needents = (1 << UTF_ACCUMULATION_SHIFT);
1743 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1747 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1748 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1749 * it's nothing to sniff at.) Pity we usually come through at least
1750 * two function calls to get here...
1752 * NB: this code assumes that swatches are never modified, once generated!
1755 if (hv == PL_last_swash_hv &&
1756 klen == PL_last_swash_klen &&
1757 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1759 tmps = PL_last_swash_tmps;
1760 slen = PL_last_swash_slen;
1763 /* Try our second-level swatch cache, kept in a hash. */
1764 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1766 /* If not cached, generate it via utf8::SWASHGET */
1767 if (!svp || !SvPOK(*svp) || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1769 /* We use utf8n_to_uvuni() as we want an index into
1770 Unicode tables, not a native character number.
1772 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1774 0 : UTF8_ALLOW_ANY);
1779 PUSHSTACKi(PERLSI_MAGIC);
1783 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1784 PUSHs(sv_2mortal(newSViv((klen) ?
1785 (code_point & ~(needents - 1)) : 0)));
1786 PUSHs(sv_2mortal(newSViv(needents)));
1788 errsv_save = newSVsv(ERRSV);
1789 if (call_method("SWASHGET", G_SCALAR))
1790 retval = newSVsv(*PL_stack_sp--);
1792 retval = &PL_sv_undef;
1794 sv_setsv(ERRSV, errsv_save);
1795 SvREFCNT_dec(errsv_save);
1799 if (IN_PERL_COMPILETIME)
1800 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1802 svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
1804 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1805 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1808 PL_last_swash_hv = hv;
1809 PL_last_swash_klen = klen;
1810 /* FIXME change interpvar.h? */
1811 PL_last_swash_tmps = (U8 *) tmps;
1812 PL_last_swash_slen = slen;
1814 Copy(ptr, PL_last_swash_key, klen, U8);
1817 switch ((int)((slen << 3) / needents)) {
1819 bit = 1 << (off & 7);
1821 return (tmps[off] & bit) != 0;
1826 return (tmps[off] << 8) + tmps[off + 1] ;
1829 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1831 Perl_croak(aTHX_ "panic: swash_fetch");
1837 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1839 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
1840 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
1841 bytes available. The return value is the pointer to the byte after the
1842 end of the new character. In other words,
1844 d = uvchr_to_utf8(d, uv);
1846 is the recommended wide native character-aware way of saying
1853 /* On ASCII machines this is normally a macro but we want a
1854 real function in case XS code wants it
1856 #undef Perl_uvchr_to_utf8
1858 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1860 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1864 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1866 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1870 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1872 Returns the native character value of the first character in the string C<s>
1873 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
1874 length, in bytes, of that character.
1876 Allows length and flags to be passed to low level routine.
1880 /* On ASCII machines this is normally a macro but we want
1881 a real function in case XS code wants it
1883 #undef Perl_utf8n_to_uvchr
1885 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1887 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1888 return UNI_TO_NATIVE(uv);
1892 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1894 Build to the scalar dsv a displayable version of the string spv,
1895 length len, the displayable version being at most pvlim bytes long
1896 (if longer, the rest is truncated and "..." will be appended).
1898 The flags argument can have UNI_DISPLAY_ISPRINT set to display
1899 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1900 to display the \\[nrfta\\] as the backslashed versions (like '\n')
1901 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1902 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1903 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1905 The pointer to the PV of the dsv is returned.
1909 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1914 sv_setpvn(dsv, "", 0);
1915 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1917 /* This serves double duty as a flag and a character to print after
1918 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1922 if (pvlim && SvCUR(dsv) >= pvlim) {
1926 u = utf8_to_uvchr((U8*)s, 0);
1928 const unsigned char c = (unsigned char)u & 0xFF;
1929 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1946 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1949 /* isPRINT() is the locale-blind version. */
1950 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1951 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1956 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1959 sv_catpvn(dsv, "...", 3);
1965 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1967 Build to the scalar dsv a displayable version of the scalar sv,
1968 the displayable version being at most pvlim bytes long
1969 (if longer, the rest is truncated and "..." will be appended).
1971 The flags argument is as in pv_uni_display().
1973 The pointer to the PV of the dsv is returned.
1977 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1979 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
1980 SvCUR(ssv), pvlim, flags);
1984 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1986 Return true if the strings s1 and s2 differ case-insensitively, false
1987 if not (if they are equal case-insensitively). If u1 is true, the
1988 string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
1989 the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1990 are false, the respective string is assumed to be in native 8-bit
1993 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1994 in there (they will point at the beginning of the I<next> character).
1995 If the pointers behind pe1 or pe2 are non-NULL, they are the end
1996 pointers beyond which scanning will not continue under any
1997 circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1998 s2+l2 will be used as goal end pointers that will also stop the scan,
1999 and which qualify towards defining a successful match: all the scans
2000 that define an explicit length must reach their goal pointers for
2001 a match to succeed).
2003 For case-insensitiveness, the "casefolding" of Unicode is used
2004 instead of upper/lowercasing both the characters, see
2005 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2009 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2011 register const U8 *p1 = (const U8*)s1;
2012 register const U8 *p2 = (const U8*)s2;
2013 register const U8 *f1 = 0, *f2 = 0;
2014 register U8 *e1 = 0, *q1 = 0;
2015 register U8 *e2 = 0, *q2 = 0;
2016 STRLEN n1 = 0, n2 = 0;
2017 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2018 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2020 STRLEN foldlen1, foldlen2;
2025 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2026 f1 = (const U8*)s1 + l1;
2029 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2030 f2 = (const U8*)s2 + l2;
2032 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2033 return 1; /* mismatch; possible infinite loop or false positive */
2036 natbuf[1] = 0; /* Need to terminate the buffer. */
2038 while ((e1 == 0 || p1 < e1) &&
2039 (f1 == 0 || p1 < f1) &&
2040 (e2 == 0 || p2 < e2) &&
2041 (f2 == 0 || p2 < f2)) {
2044 to_utf8_fold(p1, foldbuf1, &foldlen1);
2046 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2047 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2054 to_utf8_fold(p2, foldbuf2, &foldlen2);
2056 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2057 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2063 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2064 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2065 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2066 return 1; /* mismatch */
2073 p1 += u1 ? UTF8SKIP(p1) : 1;
2075 p2 += u2 ? UTF8SKIP(p2) : 1;
2079 /* A match is defined by all the scans that specified
2080 * an explicit length reaching their final goals. */
2081 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2090 return match ? 0 : 1; /* 0 match, 1 mismatch */
2095 * c-indentation-style: bsd
2097 * indent-tabs-mode: t
2100 * ex: set ts=8 sts=4 sw=4 noet: