3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
4 * by Larry Wall and others
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
25 #define PERL_IN_UTF8_C
29 /* Separate prototypes needed because in ASCII systems these
30 * usually macros but they still are compiled as code, too. */
31 PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
32 PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
35 static const char unees[] =
36 "Malformed UTF-8 character (unexpected end of string)";
39 =head1 Unicode Support
41 This file contains various utility functions for manipulating UTF8-encoded
42 strings. For the uninitiated, this is a method of representing arbitrary
43 Unicode characters as a variable number of bytes, in such a way that
44 characters in the ASCII range are unmodified, and a zero byte never appears
45 within non-zero characters.
47 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
49 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
50 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
51 bytes available. The return value is the pointer to the byte after the
52 end of the new character. In other words,
54 d = uvuni_to_utf8_flags(d, uv, flags);
58 d = uvuni_to_utf8(d, uv);
60 (which is equivalent to)
62 d = uvuni_to_utf8_flags(d, uv, 0);
64 is the recommended Unicode-aware way of saying
72 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
74 if (ckWARN(WARN_UTF8)) {
75 if (UNICODE_IS_SURROGATE(uv) &&
76 !(flags & UNICODE_ALLOW_SURROGATE))
77 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
79 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
80 !(flags & UNICODE_ALLOW_FDD0))
82 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
83 !(flags & UNICODE_ALLOW_FFFF))) &&
84 /* UNICODE_ALLOW_SUPER includes
85 * FFFEs and FFFFs beyond 0x10FFFF. */
86 ((uv <= PERL_UNICODE_MAX) ||
87 !(flags & UNICODE_ALLOW_SUPER))
89 Perl_warner(aTHX_ packWARN(WARN_UTF8),
90 "Unicode character 0x%04"UVxf" is illegal", uv);
92 if (UNI_IS_INVARIANT(uv)) {
93 *d++ = (U8)UTF_TO_NATIVE(uv);
98 STRLEN len = UNISKIP(uv);
101 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
102 uv >>= UTF_ACCUMULATION_SHIFT;
104 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
107 #else /* Non loop style */
109 *d++ = (U8)(( uv >> 6) | 0xc0);
110 *d++ = (U8)(( uv & 0x3f) | 0x80);
114 *d++ = (U8)(( uv >> 12) | 0xe0);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
120 *d++ = (U8)(( uv >> 18) | 0xf0);
121 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
123 *d++ = (U8)(( uv & 0x3f) | 0x80);
126 if (uv < 0x4000000) {
127 *d++ = (U8)(( uv >> 24) | 0xf8);
128 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
129 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
131 *d++ = (U8)(( uv & 0x3f) | 0x80);
134 if (uv < 0x80000000) {
135 *d++ = (U8)(( uv >> 30) | 0xfc);
136 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
137 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
138 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
139 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
140 *d++ = (U8)(( uv & 0x3f) | 0x80);
144 if (uv < UTF8_QUAD_MAX)
147 *d++ = 0xfe; /* Can't match U+FEFF! */
148 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
149 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
150 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
151 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
152 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
153 *d++ = (U8)(( uv & 0x3f) | 0x80);
158 *d++ = 0xff; /* Can't match U+FFFE! */
159 *d++ = 0x80; /* 6 Reserved bits */
160 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
161 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
163 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
164 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
165 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
166 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
167 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
168 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
169 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
170 *d++ = (U8)(( uv & 0x3f) | 0x80);
174 #endif /* Loop style */
179 Tests if some arbitrary number of bytes begins in a valid UTF-8
180 character. Note that an INVARIANT (i.e. ASCII) character is a valid
181 UTF-8 character. The actual number of bytes in the UTF-8 character
182 will be returned if it is valid, otherwise 0.
184 This is the "slow" version as opposed to the "fast" version which is
185 the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
186 difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
187 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
188 you should use the _slow(). In practice this means that the _slow()
189 will be used very rarely, since the maximum Unicode code point (as of
190 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
191 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
196 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
202 if (UTF8_IS_INVARIANT(u))
205 if (!UTF8_IS_START(u))
208 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
214 u = NATIVE_TO_UTF(u);
216 u &= UTF_START_MASK(len);
220 if (!UTF8_IS_CONTINUATION(*s))
222 uv = UTF8_ACCUMULATE(uv, *s);
229 if ((STRLEN)UNISKIP(uv) < len)
236 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
238 Tests if some arbitrary number of bytes begins in a valid UTF-8
239 character. Note that an INVARIANT (i.e. ASCII) character is a valid
240 UTF-8 character. The actual number of bytes in the UTF-8 character
241 will be returned if it is valid, otherwise 0.
245 Perl_is_utf8_char(pTHX_ const U8 *s)
247 const STRLEN len = UTF8SKIP(s);
250 if (IS_UTF8_CHAR_FAST(len))
251 return IS_UTF8_CHAR(s, len) ? len : 0;
252 #endif /* #ifdef IS_UTF8_CHAR */
253 return is_utf8_char_slow(s, len);
257 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
259 Returns true if first C<len> bytes of the given string form a valid
260 UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
261 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
262 because a valid ASCII string is a valid UTF-8 string.
264 See also is_utf8_string_loclen() and is_utf8_string_loc().
270 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
272 const U8* const send = s + (len ? len : strlen((const char *)s));
279 /* Inline the easy bits of is_utf8_char() here for speed... */
280 if (UTF8_IS_INVARIANT(*x))
282 else if (!UTF8_IS_START(*x))
285 /* ... and call is_utf8_char() only if really needed. */
288 if (IS_UTF8_CHAR_FAST(c)) {
289 if (!IS_UTF8_CHAR(x, c))
293 c = is_utf8_char_slow(x, c);
296 #endif /* #ifdef IS_UTF8_CHAR */
311 Implemented as a macro in utf8.h
313 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
315 Like is_utf8_string() but stores the location of the failure (in the
316 case of "utf8ness failure") or the location s+len (in the case of
317 "utf8ness success") in the C<ep>.
319 See also is_utf8_string_loclen() and is_utf8_string().
321 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
323 Like is_utf8_string() but stores the location of the failure (in the
324 case of "utf8ness failure") or the location s+len (in the case of
325 "utf8ness success") in the C<ep>, and the number of UTF-8
326 encoded characters in the C<el>.
328 See also is_utf8_string_loc() and is_utf8_string().
334 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
336 const U8* const send = s + (len ? len : strlen((const char *)s));
343 /* Inline the easy bits of is_utf8_char() here for speed... */
344 if (UTF8_IS_INVARIANT(*x))
346 else if (!UTF8_IS_START(*x))
349 /* ... and call is_utf8_char() only if really needed. */
352 if (IS_UTF8_CHAR_FAST(c)) {
353 if (!IS_UTF8_CHAR(x, c))
356 c = is_utf8_char_slow(x, c);
359 #endif /* #ifdef IS_UTF8_CHAR */
378 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
380 Bottom level UTF-8 decode routine.
381 Returns the unicode code point value of the first character in the string C<s>
382 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
383 C<retlen> will be set to the length, in bytes, of that character.
385 If C<s> does not point to a well-formed UTF-8 character, the behaviour
386 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
387 it is assumed that the caller will raise a warning, and this function
388 will silently just set C<retlen> to C<-1> and return zero. If the
389 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
390 malformations will be given, C<retlen> will be set to the expected
391 length of the UTF-8 character in bytes, and zero will be returned.
393 The C<flags> can also contain various flags to allow deviations from
394 the strict UTF-8 encoding (see F<utf8.h>).
396 Most code should use utf8_to_uvchr() rather than call this directly.
402 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
405 const U8 * const s0 = s;
408 const bool dowarn = ckWARN_d(WARN_UTF8);
409 const UV startbyte = *s;
410 STRLEN expectlen = 0;
413 /* This list is a superset of the UTF8_ALLOW_XXX. */
415 #define UTF8_WARN_EMPTY 1
416 #define UTF8_WARN_CONTINUATION 2
417 #define UTF8_WARN_NON_CONTINUATION 3
418 #define UTF8_WARN_FE_FF 4
419 #define UTF8_WARN_SHORT 5
420 #define UTF8_WARN_OVERFLOW 6
421 #define UTF8_WARN_SURROGATE 7
422 #define UTF8_WARN_LONG 8
423 #define UTF8_WARN_FFFF 9 /* Also FFFE. */
426 !(flags & UTF8_ALLOW_EMPTY)) {
427 warning = UTF8_WARN_EMPTY;
431 if (UTF8_IS_INVARIANT(uv)) {
434 return (UV) (NATIVE_TO_UTF(*s));
437 if (UTF8_IS_CONTINUATION(uv) &&
438 !(flags & UTF8_ALLOW_CONTINUATION)) {
439 warning = UTF8_WARN_CONTINUATION;
443 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
444 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
445 warning = UTF8_WARN_NON_CONTINUATION;
450 uv = NATIVE_TO_UTF(uv);
452 if ((uv == 0xfe || uv == 0xff) &&
453 !(flags & UTF8_ALLOW_FE_FF)) {
454 warning = UTF8_WARN_FE_FF;
459 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
460 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
461 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
462 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
464 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
465 else { len = 7; uv &= 0x01; }
467 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
468 else if (!(uv & 0x01)) { len = 7; uv = 0; }
469 else { len = 13; uv = 0; } /* whoa! */
477 if ((curlen < expectlen) &&
478 !(flags & UTF8_ALLOW_SHORT)) {
479 warning = UTF8_WARN_SHORT;
488 if (!UTF8_IS_CONTINUATION(*s) &&
489 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
491 warning = UTF8_WARN_NON_CONTINUATION;
495 uv = UTF8_ACCUMULATE(uv, *s);
497 /* These cannot be allowed. */
499 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
500 warning = UTF8_WARN_LONG;
504 else { /* uv < ouv */
505 /* This cannot be allowed. */
506 warning = UTF8_WARN_OVERFLOW;
514 if (UNICODE_IS_SURROGATE(uv) &&
515 !(flags & UTF8_ALLOW_SURROGATE)) {
516 warning = UTF8_WARN_SURROGATE;
518 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
519 !(flags & UTF8_ALLOW_LONG)) {
520 warning = UTF8_WARN_LONG;
522 } else if (UNICODE_IS_ILLEGAL(uv) &&
523 !(flags & UTF8_ALLOW_FFFF)) {
524 warning = UTF8_WARN_FFFF;
532 if (flags & UTF8_CHECK_ONLY) {
534 *retlen = ((STRLEN) -1);
539 SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
542 case 0: /* Intentionally empty. */ break;
543 case UTF8_WARN_EMPTY:
544 sv_catpvs(sv, "(empty string)");
546 case UTF8_WARN_CONTINUATION:
547 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
549 case UTF8_WARN_NON_CONTINUATION:
551 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
552 (UV)s[1], startbyte);
554 const int len = (int)(s-s0);
555 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
556 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
560 case UTF8_WARN_FE_FF:
561 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
563 case UTF8_WARN_SHORT:
564 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
565 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
566 expectlen = curlen; /* distance for caller to skip */
568 case UTF8_WARN_OVERFLOW:
569 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
572 case UTF8_WARN_SURROGATE:
573 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
576 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
577 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
580 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
583 sv_catpvs(sv, "(unknown reason)");
588 const char * const s = SvPVX_const(sv);
591 Perl_warner(aTHX_ packWARN(WARN_UTF8),
592 "%s in %s", s, OP_DESC(PL_op));
594 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
599 *retlen = expectlen ? expectlen : len;
605 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
607 Returns the native character value of the first character in the string C<s>
608 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
609 length, in bytes, of that character.
611 If C<s> does not point to a well-formed UTF-8 character, zero is
612 returned and retlen is set, if possible, to -1.
618 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
620 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
621 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
625 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
627 Returns the Unicode code point of the first character in the string C<s>
628 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
629 length, in bytes, of that character.
631 This function should only be used when returned UV is considered
632 an index into the Unicode semantic tables (e.g. swashes).
634 If C<s> does not point to a well-formed UTF-8 character, zero is
635 returned and retlen is set, if possible, to -1.
641 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
643 /* Call the low level routine asking for checks */
644 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
645 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
649 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
651 Return the length of the UTF-8 char encoded string C<s> in characters.
652 Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
653 up past C<e>, croaks.
659 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
665 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
666 * the bitops (especially ~) can create illegal UTF-8.
667 * In other words: in Perl UTF-8 is not just for Unicode. */
670 goto warn_and_return;
675 if (ckWARN_d(WARN_UTF8)) {
677 Perl_warner(aTHX_ packWARN(WARN_UTF8),
678 "%s in %s", unees, OP_DESC(PL_op));
680 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
692 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
694 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
697 WARNING: use only if you *know* that the pointers point inside the
704 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
706 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
710 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
712 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
715 WARNING: do not use the following unless you *know* C<off> is within
716 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
717 on the first byte of character or just after the last byte of a character.
723 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
726 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
727 * the bitops (especially ~) can create illegal UTF-8.
728 * In other words: in Perl UTF-8 is not just for Unicode. */
737 while (UTF8_IS_CONTINUATION(*s))
745 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
747 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
748 Unlike C<bytes_to_utf8>, this over-writes the original string, and
749 updates len to contain the new length.
750 Returns zero on failure, setting C<len> to -1.
752 If you need a copy of the string, see C<bytes_from_utf8>.
758 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
761 U8 * const send = s + *len;
764 /* ensure valid UTF-8 and chars < 256 before updating string */
768 if (!UTF8_IS_INVARIANT(c) &&
769 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
770 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
771 *len = ((STRLEN) -1);
779 *d++ = (U8)utf8_to_uvchr(s, &ulen);
788 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
790 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
791 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
792 the newly-created string, and updates C<len> to contain the new
793 length. Returns the original string if no conversion occurs, C<len>
794 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
795 0 if C<s> is converted or contains all 7bit characters.
801 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
812 /* ensure valid UTF-8 and chars < 256 before converting string */
813 for (send = s + *len; s < send;) {
815 if (!UTF8_IS_INVARIANT(c)) {
816 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
817 (c = *s++) && UTF8_IS_CONTINUATION(c))
826 Newx(d, (*len) - count + 1, U8);
827 s = start; start = d;
830 if (!UTF8_IS_INVARIANT(c)) {
831 /* Then it is two-byte encoded */
832 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
833 c = ASCII_TO_NATIVE(c);
843 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
845 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
846 Returns a pointer to the newly-created string, and sets C<len> to
847 reflect the new length.
849 If you want to convert to UTF-8 from other encodings than ASCII,
850 see sv_recode_to_utf8().
856 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
858 const U8 * const send = s + (*len);
863 Newx(d, (*len) * 2 + 1, U8);
867 const UV uv = NATIVE_TO_ASCII(*s++);
868 if (UNI_IS_INVARIANT(uv))
869 *d++ = (U8)UTF_TO_NATIVE(uv);
871 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
872 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
881 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
883 * Destination must be pre-extended to 3/2 source. Do not use in-place.
884 * We optimize for native, for obvious reasons. */
887 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
892 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
899 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
904 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
908 *d++ = UNI_TO_NATIVE(uv);
915 *d++ = (U8)(( uv >> 6) | 0xc0);
916 *d++ = (U8)(( uv & 0x3f) | 0x80);
919 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
920 UV low = (p[0] << 8) + p[1];
922 if (low < 0xdc00 || low >= 0xdfff)
923 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
924 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
927 *d++ = (U8)(( uv >> 12) | 0xe0);
928 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
929 *d++ = (U8)(( uv & 0x3f) | 0x80);
933 *d++ = (U8)(( uv >> 18) | 0xf0);
934 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
935 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
936 *d++ = (U8)(( uv & 0x3f) | 0x80);
940 *newlen = d - dstart;
944 /* Note: this one is slightly destructive of the source. */
947 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
950 U8* const send = s + bytelen;
957 return utf16_to_utf8(p, d, bytelen, newlen);
960 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
963 Perl_is_uni_alnum(pTHX_ UV c)
965 U8 tmpbuf[UTF8_MAXBYTES+1];
966 uvchr_to_utf8(tmpbuf, c);
967 return is_utf8_alnum(tmpbuf);
971 Perl_is_uni_alnumc(pTHX_ UV c)
973 U8 tmpbuf[UTF8_MAXBYTES+1];
974 uvchr_to_utf8(tmpbuf, c);
975 return is_utf8_alnumc(tmpbuf);
979 Perl_is_uni_idfirst(pTHX_ UV c)
981 U8 tmpbuf[UTF8_MAXBYTES+1];
982 uvchr_to_utf8(tmpbuf, c);
983 return is_utf8_idfirst(tmpbuf);
987 Perl_is_uni_alpha(pTHX_ UV c)
989 U8 tmpbuf[UTF8_MAXBYTES+1];
990 uvchr_to_utf8(tmpbuf, c);
991 return is_utf8_alpha(tmpbuf);
995 Perl_is_uni_ascii(pTHX_ UV c)
997 U8 tmpbuf[UTF8_MAXBYTES+1];
998 uvchr_to_utf8(tmpbuf, c);
999 return is_utf8_ascii(tmpbuf);
1003 Perl_is_uni_space(pTHX_ UV c)
1005 U8 tmpbuf[UTF8_MAXBYTES+1];
1006 uvchr_to_utf8(tmpbuf, c);
1007 return is_utf8_space(tmpbuf);
1011 Perl_is_uni_digit(pTHX_ UV c)
1013 U8 tmpbuf[UTF8_MAXBYTES+1];
1014 uvchr_to_utf8(tmpbuf, c);
1015 return is_utf8_digit(tmpbuf);
1019 Perl_is_uni_upper(pTHX_ UV c)
1021 U8 tmpbuf[UTF8_MAXBYTES+1];
1022 uvchr_to_utf8(tmpbuf, c);
1023 return is_utf8_upper(tmpbuf);
1027 Perl_is_uni_lower(pTHX_ UV c)
1029 U8 tmpbuf[UTF8_MAXBYTES+1];
1030 uvchr_to_utf8(tmpbuf, c);
1031 return is_utf8_lower(tmpbuf);
1035 Perl_is_uni_cntrl(pTHX_ UV c)
1037 U8 tmpbuf[UTF8_MAXBYTES+1];
1038 uvchr_to_utf8(tmpbuf, c);
1039 return is_utf8_cntrl(tmpbuf);
1043 Perl_is_uni_graph(pTHX_ UV c)
1045 U8 tmpbuf[UTF8_MAXBYTES+1];
1046 uvchr_to_utf8(tmpbuf, c);
1047 return is_utf8_graph(tmpbuf);
1051 Perl_is_uni_print(pTHX_ UV c)
1053 U8 tmpbuf[UTF8_MAXBYTES+1];
1054 uvchr_to_utf8(tmpbuf, c);
1055 return is_utf8_print(tmpbuf);
1059 Perl_is_uni_punct(pTHX_ UV c)
1061 U8 tmpbuf[UTF8_MAXBYTES+1];
1062 uvchr_to_utf8(tmpbuf, c);
1063 return is_utf8_punct(tmpbuf);
1067 Perl_is_uni_xdigit(pTHX_ UV c)
1069 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1070 uvchr_to_utf8(tmpbuf, c);
1071 return is_utf8_xdigit(tmpbuf);
1075 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1077 uvchr_to_utf8(p, c);
1078 return to_utf8_upper(p, p, lenp);
1082 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1084 uvchr_to_utf8(p, c);
1085 return to_utf8_title(p, p, lenp);
1089 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1091 uvchr_to_utf8(p, c);
1092 return to_utf8_lower(p, p, lenp);
1096 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1098 uvchr_to_utf8(p, c);
1099 return to_utf8_fold(p, p, lenp);
1102 /* for now these all assume no locale info available for Unicode > 255 */
1105 Perl_is_uni_alnum_lc(pTHX_ UV c)
1107 return is_uni_alnum(c); /* XXX no locale support yet */
1111 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1113 return is_uni_alnumc(c); /* XXX no locale support yet */
1117 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1119 return is_uni_idfirst(c); /* XXX no locale support yet */
1123 Perl_is_uni_alpha_lc(pTHX_ UV c)
1125 return is_uni_alpha(c); /* XXX no locale support yet */
1129 Perl_is_uni_ascii_lc(pTHX_ UV c)
1131 return is_uni_ascii(c); /* XXX no locale support yet */
1135 Perl_is_uni_space_lc(pTHX_ UV c)
1137 return is_uni_space(c); /* XXX no locale support yet */
1141 Perl_is_uni_digit_lc(pTHX_ UV c)
1143 return is_uni_digit(c); /* XXX no locale support yet */
1147 Perl_is_uni_upper_lc(pTHX_ UV c)
1149 return is_uni_upper(c); /* XXX no locale support yet */
1153 Perl_is_uni_lower_lc(pTHX_ UV c)
1155 return is_uni_lower(c); /* XXX no locale support yet */
1159 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1161 return is_uni_cntrl(c); /* XXX no locale support yet */
1165 Perl_is_uni_graph_lc(pTHX_ UV c)
1167 return is_uni_graph(c); /* XXX no locale support yet */
1171 Perl_is_uni_print_lc(pTHX_ UV c)
1173 return is_uni_print(c); /* XXX no locale support yet */
1177 Perl_is_uni_punct_lc(pTHX_ UV c)
1179 return is_uni_punct(c); /* XXX no locale support yet */
1183 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1185 return is_uni_xdigit(c); /* XXX no locale support yet */
1189 Perl_to_uni_upper_lc(pTHX_ U32 c)
1191 /* XXX returns only the first character -- do not use XXX */
1192 /* XXX no locale support yet */
1194 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1195 return (U32)to_uni_upper(c, tmpbuf, &len);
1199 Perl_to_uni_title_lc(pTHX_ U32 c)
1201 /* XXX returns only the first character XXX -- do not use XXX */
1202 /* XXX no locale support yet */
1204 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1205 return (U32)to_uni_title(c, tmpbuf, &len);
1209 Perl_to_uni_lower_lc(pTHX_ U32 c)
1211 /* XXX returns only the first character -- do not use XXX */
1212 /* XXX no locale support yet */
1214 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1215 return (U32)to_uni_lower(c, tmpbuf, &len);
1219 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1220 const char *const swashname)
1223 if (!is_utf8_char(p))
1226 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1227 return swash_fetch(*swash, p, TRUE) != 0;
1231 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1234 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1235 * descendant of isalnum(3), in other words, it doesn't
1236 * contain the '_'. --jhi */
1237 return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1241 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1244 return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
1248 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1253 /* is_utf8_idstart would be more logical. */
1254 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1258 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1263 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1267 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1270 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1274 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1277 return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1281 Perl_is_utf8_space(pTHX_ const U8 *p)
1284 return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1288 Perl_is_utf8_digit(pTHX_ const U8 *p)
1291 return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1295 Perl_is_utf8_upper(pTHX_ const U8 *p)
1298 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1302 Perl_is_utf8_lower(pTHX_ const U8 *p)
1305 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1309 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1312 return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1316 Perl_is_utf8_graph(pTHX_ const U8 *p)
1319 return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1323 Perl_is_utf8_print(pTHX_ const U8 *p)
1326 return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1330 Perl_is_utf8_punct(pTHX_ const U8 *p)
1333 return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1337 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1340 return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
1344 Perl_is_utf8_mark(pTHX_ const U8 *p)
1347 return is_utf8_common(p, &PL_utf8_mark, "IsM");
1351 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1353 The "p" contains the pointer to the UTF-8 string encoding
1354 the character that is being converted.
1356 The "ustrp" is a pointer to the character buffer to put the
1357 conversion result to. The "lenp" is a pointer to the length
1360 The "swashp" is a pointer to the swash to use.
1362 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1363 and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
1364 but not always, a multicharacter mapping), is tried first.
1366 The "special" is a string like "utf8::ToSpecLower", which means the
1367 hash %utf8::ToSpecLower. The access to the hash is through
1368 Perl_to_utf8_case().
1370 The "normal" is a string like "ToLower" which means the swash
1376 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1377 SV **swashp, const char *normal, const char *special)
1380 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1383 const UV uv0 = utf8_to_uvchr(p, NULL);
1384 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1385 * are necessary in EBCDIC, they are redundant no-ops
1386 * in ASCII-ish platforms, and hopefully optimized away. */
1387 const UV uv1 = NATIVE_TO_UNI(uv0);
1388 uvuni_to_utf8(tmpbuf, uv1);
1390 if (!*swashp) /* load on-demand */
1391 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1393 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1394 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1395 /* It might be "special" (sometimes, but not always,
1396 * a multicharacter mapping) */
1397 HV * const hv = get_hv(special, FALSE);
1401 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1405 s = SvPV_const(*svp, len);
1407 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1410 /* If we have EBCDIC we need to remap the characters
1411 * since any characters in the low 256 are Unicode
1412 * code points, not EBCDIC. */
1413 U8 *t = (U8*)s, *tend = t + len, *d;
1420 const UV c = utf8_to_uvchr(t, &tlen);
1422 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1431 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1436 Copy(tmpbuf, ustrp, len, U8);
1438 Copy(s, ustrp, len, U8);
1444 if (!len && *swashp) {
1445 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1448 /* It was "normal" (a single character mapping). */
1449 const UV uv3 = UNI_TO_NATIVE(uv2);
1450 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1454 if (!len) /* Neither: just copy. */
1455 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1460 return len ? utf8_to_uvchr(ustrp, 0) : 0;
1464 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1466 Convert the UTF-8 encoded character at p to its uppercase version and
1467 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1468 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1469 the uppercase version may be longer than the original character.
1471 The first character of the uppercased version is returned
1472 (but note, as explained above, that there may be more.)
1477 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1480 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1481 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1485 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1487 Convert the UTF-8 encoded character at p to its titlecase version and
1488 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1489 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1490 titlecase version may be longer than the original character.
1492 The first character of the titlecased version is returned
1493 (but note, as explained above, that there may be more.)
1498 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1501 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1502 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1506 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1508 Convert the UTF-8 encoded character at p to its lowercase version and
1509 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1510 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1511 lowercase version may be longer than the original character.
1513 The first character of the lowercased version is returned
1514 (but note, as explained above, that there may be more.)
1519 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1522 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1523 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1527 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1529 Convert the UTF-8 encoded character at p to its foldcase version and
1530 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1531 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1532 foldcase version may be longer than the original character (up to
1535 The first character of the foldcased version is returned
1536 (but note, as explained above, that there may be more.)
1541 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1544 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1545 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1549 * A "swash" is a swatch hash.
1550 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1551 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1552 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1555 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1559 SV* const tokenbufsv = sv_newmortal();
1561 const size_t pkg_len = strlen(pkg);
1562 const size_t name_len = strlen(name);
1563 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1566 PUSHSTACKi(PERLSI_MAGIC);
1571 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1573 errsv_save = newSVsv(ERRSV);
1574 /* It is assumed that callers of this routine are not passing in any
1575 user derived data. */
1576 /* Need to do this after save_re_context() as it will set PL_tainted to
1577 1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1578 Even line to create errsv_save can turn on PL_tainted. */
1579 SAVEBOOL(PL_tainted);
1581 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1584 sv_setsv(ERRSV, errsv_save);
1585 SvREFCNT_dec(errsv_save);
1591 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1592 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1594 PUSHs(sv_2mortal(newSViv(minbits)));
1595 PUSHs(sv_2mortal(newSViv(none)));
1597 if (IN_PERL_COMPILETIME) {
1598 /* XXX ought to be handled by lex_start */
1601 sv_setpv(tokenbufsv, PL_tokenbuf);
1603 errsv_save = newSVsv(ERRSV);
1604 if (call_method("SWASHNEW", G_SCALAR))
1605 retval = newSVsv(*PL_stack_sp--);
1607 retval = &PL_sv_undef;
1609 sv_setsv(ERRSV, errsv_save);
1610 SvREFCNT_dec(errsv_save);
1613 if (IN_PERL_COMPILETIME) {
1615 const char* const pv = SvPV_const(tokenbufsv, len);
1617 Copy(pv, PL_tokenbuf, len+1, char);
1618 CopHINTS_set(PL_curcop, PL_hints);
1620 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1622 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1624 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1630 /* This API is wrong for special case conversions since we may need to
1631 * return several Unicode characters for a single Unicode character
1632 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1633 * the lower-level routine, and it is similarly broken for returning
1634 * multiple values. --jhi */
1635 /* Now SWASHGET is recasted into S_swash_get in this file. */
1638 * Returns the value of property/mapping C<swash> for the first character
1639 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1640 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1641 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1644 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1647 HV* const hv = (HV*)SvRV(swash);
1652 const U8 *tmps = NULL;
1656 const UV c = NATIVE_TO_ASCII(*ptr);
1658 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1659 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1660 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1663 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1664 * then the "swatch" is a vec() for al the chars which start
1666 * So the key in the hash (klen) is length of encoded char -1
1668 klen = UTF8SKIP(ptr) - 1;
1672 /* If char in invariant then swatch is for all the invariant chars
1673 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1675 needents = UTF_CONTINUATION_MARK;
1676 off = NATIVE_TO_UTF(ptr[klen]);
1679 /* If char is encoded then swatch is for the prefix */
1680 needents = (1 << UTF_ACCUMULATION_SHIFT);
1681 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1685 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1686 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1687 * it's nothing to sniff at.) Pity we usually come through at least
1688 * two function calls to get here...
1690 * NB: this code assumes that swatches are never modified, once generated!
1693 if (hv == PL_last_swash_hv &&
1694 klen == PL_last_swash_klen &&
1695 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1697 tmps = PL_last_swash_tmps;
1698 slen = PL_last_swash_slen;
1701 /* Try our second-level swatch cache, kept in a hash. */
1702 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1704 /* If not cached, generate it via swash_get */
1705 if (!svp || !SvPOK(*svp)
1706 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1707 /* We use utf8n_to_uvuni() as we want an index into
1708 Unicode tables, not a native character number.
1710 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1712 0 : UTF8_ALLOW_ANY);
1713 swatch = swash_get(swash,
1714 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1715 (klen) ? (code_point & ~(needents - 1)) : 0,
1718 if (IN_PERL_COMPILETIME)
1719 CopHINTS_set(PL_curcop, PL_hints);
1721 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1723 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1724 || (slen << 3) < needents)
1725 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1728 PL_last_swash_hv = hv;
1729 PL_last_swash_klen = klen;
1730 /* FIXME change interpvar.h? */
1731 PL_last_swash_tmps = (U8 *) tmps;
1732 PL_last_swash_slen = slen;
1734 Copy(ptr, PL_last_swash_key, klen, U8);
1737 switch ((int)((slen << 3) / needents)) {
1739 bit = 1 << (off & 7);
1741 return (tmps[off] & bit) != 0;
1746 return (tmps[off] << 8) + tmps[off + 1] ;
1749 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1751 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1752 NORETURN_FUNCTION_END;
1756 * Returns a swatch (a bit vector string) for a code point sequence
1757 * that starts from the value C<start> and comprises the number C<span>.
1758 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1759 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1762 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1765 U8 *l, *lend, *x, *xend, *s;
1766 STRLEN lcur, xcur, scur;
1768 HV* const hv = (HV*)SvRV(swash);
1769 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1770 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1771 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1772 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1773 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1774 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1775 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1776 const STRLEN bits = SvUV(*bitssvp);
1777 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1778 const UV none = SvUV(*nonesvp);
1779 const UV end = start + span;
1781 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1782 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1786 /* create and initialize $swatch */
1787 swatch = newSVpvs("");
1788 scur = octets ? (span * octets) : (span + 7) / 8;
1789 SvGROW(swatch, scur + 1);
1790 s = (U8*)SvPVX(swatch);
1791 if (octets && none) {
1792 const U8* const e = s + scur;
1795 *s++ = (U8)(none & 0xff);
1796 else if (bits == 16) {
1797 *s++ = (U8)((none >> 8) & 0xff);
1798 *s++ = (U8)( none & 0xff);
1800 else if (bits == 32) {
1801 *s++ = (U8)((none >> 24) & 0xff);
1802 *s++ = (U8)((none >> 16) & 0xff);
1803 *s++ = (U8)((none >> 8) & 0xff);
1804 *s++ = (U8)( none & 0xff);
1810 (void)memzero((U8*)s, scur + 1);
1812 SvCUR_set(swatch, scur);
1813 s = (U8*)SvPVX(swatch);
1815 /* read $swash->{LIST} */
1816 l = (U8*)SvPV(*listsvp, lcur);
1821 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1823 U8* const nl = (U8*)memchr(l, '\n', lend - l);
1826 min = grok_hex((char *)l, &numlen, &flags, NULL);
1830 l = nl + 1; /* 1 is length of "\n" */
1834 l = lend; /* to LIST's end at which \n is not found */
1840 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1842 max = grok_hex((char *)l, &numlen, &flags, NULL);
1851 flags = PERL_SCAN_SILENT_ILLDIGIT |
1852 PERL_SCAN_DISALLOW_PREFIX;
1854 val = grok_hex((char *)l, &numlen, &flags, NULL);
1863 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1869 val = 0; /* bits == 1, then val should be ignored */
1876 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1880 val = 0; /* bits == 1, then val should be ignored */
1894 if (!none || val < none) {
1899 for (key = min; key <= max; key++) {
1903 /* offset must be non-negative (start <= min <= key < end) */
1904 offset = octets * (key - start);
1906 s[offset] = (U8)(val & 0xff);
1907 else if (bits == 16) {
1908 s[offset ] = (U8)((val >> 8) & 0xff);
1909 s[offset + 1] = (U8)( val & 0xff);
1911 else if (bits == 32) {
1912 s[offset ] = (U8)((val >> 24) & 0xff);
1913 s[offset + 1] = (U8)((val >> 16) & 0xff);
1914 s[offset + 2] = (U8)((val >> 8) & 0xff);
1915 s[offset + 3] = (U8)( val & 0xff);
1918 if (!none || val < none)
1922 else { /* bits == 1, then val should be ignored */
1926 for (key = min; key <= max; key++) {
1927 const STRLEN offset = (STRLEN)(key - start);
1930 s[offset >> 3] |= 1 << (offset & 7);
1936 /* read $swash->{EXTRAS} */
1937 x = (U8*)SvPV(*extssvp, xcur);
1945 SV **otherbitssvp, *other;
1949 const U8 opc = *x++;
1953 nl = (U8*)memchr(x, '\n', xend - x);
1955 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1957 x = nl + 1; /* 1 is length of "\n" */
1961 x = xend; /* to EXTRAS' end at which \n is not found */
1968 namelen = nl - namestr;
1972 namelen = xend - namestr;
1976 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
1977 otherhv = (HV*)SvRV(*othersvp);
1978 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
1979 otherbits = (STRLEN)SvUV(*otherbitssvp);
1980 if (bits < otherbits)
1981 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
1983 /* The "other" swatch must be destroyed after. */
1984 other = swash_get(*othersvp, start, span);
1985 o = (U8*)SvPV(other, olen);
1988 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
1990 s = (U8*)SvPV(swatch, slen);
1991 if (bits == 1 && otherbits == 1) {
1993 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2017 STRLEN otheroctets = otherbits >> 3;
2019 U8* const send = s + slen;
2024 if (otherbits == 1) {
2025 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2029 STRLEN vlen = otheroctets;
2037 if (opc == '+' && otherval)
2038 NOOP; /* replace with otherval */
2039 else if (opc == '!' && !otherval)
2041 else if (opc == '-' && otherval)
2043 else if (opc == '&' && !otherval)
2046 s += octets; /* no replacement */
2051 *s++ = (U8)( otherval & 0xff);
2052 else if (bits == 16) {
2053 *s++ = (U8)((otherval >> 8) & 0xff);
2054 *s++ = (U8)( otherval & 0xff);
2056 else if (bits == 32) {
2057 *s++ = (U8)((otherval >> 24) & 0xff);
2058 *s++ = (U8)((otherval >> 16) & 0xff);
2059 *s++ = (U8)((otherval >> 8) & 0xff);
2060 *s++ = (U8)( otherval & 0xff);
2064 sv_free(other); /* through with it! */
2070 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2072 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2073 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2074 bytes available. The return value is the pointer to the byte after the
2075 end of the new character. In other words,
2077 d = uvchr_to_utf8(d, uv);
2079 is the recommended wide native character-aware way of saying
2086 /* On ASCII machines this is normally a macro but we want a
2087 real function in case XS code wants it
2090 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2092 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2096 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2098 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2102 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2105 Returns the native character value of the first character in the string
2107 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2108 length, in bytes, of that character.
2110 Allows length and flags to be passed to low level routine.
2114 /* On ASCII machines this is normally a macro but we want
2115 a real function in case XS code wants it
2118 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2121 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2122 return UNI_TO_NATIVE(uv);
2126 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2128 Build to the scalar dsv a displayable version of the string spv,
2129 length len, the displayable version being at most pvlim bytes long
2130 (if longer, the rest is truncated and "..." will be appended).
2132 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2133 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2134 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2135 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2136 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2137 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2139 The pointer to the PV of the dsv is returned.
2143 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2148 sv_setpvn(dsv, "", 0);
2149 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2151 /* This serves double duty as a flag and a character to print after
2152 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2156 if (pvlim && SvCUR(dsv) >= pvlim) {
2160 u = utf8_to_uvchr((U8*)s, 0);
2162 const unsigned char c = (unsigned char)u & 0xFF;
2163 if (flags & UNI_DISPLAY_BACKSLASH) {
2180 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2183 /* isPRINT() is the locale-blind version. */
2184 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2185 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2190 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2193 sv_catpvs(dsv, "...");
2199 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2201 Build to the scalar dsv a displayable version of the scalar sv,
2202 the displayable version being at most pvlim bytes long
2203 (if longer, the rest is truncated and "..." will be appended).
2205 The flags argument is as in pv_uni_display().
2207 The pointer to the PV of the dsv is returned.
2212 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2214 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2215 SvCUR(ssv), pvlim, flags);
2219 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
2221 Return true if the strings s1 and s2 differ case-insensitively, false
2222 if not (if they are equal case-insensitively). If u1 is true, the
2223 string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
2224 the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2225 are false, the respective string is assumed to be in native 8-bit
2228 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2229 in there (they will point at the beginning of the I<next> character).
2230 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2231 pointers beyond which scanning will not continue under any
2232 circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
2233 s2+l2 will be used as goal end pointers that will also stop the scan,
2234 and which qualify towards defining a successful match: all the scans
2235 that define an explicit length must reach their goal pointers for
2236 a match to succeed).
2238 For case-insensitiveness, the "casefolding" of Unicode is used
2239 instead of upper/lowercasing both the characters, see
2240 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2244 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2247 register const U8 *p1 = (const U8*)s1;
2248 register const U8 *p2 = (const U8*)s2;
2249 register const U8 *f1 = NULL;
2250 register const U8 *f2 = NULL;
2251 register U8 *e1 = NULL;
2252 register U8 *q1 = NULL;
2253 register U8 *e2 = NULL;
2254 register U8 *q2 = NULL;
2255 STRLEN n1 = 0, n2 = 0;
2256 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2257 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2259 STRLEN foldlen1, foldlen2;
2264 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2265 f1 = (const U8*)s1 + l1;
2268 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2269 f2 = (const U8*)s2 + l2;
2271 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2272 return 1; /* mismatch; possible infinite loop or false positive */
2275 natbuf[1] = 0; /* Need to terminate the buffer. */
2277 while ((e1 == 0 || p1 < e1) &&
2278 (f1 == 0 || p1 < f1) &&
2279 (e2 == 0 || p2 < e2) &&
2280 (f2 == 0 || p2 < f2)) {
2283 to_utf8_fold(p1, foldbuf1, &foldlen1);
2285 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2286 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2293 to_utf8_fold(p2, foldbuf2, &foldlen2);
2295 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2296 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2302 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2303 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2304 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2305 return 1; /* mismatch */
2312 p1 += u1 ? UTF8SKIP(p1) : 1;
2314 p2 += u2 ? UTF8SKIP(p2) : 1;
2318 /* A match is defined by all the scans that specified
2319 * an explicit length reaching their final goals. */
2320 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2329 return match ? 0 : 1; /* 0 match, 1 mismatch */
2334 * c-indentation-style: bsd
2336 * indent-tabs-mode: t
2339 * ex: set ts=8 sts=4 sw=4 noet: