3 * Copyright (c) 1998-2002, Larry Wall
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
24 #define PERL_IN_UTF8_C
27 static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
30 =head1 Unicode Support
32 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
34 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
35 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
36 bytes available. The return value is the pointer to the byte after the
37 end of the new character. In other words,
39 d = uvuni_to_utf8_flags(d, uv, flags);
43 d = uvuni_to_utf8(d, uv);
45 (which is equivalent to)
47 d = uvuni_to_utf8_flags(d, uv, 0);
49 is the recommended Unicode-aware way of saying
57 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
59 if (ckWARN(WARN_UTF8)) {
60 if (UNICODE_IS_SURROGATE(uv) &&
61 !(flags & UNICODE_ALLOW_SURROGATE))
62 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
64 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
65 !(flags & UNICODE_ALLOW_FDD0))
67 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
68 !(flags & UNICODE_ALLOW_FFFF))) &&
69 /* UNICODE_ALLOW_SUPER includes
70 * FFFEs and FFFFs beyond 0x10FFFF. */
71 ((uv <= PERL_UNICODE_MAX) ||
72 !(flags & UNICODE_ALLOW_SUPER))
74 Perl_warner(aTHX_ packWARN(WARN_UTF8),
75 "Unicode character 0x%04"UVxf" is illegal", uv);
77 if (UNI_IS_INVARIANT(uv)) {
78 *d++ = (U8)UTF_TO_NATIVE(uv);
83 STRLEN len = UNISKIP(uv);
86 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
87 uv >>= UTF_ACCUMULATION_SHIFT;
89 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
92 #else /* Non loop style */
94 *d++ = (U8)(( uv >> 6) | 0xc0);
95 *d++ = (U8)(( uv & 0x3f) | 0x80);
99 *d++ = (U8)(( uv >> 12) | 0xe0);
100 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
101 *d++ = (U8)(( uv & 0x3f) | 0x80);
105 *d++ = (U8)(( uv >> 18) | 0xf0);
106 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
107 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
108 *d++ = (U8)(( uv & 0x3f) | 0x80);
111 if (uv < 0x4000000) {
112 *d++ = (U8)(( uv >> 24) | 0xf8);
113 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
119 if (uv < 0x80000000) {
120 *d++ = (U8)(( uv >> 30) | 0xfc);
121 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
124 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
125 *d++ = (U8)(( uv & 0x3f) | 0x80);
129 if (uv < UTF8_QUAD_MAX)
132 *d++ = 0xfe; /* Can't match U+FEFF! */
133 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
134 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
135 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
136 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
137 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
138 *d++ = (U8)(( uv & 0x3f) | 0x80);
143 *d++ = 0xff; /* Can't match U+FFFE! */
144 *d++ = 0x80; /* 6 Reserved bits */
145 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
146 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
147 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
148 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
149 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
150 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
151 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
152 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
153 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
154 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
155 *d++ = (U8)(( uv & 0x3f) | 0x80);
159 #endif /* Loop style */
163 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
165 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
170 =for apidoc A|STRLEN|is_utf8_char|U8 *s
172 Tests if some arbitrary number of bytes begins in a valid UTF-8
173 character. Note that an INVARIANT (i.e. ASCII) character is a valid
174 UTF-8 character. The actual number of bytes in the UTF-8 character
175 will be returned if it is valid, otherwise 0.
179 Perl_is_utf8_char(pTHX_ U8 *s)
185 if (UTF8_IS_INVARIANT(u))
188 if (!UTF8_IS_START(u))
193 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
198 u &= UTF_START_MASK(len);
202 if (!UTF8_IS_CONTINUATION(*s))
204 uv = UTF8_ACCUMULATE(uv, *s);
211 if ((STRLEN)UNISKIP(uv) < len)
218 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
220 Returns true if first C<len> bytes of the given string form a valid UTF8
221 string, false otherwise. Note that 'a valid UTF8 string' does not mean
222 'a string that contains UTF8' because a valid ASCII string is a valid
229 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
236 len = strlen((char *)s);
252 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
254 Bottom level UTF-8 decode routine.
255 Returns the unicode code point value of the first character in the string C<s>
256 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
257 C<retlen> will be set to the length, in bytes, of that character.
259 If C<s> does not point to a well-formed UTF8 character, the behaviour
260 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
261 it is assumed that the caller will raise a warning, and this function
262 will silently just set C<retlen> to C<-1> and return zero. If the
263 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
264 malformations will be given, C<retlen> will be set to the expected
265 length of the UTF-8 character in bytes, and zero will be returned.
267 The C<flags> can also contain various flags to allow deviations from
268 the strict UTF-8 encoding (see F<utf8.h>).
270 Most code should use utf8_to_uvchr() rather than call this directly.
276 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
281 bool dowarn = ckWARN_d(WARN_UTF8);
283 STRLEN expectlen = 0;
286 /* This list is a superset of the UTF8_ALLOW_XXX. */
288 #define UTF8_WARN_EMPTY 1
289 #define UTF8_WARN_CONTINUATION 2
290 #define UTF8_WARN_NON_CONTINUATION 3
291 #define UTF8_WARN_FE_FF 4
292 #define UTF8_WARN_SHORT 5
293 #define UTF8_WARN_OVERFLOW 6
294 #define UTF8_WARN_SURROGATE 7
295 #define UTF8_WARN_LONG 8
296 #define UTF8_WARN_FFFF 9 /* Also FFFE. */
299 !(flags & UTF8_ALLOW_EMPTY)) {
300 warning = UTF8_WARN_EMPTY;
304 if (UTF8_IS_INVARIANT(uv)) {
307 return (UV) (NATIVE_TO_UTF(*s));
310 if (UTF8_IS_CONTINUATION(uv) &&
311 !(flags & UTF8_ALLOW_CONTINUATION)) {
312 warning = UTF8_WARN_CONTINUATION;
316 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
317 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
318 warning = UTF8_WARN_NON_CONTINUATION;
323 uv = NATIVE_TO_UTF(uv);
325 if ((uv == 0xfe || uv == 0xff) &&
326 !(flags & UTF8_ALLOW_FE_FF)) {
327 warning = UTF8_WARN_FE_FF;
332 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
333 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
334 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
335 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
337 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
338 else { len = 7; uv &= 0x01; }
340 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
341 else if (!(uv & 0x01)) { len = 7; uv = 0; }
342 else { len = 13; uv = 0; } /* whoa! */
350 if ((curlen < expectlen) &&
351 !(flags & UTF8_ALLOW_SHORT)) {
352 warning = UTF8_WARN_SHORT;
361 if (!UTF8_IS_CONTINUATION(*s) &&
362 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
364 warning = UTF8_WARN_NON_CONTINUATION;
368 uv = UTF8_ACCUMULATE(uv, *s);
370 /* These cannot be allowed. */
372 if (!(flags & UTF8_ALLOW_LONG)) {
373 warning = UTF8_WARN_LONG;
377 else { /* uv < ouv */
378 /* This cannot be allowed. */
379 warning = UTF8_WARN_OVERFLOW;
387 if (UNICODE_IS_SURROGATE(uv) &&
388 !(flags & UTF8_ALLOW_SURROGATE)) {
389 warning = UTF8_WARN_SURROGATE;
391 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
392 !(flags & UTF8_ALLOW_LONG)) {
393 warning = UTF8_WARN_LONG;
395 } else if (UNICODE_IS_ILLEGAL(uv) &&
396 !(flags & UTF8_ALLOW_FFFF)) {
397 warning = UTF8_WARN_FFFF;
405 if (flags & UTF8_CHECK_ONLY) {
412 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
415 case 0: /* Intentionally empty. */ break;
416 case UTF8_WARN_EMPTY:
417 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
419 case UTF8_WARN_CONTINUATION:
420 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
422 case UTF8_WARN_NON_CONTINUATION:
424 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
425 (UV)s[1], startbyte);
427 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
428 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
431 case UTF8_WARN_FE_FF:
432 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
434 case UTF8_WARN_SHORT:
435 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
436 curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
437 expectlen = curlen; /* distance for caller to skip */
439 case UTF8_WARN_OVERFLOW:
440 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
443 case UTF8_WARN_SURROGATE:
444 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
447 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
448 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
451 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
454 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
462 Perl_warner(aTHX_ packWARN(WARN_UTF8),
463 "%s in %s", s, OP_DESC(PL_op));
465 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
470 *retlen = expectlen ? expectlen : len;
476 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
478 Returns the native character value of the first character in the string C<s>
479 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
480 length, in bytes, of that character.
482 If C<s> does not point to a well-formed UTF8 character, zero is
483 returned and retlen is set, if possible, to -1.
489 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
491 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
492 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
496 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
498 Returns the Unicode code point of the first character in the string C<s>
499 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
500 length, in bytes, of that character.
502 This function should only be used when returned UV is considered
503 an index into the Unicode semantic tables (e.g. swashes).
505 If C<s> does not point to a well-formed UTF8 character, zero is
506 returned and retlen is set, if possible, to -1.
512 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
514 /* Call the low level routine asking for checks */
515 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
516 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
520 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
522 Return the length of the UTF-8 char encoded string C<s> in characters.
523 Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
524 up past C<e>, croaks.
530 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
534 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
535 * the bitops (especially ~) can create illegal UTF-8.
536 * In other words: in Perl UTF-8 is not just for Unicode. */
539 if (ckWARN_d(WARN_UTF8)) {
541 Perl_warner(aTHX_ packWARN(WARN_UTF8),
542 "%s in %s", unees, OP_DESC(PL_op));
544 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
552 if (ckWARN_d(WARN_UTF8)) {
554 Perl_warner(aTHX_ packWARN(WARN_UTF8),
555 unees, OP_DESC(PL_op));
557 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
569 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
571 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
574 WARNING: use only if you *know* that the pointers point inside the
581 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
585 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
586 * the bitops (especially ~) can create illegal UTF-8.
587 * In other words: in Perl UTF-8 is not just for Unicode. */
594 if (ckWARN_d(WARN_UTF8)) {
596 Perl_warner(aTHX_ packWARN(WARN_UTF8),
597 "%s in %s", unees, OP_DESC(PL_op));
599 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
612 if (ckWARN_d(WARN_UTF8)) {
614 Perl_warner(aTHX_ packWARN(WARN_UTF8),
615 "%s in %s", unees, OP_DESC(PL_op));
617 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
630 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
632 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
635 WARNING: do not use the following unless you *know* C<off> is within
636 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
637 on the first byte of character or just after the last byte of a character.
643 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
645 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
646 * the bitops (especially ~) can create illegal UTF-8.
647 * In other words: in Perl UTF-8 is not just for Unicode. */
656 while (UTF8_IS_CONTINUATION(*s))
664 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
666 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
667 Unlike C<bytes_to_utf8>, this over-writes the original string, and
668 updates len to contain the new length.
669 Returns zero on failure, setting C<len> to -1.
675 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
681 /* ensure valid UTF8 and chars < 256 before updating string */
682 for (send = s + *len; s < send; ) {
685 if (!UTF8_IS_INVARIANT(c) &&
686 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
687 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
696 *d++ = (U8)utf8_to_uvchr(s, &ulen);
705 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
707 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
708 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
709 the newly-created string, and updates C<len> to contain the new
710 length. Returns the original string if no conversion occurs, C<len>
711 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
712 0 if C<s> is converted or contains all 7bit characters.
718 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
728 /* ensure valid UTF8 and chars < 256 before converting string */
729 for (send = s + *len; s < send;) {
731 if (!UTF8_IS_INVARIANT(c)) {
732 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
733 (c = *s++) && UTF8_IS_CONTINUATION(c))
742 Newz(801, d, (*len) - count + 1, U8);
743 s = start; start = d;
746 if (!UTF8_IS_INVARIANT(c)) {
747 /* Then it is two-byte encoded */
748 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
749 c = ASCII_TO_NATIVE(c);
759 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
761 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
762 Returns a pointer to the newly-created string, and sets C<len> to
763 reflect the new length.
769 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
776 Newz(801, d, (*len) * 2 + 1, U8);
780 UV uv = NATIVE_TO_ASCII(*s++);
781 if (UNI_IS_INVARIANT(uv))
782 *d++ = (U8)UTF_TO_NATIVE(uv);
784 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
785 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
794 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
796 * Destination must be pre-extended to 3/2 source. Do not use in-place.
797 * We optimize for native, for obvious reasons. */
800 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
806 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
811 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
818 *d++ = (U8)(( uv >> 6) | 0xc0);
819 *d++ = (U8)(( uv & 0x3f) | 0x80);
822 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
824 if (low < 0xdc00 || low >= 0xdfff)
825 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
826 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
829 *d++ = (U8)(( uv >> 12) | 0xe0);
830 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
831 *d++ = (U8)(( uv & 0x3f) | 0x80);
835 *d++ = (U8)(( uv >> 18) | 0xf0);
836 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
837 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
838 *d++ = (U8)(( uv & 0x3f) | 0x80);
842 *newlen = d - dstart;
846 /* Note: this one is slightly destructive of the source. */
849 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
852 U8* send = s + bytelen;
859 return utf16_to_utf8(p, d, bytelen, newlen);
862 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
865 Perl_is_uni_alnum(pTHX_ UV c)
867 U8 tmpbuf[UTF8_MAXLEN+1];
868 uvchr_to_utf8(tmpbuf, c);
869 return is_utf8_alnum(tmpbuf);
873 Perl_is_uni_alnumc(pTHX_ UV c)
875 U8 tmpbuf[UTF8_MAXLEN+1];
876 uvchr_to_utf8(tmpbuf, c);
877 return is_utf8_alnumc(tmpbuf);
881 Perl_is_uni_idfirst(pTHX_ UV c)
883 U8 tmpbuf[UTF8_MAXLEN+1];
884 uvchr_to_utf8(tmpbuf, c);
885 return is_utf8_idfirst(tmpbuf);
889 Perl_is_uni_alpha(pTHX_ UV c)
891 U8 tmpbuf[UTF8_MAXLEN+1];
892 uvchr_to_utf8(tmpbuf, c);
893 return is_utf8_alpha(tmpbuf);
897 Perl_is_uni_ascii(pTHX_ UV c)
899 U8 tmpbuf[UTF8_MAXLEN+1];
900 uvchr_to_utf8(tmpbuf, c);
901 return is_utf8_ascii(tmpbuf);
905 Perl_is_uni_space(pTHX_ UV c)
907 U8 tmpbuf[UTF8_MAXLEN+1];
908 uvchr_to_utf8(tmpbuf, c);
909 return is_utf8_space(tmpbuf);
913 Perl_is_uni_digit(pTHX_ UV c)
915 U8 tmpbuf[UTF8_MAXLEN+1];
916 uvchr_to_utf8(tmpbuf, c);
917 return is_utf8_digit(tmpbuf);
921 Perl_is_uni_upper(pTHX_ UV c)
923 U8 tmpbuf[UTF8_MAXLEN+1];
924 uvchr_to_utf8(tmpbuf, c);
925 return is_utf8_upper(tmpbuf);
929 Perl_is_uni_lower(pTHX_ UV c)
931 U8 tmpbuf[UTF8_MAXLEN+1];
932 uvchr_to_utf8(tmpbuf, c);
933 return is_utf8_lower(tmpbuf);
937 Perl_is_uni_cntrl(pTHX_ UV c)
939 U8 tmpbuf[UTF8_MAXLEN+1];
940 uvchr_to_utf8(tmpbuf, c);
941 return is_utf8_cntrl(tmpbuf);
945 Perl_is_uni_graph(pTHX_ UV c)
947 U8 tmpbuf[UTF8_MAXLEN+1];
948 uvchr_to_utf8(tmpbuf, c);
949 return is_utf8_graph(tmpbuf);
953 Perl_is_uni_print(pTHX_ UV c)
955 U8 tmpbuf[UTF8_MAXLEN+1];
956 uvchr_to_utf8(tmpbuf, c);
957 return is_utf8_print(tmpbuf);
961 Perl_is_uni_punct(pTHX_ UV c)
963 U8 tmpbuf[UTF8_MAXLEN+1];
964 uvchr_to_utf8(tmpbuf, c);
965 return is_utf8_punct(tmpbuf);
969 Perl_is_uni_xdigit(pTHX_ UV c)
971 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
972 uvchr_to_utf8(tmpbuf, c);
973 return is_utf8_xdigit(tmpbuf);
977 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
980 return to_utf8_upper(p, p, lenp);
984 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
987 return to_utf8_title(p, p, lenp);
991 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
994 return to_utf8_lower(p, p, lenp);
998 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1000 uvchr_to_utf8(p, c);
1001 return to_utf8_fold(p, p, lenp);
1004 /* for now these all assume no locale info available for Unicode > 255 */
1007 Perl_is_uni_alnum_lc(pTHX_ UV c)
1009 return is_uni_alnum(c); /* XXX no locale support yet */
1013 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1015 return is_uni_alnumc(c); /* XXX no locale support yet */
1019 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1021 return is_uni_idfirst(c); /* XXX no locale support yet */
1025 Perl_is_uni_alpha_lc(pTHX_ UV c)
1027 return is_uni_alpha(c); /* XXX no locale support yet */
1031 Perl_is_uni_ascii_lc(pTHX_ UV c)
1033 return is_uni_ascii(c); /* XXX no locale support yet */
1037 Perl_is_uni_space_lc(pTHX_ UV c)
1039 return is_uni_space(c); /* XXX no locale support yet */
1043 Perl_is_uni_digit_lc(pTHX_ UV c)
1045 return is_uni_digit(c); /* XXX no locale support yet */
1049 Perl_is_uni_upper_lc(pTHX_ UV c)
1051 return is_uni_upper(c); /* XXX no locale support yet */
1055 Perl_is_uni_lower_lc(pTHX_ UV c)
1057 return is_uni_lower(c); /* XXX no locale support yet */
1061 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1063 return is_uni_cntrl(c); /* XXX no locale support yet */
1067 Perl_is_uni_graph_lc(pTHX_ UV c)
1069 return is_uni_graph(c); /* XXX no locale support yet */
1073 Perl_is_uni_print_lc(pTHX_ UV c)
1075 return is_uni_print(c); /* XXX no locale support yet */
1079 Perl_is_uni_punct_lc(pTHX_ UV c)
1081 return is_uni_punct(c); /* XXX no locale support yet */
1085 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1087 return is_uni_xdigit(c); /* XXX no locale support yet */
1091 Perl_to_uni_upper_lc(pTHX_ U32 c)
1093 /* XXX returns only the first character -- do not use XXX */
1094 /* XXX no locale support yet */
1096 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1097 return (U32)to_uni_upper(c, tmpbuf, &len);
1101 Perl_to_uni_title_lc(pTHX_ U32 c)
1103 /* XXX returns only the first character XXX -- do not use XXX */
1104 /* XXX no locale support yet */
1106 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1107 return (U32)to_uni_title(c, tmpbuf, &len);
1111 Perl_to_uni_lower_lc(pTHX_ U32 c)
1113 /* XXX returns only the first character -- do not use XXX */
1114 /* XXX no locale support yet */
1116 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1117 return (U32)to_uni_lower(c, tmpbuf, &len);
1121 Perl_is_utf8_alnum(pTHX_ U8 *p)
1123 if (!is_utf8_char(p))
1126 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1127 * descendant of isalnum(3), in other words, it doesn't
1128 * contain the '_'. --jhi */
1129 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1130 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1131 /* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1132 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1134 PL_utf8_alnum = swash_init("utf8", "",
1135 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1136 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1141 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1143 if (!is_utf8_char(p))
1146 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1147 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1148 /* return is_utf8_alpha(p) || is_utf8_digit(p); */
1149 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1151 PL_utf8_alnum = swash_init("utf8", "",
1152 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1153 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1158 Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
1162 if (!is_utf8_char(p))
1164 if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1165 PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1166 return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1170 Perl_is_utf8_idcont(pTHX_ U8 *p)
1174 if (!is_utf8_char(p))
1176 if (!PL_utf8_idcont)
1177 PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1178 return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1182 Perl_is_utf8_alpha(pTHX_ U8 *p)
1184 if (!is_utf8_char(p))
1187 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1188 return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1192 Perl_is_utf8_ascii(pTHX_ U8 *p)
1194 if (!is_utf8_char(p))
1197 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1198 return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1202 Perl_is_utf8_space(pTHX_ U8 *p)
1204 if (!is_utf8_char(p))
1207 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1208 return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1212 Perl_is_utf8_digit(pTHX_ U8 *p)
1214 if (!is_utf8_char(p))
1217 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1218 return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1222 Perl_is_utf8_upper(pTHX_ U8 *p)
1224 if (!is_utf8_char(p))
1227 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1228 return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1232 Perl_is_utf8_lower(pTHX_ U8 *p)
1234 if (!is_utf8_char(p))
1237 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1238 return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1242 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1244 if (!is_utf8_char(p))
1247 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1248 return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1252 Perl_is_utf8_graph(pTHX_ U8 *p)
1254 if (!is_utf8_char(p))
1257 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1258 return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1262 Perl_is_utf8_print(pTHX_ U8 *p)
1264 if (!is_utf8_char(p))
1267 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1268 return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1272 Perl_is_utf8_punct(pTHX_ U8 *p)
1274 if (!is_utf8_char(p))
1277 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1278 return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1282 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1284 if (!is_utf8_char(p))
1286 if (!PL_utf8_xdigit)
1287 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1288 return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1292 Perl_is_utf8_mark(pTHX_ U8 *p)
1294 if (!is_utf8_char(p))
1297 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1298 return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1302 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1304 The "p" contains the pointer to the UTF-8 string encoding
1305 the character that is being converted.
1307 The "ustrp" is a pointer to the character buffer to put the
1308 conversion result to. The "lenp" is a pointer to the length
1311 The "swashp" is a pointer to the swash to use.
1313 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1314 and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1315 but not always, a multicharacter mapping), is tried first.
1317 The "special" is a string like "utf8::ToSpecLower", which means the
1318 hash %utf8::ToSpecLower. The access to the hash is through
1319 Perl_to_utf8_case().
1321 The "normal" is a string like "ToLower" which means the swash
1327 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1330 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
1333 uv0 = utf8_to_uvchr(p, 0);
1334 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1335 * are necessary in EBCDIC, they are redundant no-ops
1336 * in ASCII-ish platforms, and hopefully optimized away. */
1337 uv1 = NATIVE_TO_UNI(uv0);
1338 uvuni_to_utf8(tmpbuf, uv1);
1340 if (!*swashp) /* load on-demand */
1341 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1344 /* It might be "special" (sometimes, but not always,
1345 * a multicharacter mapping) */
1351 if ((hv = get_hv(special, FALSE)) &&
1352 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
1353 (he = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
1354 (val = HeVAL(he))) {
1359 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1362 /* If we have EBCDIC we need to remap the characters
1363 * since any characters in the low 256 are Unicode
1364 * code points, not EBCDIC. */
1365 U8 *t = (U8*)s, *tend = t + len, *d;
1372 UV c = utf8_to_uvchr(t, &tlen);
1374 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1383 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1388 Copy(tmpbuf, ustrp, len, U8);
1390 Copy(s, ustrp, len, U8);
1396 if (!len && *swashp) {
1397 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1400 /* It was "normal" (a single character mapping). */
1401 UV uv3 = UNI_TO_NATIVE(uv2);
1403 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1407 if (!len) /* Neither: just copy. */
1408 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1413 return len ? utf8_to_uvchr(ustrp, 0) : 0;
1417 =for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1419 Convert the UTF-8 encoded character at p to its uppercase version and
1420 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1421 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1422 uppercase version may be longer than the original character (up to two
1425 The first character of the uppercased version is returned
1426 (but note, as explained above, that there may be more.)
1431 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1433 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1434 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1438 =for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1440 Convert the UTF-8 encoded character at p to its titlecase version and
1441 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1442 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1443 titlecase version may be longer than the original character (up to two
1446 The first character of the titlecased version is returned
1447 (but note, as explained above, that there may be more.)
1452 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1454 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1455 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1459 =for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1461 Convert the UTF-8 encoded character at p to its lowercase version and
1462 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1463 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1464 lowercase version may be longer than the original character (up to two
1467 The first character of the lowercased version is returned
1468 (but note, as explained above, that there may be more.)
1473 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1475 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1476 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1480 =for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1482 Convert the UTF-8 encoded character at p to its foldcase version and
1483 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1484 that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1485 foldcase version may be longer than the original character (up to
1488 The first character of the foldcased version is returned
1489 (but note, as explained above, that there may be more.)
1494 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1496 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1497 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1500 /* a "swash" is a swatch hash */
1503 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1506 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1508 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1511 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1513 errsv_save = newSVsv(ERRSV);
1514 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1516 sv_setsv(ERRSV, errsv_save);
1517 SvREFCNT_dec(errsv_save);
1521 PUSHSTACKi(PERLSI_MAGIC);
1524 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1525 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1527 PUSHs(sv_2mortal(newSViv(minbits)));
1528 PUSHs(sv_2mortal(newSViv(none)));
1534 if (PL_curcop == &PL_compiling) {
1535 /* XXX ought to be handled by lex_start */
1537 sv_setpv(tokenbufsv, PL_tokenbuf);
1539 errsv_save = newSVsv(ERRSV);
1540 if (call_method("SWASHNEW", G_SCALAR))
1541 retval = newSVsv(*PL_stack_sp--);
1543 retval = &PL_sv_undef;
1545 sv_setsv(ERRSV, errsv_save);
1546 SvREFCNT_dec(errsv_save);
1549 if (PL_curcop == &PL_compiling) {
1551 char* pv = SvPV(tokenbufsv, len);
1553 Copy(pv, PL_tokenbuf, len+1, char);
1554 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1556 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1558 Perl_croak(aTHX_ "Can't find Unicode property definition \"%s\"",
1559 SvPV_nolen(retval));
1560 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1566 /* This API is wrong for special case conversions since we may need to
1567 * return several Unicode characters for a single Unicode character
1568 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1569 * the lower-level routine, and it is similarly broken for returning
1570 * multiple values. --jhi */
1572 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1574 HV* hv = (HV*)SvRV(sv);
1583 UV c = NATIVE_TO_ASCII(*ptr);
1585 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1586 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1587 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1590 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1591 * then the "swatch" is a vec() for al the chars which start
1593 * So the key in the hash (klen) is length of encoded char -1
1595 klen = UTF8SKIP(ptr) - 1;
1600 /* If char in invariant then swatch is for all the invariant chars
1601 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1603 needents = UTF_CONTINUATION_MARK;
1604 off = NATIVE_TO_UTF(ptr[klen]);
1608 /* If char is encoded then swatch is for the prefix */
1609 needents = (1 << UTF_ACCUMULATION_SHIFT);
1610 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1614 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1615 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1616 * it's nothing to sniff at.) Pity we usually come through at least
1617 * two function calls to get here...
1619 * NB: this code assumes that swatches are never modified, once generated!
1622 if (hv == PL_last_swash_hv &&
1623 klen == PL_last_swash_klen &&
1624 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1626 tmps = PL_last_swash_tmps;
1627 slen = PL_last_swash_slen;
1630 /* Try our second-level swatch cache, kept in a hash. */
1631 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1633 /* If not cached, generate it via utf8::SWASHGET */
1634 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1636 /* We use utf8n_to_uvuni() as we want an index into
1637 Unicode tables, not a native character number.
1639 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
1641 0 : UTF8_ALLOW_ANY);
1646 PUSHSTACKi(PERLSI_MAGIC);
1650 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1651 PUSHs(sv_2mortal(newSViv((klen) ?
1652 (code_point & ~(needents - 1)) : 0)));
1653 PUSHs(sv_2mortal(newSViv(needents)));
1655 errsv_save = newSVsv(ERRSV);
1656 if (call_method("SWASHGET", G_SCALAR))
1657 retval = newSVsv(*PL_stack_sp--);
1659 retval = &PL_sv_undef;
1661 sv_setsv(ERRSV, errsv_save);
1662 SvREFCNT_dec(errsv_save);
1666 if (PL_curcop == &PL_compiling)
1667 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1669 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1671 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1672 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1675 PL_last_swash_hv = hv;
1676 PL_last_swash_klen = klen;
1677 PL_last_swash_tmps = tmps;
1678 PL_last_swash_slen = slen;
1680 Copy(ptr, PL_last_swash_key, klen, U8);
1683 switch ((int)((slen << 3) / needents)) {
1685 bit = 1 << (off & 7);
1687 return (tmps[off] & bit) != 0;
1692 return (tmps[off] << 8) + tmps[off + 1] ;
1695 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1697 Perl_croak(aTHX_ "panic: swash_fetch");
1703 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1705 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1706 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1707 bytes available. The return value is the pointer to the byte after the
1708 end of the new character. In other words,
1710 d = uvchr_to_utf8(d, uv);
1712 is the recommended wide native character-aware way of saying
1719 /* On ASCII machines this is normally a macro but we want a
1720 real function in case XS code wants it
1722 #undef Perl_uvchr_to_utf8
1724 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1726 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1730 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1732 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1736 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1738 Returns the native character value of the first character in the string C<s>
1739 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1740 length, in bytes, of that character.
1742 Allows length and flags to be passed to low level routine.
1746 /* On ASCII machines this is normally a macro but we want
1747 a real function in case XS code wants it
1749 #undef Perl_utf8n_to_uvchr
1751 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1753 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1754 return UNI_TO_NATIVE(uv);
1758 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1760 Build to the scalar dsv a displayable version of the string spv,
1761 length len, the displayable version being at most pvlim bytes long
1762 (if longer, the rest is truncated and "..." will be appended).
1764 The flags argument can have UNI_DISPLAY_ISPRINT set to display
1765 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1766 to display the \\[nrfta\\] as the backslashed versions (like '\n')
1767 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1768 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1769 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1771 The pointer to the PV of the dsv is returned.
1775 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1780 sv_setpvn(dsv, "", 0);
1781 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1785 if (pvlim && SvCUR(dsv) >= pvlim) {
1789 u = utf8_to_uvchr((U8*)s, 0);
1791 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1794 Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1796 Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1798 Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1800 Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1802 Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1804 Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
1808 /* isPRINT() is the locale-blind version. */
1809 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
1810 Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
1815 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1818 sv_catpvn(dsv, "...", 3);
1824 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1826 Build to the scalar dsv a displayable version of the scalar sv,
1827 the displayable version being at most pvlim bytes long
1828 (if longer, the rest is truncated and "..." will be appended).
1830 The flags argument is as in pv_uni_display().
1832 The pointer to the PV of the dsv is returned.
1836 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1838 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1843 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1845 Return true if the strings s1 and s2 differ case-insensitively, false
1846 if not (if they are equal case-insensitively). If u1 is true, the
1847 string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
1848 the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1849 are false, the respective string is assumed to be in native 8-bit
1852 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1853 in there (they will point at the beginning of the I<next> character).
1854 If the pointers behind pe1 or pe2 are non-NULL, they are the end
1855 pointers beyond which scanning will not continue under any
1856 circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1857 s2+l2 will be used as goal end pointers that will also stop the scan,
1858 and which qualify towards defining a successful match: all the scans
1859 that define an explicit length must reach their goal pointers for
1860 a match to succeed).
1862 For case-insensitiveness, the "casefolding" of Unicode is used
1863 instead of upper/lowercasing both the characters, see
1864 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1868 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1870 register U8 *p1 = (U8*)s1;
1871 register U8 *p2 = (U8*)s2;
1872 register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1873 register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1874 STRLEN n1 = 0, n2 = 0;
1875 U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1876 U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
1878 STRLEN foldlen1, foldlen2;
1883 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (U8*)s1)))
1887 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (U8*)s2)))
1890 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1891 return 1; /* mismatch; possible infinite loop or false positive */
1894 natbuf[1] = 0; /* Need to terminate the buffer. */
1896 while ((e1 == 0 || p1 < e1) &&
1897 (f1 == 0 || p1 < f1) &&
1898 (e2 == 0 || p2 < e2) &&
1899 (f2 == 0 || p2 < f2)) {
1902 to_utf8_fold(p1, foldbuf1, &foldlen1);
1905 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1912 to_utf8_fold(p2, foldbuf2, &foldlen2);
1915 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1921 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1922 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1923 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
1924 return 1; /* mismatch */
1931 p1 += u1 ? UTF8SKIP(p1) : 1;
1933 p2 += u2 ? UTF8SKIP(p2) : 1;
1937 /* A match is defined by all the scans that specified
1938 * an explicit length reaching their final goals. */
1939 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
1948 return match ? 0 : 1; /* 0 match, 1 mismatch */