3 * Copyright (c) 1998-2002, Larry Wall
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
24 #define PERL_IN_UTF8_C
27 static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
30 =head1 Unicode Support
32 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
34 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
35 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
36 bytes available. The return value is the pointer to the byte after the
37 end of the new character. In other words,
39 d = uvuni_to_utf8_flags(d, uv, flags);
43 d = uvuni_to_utf8(d, uv);
45 (which is equivalent to)
47 d = uvuni_to_utf8_flags(d, uv, 0);
49 is the recommended Unicode-aware way of saying
57 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
59 if (ckWARN(WARN_UTF8)) {
60 if (UNICODE_IS_SURROGATE(uv) &&
61 !(flags & UNICODE_ALLOW_SURROGATE))
62 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
64 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
65 !(flags & UNICODE_ALLOW_FDD0))
67 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
68 !(flags & UNICODE_ALLOW_FFFF))) &&
69 /* UNICODE_ALLOW_SUPER includes
70 * FFFEs and FFFFs beyond 0x10FFFF. */
71 ((uv <= PERL_UNICODE_MAX) ||
72 !(flags & UNICODE_ALLOW_SUPER))
74 Perl_warner(aTHX_ packWARN(WARN_UTF8),
75 "Unicode character 0x%04"UVxf" is illegal", uv);
77 if (UNI_IS_INVARIANT(uv)) {
78 *d++ = (U8)UTF_TO_NATIVE(uv);
83 STRLEN len = UNISKIP(uv);
86 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
87 uv >>= UTF_ACCUMULATION_SHIFT;
89 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
92 #else /* Non loop style */
94 *d++ = (U8)(( uv >> 6) | 0xc0);
95 *d++ = (U8)(( uv & 0x3f) | 0x80);
99 *d++ = (U8)(( uv >> 12) | 0xe0);
100 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
101 *d++ = (U8)(( uv & 0x3f) | 0x80);
105 *d++ = (U8)(( uv >> 18) | 0xf0);
106 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
107 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
108 *d++ = (U8)(( uv & 0x3f) | 0x80);
111 if (uv < 0x4000000) {
112 *d++ = (U8)(( uv >> 24) | 0xf8);
113 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
119 if (uv < 0x80000000) {
120 *d++ = (U8)(( uv >> 30) | 0xfc);
121 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
124 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
125 *d++ = (U8)(( uv & 0x3f) | 0x80);
129 if (uv < UTF8_QUAD_MAX)
132 *d++ = 0xfe; /* Can't match U+FEFF! */
133 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
134 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
135 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
136 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
137 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
138 *d++ = (U8)(( uv & 0x3f) | 0x80);
143 *d++ = 0xff; /* Can't match U+FFFE! */
144 *d++ = 0x80; /* 6 Reserved bits */
145 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
146 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
147 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
148 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
149 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
150 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
151 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
152 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
153 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
154 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
155 *d++ = (U8)(( uv & 0x3f) | 0x80);
159 #endif /* Loop style */
163 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
165 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
170 =for apidoc A|STRLEN|is_utf8_char|U8 *s
172 Tests if some arbitrary number of bytes begins in a valid UTF-8
173 character. Note that an INVARIANT (i.e. ASCII) character is a valid
174 UTF-8 character. The actual number of bytes in the UTF-8 character
175 will be returned if it is valid, otherwise 0.
179 Perl_is_utf8_char(pTHX_ U8 *s)
185 if (UTF8_IS_INVARIANT(u))
188 if (!UTF8_IS_START(u))
193 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
198 u &= UTF_START_MASK(len);
202 if (!UTF8_IS_CONTINUATION(*s))
204 uv = UTF8_ACCUMULATE(uv, *s);
211 if ((STRLEN)UNISKIP(uv) < len)
218 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
220 Returns true if first C<len> bytes of the given string form a valid UTF8
221 string, false otherwise. Note that 'a valid UTF8 string' does not mean
222 'a string that contains UTF8' because a valid ASCII string is a valid
229 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
236 len = strlen((char *)s);
252 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
254 Bottom level UTF-8 decode routine.
255 Returns the unicode code point value of the first character in the string C<s>
256 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
257 C<retlen> will be set to the length, in bytes, of that character.
259 If C<s> does not point to a well-formed UTF8 character, the behaviour
260 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
261 it is assumed that the caller will raise a warning, and this function
262 will silently just set C<retlen> to C<-1> and return zero. If the
263 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
264 malformations will be given, C<retlen> will be set to the expected
265 length of the UTF-8 character in bytes, and zero will be returned.
267 The C<flags> can also contain various flags to allow deviations from
268 the strict UTF-8 encoding (see F<utf8.h>).
270 Most code should use utf8_to_uvchr() rather than call this directly.
276 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
281 bool dowarn = ckWARN_d(WARN_UTF8);
283 STRLEN expectlen = 0;
286 /* This list is a superset of the UTF8_ALLOW_XXX. */
288 #define UTF8_WARN_EMPTY 1
289 #define UTF8_WARN_CONTINUATION 2
290 #define UTF8_WARN_NON_CONTINUATION 3
291 #define UTF8_WARN_FE_FF 4
292 #define UTF8_WARN_SHORT 5
293 #define UTF8_WARN_OVERFLOW 6
294 #define UTF8_WARN_SURROGATE 7
295 #define UTF8_WARN_LONG 8
296 #define UTF8_WARN_FFFF 9 /* Also FFFE. */
299 !(flags & UTF8_ALLOW_EMPTY)) {
300 warning = UTF8_WARN_EMPTY;
304 if (UTF8_IS_INVARIANT(uv)) {
307 return (UV) (NATIVE_TO_UTF(*s));
310 if (UTF8_IS_CONTINUATION(uv) &&
311 !(flags & UTF8_ALLOW_CONTINUATION)) {
312 warning = UTF8_WARN_CONTINUATION;
316 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
317 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
318 warning = UTF8_WARN_NON_CONTINUATION;
323 uv = NATIVE_TO_UTF(uv);
325 if ((uv == 0xfe || uv == 0xff) &&
326 !(flags & UTF8_ALLOW_FE_FF)) {
327 warning = UTF8_WARN_FE_FF;
332 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
333 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
334 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
335 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
337 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
338 else { len = 7; uv &= 0x01; }
340 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
341 else if (!(uv & 0x01)) { len = 7; uv = 0; }
342 else { len = 13; uv = 0; } /* whoa! */
350 if ((curlen < expectlen) &&
351 !(flags & UTF8_ALLOW_SHORT)) {
352 warning = UTF8_WARN_SHORT;
361 if (!UTF8_IS_CONTINUATION(*s) &&
362 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
364 warning = UTF8_WARN_NON_CONTINUATION;
368 uv = UTF8_ACCUMULATE(uv, *s);
370 /* These cannot be allowed. */
372 if (!(flags & UTF8_ALLOW_LONG)) {
373 warning = UTF8_WARN_LONG;
377 else { /* uv < ouv */
378 /* This cannot be allowed. */
379 warning = UTF8_WARN_OVERFLOW;
387 if (UNICODE_IS_SURROGATE(uv) &&
388 !(flags & UTF8_ALLOW_SURROGATE)) {
389 warning = UTF8_WARN_SURROGATE;
391 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
392 !(flags & UTF8_ALLOW_LONG)) {
393 warning = UTF8_WARN_LONG;
395 } else if (UNICODE_IS_ILLEGAL(uv) &&
396 !(flags & UTF8_ALLOW_FFFF)) {
397 warning = UTF8_WARN_FFFF;
405 if (flags & UTF8_CHECK_ONLY) {
412 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
415 case 0: /* Intentionally empty. */ break;
416 case UTF8_WARN_EMPTY:
417 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
419 case UTF8_WARN_CONTINUATION:
420 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
422 case UTF8_WARN_NON_CONTINUATION:
424 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
425 (UV)s[1], startbyte);
427 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
428 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
431 case UTF8_WARN_FE_FF:
432 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
434 case UTF8_WARN_SHORT:
435 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
436 curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
437 expectlen = curlen; /* distance for caller to skip */
439 case UTF8_WARN_OVERFLOW:
440 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
443 case UTF8_WARN_SURROGATE:
444 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
447 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
448 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
451 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
454 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
462 Perl_warner(aTHX_ packWARN(WARN_UTF8),
463 "%s in %s", s, OP_DESC(PL_op));
465 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
470 *retlen = expectlen ? expectlen : len;
476 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
478 Returns the native character value of the first character in the string C<s>
479 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
480 length, in bytes, of that character.
482 If C<s> does not point to a well-formed UTF8 character, zero is
483 returned and retlen is set, if possible, to -1.
489 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
491 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
492 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
496 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
498 Returns the Unicode code point of the first character in the string C<s>
499 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
500 length, in bytes, of that character.
502 This function should only be used when returned UV is considered
503 an index into the Unicode semantic tables (e.g. swashes).
505 If C<s> does not point to a well-formed UTF8 character, zero is
506 returned and retlen is set, if possible, to -1.
512 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
514 /* Call the low level routine asking for checks */
515 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
516 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
520 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
522 Return the length of the UTF-8 char encoded string C<s> in characters.
523 Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
524 up past C<e>, croaks.
530 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
534 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
535 * the bitops (especially ~) can create illegal UTF-8.
536 * In other words: in Perl UTF-8 is not just for Unicode. */
539 if (ckWARN_d(WARN_UTF8)) {
541 Perl_warner(aTHX_ packWARN(WARN_UTF8),
542 "%s in %s", unees, OP_DESC(PL_op));
544 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
552 if (ckWARN_d(WARN_UTF8)) {
554 Perl_warner(aTHX_ packWARN(WARN_UTF8),
555 unees, OP_DESC(PL_op));
557 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
569 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
571 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
574 WARNING: use only if you *know* that the pointers point inside the
581 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
585 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
586 * the bitops (especially ~) can create illegal UTF-8.
587 * In other words: in Perl UTF-8 is not just for Unicode. */
594 if (ckWARN_d(WARN_UTF8)) {
596 Perl_warner(aTHX_ packWARN(WARN_UTF8),
597 "%s in %s", unees, OP_DESC(PL_op));
599 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
612 if (ckWARN_d(WARN_UTF8)) {
614 Perl_warner(aTHX_ packWARN(WARN_UTF8),
615 "%s in %s", unees, OP_DESC(PL_op));
617 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
630 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
632 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
635 WARNING: do not use the following unless you *know* C<off> is within
636 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
637 on the first byte of character or just after the last byte of a character.
643 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
645 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
646 * the bitops (especially ~) can create illegal UTF-8.
647 * In other words: in Perl UTF-8 is not just for Unicode. */
656 while (UTF8_IS_CONTINUATION(*s))
664 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
666 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
667 Unlike C<bytes_to_utf8>, this over-writes the original string, and
668 updates len to contain the new length.
669 Returns zero on failure, setting C<len> to -1.
675 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
681 /* ensure valid UTF8 and chars < 256 before updating string */
682 for (send = s + *len; s < send; ) {
685 if (!UTF8_IS_INVARIANT(c) &&
686 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
687 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
696 *d++ = (U8)utf8_to_uvchr(s, &ulen);
705 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
707 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
708 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
709 the newly-created string, and updates C<len> to contain the new
710 length. Returns the original string if no conversion occurs, C<len>
711 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
712 0 if C<s> is converted or contains all 7bit characters.
718 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
728 /* ensure valid UTF8 and chars < 256 before converting string */
729 for (send = s + *len; s < send;) {
731 if (!UTF8_IS_INVARIANT(c)) {
732 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
733 (c = *s++) && UTF8_IS_CONTINUATION(c))
742 Newz(801, d, (*len) - count + 1, U8);
743 s = start; start = d;
746 if (!UTF8_IS_INVARIANT(c)) {
747 /* Then it is two-byte encoded */
748 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
749 c = ASCII_TO_NATIVE(c);
759 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
761 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
762 Returns a pointer to the newly-created string, and sets C<len> to
763 reflect the new length.
769 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
776 Newz(801, d, (*len) * 2 + 1, U8);
780 UV uv = NATIVE_TO_ASCII(*s++);
781 if (UNI_IS_INVARIANT(uv))
782 *d++ = (U8)UTF_TO_NATIVE(uv);
784 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
785 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
794 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
796 * Destination must be pre-extended to 3/2 source. Do not use in-place.
797 * We optimize for native, for obvious reasons. */
800 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
806 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
811 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
818 *d++ = (U8)(( uv >> 6) | 0xc0);
819 *d++ = (U8)(( uv & 0x3f) | 0x80);
822 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
824 if (low < 0xdc00 || low >= 0xdfff)
825 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
826 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
829 *d++ = (U8)(( uv >> 12) | 0xe0);
830 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
831 *d++ = (U8)(( uv & 0x3f) | 0x80);
835 *d++ = (U8)(( uv >> 18) | 0xf0);
836 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
837 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
838 *d++ = (U8)(( uv & 0x3f) | 0x80);
842 *newlen = d - dstart;
846 /* Note: this one is slightly destructive of the source. */
849 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
852 U8* send = s + bytelen;
859 return utf16_to_utf8(p, d, bytelen, newlen);
862 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
865 Perl_is_uni_alnum(pTHX_ UV c)
867 U8 tmpbuf[UTF8_MAXLEN+1];
868 uvchr_to_utf8(tmpbuf, c);
869 return is_utf8_alnum(tmpbuf);
873 Perl_is_uni_alnumc(pTHX_ UV c)
875 U8 tmpbuf[UTF8_MAXLEN+1];
876 uvchr_to_utf8(tmpbuf, c);
877 return is_utf8_alnumc(tmpbuf);
881 Perl_is_uni_idfirst(pTHX_ UV c)
883 U8 tmpbuf[UTF8_MAXLEN+1];
884 uvchr_to_utf8(tmpbuf, c);
885 return is_utf8_idfirst(tmpbuf);
889 Perl_is_uni_alpha(pTHX_ UV c)
891 U8 tmpbuf[UTF8_MAXLEN+1];
892 uvchr_to_utf8(tmpbuf, c);
893 return is_utf8_alpha(tmpbuf);
897 Perl_is_uni_ascii(pTHX_ UV c)
899 U8 tmpbuf[UTF8_MAXLEN+1];
900 uvchr_to_utf8(tmpbuf, c);
901 return is_utf8_ascii(tmpbuf);
905 Perl_is_uni_space(pTHX_ UV c)
907 U8 tmpbuf[UTF8_MAXLEN+1];
908 uvchr_to_utf8(tmpbuf, c);
909 return is_utf8_space(tmpbuf);
913 Perl_is_uni_digit(pTHX_ UV c)
915 U8 tmpbuf[UTF8_MAXLEN+1];
916 uvchr_to_utf8(tmpbuf, c);
917 return is_utf8_digit(tmpbuf);
921 Perl_is_uni_upper(pTHX_ UV c)
923 U8 tmpbuf[UTF8_MAXLEN+1];
924 uvchr_to_utf8(tmpbuf, c);
925 return is_utf8_upper(tmpbuf);
929 Perl_is_uni_lower(pTHX_ UV c)
931 U8 tmpbuf[UTF8_MAXLEN+1];
932 uvchr_to_utf8(tmpbuf, c);
933 return is_utf8_lower(tmpbuf);
937 Perl_is_uni_cntrl(pTHX_ UV c)
939 U8 tmpbuf[UTF8_MAXLEN+1];
940 uvchr_to_utf8(tmpbuf, c);
941 return is_utf8_cntrl(tmpbuf);
945 Perl_is_uni_graph(pTHX_ UV c)
947 U8 tmpbuf[UTF8_MAXLEN+1];
948 uvchr_to_utf8(tmpbuf, c);
949 return is_utf8_graph(tmpbuf);
953 Perl_is_uni_print(pTHX_ UV c)
955 U8 tmpbuf[UTF8_MAXLEN+1];
956 uvchr_to_utf8(tmpbuf, c);
957 return is_utf8_print(tmpbuf);
961 Perl_is_uni_punct(pTHX_ UV c)
963 U8 tmpbuf[UTF8_MAXLEN+1];
964 uvchr_to_utf8(tmpbuf, c);
965 return is_utf8_punct(tmpbuf);
969 Perl_is_uni_xdigit(pTHX_ UV c)
971 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
972 uvchr_to_utf8(tmpbuf, c);
973 return is_utf8_xdigit(tmpbuf);
977 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
980 return to_utf8_upper(p, p, lenp);
984 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
987 return to_utf8_title(p, p, lenp);
991 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
994 return to_utf8_lower(p, p, lenp);
998 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1000 uvchr_to_utf8(p, c);
1001 return to_utf8_fold(p, p, lenp);
1004 /* for now these all assume no locale info available for Unicode > 255 */
1007 Perl_is_uni_alnum_lc(pTHX_ UV c)
1009 return is_uni_alnum(c); /* XXX no locale support yet */
1013 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1015 return is_uni_alnumc(c); /* XXX no locale support yet */
1019 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1021 return is_uni_idfirst(c); /* XXX no locale support yet */
1025 Perl_is_uni_alpha_lc(pTHX_ UV c)
1027 return is_uni_alpha(c); /* XXX no locale support yet */
1031 Perl_is_uni_ascii_lc(pTHX_ UV c)
1033 return is_uni_ascii(c); /* XXX no locale support yet */
1037 Perl_is_uni_space_lc(pTHX_ UV c)
1039 return is_uni_space(c); /* XXX no locale support yet */
1043 Perl_is_uni_digit_lc(pTHX_ UV c)
1045 return is_uni_digit(c); /* XXX no locale support yet */
1049 Perl_is_uni_upper_lc(pTHX_ UV c)
1051 return is_uni_upper(c); /* XXX no locale support yet */
1055 Perl_is_uni_lower_lc(pTHX_ UV c)
1057 return is_uni_lower(c); /* XXX no locale support yet */
1061 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1063 return is_uni_cntrl(c); /* XXX no locale support yet */
1067 Perl_is_uni_graph_lc(pTHX_ UV c)
1069 return is_uni_graph(c); /* XXX no locale support yet */
1073 Perl_is_uni_print_lc(pTHX_ UV c)
1075 return is_uni_print(c); /* XXX no locale support yet */
1079 Perl_is_uni_punct_lc(pTHX_ UV c)
1081 return is_uni_punct(c); /* XXX no locale support yet */
1085 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1087 return is_uni_xdigit(c); /* XXX no locale support yet */
1091 Perl_to_uni_upper_lc(pTHX_ U32 c)
1093 /* XXX returns only the first character -- do not use XXX */
1094 /* XXX no locale support yet */
1096 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1097 return (U32)to_uni_upper(c, tmpbuf, &len);
1101 Perl_to_uni_title_lc(pTHX_ U32 c)
1103 /* XXX returns only the first character XXX -- do not use XXX */
1104 /* XXX no locale support yet */
1106 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1107 return (U32)to_uni_title(c, tmpbuf, &len);
1111 Perl_to_uni_lower_lc(pTHX_ U32 c)
1113 /* XXX returns only the first character -- do not use XXX */
1114 /* XXX no locale support yet */
1116 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1117 return (U32)to_uni_lower(c, tmpbuf, &len);
1121 Perl_is_utf8_alnum(pTHX_ U8 *p)
1123 if (!is_utf8_char(p))
1126 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1127 * descendant of isalnum(3), in other words, it doesn't
1128 * contain the '_'. --jhi */
1129 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1130 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1131 /* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1132 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1134 PL_utf8_alnum = swash_init("utf8", "",
1135 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1136 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1141 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1143 if (!is_utf8_char(p))
1146 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1147 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1148 /* return is_utf8_alpha(p) || is_utf8_digit(p); */
1149 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1151 PL_utf8_alnum = swash_init("utf8", "",
1152 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1153 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1158 Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
1162 if (!is_utf8_char(p))
1164 if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1165 PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1166 return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1170 Perl_is_utf8_idcont(pTHX_ U8 *p)
1174 if (!is_utf8_char(p))
1176 if (!PL_utf8_idcont)
1177 PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1178 return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1182 Perl_is_utf8_alpha(pTHX_ U8 *p)
1184 if (!is_utf8_char(p))
1187 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1188 return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1192 Perl_is_utf8_ascii(pTHX_ U8 *p)
1194 if (!is_utf8_char(p))
1197 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1198 return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1202 Perl_is_utf8_space(pTHX_ U8 *p)
1204 if (!is_utf8_char(p))
1207 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1208 return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1212 Perl_is_utf8_digit(pTHX_ U8 *p)
1214 if (!is_utf8_char(p))
1217 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1218 return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1222 Perl_is_utf8_upper(pTHX_ U8 *p)
1224 if (!is_utf8_char(p))
1227 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1228 return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1232 Perl_is_utf8_lower(pTHX_ U8 *p)
1234 if (!is_utf8_char(p))
1237 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1238 return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1242 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1244 if (!is_utf8_char(p))
1247 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1248 return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1252 Perl_is_utf8_graph(pTHX_ U8 *p)
1254 if (!is_utf8_char(p))
1257 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1258 return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1262 Perl_is_utf8_print(pTHX_ U8 *p)
1264 if (!is_utf8_char(p))
1267 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1268 return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1272 Perl_is_utf8_punct(pTHX_ U8 *p)
1274 if (!is_utf8_char(p))
1277 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1278 return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1282 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1284 if (!is_utf8_char(p))
1286 if (!PL_utf8_xdigit)
1287 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1288 return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1292 Perl_is_utf8_mark(pTHX_ U8 *p)
1294 if (!is_utf8_char(p))
1297 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1298 return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1302 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1304 The "p" contains the pointer to the UTF-8 string encoding
1305 the character that is being converted.
1307 The "ustrp" is a pointer to the character buffer to put the
1308 conversion result to. The "lenp" is a pointer to the length
1311 The "swashp" is a pointer to the swash to use.
1313 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1314 and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1315 but not always, a multicharacter mapping), is tried first.
1317 The "special" is a string like "utf8::ToSpecLower", which means the
1318 hash %utf8::ToSpecLower. The access to the hash is through
1319 Perl_to_utf8_case().
1321 The "normal" is a string like "ToLower" which means the swash
1327 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1330 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
1333 uv0 = utf8_to_uvchr(p, 0);
1334 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1335 * are necessary in EBCDIC, they are redundant no-ops
1336 * in ASCII-ish platforms, and hopefully optimized away. */
1337 uv1 = NATIVE_TO_UNI(uv0);
1338 uvuni_to_utf8(tmpbuf, uv1);
1340 if (!*swashp) /* load on-demand */
1341 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1344 /* It might be "special" (sometimes, but not always,
1345 * a multicharacter mapping) */
1351 #if defined(UNDER_CE) && defined(MIPS)
1352 /*strange: compiler complaints that I redefine macro UVXf and points where
1353 it was first defined. I copied line from there without any changes.
1354 Nothing should change.
1355 But when I do not do this, there is an error on a line with
1356 Perl_newSVpvf(aTHX_ "%04"UVXf, uv1)
1358 #define UVXf "lX" /**/
1360 if ((hv = get_hv(special, FALSE)) &&
1361 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
1362 (he = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
1363 (val = HeVAL(he))) {
1368 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1371 /* If we have EBCDIC we need to remap the characters
1372 * since any characters in the low 256 are Unicode
1373 * code points, not EBCDIC. */
1374 U8 *t = (U8*)s, *tend = t + len, *d;
1381 UV c = utf8_to_uvchr(t, &tlen);
1383 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1392 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1397 Copy(tmpbuf, ustrp, len, U8);
1399 Copy(s, ustrp, len, U8);
1405 if (!len && *swashp) {
1406 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1409 /* It was "normal" (a single character mapping). */
1410 UV uv3 = UNI_TO_NATIVE(uv2);
1412 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1416 if (!len) /* Neither: just copy. */
1417 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1422 return len ? utf8_to_uvchr(ustrp, 0) : 0;
1426 =for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1428 Convert the UTF-8 encoded character at p to its uppercase version and
1429 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1430 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1431 uppercase version may be longer than the original character (up to two
1434 The first character of the uppercased version is returned
1435 (but note, as explained above, that there may be more.)
1440 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1442 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1443 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1447 =for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1449 Convert the UTF-8 encoded character at p to its titlecase version and
1450 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1451 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1452 titlecase version may be longer than the original character (up to two
1455 The first character of the titlecased version is returned
1456 (but note, as explained above, that there may be more.)
1461 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1463 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1464 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1468 =for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1470 Convert the UTF-8 encoded character at p to its lowercase version and
1471 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1472 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1473 lowercase version may be longer than the original character (up to two
1476 The first character of the lowercased version is returned
1477 (but note, as explained above, that there may be more.)
1482 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1484 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1485 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1489 =for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1491 Convert the UTF-8 encoded character at p to its foldcase version and
1492 store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1493 that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1494 foldcase version may be longer than the original character (up to
1497 The first character of the foldcased version is returned
1498 (but note, as explained above, that there may be more.)
1503 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1505 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1506 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1509 /* a "swash" is a swatch hash */
1512 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1515 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1517 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1520 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1522 errsv_save = newSVsv(ERRSV);
1523 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1525 sv_setsv(ERRSV, errsv_save);
1526 SvREFCNT_dec(errsv_save);
1530 PUSHSTACKi(PERLSI_MAGIC);
1533 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1534 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1536 PUSHs(sv_2mortal(newSViv(minbits)));
1537 PUSHs(sv_2mortal(newSViv(none)));
1543 if (PL_curcop == &PL_compiling) {
1544 /* XXX ought to be handled by lex_start */
1546 sv_setpv(tokenbufsv, PL_tokenbuf);
1548 errsv_save = newSVsv(ERRSV);
1549 if (call_method("SWASHNEW", G_SCALAR))
1550 retval = newSVsv(*PL_stack_sp--);
1552 retval = &PL_sv_undef;
1554 sv_setsv(ERRSV, errsv_save);
1555 SvREFCNT_dec(errsv_save);
1558 if (PL_curcop == &PL_compiling) {
1560 char* pv = SvPV(tokenbufsv, len);
1562 Copy(pv, PL_tokenbuf, len+1, char);
1563 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1565 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1567 Perl_croak(aTHX_ "Can't find Unicode property definition \"%s\"",
1568 SvPV_nolen(retval));
1569 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1575 /* This API is wrong for special case conversions since we may need to
1576 * return several Unicode characters for a single Unicode character
1577 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1578 * the lower-level routine, and it is similarly broken for returning
1579 * multiple values. --jhi */
1581 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1583 HV* hv = (HV*)SvRV(sv);
1592 UV c = NATIVE_TO_ASCII(*ptr);
1594 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1595 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1596 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1599 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1600 * then the "swatch" is a vec() for al the chars which start
1602 * So the key in the hash (klen) is length of encoded char -1
1604 klen = UTF8SKIP(ptr) - 1;
1609 /* If char in invariant then swatch is for all the invariant chars
1610 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1612 needents = UTF_CONTINUATION_MARK;
1613 off = NATIVE_TO_UTF(ptr[klen]);
1617 /* If char is encoded then swatch is for the prefix */
1618 needents = (1 << UTF_ACCUMULATION_SHIFT);
1619 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1623 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1624 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1625 * it's nothing to sniff at.) Pity we usually come through at least
1626 * two function calls to get here...
1628 * NB: this code assumes that swatches are never modified, once generated!
1631 if (hv == PL_last_swash_hv &&
1632 klen == PL_last_swash_klen &&
1633 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1635 tmps = PL_last_swash_tmps;
1636 slen = PL_last_swash_slen;
1639 /* Try our second-level swatch cache, kept in a hash. */
1640 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1642 /* If not cached, generate it via utf8::SWASHGET */
1643 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1645 /* We use utf8n_to_uvuni() as we want an index into
1646 Unicode tables, not a native character number.
1648 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
1650 0 : UTF8_ALLOW_ANY);
1655 PUSHSTACKi(PERLSI_MAGIC);
1659 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1660 PUSHs(sv_2mortal(newSViv((klen) ?
1661 (code_point & ~(needents - 1)) : 0)));
1662 PUSHs(sv_2mortal(newSViv(needents)));
1664 errsv_save = newSVsv(ERRSV);
1665 if (call_method("SWASHGET", G_SCALAR))
1666 retval = newSVsv(*PL_stack_sp--);
1668 retval = &PL_sv_undef;
1670 sv_setsv(ERRSV, errsv_save);
1671 SvREFCNT_dec(errsv_save);
1675 if (PL_curcop == &PL_compiling)
1676 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1678 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1680 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1681 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1684 PL_last_swash_hv = hv;
1685 PL_last_swash_klen = klen;
1686 PL_last_swash_tmps = tmps;
1687 PL_last_swash_slen = slen;
1689 Copy(ptr, PL_last_swash_key, klen, U8);
1692 switch ((int)((slen << 3) / needents)) {
1694 bit = 1 << (off & 7);
1696 return (tmps[off] & bit) != 0;
1701 return (tmps[off] << 8) + tmps[off + 1] ;
1704 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1706 Perl_croak(aTHX_ "panic: swash_fetch");
1712 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1714 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1715 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1716 bytes available. The return value is the pointer to the byte after the
1717 end of the new character. In other words,
1719 d = uvchr_to_utf8(d, uv);
1721 is the recommended wide native character-aware way of saying
1728 /* On ASCII machines this is normally a macro but we want a
1729 real function in case XS code wants it
1731 #undef Perl_uvchr_to_utf8
1733 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1735 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1739 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1741 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1745 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1747 Returns the native character value of the first character in the string C<s>
1748 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1749 length, in bytes, of that character.
1751 Allows length and flags to be passed to low level routine.
1755 /* On ASCII machines this is normally a macro but we want
1756 a real function in case XS code wants it
1758 #undef Perl_utf8n_to_uvchr
1760 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1762 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1763 return UNI_TO_NATIVE(uv);
1767 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1769 Build to the scalar dsv a displayable version of the string spv,
1770 length len, the displayable version being at most pvlim bytes long
1771 (if longer, the rest is truncated and "..." will be appended).
1773 The flags argument can have UNI_DISPLAY_ISPRINT set to display
1774 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1775 to display the \\[nrfta\\] as the backslashed versions (like '\n')
1776 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1777 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1778 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1780 The pointer to the PV of the dsv is returned.
1784 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1789 sv_setpvn(dsv, "", 0);
1790 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1794 if (pvlim && SvCUR(dsv) >= pvlim) {
1798 u = utf8_to_uvchr((U8*)s, 0);
1800 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1803 Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1805 Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1807 Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1809 Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1811 Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1813 Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
1817 /* isPRINT() is the locale-blind version. */
1818 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
1819 Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
1824 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1827 sv_catpvn(dsv, "...", 3);
1833 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1835 Build to the scalar dsv a displayable version of the scalar sv,
1836 the displayable version being at most pvlim bytes long
1837 (if longer, the rest is truncated and "..." will be appended).
1839 The flags argument is as in pv_uni_display().
1841 The pointer to the PV of the dsv is returned.
1845 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1847 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1852 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1854 Return true if the strings s1 and s2 differ case-insensitively, false
1855 if not (if they are equal case-insensitively). If u1 is true, the
1856 string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
1857 the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1858 are false, the respective string is assumed to be in native 8-bit
1861 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1862 in there (they will point at the beginning of the I<next> character).
1863 If the pointers behind pe1 or pe2 are non-NULL, they are the end
1864 pointers beyond which scanning will not continue under any
1865 circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1866 s2+l2 will be used as goal end pointers that will also stop the scan,
1867 and which qualify towards defining a successful match: all the scans
1868 that define an explicit length must reach their goal pointers for
1869 a match to succeed).
1871 For case-insensitiveness, the "casefolding" of Unicode is used
1872 instead of upper/lowercasing both the characters, see
1873 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1877 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1879 register U8 *p1 = (U8*)s1;
1880 register U8 *p2 = (U8*)s2;
1881 register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1882 register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1883 STRLEN n1 = 0, n2 = 0;
1884 U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1885 U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
1887 STRLEN foldlen1, foldlen2;
1892 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (U8*)s1)))
1896 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (U8*)s2)))
1899 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1900 return 1; /* mismatch; possible infinite loop or false positive */
1903 natbuf[1] = 0; /* Need to terminate the buffer. */
1905 while ((e1 == 0 || p1 < e1) &&
1906 (f1 == 0 || p1 < f1) &&
1907 (e2 == 0 || p2 < e2) &&
1908 (f2 == 0 || p2 < f2)) {
1911 to_utf8_fold(p1, foldbuf1, &foldlen1);
1914 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1921 to_utf8_fold(p2, foldbuf2, &foldlen2);
1924 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1930 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1931 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1932 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
1933 return 1; /* mismatch */
1940 p1 += u1 ? UTF8SKIP(p1) : 1;
1942 p2 += u2 ? UTF8SKIP(p2) : 1;
1946 /* A match is defined by all the scans that specified
1947 * an explicit length reaching their final goals. */
1948 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
1957 return match ? 0 : 1; /* 0 match, 1 mismatch */