3 * Copyright (c) 1998-2001, Larry Wall
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
24 #define PERL_IN_UTF8_C
30 =for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34 bytes available. The return value is the pointer to the byte after the
35 end of the new character. In other words,
37 d = uvuni_to_utf8(d, uv);
39 is the recommended Unicode-aware way of saying
47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
49 if (UNI_IS_INVARIANT(uv)) {
50 *d++ = UTF_TO_NATIVE(uv);
53 #if defined(EBCDIC) || 1 /* always for testing */
55 STRLEN len = UNISKIP(uv);
58 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
59 uv >>= UTF_ACCUMULATION_SHIFT;
61 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
64 #else /* Non loop style */
66 *d++ = (( uv >> 6) | 0xc0);
67 *d++ = (( uv & 0x3f) | 0x80);
71 *d++ = (( uv >> 12) | 0xe0);
72 *d++ = (((uv >> 6) & 0x3f) | 0x80);
73 *d++ = (( uv & 0x3f) | 0x80);
77 *d++ = (( uv >> 18) | 0xf0);
78 *d++ = (((uv >> 12) & 0x3f) | 0x80);
79 *d++ = (((uv >> 6) & 0x3f) | 0x80);
80 *d++ = (( uv & 0x3f) | 0x80);
84 *d++ = (( uv >> 24) | 0xf8);
85 *d++ = (((uv >> 18) & 0x3f) | 0x80);
86 *d++ = (((uv >> 12) & 0x3f) | 0x80);
87 *d++ = (((uv >> 6) & 0x3f) | 0x80);
88 *d++ = (( uv & 0x3f) | 0x80);
91 if (uv < 0x80000000) {
92 *d++ = (( uv >> 30) | 0xfc);
93 *d++ = (((uv >> 24) & 0x3f) | 0x80);
94 *d++ = (((uv >> 18) & 0x3f) | 0x80);
95 *d++ = (((uv >> 12) & 0x3f) | 0x80);
96 *d++ = (((uv >> 6) & 0x3f) | 0x80);
97 *d++ = (( uv & 0x3f) | 0x80);
101 if (uv < UTF8_QUAD_MAX)
104 *d++ = 0xfe; /* Can't match U+FEFF! */
105 *d++ = (((uv >> 30) & 0x3f) | 0x80);
106 *d++ = (((uv >> 24) & 0x3f) | 0x80);
107 *d++ = (((uv >> 18) & 0x3f) | 0x80);
108 *d++ = (((uv >> 12) & 0x3f) | 0x80);
109 *d++ = (((uv >> 6) & 0x3f) | 0x80);
110 *d++ = (( uv & 0x3f) | 0x80);
115 *d++ = 0xff; /* Can't match U+FFFE! */
116 *d++ = 0x80; /* 6 Reserved bits */
117 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
118 *d++ = (((uv >> 54) & 0x3f) | 0x80);
119 *d++ = (((uv >> 48) & 0x3f) | 0x80);
120 *d++ = (((uv >> 42) & 0x3f) | 0x80);
121 *d++ = (((uv >> 36) & 0x3f) | 0x80);
122 *d++ = (((uv >> 30) & 0x3f) | 0x80);
123 *d++ = (((uv >> 24) & 0x3f) | 0x80);
124 *d++ = (((uv >> 18) & 0x3f) | 0x80);
125 *d++ = (((uv >> 12) & 0x3f) | 0x80);
126 *d++ = (((uv >> 6) & 0x3f) | 0x80);
127 *d++ = (( uv & 0x3f) | 0x80);
131 #endif /* Loop style */
137 =for apidoc A|STRLEN|is_utf8_char|U8 *s
139 Tests if some arbitrary number of bytes begins in a valid UTF-8
140 character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
141 The actual number of bytes in the UTF-8 character will be returned if
142 it is valid, otherwise 0.
147 Perl_is_utf8_char(pTHX_ U8 *s)
153 if (UTF8_IS_INVARIANT(u))
156 if (!UTF8_IS_START(u))
161 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
166 u &= UTF_START_MASK(len);
170 if (!UTF8_IS_CONTINUATION(*s))
172 uv = UTF8_ACCUMULATE(uv, *s);
179 if (UNISKIP(uv) < len)
186 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
188 Returns true if first C<len> bytes of the given string form a valid UTF8
189 string, false otherwise. Note that 'a valid UTF8 string' does not mean
190 'a string that contains UTF8' because a valid ASCII string is a valid
197 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
204 len = strlen((char *)s);
220 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
222 Bottom level UTF-8 decode routine.
223 Returns the unicode code point value of the first character in the string C<s>
224 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
225 C<retlen> will be set to the length, in bytes, of that character.
227 If C<s> does not point to a well-formed UTF8 character, the behaviour
228 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
229 it is assumed that the caller will raise a warning, and this function
230 will silently just set C<retlen> to C<-1> and return zero. If the
231 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
232 malformations will be given, C<retlen> will be set to the expected
233 length of the UTF-8 character in bytes, and zero will be returned.
235 The C<flags> can also contain various flags to allow deviations from
236 the strict UTF-8 encoding (see F<utf8.h>).
238 Most code should use utf8_to_uvchr() rather than call this directly.
244 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
248 bool dowarn = ckWARN_d(WARN_UTF8);
249 STRLEN expectlen = 0;
252 /* This list is a superset of the UTF8_ALLOW_XXX. */
254 #define UTF8_WARN_EMPTY 1
255 #define UTF8_WARN_CONTINUATION 2
256 #define UTF8_WARN_NON_CONTINUATION 3
257 #define UTF8_WARN_FE_FF 4
258 #define UTF8_WARN_SHORT 5
259 #define UTF8_WARN_OVERFLOW 6
260 #define UTF8_WARN_SURROGATE 7
261 #define UTF8_WARN_BOM 8
262 #define UTF8_WARN_LONG 9
263 #define UTF8_WARN_FFFF 10
266 !(flags & UTF8_ALLOW_EMPTY)) {
267 warning = UTF8_WARN_EMPTY;
271 if (UTF8_IS_INVARIANT(uv)) {
274 return (UV) (NATIVE_TO_UTF(*s));
277 if (UTF8_IS_CONTINUATION(uv) &&
278 !(flags & UTF8_ALLOW_CONTINUATION)) {
279 warning = UTF8_WARN_CONTINUATION;
283 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
284 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
285 warning = UTF8_WARN_NON_CONTINUATION;
290 uv = NATIVE_TO_UTF(uv);
292 if ((uv == 0xfe || uv == 0xff) &&
293 !(flags & UTF8_ALLOW_FE_FF)) {
294 warning = UTF8_WARN_FE_FF;
299 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
300 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
301 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
302 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
304 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
305 else { len = 7; uv &= 0x01; }
307 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
308 else if (!(uv & 0x01)) { len = 7; uv = 0; }
309 else { len = 13; uv = 0; } /* whoa! */
317 if ((curlen < expectlen) &&
318 !(flags & UTF8_ALLOW_SHORT)) {
319 warning = UTF8_WARN_SHORT;
328 if (!UTF8_IS_CONTINUATION(*s) &&
329 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
331 warning = UTF8_WARN_NON_CONTINUATION;
335 uv = UTF8_ACCUMULATE(uv, *s);
337 /* These cannot be allowed. */
339 if (!(flags & UTF8_ALLOW_LONG)) {
340 warning = UTF8_WARN_LONG;
344 else { /* uv < ouv */
345 /* This cannot be allowed. */
346 warning = UTF8_WARN_OVERFLOW;
354 if (UNICODE_IS_SURROGATE(uv) &&
355 !(flags & UTF8_ALLOW_SURROGATE)) {
356 warning = UTF8_WARN_SURROGATE;
358 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
359 !(flags & UTF8_ALLOW_BOM)) {
360 warning = UTF8_WARN_BOM;
362 } else if ((expectlen > UNISKIP(uv)) &&
363 !(flags & UTF8_ALLOW_LONG)) {
364 warning = UTF8_WARN_LONG;
366 } else if (UNICODE_IS_ILLEGAL(uv) &&
367 !(flags & UTF8_ALLOW_FFFF)) {
368 warning = UTF8_WARN_FFFF;
376 if (flags & UTF8_CHECK_ONLY) {
383 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
386 case 0: /* Intentionally empty. */ break;
387 case UTF8_WARN_EMPTY:
388 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
390 case UTF8_WARN_CONTINUATION:
391 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
393 case UTF8_WARN_NON_CONTINUATION:
394 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
397 case UTF8_WARN_FE_FF:
398 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
400 case UTF8_WARN_SHORT:
401 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
402 curlen, curlen == 1 ? "" : "s", expectlen);
404 case UTF8_WARN_OVERFLOW:
405 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
408 case UTF8_WARN_SURROGATE:
409 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
412 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
415 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
416 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
419 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
422 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
430 Perl_warner(aTHX_ WARN_UTF8,
431 "%s in %s", s, OP_DESC(PL_op));
433 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
438 *retlen = expectlen ? expectlen : len;
444 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
446 Returns the native character value of the first character in the string C<s>
447 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
448 length, in bytes, of that character.
450 If C<s> does not point to a well-formed UTF8 character, zero is
451 returned and retlen is set, if possible, to -1.
457 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
459 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
463 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
465 Returns the Unicode code point of the first character in the string C<s>
466 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
467 length, in bytes, of that character.
469 This function should only be used when returned UV is considered
470 an index into the Unicode semantic tables (e.g. swashes).
472 If C<s> does not point to a well-formed UTF8 character, zero is
473 returned and retlen is set, if possible, to -1.
479 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
481 /* Call the low level routine asking for checks */
482 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
486 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
488 Return the length of the UTF-8 char encoded string C<s> in characters.
489 Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
490 up past C<e>, croaks.
496 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
500 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
501 * the bitops (especially ~) can create illegal UTF-8.
502 * In other words: in Perl UTF-8 is not just for Unicode. */
505 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
510 Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
519 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
521 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
524 WARNING: use only if you *know* that the pointers point inside the
531 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
535 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
536 * the bitops (especially ~) can create illegal UTF-8.
537 * In other words: in Perl UTF-8 is not just for Unicode. */
544 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
554 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
564 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
566 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
569 WARNING: do not use the following unless you *know* C<off> is within
570 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
571 on the first byte of character or just after the last byte of a character.
577 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
579 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
580 * the bitops (especially ~) can create illegal UTF-8.
581 * In other words: in Perl UTF-8 is not just for Unicode. */
590 while (UTF8_IS_CONTINUATION(*s))
598 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
600 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
601 Unlike C<bytes_to_utf8>, this over-writes the original string, and
602 updates len to contain the new length.
603 Returns zero on failure, setting C<len> to -1.
609 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
615 /* ensure valid UTF8 and chars < 256 before updating string */
616 for (send = s + *len; s < send; ) {
619 if (!UTF8_IS_INVARIANT(c) &&
620 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
621 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
630 *d++ = (U8)utf8_to_uvchr(s, &ulen);
639 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
641 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
642 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
643 the newly-created string, and updates C<len> to contain the new
644 length. Returns the original string if no conversion occurs, C<len>
645 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
646 0 if C<s> is converted or contains all 7bit characters.
652 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
662 /* ensure valid UTF8 and chars < 256 before converting string */
663 for (send = s + *len; s < send;) {
665 if (!UTF8_IS_INVARIANT(c)) {
666 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
667 (c = *s++) && UTF8_IS_CONTINUATION(c))
676 Newz(801, d, (*len) - count + 1, U8);
677 s = start; start = d;
680 if (!UTF8_IS_INVARIANT(c)) {
681 /* Then it is two-byte encoded */
682 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
683 c = ASCII_TO_NATIVE(c);
693 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
695 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
696 Returns a pointer to the newly-created string, and sets C<len> to
697 reflect the new length.
703 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
710 Newz(801, d, (*len) * 2 + 1, U8);
714 UV uv = NATIVE_TO_ASCII(*s++);
715 if (UNI_IS_INVARIANT(uv))
716 *d++ = UTF_TO_NATIVE(uv);
718 *d++ = UTF8_EIGHT_BIT_HI(uv);
719 *d++ = UTF8_EIGHT_BIT_LO(uv);
728 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
730 * Destination must be pre-extended to 3/2 source. Do not use in-place.
731 * We optimize for native, for obvious reasons. */
734 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
740 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
745 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
752 *d++ = (( uv >> 6) | 0xc0);
753 *d++ = (( uv & 0x3f) | 0x80);
756 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
758 if (low < 0xdc00 || low >= 0xdfff)
759 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
760 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
763 *d++ = (( uv >> 12) | 0xe0);
764 *d++ = (((uv >> 6) & 0x3f) | 0x80);
765 *d++ = (( uv & 0x3f) | 0x80);
769 *d++ = (( uv >> 18) | 0xf0);
770 *d++ = (((uv >> 12) & 0x3f) | 0x80);
771 *d++ = (((uv >> 6) & 0x3f) | 0x80);
772 *d++ = (( uv & 0x3f) | 0x80);
776 *newlen = d - dstart;
780 /* Note: this one is slightly destructive of the source. */
783 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
786 U8* send = s + bytelen;
793 return utf16_to_utf8(p, d, bytelen, newlen);
796 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
799 Perl_is_uni_alnum(pTHX_ UV c)
801 U8 tmpbuf[UTF8_MAXLEN+1];
802 uvchr_to_utf8(tmpbuf, (UV)c);
803 return is_utf8_alnum(tmpbuf);
807 Perl_is_uni_alnumc(pTHX_ UV c)
809 U8 tmpbuf[UTF8_MAXLEN+1];
810 uvchr_to_utf8(tmpbuf, (UV)c);
811 return is_utf8_alnumc(tmpbuf);
815 Perl_is_uni_idfirst(pTHX_ UV c)
817 U8 tmpbuf[UTF8_MAXLEN+1];
818 uvchr_to_utf8(tmpbuf, (UV)c);
819 return is_utf8_idfirst(tmpbuf);
823 Perl_is_uni_alpha(pTHX_ UV c)
825 U8 tmpbuf[UTF8_MAXLEN+1];
826 uvchr_to_utf8(tmpbuf, (UV)c);
827 return is_utf8_alpha(tmpbuf);
831 Perl_is_uni_ascii(pTHX_ UV c)
833 U8 tmpbuf[UTF8_MAXLEN+1];
834 uvchr_to_utf8(tmpbuf, (UV)c);
835 return is_utf8_ascii(tmpbuf);
839 Perl_is_uni_space(pTHX_ UV c)
841 U8 tmpbuf[UTF8_MAXLEN+1];
842 uvchr_to_utf8(tmpbuf, (UV)c);
843 return is_utf8_space(tmpbuf);
847 Perl_is_uni_digit(pTHX_ UV c)
849 U8 tmpbuf[UTF8_MAXLEN+1];
850 uvchr_to_utf8(tmpbuf, (UV)c);
851 return is_utf8_digit(tmpbuf);
855 Perl_is_uni_upper(pTHX_ UV c)
857 U8 tmpbuf[UTF8_MAXLEN+1];
858 uvchr_to_utf8(tmpbuf, (UV)c);
859 return is_utf8_upper(tmpbuf);
863 Perl_is_uni_lower(pTHX_ UV c)
865 U8 tmpbuf[UTF8_MAXLEN+1];
866 uvchr_to_utf8(tmpbuf, (UV)c);
867 return is_utf8_lower(tmpbuf);
871 Perl_is_uni_cntrl(pTHX_ UV c)
873 U8 tmpbuf[UTF8_MAXLEN+1];
874 uvchr_to_utf8(tmpbuf, (UV)c);
875 return is_utf8_cntrl(tmpbuf);
879 Perl_is_uni_graph(pTHX_ UV c)
881 U8 tmpbuf[UTF8_MAXLEN+1];
882 uvchr_to_utf8(tmpbuf, (UV)c);
883 return is_utf8_graph(tmpbuf);
887 Perl_is_uni_print(pTHX_ UV c)
889 U8 tmpbuf[UTF8_MAXLEN+1];
890 uvchr_to_utf8(tmpbuf, (UV)c);
891 return is_utf8_print(tmpbuf);
895 Perl_is_uni_punct(pTHX_ UV c)
897 U8 tmpbuf[UTF8_MAXLEN+1];
898 uvchr_to_utf8(tmpbuf, (UV)c);
899 return is_utf8_punct(tmpbuf);
903 Perl_is_uni_xdigit(pTHX_ UV c)
905 U8 tmpbuf[UTF8_MAXLEN*2+1];
906 uvchr_to_utf8(tmpbuf, (UV)c);
907 return is_utf8_xdigit(tmpbuf);
911 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
913 U8 tmpbuf[UTF8_MAXLEN*2+1];
914 uvchr_to_utf8(tmpbuf, (UV)c);
915 return to_utf8_upper(tmpbuf, p, lenp);
919 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
921 U8 tmpbuf[UTF8_MAXLEN*2+1];
922 uvchr_to_utf8(tmpbuf, (UV)c);
923 return to_utf8_title(tmpbuf, p, lenp);
927 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
929 U8 tmpbuf[UTF8_MAXLEN+1];
930 uvchr_to_utf8(tmpbuf, (UV)c);
931 return to_utf8_lower(tmpbuf, p, lenp);
935 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
937 U8 tmpbuf[UTF8_MAXLEN+1];
938 uvchr_to_utf8(tmpbuf, (UV)c);
939 return to_utf8_fold(tmpbuf, p, lenp);
942 /* for now these all assume no locale info available for Unicode > 255 */
945 Perl_is_uni_alnum_lc(pTHX_ UV c)
947 return is_uni_alnum(c); /* XXX no locale support yet */
951 Perl_is_uni_alnumc_lc(pTHX_ UV c)
953 return is_uni_alnumc(c); /* XXX no locale support yet */
957 Perl_is_uni_idfirst_lc(pTHX_ UV c)
959 return is_uni_idfirst(c); /* XXX no locale support yet */
963 Perl_is_uni_alpha_lc(pTHX_ UV c)
965 return is_uni_alpha(c); /* XXX no locale support yet */
969 Perl_is_uni_ascii_lc(pTHX_ UV c)
971 return is_uni_ascii(c); /* XXX no locale support yet */
975 Perl_is_uni_space_lc(pTHX_ UV c)
977 return is_uni_space(c); /* XXX no locale support yet */
981 Perl_is_uni_digit_lc(pTHX_ UV c)
983 return is_uni_digit(c); /* XXX no locale support yet */
987 Perl_is_uni_upper_lc(pTHX_ UV c)
989 return is_uni_upper(c); /* XXX no locale support yet */
993 Perl_is_uni_lower_lc(pTHX_ UV c)
995 return is_uni_lower(c); /* XXX no locale support yet */
999 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1001 return is_uni_cntrl(c); /* XXX no locale support yet */
1005 Perl_is_uni_graph_lc(pTHX_ UV c)
1007 return is_uni_graph(c); /* XXX no locale support yet */
1011 Perl_is_uni_print_lc(pTHX_ UV c)
1013 return is_uni_print(c); /* XXX no locale support yet */
1017 Perl_is_uni_punct_lc(pTHX_ UV c)
1019 return is_uni_punct(c); /* XXX no locale support yet */
1023 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1025 return is_uni_xdigit(c); /* XXX no locale support yet */
1029 Perl_is_utf8_alnum(pTHX_ U8 *p)
1031 if (!is_utf8_char(p))
1034 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1035 * descendant of isalnum(3), in other words, it doesn't
1036 * contain the '_'. --jhi */
1037 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1038 return swash_fetch(PL_utf8_alnum, p, TRUE);
1039 /* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1040 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1042 PL_utf8_alnum = swash_init("utf8", "",
1043 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1044 return swash_fetch(PL_utf8_alnum, p, TRUE);
1049 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1051 if (!is_utf8_char(p))
1054 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1055 return swash_fetch(PL_utf8_alnum, p, TRUE);
1056 /* return is_utf8_alpha(p) || is_utf8_digit(p); */
1057 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1059 PL_utf8_alnum = swash_init("utf8", "",
1060 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1061 return swash_fetch(PL_utf8_alnum, p, TRUE);
1066 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1068 return *p == '_' || is_utf8_alpha(p);
1072 Perl_is_utf8_alpha(pTHX_ U8 *p)
1074 if (!is_utf8_char(p))
1077 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1078 return swash_fetch(PL_utf8_alpha, p, TRUE);
1082 Perl_is_utf8_ascii(pTHX_ U8 *p)
1084 if (!is_utf8_char(p))
1087 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1088 return swash_fetch(PL_utf8_ascii, p, TRUE);
1092 Perl_is_utf8_space(pTHX_ U8 *p)
1094 if (!is_utf8_char(p))
1097 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1098 return swash_fetch(PL_utf8_space, p, TRUE);
1102 Perl_is_utf8_digit(pTHX_ U8 *p)
1104 if (!is_utf8_char(p))
1107 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1108 return swash_fetch(PL_utf8_digit, p, TRUE);
1112 Perl_is_utf8_upper(pTHX_ U8 *p)
1114 if (!is_utf8_char(p))
1117 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1118 return swash_fetch(PL_utf8_upper, p, TRUE);
1122 Perl_is_utf8_lower(pTHX_ U8 *p)
1124 if (!is_utf8_char(p))
1127 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1128 return swash_fetch(PL_utf8_lower, p, TRUE);
1132 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1134 if (!is_utf8_char(p))
1137 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1138 return swash_fetch(PL_utf8_cntrl, p, TRUE);
1142 Perl_is_utf8_graph(pTHX_ U8 *p)
1144 if (!is_utf8_char(p))
1147 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1148 return swash_fetch(PL_utf8_graph, p, TRUE);
1152 Perl_is_utf8_print(pTHX_ U8 *p)
1154 if (!is_utf8_char(p))
1157 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1158 return swash_fetch(PL_utf8_print, p, TRUE);
1162 Perl_is_utf8_punct(pTHX_ U8 *p)
1164 if (!is_utf8_char(p))
1167 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1168 return swash_fetch(PL_utf8_punct, p, TRUE);
1172 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1174 if (!is_utf8_char(p))
1176 if (!PL_utf8_xdigit)
1177 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1178 return swash_fetch(PL_utf8_xdigit, p, TRUE);
1182 Perl_is_utf8_mark(pTHX_ U8 *p)
1184 if (!is_utf8_char(p))
1187 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1188 return swash_fetch(PL_utf8_mark, p, TRUE);
1192 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1194 The "p" contains the pointer to the UTF-8 string encoding
1195 the character that is being converted.
1197 The "ustrp" is a pointer to the character buffer to put the
1198 conversion result to. The "lenp" is a pointer to the length
1201 The "swash" is a pointer to the swash to use.
1203 The "normal" is a string like "ToLower" which means the swash
1204 $utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
1205 and loaded by SWASHGET, using lib/utf8_heavy.pl.
1207 The "special" is a string like "utf8::ToSpecLower", which means
1208 the hash %utf8::ToSpecLower, which is stored in the same file,
1209 lib/unicore/To/Lower.pl, and also loaded by SWASHGET. The access
1210 to the hash is by Perl_to_utf8_case().
1216 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special)
1221 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1222 uv = swash_fetch(*swashp, p, TRUE);
1224 uv = UNI_TO_NATIVE(uv);
1230 uv = utf8_to_uvchr(p, 0);
1232 uv = NATIVE_TO_UTF(uv);
1234 if ((hv = get_hv(special, FALSE)) &&
1235 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) &&
1236 (he = hv_fetch_ent(hv, keysv, FALSE, 0))) {
1237 SV *val = HeVAL(he);
1238 char *s = SvPV(val, *lenp);
1240 if (*lenp > 1 || UNI_IS_INVARIANT(c))
1241 Copy(s, ustrp, *lenp, U8);
1243 c = UTF_TO_NATIVE(c);
1244 /* something in the 0x80..0xFF range */
1245 ustrp[0] = UTF8_EIGHT_BIT_HI(c);
1246 ustrp[1] = UTF8_EIGHT_BIT_LO(c);
1252 *lenp = UNISKIP(uv);
1253 uvuni_to_utf8(ustrp, uv);
1258 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1260 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1261 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1265 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1267 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1268 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1272 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1274 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1275 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1279 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1281 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1282 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1285 /* a "swash" is a swatch hash */
1288 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1291 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1293 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1296 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1298 errsv_save = newSVsv(ERRSV);
1299 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1301 sv_setsv(ERRSV, errsv_save);
1302 SvREFCNT_dec(errsv_save);
1306 PUSHSTACKi(PERLSI_MAGIC);
1309 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1310 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1312 PUSHs(sv_2mortal(newSViv(minbits)));
1313 PUSHs(sv_2mortal(newSViv(none)));
1319 if (PL_curcop == &PL_compiling)
1320 /* XXX ought to be handled by lex_start */
1321 sv_setpv(tokenbufsv, PL_tokenbuf);
1322 errsv_save = newSVsv(ERRSV);
1323 if (call_method("SWASHNEW", G_SCALAR))
1324 retval = newSVsv(*PL_stack_sp--);
1326 retval = &PL_sv_undef;
1328 sv_setsv(ERRSV, errsv_save);
1329 SvREFCNT_dec(errsv_save);
1332 if (PL_curcop == &PL_compiling) {
1334 char* pv = SvPV(tokenbufsv, len);
1336 Copy(pv, PL_tokenbuf, len+1, char);
1337 PL_curcop->op_private = PL_hints;
1339 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1340 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1345 /* This API is wrong for special case conversions since we may need to
1346 * return several Unicode characters for a single Unicode character
1347 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1348 * the lower-level routine, and it is similarly broken for returning
1349 * multiple values. --jhi */
1351 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1353 HV* hv = (HV*)SvRV(sv);
1362 UV c = NATIVE_TO_ASCII(*ptr);
1364 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1365 tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1366 tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1369 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1370 * then the "swatch" is a vec() for al the chars which start
1372 * So the key in the hash (klen) is length of encoded char -1
1374 klen = UTF8SKIP(ptr) - 1;
1379 /* If char in invariant then swatch is for all the invariant chars
1380 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1382 needents = UTF_CONTINUATION_MARK;
1383 off = NATIVE_TO_UTF(ptr[klen]);
1387 /* If char is encoded then swatch is for the prefix */
1388 needents = (1 << UTF_ACCUMULATION_SHIFT);
1389 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1393 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1394 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1395 * it's nothing to sniff at.) Pity we usually come through at least
1396 * two function calls to get here...
1398 * NB: this code assumes that swatches are never modified, once generated!
1401 if (hv == PL_last_swash_hv &&
1402 klen == PL_last_swash_klen &&
1403 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1405 tmps = PL_last_swash_tmps;
1406 slen = PL_last_swash_slen;
1409 /* Try our second-level swatch cache, kept in a hash. */
1410 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1412 /* If not cached, generate it via utf8::SWASHGET */
1413 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1415 /* We use utf8n_to_uvuni() as we want an index into
1416 Unicode tables, not a native character number.
1418 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
1423 PUSHSTACKi(PERLSI_MAGIC);
1427 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1428 PUSHs(sv_2mortal(newSViv((klen) ?
1429 (code_point & ~(needents - 1)) : 0)));
1430 PUSHs(sv_2mortal(newSViv(needents)));
1432 errsv_save = newSVsv(ERRSV);
1433 if (call_method("SWASHGET", G_SCALAR))
1434 retval = newSVsv(*PL_stack_sp--);
1436 retval = &PL_sv_undef;
1438 sv_setsv(ERRSV, errsv_save);
1439 SvREFCNT_dec(errsv_save);
1443 if (PL_curcop == &PL_compiling)
1444 PL_curcop->op_private = PL_hints;
1446 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1448 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1449 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1452 PL_last_swash_hv = hv;
1453 PL_last_swash_klen = klen;
1454 PL_last_swash_tmps = tmps;
1455 PL_last_swash_slen = slen;
1457 Copy(ptr, PL_last_swash_key, klen, U8);
1460 switch ((int)((slen << 3) / needents)) {
1462 bit = 1 << (off & 7);
1464 return (tmps[off] & bit) != 0;
1469 return (tmps[off] << 8) + tmps[off + 1] ;
1472 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1474 Perl_croak(aTHX_ "panic: swash_fetch");
1480 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1482 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1483 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1484 bytes available. The return value is the pointer to the byte after the
1485 end of the new character. In other words,
1487 d = uvchr_to_utf8(d, uv);
1489 is the recommended wide native character-aware way of saying
1496 /* On ASCII machines this is normally a macro but we want a
1497 real function in case XS code wants it
1499 #undef Perl_uvchr_to_utf8
1501 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1503 return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1508 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1510 Returns the native character value of the first character in the string C<s>
1511 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1512 length, in bytes, of that character.
1514 Allows length and flags to be passed to low level routine.
1518 /* On ASCII machines this is normally a macro but we want a
1519 real function in case XS code wants it
1521 #undef Perl_utf8n_to_uvchr
1523 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1525 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1526 return UNI_TO_NATIVE(uv);
1530 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1535 sv_setpvn(dsv, "", 0);
1536 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1538 if (pvlim && SvCUR(dsv) >= pvlim) {
1542 u = utf8_to_uvchr((U8*)s, 0);
1543 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1546 sv_catpvn(dsv, "...", 3);
1552 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1554 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1559 Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, const char *s2, bool u2, register I32 len)
1561 register U8 *a = (U8*)s1;
1562 register U8 *b = (U8*)s2;
1565 STRLEN ulen1, ulen2;
1566 U8 tmpbuf1[UTF8_MAXLEN*3+1];
1567 U8 tmpbuf2[UTF8_MAXLEN*3+1];
1571 ca = utf8_to_uvchr((U8*)a, &la);
1577 cb = utf8_to_uvchr((U8*)b, &lb);
1584 to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
1588 to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
1592 || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb])
1593 || memNE(tmpbuf1, tmpbuf2, ulen1))