3 * Copyright (c) 1998-2001, Larry Wall
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
24 #define PERL_IN_UTF8_C
30 =for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34 bytes available. The return value is the pointer to the byte after the
35 end of the new character. In other words,
37 d = uvuni_to_utf8(d, uv);
39 is the recommended Unicode-aware way of saying
47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
49 if (ckWARN_d(WARN_UTF8)) {
50 if (UNICODE_IS_SURROGATE(uv))
51 Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv);
52 else if ((uv >= 0xFDD0 && uv <= 0xFDEF) ||
53 (uv == 0xFFFE || uv == 0xFFFF))
54 Perl_warner(aTHX_ WARN_UTF8,
55 "Unicode character 0x%04"UVxf" is illegal", uv);
57 if (UNI_IS_INVARIANT(uv)) {
58 *d++ = UTF_TO_NATIVE(uv);
63 STRLEN len = UNISKIP(uv);
66 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
67 uv >>= UTF_ACCUMULATION_SHIFT;
69 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
72 #else /* Non loop style */
74 *d++ = (( uv >> 6) | 0xc0);
75 *d++ = (( uv & 0x3f) | 0x80);
79 *d++ = (( uv >> 12) | 0xe0);
80 *d++ = (((uv >> 6) & 0x3f) | 0x80);
81 *d++ = (( uv & 0x3f) | 0x80);
85 *d++ = (( uv >> 18) | 0xf0);
86 *d++ = (((uv >> 12) & 0x3f) | 0x80);
87 *d++ = (((uv >> 6) & 0x3f) | 0x80);
88 *d++ = (( uv & 0x3f) | 0x80);
92 *d++ = (( uv >> 24) | 0xf8);
93 *d++ = (((uv >> 18) & 0x3f) | 0x80);
94 *d++ = (((uv >> 12) & 0x3f) | 0x80);
95 *d++ = (((uv >> 6) & 0x3f) | 0x80);
96 *d++ = (( uv & 0x3f) | 0x80);
99 if (uv < 0x80000000) {
100 *d++ = (( uv >> 30) | 0xfc);
101 *d++ = (((uv >> 24) & 0x3f) | 0x80);
102 *d++ = (((uv >> 18) & 0x3f) | 0x80);
103 *d++ = (((uv >> 12) & 0x3f) | 0x80);
104 *d++ = (((uv >> 6) & 0x3f) | 0x80);
105 *d++ = (( uv & 0x3f) | 0x80);
109 if (uv < UTF8_QUAD_MAX)
112 *d++ = 0xfe; /* Can't match U+FEFF! */
113 *d++ = (((uv >> 30) & 0x3f) | 0x80);
114 *d++ = (((uv >> 24) & 0x3f) | 0x80);
115 *d++ = (((uv >> 18) & 0x3f) | 0x80);
116 *d++ = (((uv >> 12) & 0x3f) | 0x80);
117 *d++ = (((uv >> 6) & 0x3f) | 0x80);
118 *d++ = (( uv & 0x3f) | 0x80);
123 *d++ = 0xff; /* Can't match U+FFFE! */
124 *d++ = 0x80; /* 6 Reserved bits */
125 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
126 *d++ = (((uv >> 54) & 0x3f) | 0x80);
127 *d++ = (((uv >> 48) & 0x3f) | 0x80);
128 *d++ = (((uv >> 42) & 0x3f) | 0x80);
129 *d++ = (((uv >> 36) & 0x3f) | 0x80);
130 *d++ = (((uv >> 30) & 0x3f) | 0x80);
131 *d++ = (((uv >> 24) & 0x3f) | 0x80);
132 *d++ = (((uv >> 18) & 0x3f) | 0x80);
133 *d++ = (((uv >> 12) & 0x3f) | 0x80);
134 *d++ = (((uv >> 6) & 0x3f) | 0x80);
135 *d++ = (( uv & 0x3f) | 0x80);
139 #endif /* Loop style */
145 =for apidoc A|STRLEN|is_utf8_char|U8 *s
147 Tests if some arbitrary number of bytes begins in a valid UTF-8
148 character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
149 The actual number of bytes in the UTF-8 character will be returned if
150 it is valid, otherwise 0.
155 Perl_is_utf8_char(pTHX_ U8 *s)
161 if (UTF8_IS_INVARIANT(u))
164 if (!UTF8_IS_START(u))
169 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
174 u &= UTF_START_MASK(len);
178 if (!UTF8_IS_CONTINUATION(*s))
180 uv = UTF8_ACCUMULATE(uv, *s);
187 if (UNISKIP(uv) < len)
194 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
196 Returns true if first C<len> bytes of the given string form a valid UTF8
197 string, false otherwise. Note that 'a valid UTF8 string' does not mean
198 'a string that contains UTF8' because a valid ASCII string is a valid
205 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
212 len = strlen((char *)s);
228 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
230 Bottom level UTF-8 decode routine.
231 Returns the unicode code point value of the first character in the string C<s>
232 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
233 C<retlen> will be set to the length, in bytes, of that character.
235 If C<s> does not point to a well-formed UTF8 character, the behaviour
236 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
237 it is assumed that the caller will raise a warning, and this function
238 will silently just set C<retlen> to C<-1> and return zero. If the
239 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
240 malformations will be given, C<retlen> will be set to the expected
241 length of the UTF-8 character in bytes, and zero will be returned.
243 The C<flags> can also contain various flags to allow deviations from
244 the strict UTF-8 encoding (see F<utf8.h>).
246 Most code should use utf8_to_uvchr() rather than call this directly.
252 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
257 bool dowarn = ckWARN_d(WARN_UTF8);
259 STRLEN expectlen = 0;
262 /* This list is a superset of the UTF8_ALLOW_XXX. */
264 #define UTF8_WARN_EMPTY 1
265 #define UTF8_WARN_CONTINUATION 2
266 #define UTF8_WARN_NON_CONTINUATION 3
267 #define UTF8_WARN_FE_FF 4
268 #define UTF8_WARN_SHORT 5
269 #define UTF8_WARN_OVERFLOW 6
270 #define UTF8_WARN_SURROGATE 7
271 #define UTF8_WARN_BOM 8
272 #define UTF8_WARN_LONG 9
273 #define UTF8_WARN_FFFF 10
276 !(flags & UTF8_ALLOW_EMPTY)) {
277 warning = UTF8_WARN_EMPTY;
281 if (UTF8_IS_INVARIANT(uv)) {
284 return (UV) (NATIVE_TO_UTF(*s));
287 if (UTF8_IS_CONTINUATION(uv) &&
288 !(flags & UTF8_ALLOW_CONTINUATION)) {
289 warning = UTF8_WARN_CONTINUATION;
293 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
294 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
295 warning = UTF8_WARN_NON_CONTINUATION;
300 uv = NATIVE_TO_UTF(uv);
302 if ((uv == 0xfe || uv == 0xff) &&
303 !(flags & UTF8_ALLOW_FE_FF)) {
304 warning = UTF8_WARN_FE_FF;
309 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
310 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
311 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
312 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
314 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
315 else { len = 7; uv &= 0x01; }
317 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
318 else if (!(uv & 0x01)) { len = 7; uv = 0; }
319 else { len = 13; uv = 0; } /* whoa! */
327 if ((curlen < expectlen) &&
328 !(flags & UTF8_ALLOW_SHORT)) {
329 warning = UTF8_WARN_SHORT;
338 if (!UTF8_IS_CONTINUATION(*s) &&
339 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
341 warning = UTF8_WARN_NON_CONTINUATION;
345 uv = UTF8_ACCUMULATE(uv, *s);
347 /* These cannot be allowed. */
349 if (!(flags & UTF8_ALLOW_LONG)) {
350 warning = UTF8_WARN_LONG;
354 else { /* uv < ouv */
355 /* This cannot be allowed. */
356 warning = UTF8_WARN_OVERFLOW;
364 if (UNICODE_IS_SURROGATE(uv) &&
365 !(flags & UTF8_ALLOW_SURROGATE)) {
366 warning = UTF8_WARN_SURROGATE;
368 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
369 !(flags & UTF8_ALLOW_BOM)) {
370 warning = UTF8_WARN_BOM;
372 } else if ((expectlen > UNISKIP(uv)) &&
373 !(flags & UTF8_ALLOW_LONG)) {
374 warning = UTF8_WARN_LONG;
376 } else if (UNICODE_IS_ILLEGAL(uv) &&
377 !(flags & UTF8_ALLOW_FFFF)) {
378 warning = UTF8_WARN_FFFF;
386 if (flags & UTF8_CHECK_ONLY) {
393 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
396 case 0: /* Intentionally empty. */ break;
397 case UTF8_WARN_EMPTY:
398 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
400 case UTF8_WARN_CONTINUATION:
401 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
403 case UTF8_WARN_NON_CONTINUATION:
405 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
406 (UV)s[1], startbyte);
408 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
409 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
412 case UTF8_WARN_FE_FF:
413 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
415 case UTF8_WARN_SHORT:
416 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
417 curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
418 expectlen = curlen; /* distance for caller to skip */
420 case UTF8_WARN_OVERFLOW:
421 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
424 case UTF8_WARN_SURROGATE:
425 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
428 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
431 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
432 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
435 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
438 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
446 Perl_warner(aTHX_ WARN_UTF8,
447 "%s in %s", s, OP_DESC(PL_op));
449 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
454 *retlen = expectlen ? expectlen : len;
460 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
462 Returns the native character value of the first character in the string C<s>
463 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
464 length, in bytes, of that character.
466 If C<s> does not point to a well-formed UTF8 character, zero is
467 returned and retlen is set, if possible, to -1.
473 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
475 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
479 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
481 Returns the Unicode code point of the first character in the string C<s>
482 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
483 length, in bytes, of that character.
485 This function should only be used when returned UV is considered
486 an index into the Unicode semantic tables (e.g. swashes).
488 If C<s> does not point to a well-formed UTF8 character, zero is
489 returned and retlen is set, if possible, to -1.
495 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
497 /* Call the low level routine asking for checks */
498 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
502 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
504 Return the length of the UTF-8 char encoded string C<s> in characters.
505 Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
506 up past C<e>, croaks.
512 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
516 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
517 * the bitops (especially ~) can create illegal UTF-8.
518 * In other words: in Perl UTF-8 is not just for Unicode. */
521 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
526 Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
535 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
537 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
540 WARNING: use only if you *know* that the pointers point inside the
547 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
551 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
552 * the bitops (especially ~) can create illegal UTF-8.
553 * In other words: in Perl UTF-8 is not just for Unicode. */
560 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
570 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
580 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
582 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
585 WARNING: do not use the following unless you *know* C<off> is within
586 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
587 on the first byte of character or just after the last byte of a character.
593 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
595 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
596 * the bitops (especially ~) can create illegal UTF-8.
597 * In other words: in Perl UTF-8 is not just for Unicode. */
606 while (UTF8_IS_CONTINUATION(*s))
614 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
616 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
617 Unlike C<bytes_to_utf8>, this over-writes the original string, and
618 updates len to contain the new length.
619 Returns zero on failure, setting C<len> to -1.
625 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
631 /* ensure valid UTF8 and chars < 256 before updating string */
632 for (send = s + *len; s < send; ) {
635 if (!UTF8_IS_INVARIANT(c) &&
636 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
637 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
646 *d++ = (U8)utf8_to_uvchr(s, &ulen);
655 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
657 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
658 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
659 the newly-created string, and updates C<len> to contain the new
660 length. Returns the original string if no conversion occurs, C<len>
661 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
662 0 if C<s> is converted or contains all 7bit characters.
668 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
678 /* ensure valid UTF8 and chars < 256 before converting string */
679 for (send = s + *len; s < send;) {
681 if (!UTF8_IS_INVARIANT(c)) {
682 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
683 (c = *s++) && UTF8_IS_CONTINUATION(c))
692 Newz(801, d, (*len) - count + 1, U8);
693 s = start; start = d;
696 if (!UTF8_IS_INVARIANT(c)) {
697 /* Then it is two-byte encoded */
698 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
699 c = ASCII_TO_NATIVE(c);
709 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
711 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
712 Returns a pointer to the newly-created string, and sets C<len> to
713 reflect the new length.
719 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
726 Newz(801, d, (*len) * 2 + 1, U8);
730 UV uv = NATIVE_TO_ASCII(*s++);
731 if (UNI_IS_INVARIANT(uv))
732 *d++ = UTF_TO_NATIVE(uv);
734 *d++ = UTF8_EIGHT_BIT_HI(uv);
735 *d++ = UTF8_EIGHT_BIT_LO(uv);
744 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
746 * Destination must be pre-extended to 3/2 source. Do not use in-place.
747 * We optimize for native, for obvious reasons. */
750 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
756 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
761 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
768 *d++ = (( uv >> 6) | 0xc0);
769 *d++ = (( uv & 0x3f) | 0x80);
772 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
774 if (low < 0xdc00 || low >= 0xdfff)
775 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
776 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
779 *d++ = (( uv >> 12) | 0xe0);
780 *d++ = (((uv >> 6) & 0x3f) | 0x80);
781 *d++ = (( uv & 0x3f) | 0x80);
785 *d++ = (( uv >> 18) | 0xf0);
786 *d++ = (((uv >> 12) & 0x3f) | 0x80);
787 *d++ = (((uv >> 6) & 0x3f) | 0x80);
788 *d++ = (( uv & 0x3f) | 0x80);
792 *newlen = d - dstart;
796 /* Note: this one is slightly destructive of the source. */
799 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
802 U8* send = s + bytelen;
809 return utf16_to_utf8(p, d, bytelen, newlen);
812 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
815 Perl_is_uni_alnum(pTHX_ UV c)
817 U8 tmpbuf[UTF8_MAXLEN+1];
818 uvchr_to_utf8(tmpbuf, (UV)c);
819 return is_utf8_alnum(tmpbuf);
823 Perl_is_uni_alnumc(pTHX_ UV c)
825 U8 tmpbuf[UTF8_MAXLEN+1];
826 uvchr_to_utf8(tmpbuf, (UV)c);
827 return is_utf8_alnumc(tmpbuf);
831 Perl_is_uni_idfirst(pTHX_ UV c)
833 U8 tmpbuf[UTF8_MAXLEN+1];
834 uvchr_to_utf8(tmpbuf, (UV)c);
835 return is_utf8_idfirst(tmpbuf);
839 Perl_is_uni_alpha(pTHX_ UV c)
841 U8 tmpbuf[UTF8_MAXLEN+1];
842 uvchr_to_utf8(tmpbuf, (UV)c);
843 return is_utf8_alpha(tmpbuf);
847 Perl_is_uni_ascii(pTHX_ UV c)
849 U8 tmpbuf[UTF8_MAXLEN+1];
850 uvchr_to_utf8(tmpbuf, (UV)c);
851 return is_utf8_ascii(tmpbuf);
855 Perl_is_uni_space(pTHX_ UV c)
857 U8 tmpbuf[UTF8_MAXLEN+1];
858 uvchr_to_utf8(tmpbuf, (UV)c);
859 return is_utf8_space(tmpbuf);
863 Perl_is_uni_digit(pTHX_ UV c)
865 U8 tmpbuf[UTF8_MAXLEN+1];
866 uvchr_to_utf8(tmpbuf, (UV)c);
867 return is_utf8_digit(tmpbuf);
871 Perl_is_uni_upper(pTHX_ UV c)
873 U8 tmpbuf[UTF8_MAXLEN+1];
874 uvchr_to_utf8(tmpbuf, (UV)c);
875 return is_utf8_upper(tmpbuf);
879 Perl_is_uni_lower(pTHX_ UV c)
881 U8 tmpbuf[UTF8_MAXLEN+1];
882 uvchr_to_utf8(tmpbuf, (UV)c);
883 return is_utf8_lower(tmpbuf);
887 Perl_is_uni_cntrl(pTHX_ UV c)
889 U8 tmpbuf[UTF8_MAXLEN+1];
890 uvchr_to_utf8(tmpbuf, (UV)c);
891 return is_utf8_cntrl(tmpbuf);
895 Perl_is_uni_graph(pTHX_ UV c)
897 U8 tmpbuf[UTF8_MAXLEN+1];
898 uvchr_to_utf8(tmpbuf, (UV)c);
899 return is_utf8_graph(tmpbuf);
903 Perl_is_uni_print(pTHX_ UV c)
905 U8 tmpbuf[UTF8_MAXLEN+1];
906 uvchr_to_utf8(tmpbuf, (UV)c);
907 return is_utf8_print(tmpbuf);
911 Perl_is_uni_punct(pTHX_ UV c)
913 U8 tmpbuf[UTF8_MAXLEN+1];
914 uvchr_to_utf8(tmpbuf, (UV)c);
915 return is_utf8_punct(tmpbuf);
919 Perl_is_uni_xdigit(pTHX_ UV c)
921 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
922 uvchr_to_utf8(tmpbuf, (UV)c);
923 return is_utf8_xdigit(tmpbuf);
927 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
929 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
930 uvchr_to_utf8(tmpbuf, (UV)c);
931 return to_utf8_upper(tmpbuf, p, lenp);
935 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
937 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
938 uvchr_to_utf8(tmpbuf, (UV)c);
939 return to_utf8_title(tmpbuf, p, lenp);
943 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
945 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
946 uvchr_to_utf8(tmpbuf, (UV)c);
947 return to_utf8_lower(tmpbuf, p, lenp);
951 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
953 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
954 uvchr_to_utf8(tmpbuf, (UV)c);
955 return to_utf8_fold(tmpbuf, p, lenp);
958 /* for now these all assume no locale info available for Unicode > 255 */
961 Perl_is_uni_alnum_lc(pTHX_ UV c)
963 return is_uni_alnum(c); /* XXX no locale support yet */
967 Perl_is_uni_alnumc_lc(pTHX_ UV c)
969 return is_uni_alnumc(c); /* XXX no locale support yet */
973 Perl_is_uni_idfirst_lc(pTHX_ UV c)
975 return is_uni_idfirst(c); /* XXX no locale support yet */
979 Perl_is_uni_alpha_lc(pTHX_ UV c)
981 return is_uni_alpha(c); /* XXX no locale support yet */
985 Perl_is_uni_ascii_lc(pTHX_ UV c)
987 return is_uni_ascii(c); /* XXX no locale support yet */
991 Perl_is_uni_space_lc(pTHX_ UV c)
993 return is_uni_space(c); /* XXX no locale support yet */
997 Perl_is_uni_digit_lc(pTHX_ UV c)
999 return is_uni_digit(c); /* XXX no locale support yet */
1003 Perl_is_uni_upper_lc(pTHX_ UV c)
1005 return is_uni_upper(c); /* XXX no locale support yet */
1009 Perl_is_uni_lower_lc(pTHX_ UV c)
1011 return is_uni_lower(c); /* XXX no locale support yet */
1015 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1017 return is_uni_cntrl(c); /* XXX no locale support yet */
1021 Perl_is_uni_graph_lc(pTHX_ UV c)
1023 return is_uni_graph(c); /* XXX no locale support yet */
1027 Perl_is_uni_print_lc(pTHX_ UV c)
1029 return is_uni_print(c); /* XXX no locale support yet */
1033 Perl_is_uni_punct_lc(pTHX_ UV c)
1035 return is_uni_punct(c); /* XXX no locale support yet */
1039 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1041 return is_uni_xdigit(c); /* XXX no locale support yet */
1045 Perl_is_utf8_alnum(pTHX_ U8 *p)
1047 if (!is_utf8_char(p))
1050 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1051 * descendant of isalnum(3), in other words, it doesn't
1052 * contain the '_'. --jhi */
1053 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1054 return swash_fetch(PL_utf8_alnum, p, TRUE);
1055 /* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1056 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1058 PL_utf8_alnum = swash_init("utf8", "",
1059 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1060 return swash_fetch(PL_utf8_alnum, p, TRUE);
1065 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1067 if (!is_utf8_char(p))
1070 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1071 return swash_fetch(PL_utf8_alnum, p, TRUE);
1072 /* return is_utf8_alpha(p) || is_utf8_digit(p); */
1073 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1075 PL_utf8_alnum = swash_init("utf8", "",
1076 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1077 return swash_fetch(PL_utf8_alnum, p, TRUE);
1082 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1084 return *p == '_' || is_utf8_alpha(p);
1088 Perl_is_utf8_alpha(pTHX_ U8 *p)
1090 if (!is_utf8_char(p))
1093 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1094 return swash_fetch(PL_utf8_alpha, p, TRUE);
1098 Perl_is_utf8_ascii(pTHX_ U8 *p)
1100 if (!is_utf8_char(p))
1103 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1104 return swash_fetch(PL_utf8_ascii, p, TRUE);
1108 Perl_is_utf8_space(pTHX_ U8 *p)
1110 if (!is_utf8_char(p))
1113 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1114 return swash_fetch(PL_utf8_space, p, TRUE);
1118 Perl_is_utf8_digit(pTHX_ U8 *p)
1120 if (!is_utf8_char(p))
1123 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1124 return swash_fetch(PL_utf8_digit, p, TRUE);
1128 Perl_is_utf8_upper(pTHX_ U8 *p)
1130 if (!is_utf8_char(p))
1133 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1134 return swash_fetch(PL_utf8_upper, p, TRUE);
1138 Perl_is_utf8_lower(pTHX_ U8 *p)
1140 if (!is_utf8_char(p))
1143 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1144 return swash_fetch(PL_utf8_lower, p, TRUE);
1148 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1150 if (!is_utf8_char(p))
1153 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1154 return swash_fetch(PL_utf8_cntrl, p, TRUE);
1158 Perl_is_utf8_graph(pTHX_ U8 *p)
1160 if (!is_utf8_char(p))
1163 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1164 return swash_fetch(PL_utf8_graph, p, TRUE);
1168 Perl_is_utf8_print(pTHX_ U8 *p)
1170 if (!is_utf8_char(p))
1173 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1174 return swash_fetch(PL_utf8_print, p, TRUE);
1178 Perl_is_utf8_punct(pTHX_ U8 *p)
1180 if (!is_utf8_char(p))
1183 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1184 return swash_fetch(PL_utf8_punct, p, TRUE);
1188 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1190 if (!is_utf8_char(p))
1192 if (!PL_utf8_xdigit)
1193 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1194 return swash_fetch(PL_utf8_xdigit, p, TRUE);
1198 Perl_is_utf8_mark(pTHX_ U8 *p)
1200 if (!is_utf8_char(p))
1203 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1204 return swash_fetch(PL_utf8_mark, p, TRUE);
1208 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1210 The "p" contains the pointer to the UTF-8 string encoding
1211 the character that is being converted.
1213 The "ustrp" is a pointer to the character buffer to put the
1214 conversion result to. The "lenp" is a pointer to the length
1217 The "swash" is a pointer to the swash to use.
1219 The "normal" is a string like "ToLower" which means the swash
1220 $utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
1221 and loaded by SWASHGET, using lib/utf8_heavy.pl.
1223 The "special" is a string like "utf8::ToSpecLower", which means
1224 the hash %utf8::ToSpecLower, which is stored in the same file,
1225 lib/unicore/To/Lower.pl, and also loaded by SWASHGET. The access
1226 to the hash is by Perl_to_utf8_case().
1232 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special)
1237 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1238 uv = swash_fetch(*swashp, p, TRUE);
1240 uv = UNI_TO_NATIVE(uv);
1246 uv = utf8_to_uvchr(p, 0);
1248 if ((hv = get_hv(special, FALSE)) &&
1249 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) &&
1250 (he = hv_fetch_ent(hv, keysv, FALSE, 0))) {
1251 SV *val = HeVAL(he);
1252 char *s = SvPV(val, *lenp);
1254 if (*lenp > 1 || UNI_IS_INVARIANT(c))
1255 Copy(s, ustrp, *lenp, U8);
1257 /* something in the 0x80..0xFF range */
1258 ustrp[0] = UTF8_EIGHT_BIT_HI(c);
1259 ustrp[1] = UTF8_EIGHT_BIT_LO(c);
1266 *lenp = UNISKIP(uv);
1267 uvuni_to_utf8(ustrp, uv);
1272 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1274 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1275 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1279 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1281 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1282 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1286 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1288 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1289 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1293 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1295 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1296 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1299 /* a "swash" is a swatch hash */
1302 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1305 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1307 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1310 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1312 errsv_save = newSVsv(ERRSV);
1313 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1315 sv_setsv(ERRSV, errsv_save);
1316 SvREFCNT_dec(errsv_save);
1320 PUSHSTACKi(PERLSI_MAGIC);
1323 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1324 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1326 PUSHs(sv_2mortal(newSViv(minbits)));
1327 PUSHs(sv_2mortal(newSViv(none)));
1333 if (PL_curcop == &PL_compiling)
1334 /* XXX ought to be handled by lex_start */
1335 sv_setpv(tokenbufsv, PL_tokenbuf);
1336 errsv_save = newSVsv(ERRSV);
1337 if (call_method("SWASHNEW", G_SCALAR))
1338 retval = newSVsv(*PL_stack_sp--);
1340 retval = &PL_sv_undef;
1342 sv_setsv(ERRSV, errsv_save);
1343 SvREFCNT_dec(errsv_save);
1346 if (PL_curcop == &PL_compiling) {
1348 char* pv = SvPV(tokenbufsv, len);
1350 Copy(pv, PL_tokenbuf, len+1, char);
1351 PL_curcop->op_private = PL_hints;
1353 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1354 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1359 /* This API is wrong for special case conversions since we may need to
1360 * return several Unicode characters for a single Unicode character
1361 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1362 * the lower-level routine, and it is similarly broken for returning
1363 * multiple values. --jhi */
1365 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1367 HV* hv = (HV*)SvRV(sv);
1376 UV c = NATIVE_TO_ASCII(*ptr);
1378 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1379 tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1380 tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1383 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1384 * then the "swatch" is a vec() for al the chars which start
1386 * So the key in the hash (klen) is length of encoded char -1
1388 klen = UTF8SKIP(ptr) - 1;
1393 /* If char in invariant then swatch is for all the invariant chars
1394 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1396 needents = UTF_CONTINUATION_MARK;
1397 off = NATIVE_TO_UTF(ptr[klen]);
1401 /* If char is encoded then swatch is for the prefix */
1402 needents = (1 << UTF_ACCUMULATION_SHIFT);
1403 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1407 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1408 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1409 * it's nothing to sniff at.) Pity we usually come through at least
1410 * two function calls to get here...
1412 * NB: this code assumes that swatches are never modified, once generated!
1415 if (hv == PL_last_swash_hv &&
1416 klen == PL_last_swash_klen &&
1417 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1419 tmps = PL_last_swash_tmps;
1420 slen = PL_last_swash_slen;
1423 /* Try our second-level swatch cache, kept in a hash. */
1424 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1426 /* If not cached, generate it via utf8::SWASHGET */
1427 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1429 /* We use utf8n_to_uvuni() as we want an index into
1430 Unicode tables, not a native character number.
1432 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
1437 PUSHSTACKi(PERLSI_MAGIC);
1441 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1442 PUSHs(sv_2mortal(newSViv((klen) ?
1443 (code_point & ~(needents - 1)) : 0)));
1444 PUSHs(sv_2mortal(newSViv(needents)));
1446 errsv_save = newSVsv(ERRSV);
1447 if (call_method("SWASHGET", G_SCALAR))
1448 retval = newSVsv(*PL_stack_sp--);
1450 retval = &PL_sv_undef;
1452 sv_setsv(ERRSV, errsv_save);
1453 SvREFCNT_dec(errsv_save);
1457 if (PL_curcop == &PL_compiling)
1458 PL_curcop->op_private = PL_hints;
1460 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1462 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1463 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1466 PL_last_swash_hv = hv;
1467 PL_last_swash_klen = klen;
1468 PL_last_swash_tmps = tmps;
1469 PL_last_swash_slen = slen;
1471 Copy(ptr, PL_last_swash_key, klen, U8);
1474 switch ((int)((slen << 3) / needents)) {
1476 bit = 1 << (off & 7);
1478 return (tmps[off] & bit) != 0;
1483 return (tmps[off] << 8) + tmps[off + 1] ;
1486 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1488 Perl_croak(aTHX_ "panic: swash_fetch");
1494 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1496 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1497 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1498 bytes available. The return value is the pointer to the byte after the
1499 end of the new character. In other words,
1501 d = uvchr_to_utf8(d, uv);
1503 is the recommended wide native character-aware way of saying
1510 /* On ASCII machines this is normally a macro but we want a
1511 real function in case XS code wants it
1513 #undef Perl_uvchr_to_utf8
1515 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1517 return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1522 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1524 Returns the native character value of the first character in the string C<s>
1525 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1526 length, in bytes, of that character.
1528 Allows length and flags to be passed to low level routine.
1532 /* On ASCII machines this is normally a macro but we want a
1533 real function in case XS code wants it
1535 #undef Perl_utf8n_to_uvchr
1537 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1539 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1540 return UNI_TO_NATIVE(uv);
1544 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1546 Build to the scalar dsv a displayable version of the string spv,
1547 length len, the displayable version being at most pvlim bytes long
1548 (if longer, the rest is truncated and "..." will be appended).
1549 The flags argument is currently unused but available for future extensions.
1550 The pointer to the PV of the dsv is returned.
1554 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1559 sv_setpvn(dsv, "", 0);
1560 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1562 if (pvlim && SvCUR(dsv) >= pvlim) {
1566 u = utf8_to_uvchr((U8*)s, 0);
1567 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1570 sv_catpvn(dsv, "...", 3);
1576 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1578 Build to the scalar dsv a displayable version of the scalar sv,
1579 he displayable version being at most pvlim bytes long
1580 (if longer, the rest is truncated and "..." will be appended).
1581 The flags argument is currently unused but available for future extensions.
1582 The pointer to the PV of the dsv is returned.
1586 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1588 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1593 =for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|const char *s2|bool u2|register I32 len
1595 Return true if the strings s1 and s2 differ case-insensitively, false
1596 if not (if they are equal case-insensitively). If u1 is true, the
1597 string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
1598 the string s2 is assumed to be in UTF-8-encoded Unicode.
1600 For case-insensitiveness, the "casefolding" of Unicode is used
1601 instead of upper/lowercasing both the characters, see
1602 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1606 Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, register I32 len1, const char *s2, bool u2, register I32 len2)
1608 register U8 *a = (U8*)s1;
1609 register U8 *b = (U8*)s2;
1610 register U8 *ae = b + len1;
1611 register U8 *be = b + len2;
1614 STRLEN ulen1, ulen2;
1615 U8 tmpbuf1[UTF8_MAXLEN_FOLD+1];
1616 U8 tmpbuf2[UTF8_MAXLEN_FOLD+1];
1618 while (a < ae && b < be) {
1620 if (a + UTF8SKIP(a) > ae)
1622 ca = utf8_to_uvchr((U8*)a, &la);
1628 if (b + UTF8SKIP(b) > be)
1630 cb = utf8_to_uvchr((U8*)b, &lb);
1637 to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
1641 to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
1645 || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb])
1646 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
1647 return 1; /* mismatch */
1652 return a == ae && b == be ? 0 : 1; /* 0 match, 1 mismatch */