3 * Copyright (c) 1998-2001, Larry Wall
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
24 #define PERL_IN_UTF8_C
30 =for apidoc A|U8*|uvuni_to_utf8|U8 *d|UV uv
32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34 bytes available. The return value is the pointer to the byte after the
35 end of the new character. In other words,
37 d = uvuni_to_utf8(d, uv);
39 is the recommended Unicode-aware way of saying
47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
49 if (UTF8_IS_INVARIANT(uv)) {
53 #if defined(EBCDIC) || 1 /* always for testing */
55 STRLEN len = UNISKIP(uv);
58 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
59 uv >>= UTF_ACCUMULATION_SHIFT;
61 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
64 #else /* Non loop style */
66 *d++ = (( uv >> 6) | 0xc0);
67 *d++ = (( uv & 0x3f) | 0x80);
71 *d++ = (( uv >> 12) | 0xe0);
72 *d++ = (((uv >> 6) & 0x3f) | 0x80);
73 *d++ = (( uv & 0x3f) | 0x80);
77 *d++ = (( uv >> 18) | 0xf0);
78 *d++ = (((uv >> 12) & 0x3f) | 0x80);
79 *d++ = (((uv >> 6) & 0x3f) | 0x80);
80 *d++ = (( uv & 0x3f) | 0x80);
84 *d++ = (( uv >> 24) | 0xf8);
85 *d++ = (((uv >> 18) & 0x3f) | 0x80);
86 *d++ = (((uv >> 12) & 0x3f) | 0x80);
87 *d++ = (((uv >> 6) & 0x3f) | 0x80);
88 *d++ = (( uv & 0x3f) | 0x80);
91 if (uv < 0x80000000) {
92 *d++ = (( uv >> 30) | 0xfc);
93 *d++ = (((uv >> 24) & 0x3f) | 0x80);
94 *d++ = (((uv >> 18) & 0x3f) | 0x80);
95 *d++ = (((uv >> 12) & 0x3f) | 0x80);
96 *d++ = (((uv >> 6) & 0x3f) | 0x80);
97 *d++ = (( uv & 0x3f) | 0x80);
101 if (uv < UTF8_QUAD_MAX)
104 *d++ = 0xfe; /* Can't match U+FEFF! */
105 *d++ = (((uv >> 30) & 0x3f) | 0x80);
106 *d++ = (((uv >> 24) & 0x3f) | 0x80);
107 *d++ = (((uv >> 18) & 0x3f) | 0x80);
108 *d++ = (((uv >> 12) & 0x3f) | 0x80);
109 *d++ = (((uv >> 6) & 0x3f) | 0x80);
110 *d++ = (( uv & 0x3f) | 0x80);
115 *d++ = 0xff; /* Can't match U+FFFE! */
116 *d++ = 0x80; /* 6 Reserved bits */
117 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
118 *d++ = (((uv >> 54) & 0x3f) | 0x80);
119 *d++ = (((uv >> 48) & 0x3f) | 0x80);
120 *d++ = (((uv >> 42) & 0x3f) | 0x80);
121 *d++ = (((uv >> 36) & 0x3f) | 0x80);
122 *d++ = (((uv >> 30) & 0x3f) | 0x80);
123 *d++ = (((uv >> 24) & 0x3f) | 0x80);
124 *d++ = (((uv >> 18) & 0x3f) | 0x80);
125 *d++ = (((uv >> 12) & 0x3f) | 0x80);
126 *d++ = (((uv >> 6) & 0x3f) | 0x80);
127 *d++ = (( uv & 0x3f) | 0x80);
131 #endif /* Loop style */
135 =for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
137 Adds the UTF8 representation of the Native codepoint C<uv> to the end
138 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
139 bytes available. The return value is the pointer to the byte after the
140 end of the new character. In other words,
142 d = uvchr_to_utf8(d, uv);
144 is the recommended wide native character-aware way of saying
152 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
155 uv = NATIVE_TO_ASCII(uv);
156 return Perl_uvuni_to_utf8(aTHX_ d, uv);
161 =for apidoc A|STRLEN|is_utf8_char|U8 *s
163 Tests if some arbitrary number of bytes begins in a valid UTF-8
164 character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
165 The actual number of bytes in the UTF-8 character will be returned if
166 it is valid, otherwise 0.
170 Perl_is_utf8_char(pTHX_ U8 *s)
176 if (UTF8_IS_INVARIANT(u))
179 if (!UTF8_IS_START(u))
184 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
189 /* The initial value is dubious */
193 if (!UTF8_IS_CONTINUATION(*s))
195 uv = UTF8_ACCUMULATE(uv, *s);
202 if (UNISKIP(uv) < len)
209 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
211 Returns true if first C<len> bytes of the given string form a valid UTF8
212 string, false otherwise. Note that 'a valid UTF8 string' does not mean
213 'a string that contains UTF8' because a valid ASCII string is a valid
220 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
227 len = strlen((char *)s);
243 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
245 Bottom level UTF-8 decode routine.
246 Returns the unicode code point value of the first character in the string C<s>
247 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
248 C<retlen> will be set to the length, in bytes, of that character.
250 If C<s> does not point to a well-formed UTF8 character, the behaviour
251 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
252 it is assumed that the caller will raise a warning, and this function
253 will silently just set C<retlen> to C<-1> and return zero. If the
254 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
255 malformations will be given, C<retlen> will be set to the expected
256 length of the UTF-8 character in bytes, and zero will be returned.
258 The C<flags> can also contain various flags to allow deviations from
259 the strict UTF-8 encoding (see F<utf8.h>).
261 Most code should use utf8_to_uvchr() rather than call this directly.
266 Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
270 bool dowarn = ckWARN_d(WARN_UTF8);
271 STRLEN expectlen = 0;
274 /* This list is a superset of the UTF8_ALLOW_XXX. */
276 #define UTF8_WARN_EMPTY 1
277 #define UTF8_WARN_CONTINUATION 2
278 #define UTF8_WARN_NON_CONTINUATION 3
279 #define UTF8_WARN_FE_FF 4
280 #define UTF8_WARN_SHORT 5
281 #define UTF8_WARN_OVERFLOW 6
282 #define UTF8_WARN_SURROGATE 7
283 #define UTF8_WARN_BOM 8
284 #define UTF8_WARN_LONG 9
285 #define UTF8_WARN_FFFF 10
288 !(flags & UTF8_ALLOW_EMPTY)) {
289 warning = UTF8_WARN_EMPTY;
293 if (UTF8_IS_INVARIANT(uv)) {
299 if (UTF8_IS_CONTINUATION(uv) &&
300 !(flags & UTF8_ALLOW_CONTINUATION)) {
301 warning = UTF8_WARN_CONTINUATION;
305 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
306 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
307 warning = UTF8_WARN_NON_CONTINUATION;
312 uv = NATIVE_TO_UTF(uv);
314 if ((uv == 0xfe || uv == 0xff) &&
315 !(flags & UTF8_ALLOW_FE_FF)) {
316 warning = UTF8_WARN_FE_FF;
321 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
322 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
323 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
324 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
326 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
327 else { len = 7; uv &= 0x01; }
329 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
330 else if (!(uv & 0x01)) { len = 7; uv = 0; }
331 else { len = 13; uv = 0; } /* whoa! */
339 if ((curlen < expectlen) &&
340 !(flags & UTF8_ALLOW_SHORT)) {
341 warning = UTF8_WARN_SHORT;
350 if (!UTF8_IS_CONTINUATION(*s) &&
351 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
353 warning = UTF8_WARN_NON_CONTINUATION;
357 uv = UTF8_ACCUMULATE(uv, *s);
359 /* These cannot be allowed. */
361 if (!(flags & UTF8_ALLOW_LONG)) {
362 warning = UTF8_WARN_LONG;
366 else { /* uv < ouv */
367 /* This cannot be allowed. */
368 warning = UTF8_WARN_OVERFLOW;
376 if (UNICODE_IS_SURROGATE(uv) &&
377 !(flags & UTF8_ALLOW_SURROGATE)) {
378 warning = UTF8_WARN_SURROGATE;
380 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
381 !(flags & UTF8_ALLOW_BOM)) {
382 warning = UTF8_WARN_BOM;
384 } else if ((expectlen > UNISKIP(uv)) &&
385 !(flags & UTF8_ALLOW_LONG)) {
386 warning = UTF8_WARN_LONG;
388 } else if (UNICODE_IS_ILLEGAL(uv) &&
389 !(flags & UTF8_ALLOW_FFFF)) {
390 warning = UTF8_WARN_FFFF;
398 if (flags & UTF8_CHECK_ONLY) {
405 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
408 case 0: /* Intentionally empty. */ break;
409 case UTF8_WARN_EMPTY:
410 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
412 case UTF8_WARN_CONTINUATION:
413 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
415 case UTF8_WARN_NON_CONTINUATION:
416 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
419 case UTF8_WARN_FE_FF:
420 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
422 case UTF8_WARN_SHORT:
423 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
424 curlen, curlen == 1 ? "" : "s", expectlen);
426 case UTF8_WARN_OVERFLOW:
427 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
430 case UTF8_WARN_SURROGATE:
431 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
434 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
437 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
438 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
441 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
444 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
452 Perl_warner(aTHX_ WARN_UTF8,
453 "%s in %s", s, PL_op_desc[PL_op->op_type]);
455 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
460 *retlen = expectlen ? expectlen : len;
466 =for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
468 Returns the native character value of the first character in the string C<s>
469 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
470 length, in bytes, of that character.
472 Allows length and flags to be passed to low level routine.
478 Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
480 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
482 return (UV) ASCII_TO_NATIVE(uv);
487 =for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen
489 Returns the native character value of the first character in the string C<s>
490 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
491 length, in bytes, of that character.
493 If C<s> does not point to a well-formed UTF8 character, zero is
494 returned and retlen is set, if possible, to -1.
500 Perl_utf8_to_uvchr(pTHX_ U8* s, STRLEN* retlen)
502 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
506 =for apidoc A|U8* s|utf8_to_uvuni|STRLEN *retlen
508 Returns the Unicode code point of the first character in the string C<s>
509 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
510 length, in bytes, of that character.
512 This function should only be used when returned UV is considered
513 an index into the Unicode semantic tables (e.g. swashes).
515 If C<s> does not point to a well-formed UTF8 character, zero is
516 returned and retlen is set, if possible, to -1.
522 Perl_utf8_to_uvuni(pTHX_ U8* s, STRLEN* retlen)
524 /* Call the low level routine asking for checks */
525 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
529 =for apidoc A|STRLEN|utf8_length|U8* s|U8 *e
531 Return the length of the UTF-8 char encoded string C<s> in characters.
532 Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
533 up past C<e>, croaks.
539 Perl_utf8_length(pTHX_ U8* s, U8* e)
543 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
544 * the bitops (especially ~) can create illegal UTF-8.
545 * In other words: in Perl UTF-8 is not just for Unicode. */
548 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
553 Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
562 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
564 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
567 WARNING: use only if you *know* that the pointers point inside the
573 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
577 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
578 * the bitops (especially ~) can create illegal UTF-8.
579 * In other words: in Perl UTF-8 is not just for Unicode. */
586 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
596 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
606 =for apidoc A|U8*|utf8_hop|U8 *s|I32 off
608 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
611 WARNING: do not use the following unless you *know* C<off> is within
612 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
613 on the first byte of character or just after the last byte of a character.
618 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
620 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
621 * the bitops (especially ~) can create illegal UTF-8.
622 * In other words: in Perl UTF-8 is not just for Unicode. */
631 while (UTF8_IS_CONTINUATION(*s))
639 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
641 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
642 Unlike C<bytes_to_utf8>, this over-writes the original string, and
643 updates len to contain the new length.
644 Returns zero on failure, setting C<len> to -1.
650 Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
656 /* ensure valid UTF8 and chars < 256 before updating string */
657 for (send = s + *len; s < send; ) {
660 if (!UTF8_IS_INVARIANT(c) &&
661 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
662 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
671 *d++ = (U8)utf8_to_uvchr(s, &ulen);
680 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
682 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
683 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
684 the newly-created string, and updates C<len> to contain the new
685 length. Returns the original string if no conversion occurs, C<len>
686 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
687 0 if C<s> is converted or contains all 7bit characters.
692 Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
702 /* ensure valid UTF8 and chars < 256 before converting string */
703 for (send = s + *len; s < send;) {
705 if (!UTF8_IS_INVARIANT(c)) {
706 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
707 (c = *s++) && UTF8_IS_CONTINUATION(c))
717 /* Can use as-is if no high chars */
722 Newz(801, d, (*len) - count + 1, U8);
723 s = start; start = d;
726 if (!UTF8_IS_INVARIANT(c))
727 c = UTF8_ACCUMULATE(c, *s++);
728 *d++ = ASCII_TO_NATIVE(c);
736 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
738 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
739 Returns a pointer to the newly-created string, and sets C<len> to
740 reflect the new length.
746 Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
753 Newz(801, d, (*len) * 2 + 1, U8);
757 UV uv = NATIVE_TO_ASCII(*s++);
758 if (UTF8_IS_INVARIANT(uv))
761 *d++ = UTF8_EIGHT_BIT_HI(uv);
762 *d++ = UTF8_EIGHT_BIT_LO(uv);
771 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
773 * Destination must be pre-extended to 3/2 source. Do not use in-place.
774 * We optimize for native, for obvious reasons. */
777 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
783 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
788 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
795 *d++ = (( uv >> 6) | 0xc0);
796 *d++ = (( uv & 0x3f) | 0x80);
799 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
801 if (low < 0xdc00 || low >= 0xdfff)
802 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
803 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
806 *d++ = (( uv >> 12) | 0xe0);
807 *d++ = (((uv >> 6) & 0x3f) | 0x80);
808 *d++ = (( uv & 0x3f) | 0x80);
812 *d++ = (( uv >> 18) | 0xf0);
813 *d++ = (((uv >> 12) & 0x3f) | 0x80);
814 *d++ = (((uv >> 6) & 0x3f) | 0x80);
815 *d++ = (( uv & 0x3f) | 0x80);
819 *newlen = d - dstart;
823 /* Note: this one is slightly destructive of the source. */
826 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
829 U8* send = s + bytelen;
836 return utf16_to_utf8(p, d, bytelen, newlen);
839 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
842 Perl_is_uni_alnum(pTHX_ U32 c)
844 U8 tmpbuf[UTF8_MAXLEN+1];
845 uvuni_to_utf8(tmpbuf, (UV)c);
846 return is_utf8_alnum(tmpbuf);
850 Perl_is_uni_alnumc(pTHX_ U32 c)
852 U8 tmpbuf[UTF8_MAXLEN+1];
853 uvuni_to_utf8(tmpbuf, (UV)c);
854 return is_utf8_alnumc(tmpbuf);
858 Perl_is_uni_idfirst(pTHX_ U32 c)
860 U8 tmpbuf[UTF8_MAXLEN+1];
861 uvuni_to_utf8(tmpbuf, (UV)c);
862 return is_utf8_idfirst(tmpbuf);
866 Perl_is_uni_alpha(pTHX_ U32 c)
868 U8 tmpbuf[UTF8_MAXLEN+1];
869 uvuni_to_utf8(tmpbuf, (UV)c);
870 return is_utf8_alpha(tmpbuf);
874 Perl_is_uni_ascii(pTHX_ U32 c)
876 U8 tmpbuf[UTF8_MAXLEN+1];
877 uvuni_to_utf8(tmpbuf, (UV)c);
878 return is_utf8_ascii(tmpbuf);
882 Perl_is_uni_space(pTHX_ U32 c)
884 U8 tmpbuf[UTF8_MAXLEN+1];
885 uvuni_to_utf8(tmpbuf, (UV)c);
886 return is_utf8_space(tmpbuf);
890 Perl_is_uni_digit(pTHX_ U32 c)
892 U8 tmpbuf[UTF8_MAXLEN+1];
893 uvuni_to_utf8(tmpbuf, (UV)c);
894 return is_utf8_digit(tmpbuf);
898 Perl_is_uni_upper(pTHX_ U32 c)
900 U8 tmpbuf[UTF8_MAXLEN+1];
901 uvuni_to_utf8(tmpbuf, (UV)c);
902 return is_utf8_upper(tmpbuf);
906 Perl_is_uni_lower(pTHX_ U32 c)
908 U8 tmpbuf[UTF8_MAXLEN+1];
909 uvuni_to_utf8(tmpbuf, (UV)c);
910 return is_utf8_lower(tmpbuf);
914 Perl_is_uni_cntrl(pTHX_ U32 c)
916 U8 tmpbuf[UTF8_MAXLEN+1];
917 uvuni_to_utf8(tmpbuf, (UV)c);
918 return is_utf8_cntrl(tmpbuf);
922 Perl_is_uni_graph(pTHX_ U32 c)
924 U8 tmpbuf[UTF8_MAXLEN+1];
925 uvuni_to_utf8(tmpbuf, (UV)c);
926 return is_utf8_graph(tmpbuf);
930 Perl_is_uni_print(pTHX_ U32 c)
932 U8 tmpbuf[UTF8_MAXLEN+1];
933 uvuni_to_utf8(tmpbuf, (UV)c);
934 return is_utf8_print(tmpbuf);
938 Perl_is_uni_punct(pTHX_ U32 c)
940 U8 tmpbuf[UTF8_MAXLEN+1];
941 uvuni_to_utf8(tmpbuf, (UV)c);
942 return is_utf8_punct(tmpbuf);
946 Perl_is_uni_xdigit(pTHX_ U32 c)
948 U8 tmpbuf[UTF8_MAXLEN+1];
949 uvuni_to_utf8(tmpbuf, (UV)c);
950 return is_utf8_xdigit(tmpbuf);
954 Perl_to_uni_upper(pTHX_ U32 c)
956 U8 tmpbuf[UTF8_MAXLEN+1];
957 uvuni_to_utf8(tmpbuf, (UV)c);
958 return to_utf8_upper(tmpbuf);
962 Perl_to_uni_title(pTHX_ U32 c)
964 U8 tmpbuf[UTF8_MAXLEN+1];
965 uvuni_to_utf8(tmpbuf, (UV)c);
966 return to_utf8_title(tmpbuf);
970 Perl_to_uni_lower(pTHX_ U32 c)
972 U8 tmpbuf[UTF8_MAXLEN+1];
973 uvuni_to_utf8(tmpbuf, (UV)c);
974 return to_utf8_lower(tmpbuf);
977 /* for now these all assume no locale info available for Unicode > 255 */
980 Perl_is_uni_alnum_lc(pTHX_ U32 c)
982 return is_uni_alnum(c); /* XXX no locale support yet */
986 Perl_is_uni_alnumc_lc(pTHX_ U32 c)
988 return is_uni_alnumc(c); /* XXX no locale support yet */
992 Perl_is_uni_idfirst_lc(pTHX_ U32 c)
994 return is_uni_idfirst(c); /* XXX no locale support yet */
998 Perl_is_uni_alpha_lc(pTHX_ U32 c)
1000 return is_uni_alpha(c); /* XXX no locale support yet */
1004 Perl_is_uni_ascii_lc(pTHX_ U32 c)
1006 return is_uni_ascii(c); /* XXX no locale support yet */
1010 Perl_is_uni_space_lc(pTHX_ U32 c)
1012 return is_uni_space(c); /* XXX no locale support yet */
1016 Perl_is_uni_digit_lc(pTHX_ U32 c)
1018 return is_uni_digit(c); /* XXX no locale support yet */
1022 Perl_is_uni_upper_lc(pTHX_ U32 c)
1024 return is_uni_upper(c); /* XXX no locale support yet */
1028 Perl_is_uni_lower_lc(pTHX_ U32 c)
1030 return is_uni_lower(c); /* XXX no locale support yet */
1034 Perl_is_uni_cntrl_lc(pTHX_ U32 c)
1036 return is_uni_cntrl(c); /* XXX no locale support yet */
1040 Perl_is_uni_graph_lc(pTHX_ U32 c)
1042 return is_uni_graph(c); /* XXX no locale support yet */
1046 Perl_is_uni_print_lc(pTHX_ U32 c)
1048 return is_uni_print(c); /* XXX no locale support yet */
1052 Perl_is_uni_punct_lc(pTHX_ U32 c)
1054 return is_uni_punct(c); /* XXX no locale support yet */
1058 Perl_is_uni_xdigit_lc(pTHX_ U32 c)
1060 return is_uni_xdigit(c); /* XXX no locale support yet */
1064 Perl_to_uni_upper_lc(pTHX_ U32 c)
1066 return to_uni_upper(c); /* XXX no locale support yet */
1070 Perl_to_uni_title_lc(pTHX_ U32 c)
1072 return to_uni_title(c); /* XXX no locale support yet */
1076 Perl_to_uni_lower_lc(pTHX_ U32 c)
1078 return to_uni_lower(c); /* XXX no locale support yet */
1082 Perl_is_utf8_alnum(pTHX_ U8 *p)
1084 if (!is_utf8_char(p))
1087 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1088 * descendant of isalnum(3), in other words, it doesn't
1089 * contain the '_'. --jhi */
1090 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1091 return swash_fetch(PL_utf8_alnum, p);
1092 /* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1093 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1095 PL_utf8_alnum = swash_init("utf8", "",
1096 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1097 return swash_fetch(PL_utf8_alnum, p);
1102 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1104 if (!is_utf8_char(p))
1107 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1108 return swash_fetch(PL_utf8_alnum, p);
1109 /* return is_utf8_alpha(p) || is_utf8_digit(p); */
1110 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1112 PL_utf8_alnum = swash_init("utf8", "",
1113 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1114 return swash_fetch(PL_utf8_alnum, p);
1119 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1121 return *p == '_' || is_utf8_alpha(p);
1125 Perl_is_utf8_alpha(pTHX_ U8 *p)
1127 if (!is_utf8_char(p))
1130 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1131 return swash_fetch(PL_utf8_alpha, p);
1135 Perl_is_utf8_ascii(pTHX_ U8 *p)
1137 if (!is_utf8_char(p))
1140 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1141 return swash_fetch(PL_utf8_ascii, p);
1145 Perl_is_utf8_space(pTHX_ U8 *p)
1147 if (!is_utf8_char(p))
1150 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1151 return swash_fetch(PL_utf8_space, p);
1155 Perl_is_utf8_digit(pTHX_ U8 *p)
1157 if (!is_utf8_char(p))
1160 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1161 return swash_fetch(PL_utf8_digit, p);
1165 Perl_is_utf8_upper(pTHX_ U8 *p)
1167 if (!is_utf8_char(p))
1170 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1171 return swash_fetch(PL_utf8_upper, p);
1175 Perl_is_utf8_lower(pTHX_ U8 *p)
1177 if (!is_utf8_char(p))
1180 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1181 return swash_fetch(PL_utf8_lower, p);
1185 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1187 if (!is_utf8_char(p))
1190 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1191 return swash_fetch(PL_utf8_cntrl, p);
1195 Perl_is_utf8_graph(pTHX_ U8 *p)
1197 if (!is_utf8_char(p))
1200 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1201 return swash_fetch(PL_utf8_graph, p);
1205 Perl_is_utf8_print(pTHX_ U8 *p)
1207 if (!is_utf8_char(p))
1210 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1211 return swash_fetch(PL_utf8_print, p);
1215 Perl_is_utf8_punct(pTHX_ U8 *p)
1217 if (!is_utf8_char(p))
1220 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1221 return swash_fetch(PL_utf8_punct, p);
1225 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1227 if (!is_utf8_char(p))
1229 if (!PL_utf8_xdigit)
1230 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1231 return swash_fetch(PL_utf8_xdigit, p);
1235 Perl_is_utf8_mark(pTHX_ U8 *p)
1237 if (!is_utf8_char(p))
1240 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1241 return swash_fetch(PL_utf8_mark, p);
1245 Perl_to_utf8_upper(pTHX_ U8 *p)
1249 if (!PL_utf8_toupper)
1250 PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
1251 uv = swash_fetch(PL_utf8_toupper, p);
1252 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1256 Perl_to_utf8_title(pTHX_ U8 *p)
1260 if (!PL_utf8_totitle)
1261 PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
1262 uv = swash_fetch(PL_utf8_totitle, p);
1263 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1267 Perl_to_utf8_lower(pTHX_ U8 *p)
1271 if (!PL_utf8_tolower)
1272 PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
1273 uv = swash_fetch(PL_utf8_tolower, p);
1274 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1277 /* a "swash" is a swatch hash */
1280 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1283 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1285 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1287 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1289 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1293 PUSHSTACKi(PERLSI_MAGIC);
1296 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1297 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1299 PUSHs(sv_2mortal(newSViv(minbits)));
1300 PUSHs(sv_2mortal(newSViv(none)));
1306 if (PL_curcop == &PL_compiling)
1307 /* XXX ought to be handled by lex_start */
1308 sv_setpv(tokenbufsv, PL_tokenbuf);
1309 if (call_method("SWASHNEW", G_SCALAR))
1310 retval = newSVsv(*PL_stack_sp--);
1312 retval = &PL_sv_undef;
1315 if (PL_curcop == &PL_compiling) {
1317 char* pv = SvPV(tokenbufsv, len);
1319 Copy(pv, PL_tokenbuf, len+1, char);
1320 PL_curcop->op_private = PL_hints;
1322 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1323 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1328 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
1330 HV* hv = (HV*)SvRV(sv);
1331 U32 klen = UTF8SKIP(ptr) - 1;
1332 U32 off = ptr[klen] & 127; /* NB: 64 bit always 0 when len > 1 */
1334 STRLEN needents = (klen ? 64 : 128);
1340 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1341 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1342 * it's nothing to sniff at.) Pity we usually come through at least
1343 * two function calls to get here...
1345 * NB: this code assumes that swatches are never modified, once generated!
1348 if (hv == PL_last_swash_hv &&
1349 klen == PL_last_swash_klen &&
1350 (!klen || memEQ((char *)ptr,(char *)PL_last_swash_key,klen)) )
1352 tmps = PL_last_swash_tmps;
1353 slen = PL_last_swash_slen;
1356 /* Try our second-level swatch cache, kept in a hash. */
1357 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1359 /* If not cached, generate it via utf8::SWASHGET */
1360 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1365 PUSHSTACKi(PERLSI_MAGIC);
1369 /* We call utf8_to_uni as we want and index into Unicode tables,
1370 not a native character number.
1372 PUSHs(sv_2mortal(newSViv(utf8_to_uvuni(ptr, 0) & ~(needents - 1))));
1373 PUSHs(sv_2mortal(newSViv(needents)));
1375 if (call_method("SWASHGET", G_SCALAR))
1376 retval = newSVsv(*PL_stack_sp--);
1378 retval = &PL_sv_undef;
1382 if (PL_curcop == &PL_compiling)
1383 PL_curcop->op_private = PL_hints;
1385 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1387 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8)
1388 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1391 PL_last_swash_hv = hv;
1392 PL_last_swash_klen = klen;
1393 PL_last_swash_tmps = tmps;
1394 PL_last_swash_slen = slen;
1396 Copy(ptr, PL_last_swash_key, klen, U8);
1399 switch ((int)((slen << 3) / needents)) {
1401 bit = 1 << (off & 7);
1403 return (tmps[off] & bit) != 0;
1408 return (tmps[off] << 8) + tmps[off + 1] ;
1411 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1413 Perl_croak(aTHX_ "panic: swash_fetch");