3 * Copyright (c) 1998-2001, Larry Wall
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
24 #define PERL_IN_UTF8_C
30 =for apidoc A|U8*|uvuni_to_utf8|U8 *d|UV uv
32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34 bytes available. The return value is the pointer to the byte after the
35 end of the new character. In other words,
37 d = uvuni_to_utf8(d, uv);
39 is the recommended Unicode-aware way of saying
47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
54 *d++ = (( uv >> 6) | 0xc0);
55 *d++ = (( uv & 0x3f) | 0x80);
59 *d++ = (( uv >> 12) | 0xe0);
60 *d++ = (((uv >> 6) & 0x3f) | 0x80);
61 *d++ = (( uv & 0x3f) | 0x80);
65 *d++ = (( uv >> 18) | 0xf0);
66 *d++ = (((uv >> 12) & 0x3f) | 0x80);
67 *d++ = (((uv >> 6) & 0x3f) | 0x80);
68 *d++ = (( uv & 0x3f) | 0x80);
72 *d++ = (( uv >> 24) | 0xf8);
73 *d++ = (((uv >> 18) & 0x3f) | 0x80);
74 *d++ = (((uv >> 12) & 0x3f) | 0x80);
75 *d++ = (((uv >> 6) & 0x3f) | 0x80);
76 *d++ = (( uv & 0x3f) | 0x80);
79 if (uv < 0x80000000) {
80 *d++ = (( uv >> 30) | 0xfc);
81 *d++ = (((uv >> 24) & 0x3f) | 0x80);
82 *d++ = (((uv >> 18) & 0x3f) | 0x80);
83 *d++ = (((uv >> 12) & 0x3f) | 0x80);
84 *d++ = (((uv >> 6) & 0x3f) | 0x80);
85 *d++ = (( uv & 0x3f) | 0x80);
89 if (uv < UTF8_QUAD_MAX)
92 *d++ = 0xfe; /* Can't match U+FEFF! */
93 *d++ = (((uv >> 30) & 0x3f) | 0x80);
94 *d++ = (((uv >> 24) & 0x3f) | 0x80);
95 *d++ = (((uv >> 18) & 0x3f) | 0x80);
96 *d++ = (((uv >> 12) & 0x3f) | 0x80);
97 *d++ = (((uv >> 6) & 0x3f) | 0x80);
98 *d++ = (( uv & 0x3f) | 0x80);
103 *d++ = 0xff; /* Can't match U+FFFE! */
104 *d++ = 0x80; /* 6 Reserved bits */
105 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
106 *d++ = (((uv >> 54) & 0x3f) | 0x80);
107 *d++ = (((uv >> 48) & 0x3f) | 0x80);
108 *d++ = (((uv >> 42) & 0x3f) | 0x80);
109 *d++ = (((uv >> 36) & 0x3f) | 0x80);
110 *d++ = (((uv >> 30) & 0x3f) | 0x80);
111 *d++ = (((uv >> 24) & 0x3f) | 0x80);
112 *d++ = (((uv >> 18) & 0x3f) | 0x80);
113 *d++ = (((uv >> 12) & 0x3f) | 0x80);
114 *d++ = (((uv >> 6) & 0x3f) | 0x80);
115 *d++ = (( uv & 0x3f) | 0x80);
122 =for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
124 Adds the UTF8 representation of the Native codepoint C<uv> to the end
125 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
126 bytes available. The return value is the pointer to the byte after the
127 end of the new character. In other words,
129 d = uvchr_to_utf8(d, uv);
131 is the recommended wide native character-aware way of saying
139 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
142 uv = NATIVE_TO_ASCII(uv);
143 return Perl_uvuni_to_utf8(aTHX_ d, uv);
148 =for apidoc A|STRLEN|is_utf8_char|U8 *s
150 Tests if some arbitrary number of bytes begins in a valid UTF-8
151 character. Note that an ASCII character is a valid UTF-8 character.
152 The actual number of bytes in the UTF-8 character will be returned if
153 it is valid, otherwise 0.
157 Perl_is_utf8_char(pTHX_ U8 *s)
163 if (UTF8_IS_ASCII(u))
166 if (!UTF8_IS_START(u))
171 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
179 if (!UTF8_IS_CONTINUATION(*s))
181 uv = UTF8_ACCUMULATE(uv, *s);
188 if (UNISKIP(uv) < len)
195 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
197 Returns true if first C<len> bytes of the given string form a valid UTF8
198 string, false otherwise. Note that 'a valid UTF8 string' does not mean
199 'a string that contains UTF8' because a valid ASCII string is a valid
206 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
213 len = strlen((char *)s);
229 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
231 Bottom level UTF-8 decode routine.
232 Returns the unicode code point value of the first character in the string C<s>
233 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
234 C<retlen> will be set to the length, in bytes, of that character.
236 If C<s> does not point to a well-formed UTF8 character, the behaviour
237 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
238 it is assumed that the caller will raise a warning, and this function
239 will silently just set C<retlen> to C<-1> and return zero. If the
240 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
241 malformations will be given, C<retlen> will be set to the expected
242 length of the UTF-8 character in bytes, and zero will be returned.
244 The C<flags> can also contain various flags to allow deviations from
245 the strict UTF-8 encoding (see F<utf8.h>).
247 Most code should use utf8_to_uvchr() rather than call this directly.
252 Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
259 bool dowarn = ckWARN_d(WARN_UTF8);
261 STRLEN expectlen = 0;
264 /* This list is a superset of the UTF8_ALLOW_XXX. */
266 #define UTF8_WARN_EMPTY 1
267 #define UTF8_WARN_CONTINUATION 2
268 #define UTF8_WARN_NON_CONTINUATION 3
269 #define UTF8_WARN_FE_FF 4
270 #define UTF8_WARN_SHORT 5
271 #define UTF8_WARN_OVERFLOW 6
272 #define UTF8_WARN_SURROGATE 7
273 #define UTF8_WARN_BOM 8
274 #define UTF8_WARN_LONG 9
275 #define UTF8_WARN_FFFF 10
278 !(flags & UTF8_ALLOW_EMPTY)) {
279 warning = UTF8_WARN_EMPTY;
283 if (UTF8_IS_ASCII(uv)) {
289 if (UTF8_IS_CONTINUATION(uv) &&
290 !(flags & UTF8_ALLOW_CONTINUATION)) {
291 warning = UTF8_WARN_CONTINUATION;
295 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
296 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
297 warning = UTF8_WARN_NON_CONTINUATION;
301 if ((uv == 0xfe || uv == 0xff) &&
302 !(flags & UTF8_ALLOW_FE_FF)) {
303 warning = UTF8_WARN_FE_FF;
307 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
308 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
309 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
310 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
311 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
312 else if (!(uv & 0x01)) { len = 7; uv = 0; }
313 else { len = 13; uv = 0; } /* whoa! */
320 if ((curlen < expectlen) &&
321 !(flags & UTF8_ALLOW_SHORT)) {
322 warning = UTF8_WARN_SHORT;
331 if (!UTF8_IS_CONTINUATION(*s) &&
332 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
334 warning = UTF8_WARN_NON_CONTINUATION;
338 uv = UTF8_ACCUMULATE(uv, *s);
340 /* These cannot be allowed. */
342 if (!(flags & UTF8_ALLOW_LONG)) {
343 warning = UTF8_WARN_LONG;
347 else { /* uv < ouv */
348 /* This cannot be allowed. */
349 warning = UTF8_WARN_OVERFLOW;
357 if (UNICODE_IS_SURROGATE(uv) &&
358 !(flags & UTF8_ALLOW_SURROGATE)) {
359 warning = UTF8_WARN_SURROGATE;
361 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
362 !(flags & UTF8_ALLOW_BOM)) {
363 warning = UTF8_WARN_BOM;
365 } else if ((expectlen > UNISKIP(uv)) &&
366 !(flags & UTF8_ALLOW_LONG)) {
367 warning = UTF8_WARN_LONG;
369 } else if (UNICODE_IS_ILLEGAL(uv) &&
370 !(flags & UTF8_ALLOW_FFFF)) {
371 warning = UTF8_WARN_FFFF;
379 if (flags & UTF8_CHECK_ONLY) {
386 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
389 case 0: /* Intentionally empty. */ break;
390 case UTF8_WARN_EMPTY:
391 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
393 case UTF8_WARN_CONTINUATION:
394 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
396 case UTF8_WARN_NON_CONTINUATION:
397 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
400 case UTF8_WARN_FE_FF:
401 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
403 case UTF8_WARN_SHORT:
404 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
405 curlen, curlen == 1 ? "" : "s", expectlen);
407 case UTF8_WARN_OVERFLOW:
408 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
411 case UTF8_WARN_SURROGATE:
412 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
415 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
418 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
419 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
422 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
425 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
433 Perl_warner(aTHX_ WARN_UTF8,
434 "%s in %s", s, PL_op_desc[PL_op->op_type]);
436 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
441 *retlen = expectlen ? expectlen : len;
447 =for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
449 Returns the native character value of the first character in the string C<s>
450 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
451 length, in bytes, of that character.
453 Allows length and flags to be passed to low level routine.
459 Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
461 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
463 return (UV) ASCII_TO_NATIVE(uv);
468 =for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen
470 Returns the native character value of the first character in the string C<s>
471 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
472 length, in bytes, of that character.
474 If C<s> does not point to a well-formed UTF8 character, zero is
475 returned and retlen is set, if possible, to -1.
481 Perl_utf8_to_uvchr(pTHX_ U8* s, STRLEN* retlen)
483 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
487 =for apidoc A|U8* s|utf8_to_uvuni|STRLEN *retlen
489 Returns the Unicode code point of the first character in the string C<s>
490 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
491 length, in bytes, of that character.
493 This function should only be used when returned UV is considered
494 an index into the Unicode semantic tables (e.g. swashes).
496 If C<s> does not point to a well-formed UTF8 character, zero is
497 returned and retlen is set, if possible, to -1.
503 Perl_utf8_to_uvuni(pTHX_ U8* s, STRLEN* retlen)
505 /* Call the low level routine asking for checks */
506 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
510 =for apidoc A|STRLEN|utf8_length|U8* s|U8 *e
512 Return the length of the UTF-8 char encoded string C<s> in characters.
513 Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
514 up past C<e>, croaks.
520 Perl_utf8_length(pTHX_ U8* s, U8* e)
524 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
525 * the bitops (especially ~) can create illegal UTF-8.
526 * In other words: in Perl UTF-8 is not just for Unicode. */
529 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
534 Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
543 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
545 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
548 WARNING: use only if you *know* that the pointers point inside the
554 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
558 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
559 * the bitops (especially ~) can create illegal UTF-8.
560 * In other words: in Perl UTF-8 is not just for Unicode. */
567 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
577 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
587 =for apidoc A|U8*|utf8_hop|U8 *s|I32 off
589 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
592 WARNING: do not use the following unless you *know* C<off> is within
593 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
594 on the first byte of character or just after the last byte of a character.
599 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
601 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
602 * the bitops (especially ~) can create illegal UTF-8.
603 * In other words: in Perl UTF-8 is not just for Unicode. */
612 while (UTF8_IS_CONTINUATION(*s))
620 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
622 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
623 Unlike C<bytes_to_utf8>, this over-writes the original string, and
624 updates len to contain the new length.
625 Returns zero on failure, setting C<len> to -1.
631 Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
637 /* ensure valid UTF8 and chars < 256 before updating string */
638 for (send = s + *len; s < send; ) {
643 ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) {
652 *d++ = (U8)utf8_to_uvchr(s, &ulen);
661 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
663 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
664 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
665 the newly-created string, and updates C<len> to contain the new
666 length. Returns the original string if no conversion occurs, C<len>
667 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
668 0 if C<s> is converted or contains all 7bit characters.
673 Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
683 /* ensure valid UTF8 and chars < 256 before converting string */
684 for (send = s + *len; s < send;) {
686 if (!UTF8_IS_ASCII(c)) {
687 if (UTF8_IS_CONTINUATION(c) || s >= send ||
688 !UTF8_IS_CONTINUATION(*s) || UTF8_IS_DOWNGRADEABLE_START(c))
699 Newz(801, d, (*len) - count + 1, U8);
700 s = start; start = d;
704 if (UTF8_IS_ASCII(c))
707 *d++ = UTF8_ACCUMULATE(c, *s++);
715 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
717 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
718 Returns a pointer to the newly-created string, and sets C<len> to
719 reflect the new length.
725 Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
732 Newz(801, d, (*len) * 2 + 1, U8);
736 if (UTF8_IS_ASCII(*s))
741 *d++ = UTF8_EIGHT_BIT_HI(uv);
742 *d++ = UTF8_EIGHT_BIT_LO(uv);
751 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
753 * Destination must be pre-extended to 3/2 source. Do not use in-place.
754 * We optimize for native, for obvious reasons. */
757 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
763 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
768 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
775 *d++ = (( uv >> 6) | 0xc0);
776 *d++ = (( uv & 0x3f) | 0x80);
779 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
781 if (low < 0xdc00 || low >= 0xdfff)
782 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
783 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
786 *d++ = (( uv >> 12) | 0xe0);
787 *d++ = (((uv >> 6) & 0x3f) | 0x80);
788 *d++ = (( uv & 0x3f) | 0x80);
792 *d++ = (( uv >> 18) | 0xf0);
793 *d++ = (((uv >> 12) & 0x3f) | 0x80);
794 *d++ = (((uv >> 6) & 0x3f) | 0x80);
795 *d++ = (( uv & 0x3f) | 0x80);
799 *newlen = d - dstart;
803 /* Note: this one is slightly destructive of the source. */
806 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
809 U8* send = s + bytelen;
816 return utf16_to_utf8(p, d, bytelen, newlen);
819 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
822 Perl_is_uni_alnum(pTHX_ U32 c)
824 U8 tmpbuf[UTF8_MAXLEN+1];
825 uvuni_to_utf8(tmpbuf, (UV)c);
826 return is_utf8_alnum(tmpbuf);
830 Perl_is_uni_alnumc(pTHX_ U32 c)
832 U8 tmpbuf[UTF8_MAXLEN+1];
833 uvuni_to_utf8(tmpbuf, (UV)c);
834 return is_utf8_alnumc(tmpbuf);
838 Perl_is_uni_idfirst(pTHX_ U32 c)
840 U8 tmpbuf[UTF8_MAXLEN+1];
841 uvuni_to_utf8(tmpbuf, (UV)c);
842 return is_utf8_idfirst(tmpbuf);
846 Perl_is_uni_alpha(pTHX_ U32 c)
848 U8 tmpbuf[UTF8_MAXLEN+1];
849 uvuni_to_utf8(tmpbuf, (UV)c);
850 return is_utf8_alpha(tmpbuf);
854 Perl_is_uni_ascii(pTHX_ U32 c)
856 U8 tmpbuf[UTF8_MAXLEN+1];
857 uvuni_to_utf8(tmpbuf, (UV)c);
858 return is_utf8_ascii(tmpbuf);
862 Perl_is_uni_space(pTHX_ U32 c)
864 U8 tmpbuf[UTF8_MAXLEN+1];
865 uvuni_to_utf8(tmpbuf, (UV)c);
866 return is_utf8_space(tmpbuf);
870 Perl_is_uni_digit(pTHX_ U32 c)
872 U8 tmpbuf[UTF8_MAXLEN+1];
873 uvuni_to_utf8(tmpbuf, (UV)c);
874 return is_utf8_digit(tmpbuf);
878 Perl_is_uni_upper(pTHX_ U32 c)
880 U8 tmpbuf[UTF8_MAXLEN+1];
881 uvuni_to_utf8(tmpbuf, (UV)c);
882 return is_utf8_upper(tmpbuf);
886 Perl_is_uni_lower(pTHX_ U32 c)
888 U8 tmpbuf[UTF8_MAXLEN+1];
889 uvuni_to_utf8(tmpbuf, (UV)c);
890 return is_utf8_lower(tmpbuf);
894 Perl_is_uni_cntrl(pTHX_ U32 c)
896 U8 tmpbuf[UTF8_MAXLEN+1];
897 uvuni_to_utf8(tmpbuf, (UV)c);
898 return is_utf8_cntrl(tmpbuf);
902 Perl_is_uni_graph(pTHX_ U32 c)
904 U8 tmpbuf[UTF8_MAXLEN+1];
905 uvuni_to_utf8(tmpbuf, (UV)c);
906 return is_utf8_graph(tmpbuf);
910 Perl_is_uni_print(pTHX_ U32 c)
912 U8 tmpbuf[UTF8_MAXLEN+1];
913 uvuni_to_utf8(tmpbuf, (UV)c);
914 return is_utf8_print(tmpbuf);
918 Perl_is_uni_punct(pTHX_ U32 c)
920 U8 tmpbuf[UTF8_MAXLEN+1];
921 uvuni_to_utf8(tmpbuf, (UV)c);
922 return is_utf8_punct(tmpbuf);
926 Perl_is_uni_xdigit(pTHX_ U32 c)
928 U8 tmpbuf[UTF8_MAXLEN+1];
929 uvuni_to_utf8(tmpbuf, (UV)c);
930 return is_utf8_xdigit(tmpbuf);
934 Perl_to_uni_upper(pTHX_ U32 c)
936 U8 tmpbuf[UTF8_MAXLEN+1];
937 uvuni_to_utf8(tmpbuf, (UV)c);
938 return to_utf8_upper(tmpbuf);
942 Perl_to_uni_title(pTHX_ U32 c)
944 U8 tmpbuf[UTF8_MAXLEN+1];
945 uvuni_to_utf8(tmpbuf, (UV)c);
946 return to_utf8_title(tmpbuf);
950 Perl_to_uni_lower(pTHX_ U32 c)
952 U8 tmpbuf[UTF8_MAXLEN+1];
953 uvuni_to_utf8(tmpbuf, (UV)c);
954 return to_utf8_lower(tmpbuf);
957 /* for now these all assume no locale info available for Unicode > 255 */
960 Perl_is_uni_alnum_lc(pTHX_ U32 c)
962 return is_uni_alnum(c); /* XXX no locale support yet */
966 Perl_is_uni_alnumc_lc(pTHX_ U32 c)
968 return is_uni_alnumc(c); /* XXX no locale support yet */
972 Perl_is_uni_idfirst_lc(pTHX_ U32 c)
974 return is_uni_idfirst(c); /* XXX no locale support yet */
978 Perl_is_uni_alpha_lc(pTHX_ U32 c)
980 return is_uni_alpha(c); /* XXX no locale support yet */
984 Perl_is_uni_ascii_lc(pTHX_ U32 c)
986 return is_uni_ascii(c); /* XXX no locale support yet */
990 Perl_is_uni_space_lc(pTHX_ U32 c)
992 return is_uni_space(c); /* XXX no locale support yet */
996 Perl_is_uni_digit_lc(pTHX_ U32 c)
998 return is_uni_digit(c); /* XXX no locale support yet */
1002 Perl_is_uni_upper_lc(pTHX_ U32 c)
1004 return is_uni_upper(c); /* XXX no locale support yet */
1008 Perl_is_uni_lower_lc(pTHX_ U32 c)
1010 return is_uni_lower(c); /* XXX no locale support yet */
1014 Perl_is_uni_cntrl_lc(pTHX_ U32 c)
1016 return is_uni_cntrl(c); /* XXX no locale support yet */
1020 Perl_is_uni_graph_lc(pTHX_ U32 c)
1022 return is_uni_graph(c); /* XXX no locale support yet */
1026 Perl_is_uni_print_lc(pTHX_ U32 c)
1028 return is_uni_print(c); /* XXX no locale support yet */
1032 Perl_is_uni_punct_lc(pTHX_ U32 c)
1034 return is_uni_punct(c); /* XXX no locale support yet */
1038 Perl_is_uni_xdigit_lc(pTHX_ U32 c)
1040 return is_uni_xdigit(c); /* XXX no locale support yet */
1044 Perl_to_uni_upper_lc(pTHX_ U32 c)
1046 return to_uni_upper(c); /* XXX no locale support yet */
1050 Perl_to_uni_title_lc(pTHX_ U32 c)
1052 return to_uni_title(c); /* XXX no locale support yet */
1056 Perl_to_uni_lower_lc(pTHX_ U32 c)
1058 return to_uni_lower(c); /* XXX no locale support yet */
1062 Perl_is_utf8_alnum(pTHX_ U8 *p)
1064 if (!is_utf8_char(p))
1067 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1068 * descendant of isalnum(3), in other words, it doesn't
1069 * contain the '_'. --jhi */
1070 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1071 return swash_fetch(PL_utf8_alnum, p);
1072 /* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1073 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1075 PL_utf8_alnum = swash_init("utf8", "",
1076 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1077 return swash_fetch(PL_utf8_alnum, p);
1082 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1084 if (!is_utf8_char(p))
1087 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1088 return swash_fetch(PL_utf8_alnum, p);
1089 /* return is_utf8_alpha(p) || is_utf8_digit(p); */
1090 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1092 PL_utf8_alnum = swash_init("utf8", "",
1093 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1094 return swash_fetch(PL_utf8_alnum, p);
1099 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1101 return *p == '_' || is_utf8_alpha(p);
1105 Perl_is_utf8_alpha(pTHX_ U8 *p)
1107 if (!is_utf8_char(p))
1110 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1111 return swash_fetch(PL_utf8_alpha, p);
1115 Perl_is_utf8_ascii(pTHX_ U8 *p)
1117 if (!is_utf8_char(p))
1120 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1121 return swash_fetch(PL_utf8_ascii, p);
1125 Perl_is_utf8_space(pTHX_ U8 *p)
1127 if (!is_utf8_char(p))
1130 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1131 return swash_fetch(PL_utf8_space, p);
1135 Perl_is_utf8_digit(pTHX_ U8 *p)
1137 if (!is_utf8_char(p))
1140 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1141 return swash_fetch(PL_utf8_digit, p);
1145 Perl_is_utf8_upper(pTHX_ U8 *p)
1147 if (!is_utf8_char(p))
1150 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1151 return swash_fetch(PL_utf8_upper, p);
1155 Perl_is_utf8_lower(pTHX_ U8 *p)
1157 if (!is_utf8_char(p))
1160 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1161 return swash_fetch(PL_utf8_lower, p);
1165 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1167 if (!is_utf8_char(p))
1170 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1171 return swash_fetch(PL_utf8_cntrl, p);
1175 Perl_is_utf8_graph(pTHX_ U8 *p)
1177 if (!is_utf8_char(p))
1180 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1181 return swash_fetch(PL_utf8_graph, p);
1185 Perl_is_utf8_print(pTHX_ U8 *p)
1187 if (!is_utf8_char(p))
1190 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1191 return swash_fetch(PL_utf8_print, p);
1195 Perl_is_utf8_punct(pTHX_ U8 *p)
1197 if (!is_utf8_char(p))
1200 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1201 return swash_fetch(PL_utf8_punct, p);
1205 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1207 if (!is_utf8_char(p))
1209 if (!PL_utf8_xdigit)
1210 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1211 return swash_fetch(PL_utf8_xdigit, p);
1215 Perl_is_utf8_mark(pTHX_ U8 *p)
1217 if (!is_utf8_char(p))
1220 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1221 return swash_fetch(PL_utf8_mark, p);
1225 Perl_to_utf8_upper(pTHX_ U8 *p)
1229 if (!PL_utf8_toupper)
1230 PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
1231 uv = swash_fetch(PL_utf8_toupper, p);
1232 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1236 Perl_to_utf8_title(pTHX_ U8 *p)
1240 if (!PL_utf8_totitle)
1241 PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
1242 uv = swash_fetch(PL_utf8_totitle, p);
1243 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1247 Perl_to_utf8_lower(pTHX_ U8 *p)
1251 if (!PL_utf8_tolower)
1252 PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
1253 uv = swash_fetch(PL_utf8_tolower, p);
1254 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1257 /* a "swash" is a swatch hash */
1260 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1263 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1266 if (!gv_stashpv(pkg, 0)) { /* demand load utf8 */
1268 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1272 PUSHSTACKi(PERLSI_MAGIC);
1275 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1276 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1278 PUSHs(sv_2mortal(newSViv(minbits)));
1279 PUSHs(sv_2mortal(newSViv(none)));
1285 if (PL_curcop == &PL_compiling)
1286 /* XXX ought to be handled by lex_start */
1287 sv_setpv(tokenbufsv, PL_tokenbuf);
1288 if (call_method("SWASHNEW", G_SCALAR))
1289 retval = newSVsv(*PL_stack_sp--);
1291 retval = &PL_sv_undef;
1294 if (PL_curcop == &PL_compiling) {
1296 char* pv = SvPV(tokenbufsv, len);
1298 Copy(pv, PL_tokenbuf, len+1, char);
1299 PL_curcop->op_private = PL_hints;
1301 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1302 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1307 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
1309 HV* hv = (HV*)SvRV(sv);
1310 U32 klen = UTF8SKIP(ptr) - 1;
1311 U32 off = ptr[klen] & 127; /* NB: 64 bit always 0 when len > 1 */
1313 STRLEN needents = (klen ? 64 : 128);
1319 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1320 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1321 * it's nothing to sniff at.) Pity we usually come through at least
1322 * two function calls to get here...
1324 * NB: this code assumes that swatches are never modified, once generated!
1327 if (hv == PL_last_swash_hv &&
1328 klen == PL_last_swash_klen &&
1329 (!klen || memEQ((char *)ptr,(char *)PL_last_swash_key,klen)) )
1331 tmps = PL_last_swash_tmps;
1332 slen = PL_last_swash_slen;
1335 /* Try our second-level swatch cache, kept in a hash. */
1336 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1338 /* If not cached, generate it via utf8::SWASHGET */
1339 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1344 PUSHSTACKi(PERLSI_MAGIC);
1348 /* We call utf8_to_uni as we want and index into Unicode tables,
1349 not a native character number.
1351 PUSHs(sv_2mortal(newSViv(utf8_to_uvuni(ptr, 0) & ~(needents - 1))));
1352 PUSHs(sv_2mortal(newSViv(needents)));
1354 if (call_method("SWASHGET", G_SCALAR))
1355 retval = newSVsv(*PL_stack_sp--);
1357 retval = &PL_sv_undef;
1361 if (PL_curcop == &PL_compiling)
1362 PL_curcop->op_private = PL_hints;
1364 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1366 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8)
1367 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1370 PL_last_swash_hv = hv;
1371 PL_last_swash_klen = klen;
1372 PL_last_swash_tmps = tmps;
1373 PL_last_swash_slen = slen;
1375 Copy(ptr, PL_last_swash_key, klen, U8);
1378 switch ((int)((slen << 3) / needents)) {
1380 bit = 1 << (off & 7);
1382 return (tmps[off] & bit) != 0;
1387 return (tmps[off] << 8) + tmps[off + 1] ;
1390 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1392 Perl_croak(aTHX_ "panic: swash_fetch");