3 * Copyright (c) 1998-2001, Larry Wall
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
24 #define PERL_IN_UTF8_C
30 =for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34 bytes available. The return value is the pointer to the byte after the
35 end of the new character. In other words,
37 d = uvuni_to_utf8(d, uv);
39 is the recommended Unicode-aware way of saying
47 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
49 if (UNI_IS_INVARIANT(uv)) {
50 *d++ = UTF_TO_NATIVE(uv);
53 #if defined(EBCDIC) || 1 /* always for testing */
55 STRLEN len = UNISKIP(uv);
58 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
59 uv >>= UTF_ACCUMULATION_SHIFT;
61 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
64 #else /* Non loop style */
66 *d++ = (( uv >> 6) | 0xc0);
67 *d++ = (( uv & 0x3f) | 0x80);
71 *d++ = (( uv >> 12) | 0xe0);
72 *d++ = (((uv >> 6) & 0x3f) | 0x80);
73 *d++ = (( uv & 0x3f) | 0x80);
77 *d++ = (( uv >> 18) | 0xf0);
78 *d++ = (((uv >> 12) & 0x3f) | 0x80);
79 *d++ = (((uv >> 6) & 0x3f) | 0x80);
80 *d++ = (( uv & 0x3f) | 0x80);
84 *d++ = (( uv >> 24) | 0xf8);
85 *d++ = (((uv >> 18) & 0x3f) | 0x80);
86 *d++ = (((uv >> 12) & 0x3f) | 0x80);
87 *d++ = (((uv >> 6) & 0x3f) | 0x80);
88 *d++ = (( uv & 0x3f) | 0x80);
91 if (uv < 0x80000000) {
92 *d++ = (( uv >> 30) | 0xfc);
93 *d++ = (((uv >> 24) & 0x3f) | 0x80);
94 *d++ = (((uv >> 18) & 0x3f) | 0x80);
95 *d++ = (((uv >> 12) & 0x3f) | 0x80);
96 *d++ = (((uv >> 6) & 0x3f) | 0x80);
97 *d++ = (( uv & 0x3f) | 0x80);
101 if (uv < UTF8_QUAD_MAX)
104 *d++ = 0xfe; /* Can't match U+FEFF! */
105 *d++ = (((uv >> 30) & 0x3f) | 0x80);
106 *d++ = (((uv >> 24) & 0x3f) | 0x80);
107 *d++ = (((uv >> 18) & 0x3f) | 0x80);
108 *d++ = (((uv >> 12) & 0x3f) | 0x80);
109 *d++ = (((uv >> 6) & 0x3f) | 0x80);
110 *d++ = (( uv & 0x3f) | 0x80);
115 *d++ = 0xff; /* Can't match U+FFFE! */
116 *d++ = 0x80; /* 6 Reserved bits */
117 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
118 *d++ = (((uv >> 54) & 0x3f) | 0x80);
119 *d++ = (((uv >> 48) & 0x3f) | 0x80);
120 *d++ = (((uv >> 42) & 0x3f) | 0x80);
121 *d++ = (((uv >> 36) & 0x3f) | 0x80);
122 *d++ = (((uv >> 30) & 0x3f) | 0x80);
123 *d++ = (((uv >> 24) & 0x3f) | 0x80);
124 *d++ = (((uv >> 18) & 0x3f) | 0x80);
125 *d++ = (((uv >> 12) & 0x3f) | 0x80);
126 *d++ = (((uv >> 6) & 0x3f) | 0x80);
127 *d++ = (( uv & 0x3f) | 0x80);
131 #endif /* Loop style */
137 =for apidoc A|STRLEN|is_utf8_char|U8 *s
139 Tests if some arbitrary number of bytes begins in a valid UTF-8
140 character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
141 The actual number of bytes in the UTF-8 character will be returned if
142 it is valid, otherwise 0.
147 Perl_is_utf8_char(pTHX_ U8 *s)
153 if (UTF8_IS_INVARIANT(u))
156 if (!UTF8_IS_START(u))
161 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
166 /* The initial value is dubious */
170 if (!UTF8_IS_CONTINUATION(*s))
172 uv = UTF8_ACCUMULATE(uv, *s);
174 /* Depending on the compiler the wrap of the value takig pladve
175 * between 5 and 6 bytes of UTF-8 encoding either works or not.
176 * See similar spot in utf8_to_uvuni(). --jhi */
184 if (UNISKIP(uv) < len)
191 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
193 Returns true if first C<len> bytes of the given string form a valid UTF8
194 string, false otherwise. Note that 'a valid UTF8 string' does not mean
195 'a string that contains UTF8' because a valid ASCII string is a valid
202 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
209 len = strlen((char *)s);
225 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
227 Bottom level UTF-8 decode routine.
228 Returns the unicode code point value of the first character in the string C<s>
229 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
230 C<retlen> will be set to the length, in bytes, of that character.
232 If C<s> does not point to a well-formed UTF8 character, the behaviour
233 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
234 it is assumed that the caller will raise a warning, and this function
235 will silently just set C<retlen> to C<-1> and return zero. If the
236 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
237 malformations will be given, C<retlen> will be set to the expected
238 length of the UTF-8 character in bytes, and zero will be returned.
240 The C<flags> can also contain various flags to allow deviations from
241 the strict UTF-8 encoding (see F<utf8.h>).
243 Most code should use utf8_to_uvchr() rather than call this directly.
249 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
253 bool dowarn = ckWARN_d(WARN_UTF8);
254 STRLEN expectlen = 0;
257 /* This list is a superset of the UTF8_ALLOW_XXX. */
259 #define UTF8_WARN_EMPTY 1
260 #define UTF8_WARN_CONTINUATION 2
261 #define UTF8_WARN_NON_CONTINUATION 3
262 #define UTF8_WARN_FE_FF 4
263 #define UTF8_WARN_SHORT 5
264 #define UTF8_WARN_OVERFLOW 6
265 #define UTF8_WARN_SURROGATE 7
266 #define UTF8_WARN_BOM 8
267 #define UTF8_WARN_LONG 9
268 #define UTF8_WARN_FFFF 10
271 !(flags & UTF8_ALLOW_EMPTY)) {
272 warning = UTF8_WARN_EMPTY;
276 if (UTF8_IS_INVARIANT(uv)) {
279 return (UV) (NATIVE_TO_UTF(*s));
282 if (UTF8_IS_CONTINUATION(uv) &&
283 !(flags & UTF8_ALLOW_CONTINUATION)) {
284 warning = UTF8_WARN_CONTINUATION;
288 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
289 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
290 warning = UTF8_WARN_NON_CONTINUATION;
295 uv = NATIVE_TO_UTF(uv);
297 if ((uv == 0xfe || uv == 0xff) &&
298 !(flags & UTF8_ALLOW_FE_FF)) {
299 warning = UTF8_WARN_FE_FF;
304 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
305 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
306 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
307 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
309 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
310 else { len = 7; uv &= 0x01; }
312 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
313 else if (!(uv & 0x01)) { len = 7; uv = 0; }
314 else { len = 13; uv = 0; } /* whoa! */
322 if ((curlen < expectlen) &&
323 !(flags & UTF8_ALLOW_SHORT)) {
324 warning = UTF8_WARN_SHORT;
333 if (!UTF8_IS_CONTINUATION(*s) &&
334 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
336 warning = UTF8_WARN_NON_CONTINUATION;
340 uv = UTF8_ACCUMULATE(uv, *s);
342 /* These cannot be allowed. */
344 if (!(flags & UTF8_ALLOW_LONG)) {
345 warning = UTF8_WARN_LONG;
349 else { /* uv < ouv */
351 /* Depending on the compiler the wrap of the value takig pladve
352 * between 5 and 6 bytes of UTF-8 encoding either works or not.
353 * See similar spot in is_utf8_char(). --jhi */
354 /* This cannot be allowed. */
355 warning = UTF8_WARN_OVERFLOW;
364 if (UNICODE_IS_SURROGATE(uv) &&
365 !(flags & UTF8_ALLOW_SURROGATE)) {
366 warning = UTF8_WARN_SURROGATE;
368 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
369 !(flags & UTF8_ALLOW_BOM)) {
370 warning = UTF8_WARN_BOM;
372 } else if ((expectlen > UNISKIP(uv)) &&
373 !(flags & UTF8_ALLOW_LONG)) {
374 warning = UTF8_WARN_LONG;
376 } else if (UNICODE_IS_ILLEGAL(uv) &&
377 !(flags & UTF8_ALLOW_FFFF)) {
378 warning = UTF8_WARN_FFFF;
386 if (flags & UTF8_CHECK_ONLY) {
393 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
396 case 0: /* Intentionally empty. */ break;
397 case UTF8_WARN_EMPTY:
398 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
400 case UTF8_WARN_CONTINUATION:
401 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
403 case UTF8_WARN_NON_CONTINUATION:
404 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
407 case UTF8_WARN_FE_FF:
408 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
410 case UTF8_WARN_SHORT:
411 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
412 curlen, curlen == 1 ? "" : "s", expectlen);
414 case UTF8_WARN_OVERFLOW:
415 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
418 case UTF8_WARN_SURROGATE:
419 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
422 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
425 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
426 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
429 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
432 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
440 Perl_warner(aTHX_ WARN_UTF8,
441 "%s in %s", s, PL_op_desc[PL_op->op_type]);
443 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
448 *retlen = expectlen ? expectlen : len;
454 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
456 Returns the native character value of the first character in the string C<s>
457 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
458 length, in bytes, of that character.
460 If C<s> does not point to a well-formed UTF8 character, zero is
461 returned and retlen is set, if possible, to -1.
467 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
469 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
473 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
475 Returns the Unicode code point of the first character in the string C<s>
476 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
477 length, in bytes, of that character.
479 This function should only be used when returned UV is considered
480 an index into the Unicode semantic tables (e.g. swashes).
482 If C<s> does not point to a well-formed UTF8 character, zero is
483 returned and retlen is set, if possible, to -1.
489 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
491 /* Call the low level routine asking for checks */
492 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
496 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
498 Return the length of the UTF-8 char encoded string C<s> in characters.
499 Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
500 up past C<e>, croaks.
506 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
510 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
511 * the bitops (especially ~) can create illegal UTF-8.
512 * In other words: in Perl UTF-8 is not just for Unicode. */
515 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
520 Perl_croak(aTHX_ "panic: utf8_length: s=%p (%02X) e=%p l=%d - unaligned end",s,*s,e,t);
529 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
531 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
534 WARNING: use only if you *know* that the pointers point inside the
541 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
545 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
546 * the bitops (especially ~) can create illegal UTF-8.
547 * In other words: in Perl UTF-8 is not just for Unicode. */
554 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
564 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
574 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
576 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
579 WARNING: do not use the following unless you *know* C<off> is within
580 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
581 on the first byte of character or just after the last byte of a character.
587 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
589 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
590 * the bitops (especially ~) can create illegal UTF-8.
591 * In other words: in Perl UTF-8 is not just for Unicode. */
600 while (UTF8_IS_CONTINUATION(*s))
608 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
610 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
611 Unlike C<bytes_to_utf8>, this over-writes the original string, and
612 updates len to contain the new length.
613 Returns zero on failure, setting C<len> to -1.
619 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
625 /* ensure valid UTF8 and chars < 256 before updating string */
626 for (send = s + *len; s < send; ) {
629 if (!UTF8_IS_INVARIANT(c) &&
630 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
631 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
640 *d++ = (U8)utf8_to_uvchr(s, &ulen);
649 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
651 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
652 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
653 the newly-created string, and updates C<len> to contain the new
654 length. Returns the original string if no conversion occurs, C<len>
655 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
656 0 if C<s> is converted or contains all 7bit characters.
662 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
672 /* ensure valid UTF8 and chars < 256 before converting string */
673 for (send = s + *len; s < send;) {
675 if (!UTF8_IS_INVARIANT(c)) {
676 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
677 (c = *s++) && UTF8_IS_CONTINUATION(c))
686 Newz(801, d, (*len) - count + 1, U8);
687 s = start; start = d;
690 if (!UTF8_IS_INVARIANT(c)) {
691 /* Then it is two-byte encoded */
692 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
693 c = ASCII_TO_NATIVE(c);
703 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
705 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
706 Returns a pointer to the newly-created string, and sets C<len> to
707 reflect the new length.
713 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
720 Newz(801, d, (*len) * 2 + 1, U8);
724 UV uv = NATIVE_TO_ASCII(*s++);
725 if (UNI_IS_INVARIANT(uv))
726 *d++ = UTF_TO_NATIVE(uv);
728 *d++ = UTF8_EIGHT_BIT_HI(uv);
729 *d++ = UTF8_EIGHT_BIT_LO(uv);
738 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
740 * Destination must be pre-extended to 3/2 source. Do not use in-place.
741 * We optimize for native, for obvious reasons. */
744 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
750 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
755 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
762 *d++ = (( uv >> 6) | 0xc0);
763 *d++ = (( uv & 0x3f) | 0x80);
766 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
768 if (low < 0xdc00 || low >= 0xdfff)
769 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
770 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
773 *d++ = (( uv >> 12) | 0xe0);
774 *d++ = (((uv >> 6) & 0x3f) | 0x80);
775 *d++ = (( uv & 0x3f) | 0x80);
779 *d++ = (( uv >> 18) | 0xf0);
780 *d++ = (((uv >> 12) & 0x3f) | 0x80);
781 *d++ = (((uv >> 6) & 0x3f) | 0x80);
782 *d++ = (( uv & 0x3f) | 0x80);
786 *newlen = d - dstart;
790 /* Note: this one is slightly destructive of the source. */
793 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
796 U8* send = s + bytelen;
803 return utf16_to_utf8(p, d, bytelen, newlen);
806 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
809 Perl_is_uni_alnum(pTHX_ U32 c)
811 U8 tmpbuf[UTF8_MAXLEN+1];
812 uvchr_to_utf8(tmpbuf, (UV)c);
813 return is_utf8_alnum(tmpbuf);
817 Perl_is_uni_alnumc(pTHX_ U32 c)
819 U8 tmpbuf[UTF8_MAXLEN+1];
820 uvchr_to_utf8(tmpbuf, (UV)c);
821 return is_utf8_alnumc(tmpbuf);
825 Perl_is_uni_idfirst(pTHX_ U32 c)
827 U8 tmpbuf[UTF8_MAXLEN+1];
828 uvchr_to_utf8(tmpbuf, (UV)c);
829 return is_utf8_idfirst(tmpbuf);
833 Perl_is_uni_alpha(pTHX_ U32 c)
835 U8 tmpbuf[UTF8_MAXLEN+1];
836 uvchr_to_utf8(tmpbuf, (UV)c);
837 return is_utf8_alpha(tmpbuf);
841 Perl_is_uni_ascii(pTHX_ U32 c)
843 U8 tmpbuf[UTF8_MAXLEN+1];
844 uvchr_to_utf8(tmpbuf, (UV)c);
845 return is_utf8_ascii(tmpbuf);
849 Perl_is_uni_space(pTHX_ U32 c)
851 U8 tmpbuf[UTF8_MAXLEN+1];
852 uvchr_to_utf8(tmpbuf, (UV)c);
853 return is_utf8_space(tmpbuf);
857 Perl_is_uni_digit(pTHX_ U32 c)
859 U8 tmpbuf[UTF8_MAXLEN+1];
860 uvchr_to_utf8(tmpbuf, (UV)c);
861 return is_utf8_digit(tmpbuf);
865 Perl_is_uni_upper(pTHX_ U32 c)
867 U8 tmpbuf[UTF8_MAXLEN+1];
868 uvchr_to_utf8(tmpbuf, (UV)c);
869 return is_utf8_upper(tmpbuf);
873 Perl_is_uni_lower(pTHX_ U32 c)
875 U8 tmpbuf[UTF8_MAXLEN+1];
876 uvchr_to_utf8(tmpbuf, (UV)c);
877 return is_utf8_lower(tmpbuf);
881 Perl_is_uni_cntrl(pTHX_ U32 c)
883 U8 tmpbuf[UTF8_MAXLEN+1];
884 uvchr_to_utf8(tmpbuf, (UV)c);
885 return is_utf8_cntrl(tmpbuf);
889 Perl_is_uni_graph(pTHX_ U32 c)
891 U8 tmpbuf[UTF8_MAXLEN+1];
892 uvchr_to_utf8(tmpbuf, (UV)c);
893 return is_utf8_graph(tmpbuf);
897 Perl_is_uni_print(pTHX_ U32 c)
899 U8 tmpbuf[UTF8_MAXLEN+1];
900 uvchr_to_utf8(tmpbuf, (UV)c);
901 return is_utf8_print(tmpbuf);
905 Perl_is_uni_punct(pTHX_ U32 c)
907 U8 tmpbuf[UTF8_MAXLEN+1];
908 uvchr_to_utf8(tmpbuf, (UV)c);
909 return is_utf8_punct(tmpbuf);
913 Perl_is_uni_xdigit(pTHX_ U32 c)
915 U8 tmpbuf[UTF8_MAXLEN+1];
916 uvchr_to_utf8(tmpbuf, (UV)c);
917 return is_utf8_xdigit(tmpbuf);
921 Perl_to_uni_upper(pTHX_ U32 c)
923 U8 tmpbuf[UTF8_MAXLEN+1];
924 uvchr_to_utf8(tmpbuf, (UV)c);
925 return to_utf8_upper(tmpbuf);
929 Perl_to_uni_title(pTHX_ U32 c)
931 U8 tmpbuf[UTF8_MAXLEN+1];
932 uvchr_to_utf8(tmpbuf, (UV)c);
933 return to_utf8_title(tmpbuf);
937 Perl_to_uni_lower(pTHX_ U32 c)
939 U8 tmpbuf[UTF8_MAXLEN+1];
940 uvchr_to_utf8(tmpbuf, (UV)c);
941 return to_utf8_lower(tmpbuf);
944 /* for now these all assume no locale info available for Unicode > 255 */
947 Perl_is_uni_alnum_lc(pTHX_ U32 c)
949 return is_uni_alnum(c); /* XXX no locale support yet */
953 Perl_is_uni_alnumc_lc(pTHX_ U32 c)
955 return is_uni_alnumc(c); /* XXX no locale support yet */
959 Perl_is_uni_idfirst_lc(pTHX_ U32 c)
961 return is_uni_idfirst(c); /* XXX no locale support yet */
965 Perl_is_uni_alpha_lc(pTHX_ U32 c)
967 return is_uni_alpha(c); /* XXX no locale support yet */
971 Perl_is_uni_ascii_lc(pTHX_ U32 c)
973 return is_uni_ascii(c); /* XXX no locale support yet */
977 Perl_is_uni_space_lc(pTHX_ U32 c)
979 return is_uni_space(c); /* XXX no locale support yet */
983 Perl_is_uni_digit_lc(pTHX_ U32 c)
985 return is_uni_digit(c); /* XXX no locale support yet */
989 Perl_is_uni_upper_lc(pTHX_ U32 c)
991 return is_uni_upper(c); /* XXX no locale support yet */
995 Perl_is_uni_lower_lc(pTHX_ U32 c)
997 return is_uni_lower(c); /* XXX no locale support yet */
1001 Perl_is_uni_cntrl_lc(pTHX_ U32 c)
1003 return is_uni_cntrl(c); /* XXX no locale support yet */
1007 Perl_is_uni_graph_lc(pTHX_ U32 c)
1009 return is_uni_graph(c); /* XXX no locale support yet */
1013 Perl_is_uni_print_lc(pTHX_ U32 c)
1015 return is_uni_print(c); /* XXX no locale support yet */
1019 Perl_is_uni_punct_lc(pTHX_ U32 c)
1021 return is_uni_punct(c); /* XXX no locale support yet */
1025 Perl_is_uni_xdigit_lc(pTHX_ U32 c)
1027 return is_uni_xdigit(c); /* XXX no locale support yet */
1031 Perl_to_uni_upper_lc(pTHX_ U32 c)
1033 return to_uni_upper(c); /* XXX no locale support yet */
1037 Perl_to_uni_title_lc(pTHX_ U32 c)
1039 return to_uni_title(c); /* XXX no locale support yet */
1043 Perl_to_uni_lower_lc(pTHX_ U32 c)
1045 return to_uni_lower(c); /* XXX no locale support yet */
1049 Perl_is_utf8_alnum(pTHX_ U8 *p)
1051 if (!is_utf8_char(p))
1054 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1055 * descendant of isalnum(3), in other words, it doesn't
1056 * contain the '_'. --jhi */
1057 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1058 return swash_fetch(PL_utf8_alnum, p);
1059 /* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1060 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1062 PL_utf8_alnum = swash_init("utf8", "",
1063 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1064 return swash_fetch(PL_utf8_alnum, p);
1069 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1071 if (!is_utf8_char(p))
1074 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1075 return swash_fetch(PL_utf8_alnum, p);
1076 /* return is_utf8_alpha(p) || is_utf8_digit(p); */
1077 #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1079 PL_utf8_alnum = swash_init("utf8", "",
1080 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1081 return swash_fetch(PL_utf8_alnum, p);
1086 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1088 return *p == '_' || is_utf8_alpha(p);
1092 Perl_is_utf8_alpha(pTHX_ U8 *p)
1094 if (!is_utf8_char(p))
1097 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1098 return swash_fetch(PL_utf8_alpha, p);
1102 Perl_is_utf8_ascii(pTHX_ U8 *p)
1104 if (!is_utf8_char(p))
1107 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1108 return swash_fetch(PL_utf8_ascii, p);
1112 Perl_is_utf8_space(pTHX_ U8 *p)
1114 if (!is_utf8_char(p))
1117 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1118 return swash_fetch(PL_utf8_space, p);
1122 Perl_is_utf8_digit(pTHX_ U8 *p)
1124 if (!is_utf8_char(p))
1127 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1128 return swash_fetch(PL_utf8_digit, p);
1132 Perl_is_utf8_upper(pTHX_ U8 *p)
1134 if (!is_utf8_char(p))
1137 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1138 return swash_fetch(PL_utf8_upper, p);
1142 Perl_is_utf8_lower(pTHX_ U8 *p)
1144 if (!is_utf8_char(p))
1147 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1148 return swash_fetch(PL_utf8_lower, p);
1152 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1154 if (!is_utf8_char(p))
1157 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1158 return swash_fetch(PL_utf8_cntrl, p);
1162 Perl_is_utf8_graph(pTHX_ U8 *p)
1164 if (!is_utf8_char(p))
1167 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1168 return swash_fetch(PL_utf8_graph, p);
1172 Perl_is_utf8_print(pTHX_ U8 *p)
1174 if (!is_utf8_char(p))
1177 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1178 return swash_fetch(PL_utf8_print, p);
1182 Perl_is_utf8_punct(pTHX_ U8 *p)
1184 if (!is_utf8_char(p))
1187 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1188 return swash_fetch(PL_utf8_punct, p);
1192 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1194 if (!is_utf8_char(p))
1196 if (!PL_utf8_xdigit)
1197 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1198 return swash_fetch(PL_utf8_xdigit, p);
1202 Perl_is_utf8_mark(pTHX_ U8 *p)
1204 if (!is_utf8_char(p))
1207 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1208 return swash_fetch(PL_utf8_mark, p);
1212 Perl_to_utf8_upper(pTHX_ U8 *p)
1216 if (!PL_utf8_toupper)
1217 PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
1218 uv = swash_fetch(PL_utf8_toupper, p);
1219 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1223 Perl_to_utf8_title(pTHX_ U8 *p)
1227 if (!PL_utf8_totitle)
1228 PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
1229 uv = swash_fetch(PL_utf8_totitle, p);
1230 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1234 Perl_to_utf8_lower(pTHX_ U8 *p)
1238 if (!PL_utf8_tolower)
1239 PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
1240 uv = swash_fetch(PL_utf8_tolower, p);
1241 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1244 /* a "swash" is a swatch hash */
1247 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1250 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1252 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1254 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1256 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1260 PUSHSTACKi(PERLSI_MAGIC);
1263 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1264 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1266 PUSHs(sv_2mortal(newSViv(minbits)));
1267 PUSHs(sv_2mortal(newSViv(none)));
1273 if (PL_curcop == &PL_compiling)
1274 /* XXX ought to be handled by lex_start */
1275 sv_setpv(tokenbufsv, PL_tokenbuf);
1276 if (call_method("SWASHNEW", G_SCALAR))
1277 retval = newSVsv(*PL_stack_sp--);
1279 retval = &PL_sv_undef;
1282 if (PL_curcop == &PL_compiling) {
1284 char* pv = SvPV(tokenbufsv, len);
1286 Copy(pv, PL_tokenbuf, len+1, char);
1287 PL_curcop->op_private = PL_hints;
1289 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1290 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1295 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
1297 HV* hv = (HV*)SvRV(sv);
1298 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1299 then the "swatch" is a vec() for al the chars which start
1301 So the key in the hash is length of encoded char -1
1303 U32 klen = UTF8SKIP(ptr) - 1;
1304 U32 off = ptr[klen];
1313 /* If char in invariant then swatch is for all the invariant chars
1314 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1316 needents = UTF_CONTINUATION_MARK;
1317 off = NATIVE_TO_UTF(ptr[klen]);
1321 /* If char is encoded then swatch is for the prefix */
1322 needents = (1 << UTF_ACCUMULATION_SHIFT);
1323 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1327 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1328 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1329 * it's nothing to sniff at.) Pity we usually come through at least
1330 * two function calls to get here...
1332 * NB: this code assumes that swatches are never modified, once generated!
1335 if (hv == PL_last_swash_hv &&
1336 klen == PL_last_swash_klen &&
1337 (!klen || memEQ((char *)ptr,(char *)PL_last_swash_key,klen)) )
1339 tmps = PL_last_swash_tmps;
1340 slen = PL_last_swash_slen;
1343 /* Try our second-level swatch cache, kept in a hash. */
1344 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1346 /* If not cached, generate it via utf8::SWASHGET */
1347 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1349 /* We use utf8n_to_uvuni() as we want an index into
1350 Unicode tables, not a native character number.
1352 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
1356 PUSHSTACKi(PERLSI_MAGIC);
1360 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1361 PUSHs(sv_2mortal(newSViv((klen) ? (code_point & ~(needents - 1)) : 0)));
1362 PUSHs(sv_2mortal(newSViv(needents)));
1364 if (call_method("SWASHGET", G_SCALAR))
1365 retval = newSVsv(*PL_stack_sp--);
1367 retval = &PL_sv_undef;
1371 if (PL_curcop == &PL_compiling)
1372 PL_curcop->op_private = PL_hints;
1374 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1376 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1377 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1380 PL_last_swash_hv = hv;
1381 PL_last_swash_klen = klen;
1382 PL_last_swash_tmps = tmps;
1383 PL_last_swash_slen = slen;
1385 Copy(ptr, PL_last_swash_key, klen, U8);
1388 switch ((int)((slen << 3) / needents)) {
1390 bit = 1 << (off & 7);
1392 return (tmps[off] & bit) != 0;
1397 return (tmps[off] << 8) + tmps[off + 1] ;
1400 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1402 Perl_croak(aTHX_ "panic: swash_fetch");
1408 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1410 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1411 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1412 bytes available. The return value is the pointer to the byte after the
1413 end of the new character. In other words,
1415 d = uvchr_to_utf8(d, uv);
1417 is the recommended wide native character-aware way of saying
1424 /* On ASCII machines this is normally a macro but we want a
1425 real function in case XS code wants it
1427 #undef Perl_uvchr_to_utf8
1429 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1431 return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1436 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1438 Returns the native character value of the first character in the string C<s>
1439 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1440 length, in bytes, of that character.
1442 Allows length and flags to be passed to low level routine.
1446 /* On ASCII machines this is normally a macro but we want a
1447 real function in case XS code wants it
1449 #undef Perl_utf8n_to_uvchr
1451 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1453 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1454 return UNI_TO_NATIVE(uv);