#include "perl.h"
#ifndef EBCDIC
-/* Separate prototypes needed because in ASCII systems these
+/* Separate prototypes needed because in ASCII systems these are
* usually macros but they still are compiled as code, too. */
PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
static const char unees[] =
"Malformed UTF-8 character (unexpected end of string)";
-/*
+/*
=head1 Unicode Support
This file contains various utility functions for manipulating UTF8-encoded
/*
=for apidoc is_ascii_string
-Returns true if first C<len> bytes of the given string are ASCII (i.e. none
-of them even raise the question of UTF-8-ness).
+Returns true if the first C<len> bytes of the given string are the same whether
+or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That
+is, if they are invariant. On ASCII-ish machines, only ASCII characters
+fit this definition, hence the function's name.
See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
if (!UTF8_IS_CONTINUATION(*s))
return 0;
uv = UTF8_ACCUMULATE(uv, *s);
- if (uv < ouv)
+ if (uv < ouv)
return 0;
ouv = uv;
s++;
const UV startbyte = *s;
STRLEN expectlen = 0;
U32 warning = 0;
+ SV* sv;
PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
-/* This list is a superset of the UTF8_ALLOW_XXX. */
+/* This list is a superset of the UTF8_ALLOW_XXX. BUT it isn't, eg SUPER missing XXX */
#define UTF8_WARN_EMPTY 1
#define UTF8_WARN_CONTINUATION 2
}
if (dowarn) {
- SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
+ if (warning == UTF8_WARN_FFFF) {
+ sv = newSVpvs_flags("Unicode non-character ", SVs_TEMP);
+ Perl_sv_catpvf(aTHX_ sv, "0x%04"UVxf" is illegal for interchange", uv);
+ }
+ else {
+ sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
+
+ switch (warning) {
+ case 0: /* Intentionally empty. */ break;
+ case UTF8_WARN_EMPTY:
+ sv_catpvs(sv, "(empty string)");
+ break;
+ case UTF8_WARN_CONTINUATION:
+ Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
+ break;
+ case UTF8_WARN_NON_CONTINUATION:
+ if (s == s0)
+ Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
+ (UV)s[1], startbyte);
+ else {
+ const int len = (int)(s-s0);
+ Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
+ (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
+ }
- switch (warning) {
- case 0: /* Intentionally empty. */ break;
- case UTF8_WARN_EMPTY:
- sv_catpvs(sv, "(empty string)");
- break;
- case UTF8_WARN_CONTINUATION:
- Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
- break;
- case UTF8_WARN_NON_CONTINUATION:
- if (s == s0)
- Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
- (UV)s[1], startbyte);
- else {
- const int len = (int)(s-s0);
- Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
- (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
+ break;
+ case UTF8_WARN_FE_FF:
+ Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
+ break;
+ case UTF8_WARN_SHORT:
+ Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
+ (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
+ expectlen = curlen; /* distance for caller to skip */
+ break;
+ case UTF8_WARN_OVERFLOW:
+ Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
+ ouv, *s, startbyte);
+ break;
+ case UTF8_WARN_SURROGATE:
+ Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
+ break;
+ case UTF8_WARN_LONG:
+ Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
+ (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
+ break;
+ default:
+ sv_catpvs(sv, "(unknown reason)");
+ break;
}
-
- break;
- case UTF8_WARN_FE_FF:
- Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
- break;
- case UTF8_WARN_SHORT:
- Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
- (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
- expectlen = curlen; /* distance for caller to skip */
- break;
- case UTF8_WARN_OVERFLOW:
- Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
- ouv, *s, startbyte);
- break;
- case UTF8_WARN_SURROGATE:
- Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
- break;
- case UTF8_WARN_LONG:
- Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
- (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
- break;
- case UTF8_WARN_FFFF:
- Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
- break;
- default:
- sv_catpvs(sv, "(unknown reason)");
- break;
}
if (warning) {
PERL_ARGS_ASSERT_UTF16_TO_UTF8;
- if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
- d[0] = 0;
- *newlen = 1;
- return d;
- }
-
if (bytelen & 1)
Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
return is_utf8_common(p, &PL_utf8_mark, "IsM");
}
+bool
+Perl_is_utf8_X_begin(pTHX_ const U8 *p)
+{
+ dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
+
+ return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
+}
+
+bool
+Perl_is_utf8_X_extend(pTHX_ const U8 *p)
+{
+ dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND;
+
+ return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
+}
+
+bool
+Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
+{
+ dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
+
+ return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend");
+}
+
+bool
+Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
+{
+ dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
+
+ return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable");
+}
+
+bool
+Perl_is_utf8_X_L(pTHX_ const U8 *p)
+{
+ dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_X_L;
+
+ return is_utf8_common(p, &PL_utf8_X_L, "GCB=L");
+}
+
+bool
+Perl_is_utf8_X_LV(pTHX_ const U8 *p)
+{
+ dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_X_LV;
+
+ return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV");
+}
+
+bool
+Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
+{
+ dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
+
+ return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT");
+}
+
+bool
+Perl_is_utf8_X_T(pTHX_ const U8 *p)
+{
+ dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_X_T;
+
+ return is_utf8_common(p, &PL_utf8_X_T, "GCB=T");
+}
+
+bool
+Perl_is_utf8_X_V(pTHX_ const U8 *p)
+{
+ dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_X_V;
+
+ return is_utf8_common(p, &PL_utf8_X_V, "GCB=V");
+}
+
+bool
+Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
+{
+ dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
+
+ return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
+}
+
/*
=for apidoc to_utf8_case
if (!*swashp) /* load on-demand */
*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
+ /* This is the beginnings of a skeleton of code to read the info section
+ * that is in all the swashes in case we ever want to do that, so one can
+ * read things whose maps aren't code points, and whose default if missing
+ * is not to the code point itself. This was just to see if it actually
+ * worked. Details on what the possibilities are are in perluniprops.pod
+ HV * const hv = get_hv("utf8::SwashInfo", 0);
+ if (hv) {
+ SV **svp;
+ svp = hv_fetch(hv, (const char*)normal, strlen(normal), FALSE);
+ const char *s;
+
+ HV * const this_hash = SvRV(*svp);
+ svp = hv_fetch(this_hash, "type", strlen("type"), FALSE);
+ s = SvPV_const(*svp, len);
+ }
+ }*/
- /* The 0xDF is the only special casing Unicode code point below 0x100. */
- if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
+ if (special) {
/* It might be "special" (sometimes, but not always,
* a multicharacter mapping) */
HV * const hv = get_hv(special, 0);
}
}
- if (!len) /* Neither: just copy. */
+ if (!len) /* Neither: just copy. In other words, there was no mapping
+ defined, which means that the code point maps to itself */
len = uvchr_to_utf8(ustrp, uv0) - ustrp;
if (lenp)
PUSHSTACKi(PERLSI_MAGIC);
ENTER;
- SAVEI32(PL_hints);
- PL_hints = 0;
+ SAVEHINTS();
save_re_context();
if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ENTER;
ptr = tmputf8;
}
/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
- * then the "swatch" is a vec() for al the chars which start
+ * then the "swatch" is a vec() for all the chars which start
* with 0xAA..0xYY
* So the key in the hash (klen) is length of encoded char -1
*/
off = ptr[klen];
if (klen == 0) {
- /* If char in invariant then swatch is for all the invariant chars
+ /* If char is invariant then swatch is for all the invariant chars
* In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
*/
needents = UTF_CONTINUATION_MARK;
=for apidoc utf8n_to_uvchr
flags
-Returns the native character value of the first character in the string
+Returns the native character value of the first character in the string
C<s>
which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
length, in bytes, of that character.
a real function in case XS code wants it
*/
UV
-Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
+Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
U32 flags)
{
const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
}
if (truncated)
sv_catpvs(dsv, "...");
-
+
return SvPVX(dsv);
}
}
/*
-=for apidoc ibcmp_utf8
-
-Return true if the strings s1 and s2 differ case-insensitively, false
-if not (if they are equal case-insensitively). If u1 is true, the
-string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
-the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
-are false, the respective string is assumed to be in native 8-bit
-encoding.
-
-If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
-in there (they will point at the beginning of the I<next> character).
-If the pointers behind pe1 or pe2 are non-NULL, they are the end
-pointers beyond which scanning will not continue under any
-circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
-s2+l2 will be used as goal end pointers that will also stop the scan,
-and which qualify towards defining a successful match: all the scans
-that define an explicit length must reach their goal pointers for
-a match to succeed).
+=for apidoc foldEQ_utf8
+
+Returns true if the leading portions of the strings s1 and s2 (either or both
+of which may be in UTF-8) are the same case-insensitively; false otherwise.
+How far into the strings to compare is determined by other input parameters.
+
+If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode;
+otherwise it is assumed to be in native 8-bit encoding. Correspondingly for u2
+with respect to s2.
+
+If the byte length l1 is non-zero, it says how far into s1 to check for fold
+equality. In other words, s1+l1 will be used as a goal to reach. The
+scan will not be considered to be a match unless the goal is reached, and
+scanning won't continue past that goal. Correspondingly for l2 with respect to
+s2.
+
+If pe1 is non-NULL and the pointer it points to is not NULL, that pointer is
+considered an end pointer beyond which scanning of s1 will not continue under
+any circumstances. This means that if both l1 and pe1 are specified, and pe1
+is less than s1+l1, the match will never be successful because it can never
+get as far as its goal (and in fact is asserted against). Correspondingly for
+pe2 with respect to s2.
+
+At least one of s1 and s2 must have a goal (at least one of l1 and l2 must be
+non-zero), and if both do, both have to be
+reached for a successful match. Also, if the fold of a character is multiple
+characters, all of them must be matched (see tr21 reference below for
+'folding').
+
+Upon a successful match, if pe1 is non-NULL,
+it will be set to point to the beginning of the I<next> character of s1 beyond
+what was matched. Correspondingly for pe2 and s2.
For case-insensitiveness, the "casefolding" of Unicode is used
instead of upper/lowercasing both the characters, see
=cut */
I32
-Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
-{
- dVAR;
- register const U8 *p1 = (const U8*)s1;
- register const U8 *p2 = (const U8*)s2;
- register const U8 *f1 = NULL;
- register const U8 *f2 = NULL;
- register U8 *e1 = NULL;
- register U8 *q1 = NULL;
- register U8 *e2 = NULL;
- register U8 *q2 = NULL;
- STRLEN n1 = 0, n2 = 0;
- U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
- U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
- U8 natbuf[1+1];
- STRLEN foldlen1, foldlen2;
- bool match;
-
- PERL_ARGS_ASSERT_IBCMP_UTF8;
-
- if (pe1)
- e1 = *(U8**)pe1;
- /* assert(e1 || l1); */
- if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
- f1 = (const U8*)s1 + l1;
- if (pe2)
- e2 = *(U8**)pe2;
- /* assert(e2 || l2); */
- if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
- f2 = (const U8*)s2 + l2;
-
- /* This shouldn't happen. However, putting an assert() there makes some
- * tests fail. */
- /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
- if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
- return 1; /* mismatch; possible infinite loop or false positive */
-
- if (!u1 || !u2)
- natbuf[1] = 0; /* Need to terminate the buffer. */
-
- while ((e1 == 0 || p1 < e1) &&
- (f1 == 0 || p1 < f1) &&
- (e2 == 0 || p2 < e2) &&
- (f2 == 0 || p2 < f2)) {
- if (n1 == 0) {
- if (u1)
- to_utf8_fold(p1, foldbuf1, &foldlen1);
- else {
- uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
- to_utf8_fold(natbuf, foldbuf1, &foldlen1);
- }
- q1 = foldbuf1;
- n1 = foldlen1;
- }
- if (n2 == 0) {
- if (u2)
- to_utf8_fold(p2, foldbuf2, &foldlen2);
- else {
- uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
- to_utf8_fold(natbuf, foldbuf2, &foldlen2);
- }
- q2 = foldbuf2;
- n2 = foldlen2;
- }
- while (n1 && n2) {
- if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
- (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
- memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
- return 1; /* mismatch */
- n1 -= UTF8SKIP(q1);
- q1 += UTF8SKIP(q1);
- n2 -= UTF8SKIP(q2);
- q2 += UTF8SKIP(q2);
- }
- if (n1 == 0)
- p1 += u1 ? UTF8SKIP(p1) : 1;
- if (n2 == 0)
- p2 += u2 ? UTF8SKIP(p2) : 1;
-
- }
-
- /* A match is defined by all the scans that specified
- * an explicit length reaching their final goals. */
- match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
-
- if (match) {
- if (pe1)
- *pe1 = (char*)p1;
- if (pe2)
- *pe2 = (char*)p2;
- }
-
- return match ? 0 : 1; /* 0 match, 1 mismatch */
+Perl_foldEQ_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
+{
+ dVAR;
+ register const U8 *p1 = (const U8*)s1; /* Point to current char */
+ register const U8 *p2 = (const U8*)s2;
+ register const U8 *g1 = NULL; /* goal for s1 */
+ register const U8 *g2 = NULL;
+ register const U8 *e1 = NULL; /* Don't scan s1 past this */
+ register U8 *f1 = NULL; /* Point to current folded */
+ register const U8 *e2 = NULL;
+ register U8 *f2 = NULL;
+ STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
+ U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
+ U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
+ U8 natbuf[2]; /* Holds native 8-bit char converted to utf8;
+ these always fit in 2 bytes */
+
+ PERL_ARGS_ASSERT_FOLDEQ_UTF8;
+
+ if (pe1) {
+ e1 = *(U8**)pe1;
+ }
+
+ if (l1) {
+ g1 = (const U8*)s1 + l1;
+ }
+
+ if (pe2) {
+ e2 = *(U8**)pe2;
+ }
+
+ if (l2) {
+ g2 = (const U8*)s2 + l2;
+ }
+
+ /* Must have at least one goal */
+ assert(g1 || g2);
+
+ if (g1) {
+
+ /* Will never match if goal is out-of-bounds */
+ assert(! e1 || e1 >= g1);
+
+ /* Here, there isn't an end pointer, or it is beyond the goal. We
+ * only go as far as the goal */
+ e1 = g1;
+ }
+ else {
+ assert(e1); /* Must have an end for looking at s1 */
+ }
+
+ /* Same for goal for s2 */
+ if (g2) {
+ assert(! e2 || e2 >= g2);
+ e2 = g2;
+ }
+ else {
+ assert(e2);
+ }
+
+ /* Look through both strings, a character at a time */
+ while (p1 < e1 && p2 < e2) {
+
+ /* If at the beginning of a new character in s1, get its fold to use
+ * and the length of the fold */
+ if (n1 == 0) {
+ if (u1) {
+ to_utf8_fold(p1, foldbuf1, &n1);
+ }
+ else { /* Not utf8, convert to it first and then get fold */
+ uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
+ to_utf8_fold(natbuf, foldbuf1, &n1);
+ }
+ f1 = foldbuf1;
+ }
+
+ if (n2 == 0) { /* Same for s2 */
+ if (u2) {
+ to_utf8_fold(p2, foldbuf2, &n2);
+ }
+ else {
+ uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
+ to_utf8_fold(natbuf, foldbuf2, &n2);
+ }
+ f2 = foldbuf2;
+ }
+
+ /* While there is more to look for in both folds, see if they
+ * continue to match */
+ while (n1 && n2) {
+ U8 fold_length = UTF8SKIP(f1);
+ if (fold_length != UTF8SKIP(f2)
+ || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
+ function call for single
+ character */
+ || memNE((char*)f1, (char*)f2, fold_length))
+ {
+ return 0; /* mismatch */
+ }
+
+ /* Here, they matched, advance past them */
+ n1 -= fold_length;
+ f1 += fold_length;
+ n2 -= fold_length;
+ f2 += fold_length;
+ }
+
+ /* When reach the end of any fold, advance the input past it */
+ if (n1 == 0) {
+ p1 += u1 ? UTF8SKIP(p1) : 1;
+ }
+ if (n2 == 0) {
+ p2 += u2 ? UTF8SKIP(p2) : 1;
+ }
+ } /* End of loop through both strings */
+
+ /* A match is defined by each scan that specified an explicit length
+ * reaching its final goal, and the other not having matched a partial
+ * character (which can happen when the fold of a character is more than one
+ * character). */
+ if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
+ return 0;
+ }
+
+ /* Successful match. Set output pointers */
+ if (pe1) {
+ *pe1 = (char*)p1;
+ }
+ if (pe2) {
+ *pe2 = (char*)p2;
+ }
+ return 1;
}
/*