X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=utf8.c;h=1f4192f9adbb9c609a0fa229fa80806e1fc6171f;hb=c33ef3ac654cbe35caea1d36f34c68f0e4a134ba;hp=040b27371daf29a939b71dee945c3aa3c2ea838d;hpb=5b3114678323b284f88ba8d2da3cad315a53ed6e;p=p5sagit%2Fp5-mst-13.2.git diff --git a/utf8.c b/utf8.c index 040b273..1f4192f 100644 --- a/utf8.c +++ b/utf8.c @@ -33,7 +33,7 @@ #include "perl.h" #ifndef EBCDIC -/* Separate prototypes needed because in ASCII systems these +/* Separate prototypes needed because in ASCII systems these are * usually macros but they still are compiled as code, too. */ PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags); PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv); @@ -42,7 +42,7 @@ PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv); static const char unees[] = "Malformed UTF-8 character (unexpected end of string)"; -/* +/* =head1 Unicode Support This file contains various utility functions for manipulating UTF8-encoded @@ -57,8 +57,10 @@ within non-zero characters. /* =for apidoc is_ascii_string -Returns true if first C bytes of the given string are ASCII (i.e. none -of them even raise the question of UTF-8-ness). +Returns true if the first C bytes of the given string are the same whether +or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That +is, if they are invariant. On ASCII-ish machines, only ASCII characters +fit this definition, hence the function's name. See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc(). @@ -262,7 +264,7 @@ S_is_utf8_char_slow(const U8 *s, const STRLEN len) if (!UTF8_IS_CONTINUATION(*s)) return 0; uv = UTF8_ACCUMULATE(uv, *s); - if (uv < ouv) + if (uv < ouv) return 0; ouv = uv; s++; @@ -1653,8 +1655,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, } }*/ - /* The 0xDF is the only special casing Unicode code point below 0x100. */ - if (special && (uv1 == 0xDF || uv1 > 0xFF)) { + if (special) { /* It might be "special" (sometimes, but not always, * a multicharacter mapping) */ HV * const hv = get_hv(special, 0); @@ -1842,8 +1843,7 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits PUSHSTACKi(PERLSI_MAGIC); ENTER; - SAVEI32(PL_hints); - PL_hints = 0; + SAVEHINTS(); save_re_context(); if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */ ENTER; @@ -2377,7 +2377,7 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) =for apidoc utf8n_to_uvchr flags -Returns the native character value of the first character in the string +Returns the native character value of the first character in the string C which is assumed to be in UTF-8 encoding; C will be set to the length, in bytes, of that character. @@ -2390,7 +2390,7 @@ Allows length and flags to be passed to low level routine. a real function in case XS code wants it */ UV -Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, +Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) { const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags); @@ -2475,7 +2475,7 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f } if (truncated) sv_catpvs(dsv, "..."); - + return SvPVX(dsv); } @@ -2502,24 +2502,38 @@ Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags) } /* -=for apidoc ibcmp_utf8 - -Return true if the strings s1 and s2 differ case-insensitively, false -if not (if they are equal case-insensitively). If u1 is true, the -string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true, -the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2 -are false, the respective string is assumed to be in native 8-bit -encoding. - -If the pe1 and pe2 are non-NULL, the scanning pointers will be copied -in there (they will point at the beginning of the I character). -If the pointers behind pe1 or pe2 are non-NULL, they are the end -pointers beyond which scanning will not continue under any -circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and -s2+l2 will be used as goal end pointers that will also stop the scan, -and which qualify towards defining a successful match: all the scans -that define an explicit length must reach their goal pointers for -a match to succeed). +=for apidoc foldEQ_utf8 + +Returns true if the leading portions of the strings s1 and s2 (either or both +of which may be in UTF-8) are the same case-insensitively; false otherwise. +How far into the strings to compare is determined by other input parameters. + +If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode; +otherwise it is assumed to be in native 8-bit encoding. Correspondingly for u2 +with respect to s2. + +If the byte length l1 is non-zero, it says how far into s1 to check for fold +equality. In other words, s1+l1 will be used as a goal to reach. The +scan will not be considered to be a match unless the goal is reached, and +scanning won't continue past that goal. Correspondingly for l2 with respect to +s2. + +If pe1 is non-NULL and the pointer it points to is not NULL, that pointer is +considered an end pointer beyond which scanning of s1 will not continue under +any circumstances. This means that if both l1 and pe1 are specified, and pe1 +is less than s1+l1, the match will never be successful because it can never +get as far as its goal (and in fact is asserted against). Correspondingly for +pe2 with respect to s2. + +At least one of s1 and s2 must have a goal (at least one of l1 and l2 must be +non-zero), and if both do, both have to be +reached for a successful match. Also, if the fold of a character is multiple +characters, all of them must be matched (see tr21 reference below for +'folding'). + +Upon a successful match, if pe1 is non-NULL, +it will be set to point to the beginning of the I character of s1 beyond +what was matched. Correspondingly for pe2 and s2. For case-insensitiveness, the "casefolding" of Unicode is used instead of upper/lowercasing both the characters, see @@ -2527,99 +2541,134 @@ http://www.unicode.org/unicode/reports/tr21/ (Case Mappings). =cut */ I32 -Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2) -{ - dVAR; - register const U8 *p1 = (const U8*)s1; - register const U8 *p2 = (const U8*)s2; - register const U8 *f1 = NULL; - register const U8 *f2 = NULL; - register U8 *e1 = NULL; - register U8 *q1 = NULL; - register U8 *e2 = NULL; - register U8 *q2 = NULL; - STRLEN n1 = 0, n2 = 0; - U8 foldbuf1[UTF8_MAXBYTES_CASE+1]; - U8 foldbuf2[UTF8_MAXBYTES_CASE+1]; - U8 natbuf[1+1]; - STRLEN foldlen1, foldlen2; - bool match; - - PERL_ARGS_ASSERT_IBCMP_UTF8; - - if (pe1) - e1 = *(U8**)pe1; - /* assert(e1 || l1); */ - if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1))) - f1 = (const U8*)s1 + l1; - if (pe2) - e2 = *(U8**)pe2; - /* assert(e2 || l2); */ - if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2))) - f2 = (const U8*)s2 + l2; - - /* This shouldn't happen. However, putting an assert() there makes some - * tests fail. */ - /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */ - if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)) - return 1; /* mismatch; possible infinite loop or false positive */ - - if (!u1 || !u2) - natbuf[1] = 0; /* Need to terminate the buffer. */ - - while ((e1 == 0 || p1 < e1) && - (f1 == 0 || p1 < f1) && - (e2 == 0 || p2 < e2) && - (f2 == 0 || p2 < f2)) { - if (n1 == 0) { - if (u1) - to_utf8_fold(p1, foldbuf1, &foldlen1); - else { - uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1))); - to_utf8_fold(natbuf, foldbuf1, &foldlen1); - } - q1 = foldbuf1; - n1 = foldlen1; - } - if (n2 == 0) { - if (u2) - to_utf8_fold(p2, foldbuf2, &foldlen2); - else { - uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2))); - to_utf8_fold(natbuf, foldbuf2, &foldlen2); - } - q2 = foldbuf2; - n2 = foldlen2; - } - while (n1 && n2) { - if ( UTF8SKIP(q1) != UTF8SKIP(q2) || - (UTF8SKIP(q1) == 1 && *q1 != *q2) || - memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) ) - return 1; /* mismatch */ - n1 -= UTF8SKIP(q1); - q1 += UTF8SKIP(q1); - n2 -= UTF8SKIP(q2); - q2 += UTF8SKIP(q2); - } - if (n1 == 0) - p1 += u1 ? UTF8SKIP(p1) : 1; - if (n2 == 0) - p2 += u2 ? UTF8SKIP(p2) : 1; - - } - - /* A match is defined by all the scans that specified - * an explicit length reaching their final goals. */ - match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2); - - if (match) { - if (pe1) - *pe1 = (char*)p1; - if (pe2) - *pe2 = (char*)p2; - } - - return match ? 0 : 1; /* 0 match, 1 mismatch */ +Perl_foldEQ_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2) +{ + dVAR; + register const U8 *p1 = (const U8*)s1; /* Point to current char */ + register const U8 *p2 = (const U8*)s2; + register const U8 *g1 = NULL; /* goal for s1 */ + register const U8 *g2 = NULL; + register const U8 *e1 = NULL; /* Don't scan s1 past this */ + register U8 *f1 = NULL; /* Point to current folded */ + register const U8 *e2 = NULL; + register U8 *f2 = NULL; + STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */ + U8 foldbuf1[UTF8_MAXBYTES_CASE+1]; + U8 foldbuf2[UTF8_MAXBYTES_CASE+1]; + U8 natbuf[2]; /* Holds native 8-bit char converted to utf8; + these always fit in 2 bytes */ + + PERL_ARGS_ASSERT_FOLDEQ_UTF8; + + if (pe1) { + e1 = *(U8**)pe1; + } + + if (l1) { + g1 = (const U8*)s1 + l1; + } + + if (pe2) { + e2 = *(U8**)pe2; + } + + if (l2) { + g2 = (const U8*)s2 + l2; + } + + /* Must have at least one goal */ + assert(g1 || g2); + + if (g1) { + + /* Will never match if goal is out-of-bounds */ + assert(! e1 || e1 >= g1); + + /* Here, there isn't an end pointer, or it is beyond the goal. We + * only go as far as the goal */ + e1 = g1; + } + else assert(e1); /* Must have an end for looking at s1 */ + + /* Same for goal for s2 */ + if (g2) { + assert(! e2 || e2 >= g2); + e2 = g2; + } + else assert(e2); + + /* Look through both strings, a character at a time */ + while (p1 < e1 && p2 < e2) { + + /* If at the beginning of a new character in s1, get its fold to use + * and the length of the fold */ + if (n1 == 0) { + if (u1) { + to_utf8_fold(p1, foldbuf1, &n1); + } + else { /* Not utf8, convert to it first and then get fold */ + uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1))); + to_utf8_fold(natbuf, foldbuf1, &n1); + } + f1 = foldbuf1; + } + + if (n2 == 0) { /* Same for s2 */ + if (u2) { + to_utf8_fold(p2, foldbuf2, &n2); + } + else { + uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2))); + to_utf8_fold(natbuf, foldbuf2, &n2); + } + f2 = foldbuf2; + } + + /* While there is more to look for in both folds, see if they + * continue to match */ + while (n1 && n2) { + U8 fold_length = UTF8SKIP(f1); + if (fold_length != UTF8SKIP(f2) + || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE + function call for single + character */ + || memNE((char*)f1, (char*)f2, fold_length)) + { + return 0; /* mismatch */ + } + + /* Here, they matched, advance past them */ + n1 -= fold_length; + f1 += fold_length; + n2 -= fold_length; + f2 += fold_length; + } + + /* When reach the end of any fold, advance the input past it */ + if (n1 == 0) { + p1 += u1 ? UTF8SKIP(p1) : 1; + } + if (n2 == 0) { + p2 += u2 ? UTF8SKIP(p2) : 1; + } + } /* End of loop through both strings */ + + /* A match is defined by each scan that specified an explicit length + * reaching its final goal, and the other not having matched a partial + * character (which can happen when the fold of a character is more than one + * character). */ + if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) { + return 0; + } + + /* Successful match. Set output pointers */ + if (pe1) { + *pe1 = (char*)p1; + } + if (pe2) { + *pe2 = (char*)p2; + } + return 1; } /*