Revamp ibcmp_utf8 for performance and clarity

diff --git a/utf8.c b/utf8.c

index 6c522d0..8fd5db9 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -2504,22 +2504,33 @@ Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
 /*
 =for apidoc ibcmp_utf8
 
-Return true if the strings s1 and s2 differ case-insensitively, false
-if not (if they are equal case-insensitively).  If u1 is true, the
-string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
-the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
-are false, the respective string is assumed to be in native 8-bit
-encoding.
-
-If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
-in there (they will point at the beginning of the I<next> character).
-If the pointers behind pe1 or pe2 are non-NULL, they are the end
-pointers beyond which scanning will not continue under any
-circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
-s2+l2 will be used as goal end pointers that will also stop the scan,
-and which qualify towards defining a successful match: all the scans
-that define an explicit length must reach their goal pointers for
-a match to succeed).
+Returns true if the strings s1 and s2 differ case-insensitively, false
+if they are equal case-insensitively.  Note that this is the complement of what
+you might expect (perhaps it would have been better to name it C<ibncmp_utf8>).
+
+If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode;
+otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for u2
+with respect to s2.
+
+If the byte length l1 is non-zero, s1+l1 will be used as a goal to reach.  The
+scan will not be considered to be a match unless the goal is reached, and
+scanning won't continue past that goal.  Correspondingly for l2 with respect to
+s2.
+
+If pe1 is non-NULL and the pointer it points to is not NULL, that pointer is
+considered an end pointer beyond which scanning of s1 will not continue under
+any circumstances.  This means that if both l1 and pe1 are specified, and pe1
+is less than s1+l1, the match will never be successful because it can never
+get as far as its goal.  Correspondingly for pe2 with respect to s2.
+
+At least one of s1 and s2 must have a goal, and if both do, both have to be
+reached for a successful match.   Also, if the fold of a character is multiple
+characters, all of them must be matched (see tr21 reference below for
+'folding').
+
+Upon a successful match (when the routine returns false), if pe1 is non-NULL,
+it will be set to point to the beginning of the I<next> character of s1 beyond
+what was matched.  Correspondingly for pe2 and s2.
 
 For case-insensitiveness, the "casefolding" of Unicode is used
 instead of upper/lowercasing both the characters, see
@@ -2529,98 +2540,131 @@ http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
 I32
 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
 {
-     dVAR;
-     register const U8 *p1  = (const U8*)s1;
-     register const U8 *p2  = (const U8*)s2;
-     register const U8 *f1 = NULL;
-     register const U8 *f2 = NULL;
-     register U8 *e1 = NULL;
-     register U8 *q1 = NULL;
-     register U8 *e2 = NULL;
-     register U8 *q2 = NULL;
-     STRLEN n1 = 0, n2 = 0;
-     U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
-     U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
-     U8 natbuf[1+1];
-     STRLEN foldlen1, foldlen2;
-     bool match;
-
-     PERL_ARGS_ASSERT_IBCMP_UTF8;
-     
-     if (pe1)
-         e1 = *(U8**)pe1;
-     /* assert(e1 || l1); */
-     if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
-         f1 = (const U8*)s1 + l1;
-     if (pe2)
-         e2 = *(U8**)pe2;
-     /* assert(e2 || l2); */
-     if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
-         f2 = (const U8*)s2 + l2;
-
-     /* This shouldn't happen. However, putting an assert() there makes some
-      * tests fail. */
-     /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
-     if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
-         return 1; /* mismatch; possible infinite loop or false positive */
-
-     if (!u1 || !u2)
-         natbuf[1] = 0; /* Need to terminate the buffer. */
-
-     while ((e1 == 0 || p1 < e1) &&
-           (f1 == 0 || p1 < f1) &&
-           (e2 == 0 || p2 < e2) &&
-           (f2 == 0 || p2 < f2)) {
-         if (n1 == 0) {
-              if (u1)
-                   to_utf8_fold(p1, foldbuf1, &foldlen1);
-              else {
-                   uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
-                   to_utf8_fold(natbuf, foldbuf1, &foldlen1);
-              }
-              q1 = foldbuf1;
-              n1 = foldlen1;
-         }
-         if (n2 == 0) {
-              if (u2)
-                   to_utf8_fold(p2, foldbuf2, &foldlen2);
-              else {
-                   uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
-                   to_utf8_fold(natbuf, foldbuf2, &foldlen2);
-              }
-              q2 = foldbuf2;
-              n2 = foldlen2;
-         }
-         while (n1 && n2) {
-              if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
-                  (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
-                   memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
-                  return 1; /* mismatch */
-              n1 -= UTF8SKIP(q1);
-              q1 += UTF8SKIP(q1);
-              n2 -= UTF8SKIP(q2);
-              q2 += UTF8SKIP(q2);
-         }
-         if (n1 == 0)
-              p1 += u1 ? UTF8SKIP(p1) : 1;
-         if (n2 == 0)
-              p2 += u2 ? UTF8SKIP(p2) : 1;
-
-     }
-
-     /* A match is defined by all the scans that specified
-      * an explicit length reaching their final goals. */
-     match = (n1 == 0 && n2 == 0    /* Must not match partial char; Bug #72998 */
-            && (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2));
-
-     if (match) {
-         if (pe1)
-              *pe1 = (char*)p1;
-         if (pe2)
-              *pe2 = (char*)p2;
-     }
-
-     return match ? 0 : 1; /* 0 match, 1 mismatch */
+    dVAR;
+    register const U8 *p1  = (const U8*)s1; /* Point to current char */
+    register const U8 *p2  = (const U8*)s2;
+    register const U8 *g1 = NULL;      /* goal for s1 */
+    register const U8 *g2 = NULL;
+    register const U8 *e1 = NULL;      /* Don't scan s1 past this */
+    register U8 *f1 = NULL;            /* Point to current folded */
+    register const U8 *e2 = NULL;
+    register U8 *f2 = NULL;
+    STRLEN n1 = 0, n2 = 0;             /* Number of bytes in current char */
+    U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
+    U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
+    U8 natbuf[2];              /* Holds native 8-bit char converted to utf8;
+                                  these always fit in 2 bytes */
+
+    PERL_ARGS_ASSERT_IBCMP_UTF8;
+
+    if (pe1) {
+       e1 = *(U8**)pe1;
+    }
+
+    if (l1) {
+       g1 = (const U8*)s1 + l1;
+    }
+
+    if (pe2) {
+       e2 = *(U8**)pe2;
+    }
+
+    if (l2) {
+       g2 = (const U8*)s2 + l2;
+    }
+
+    /* Must have at least one goal */
+    assert(g1 || g2);
+
+    if (g1) {
+
+       /* Will never match if goal is out-of-bounds */
+       assert(! e1  || e1 >= g1);
+
+       /* Here, there isn't an end pointer, or it is beyond the goal.  We
+       * only go as far as the goal */
+       e1 = g1;
+    }
+    else assert(e1);   /* Must have an end for looking at s1 */
+
+    /* Same for goal for s2 */
+    if (g2) {
+       assert(! e2  || e2 >= g2);
+       e2 = g2;
+    }
+    else assert(e2);
+
+    /* Look through both strings, a character at a time */
+    while (p1 < e1 && p2 < e2) {
+
+       /* If at the beginning of a new character in s1, get its fold to use */
+       if (n1 == 0) {
+           if (u1) {
+               to_utf8_fold(p1, foldbuf1, &n1);
+           }
+           else {  /* Not utf8, convert to it first and then get fold */
+               uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
+               to_utf8_fold(natbuf, foldbuf1, &n1);
+           }
+           f1 = foldbuf1;
+       }
+
+       if (n2 == 0) {    /* Same for s2 */
+           if (u2) {
+               to_utf8_fold(p2, foldbuf2, &n2);
+           }
+           else {
+               uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
+               to_utf8_fold(natbuf, foldbuf2, &n2);
+           }
+           f2 = foldbuf2;
+       }
+
+       /* While there is more to look for in both folds, see if they
+       * continue to match */
+       while (n1 && n2) {
+           U8 fold_length = UTF8SKIP(f1);
+           if (fold_length != UTF8SKIP(f2)
+               || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
+                                                      function call for single
+                                                      character */
+               || memNE((char*)f1, (char*)f2, fold_length))
+           {
+               return 1; /* mismatch */
+           }
+
+           /* Here, they matched, advance past them */
+           n1 -= fold_length;
+           f1 += fold_length;
+           n2 -= fold_length;
+           f2 += fold_length;
+       }
+
+       /* When reach the end of any fold, advance the input past it */
+       if (n1 == 0) {
+           p1 += u1 ? UTF8SKIP(p1) : 1;
+       }
+       if (n2 == 0) {
+           p2 += u2 ? UTF8SKIP(p2) : 1;
+       }
+    } /* End of loop through both strings */
+
+    /* A match is defined by each scan that specified an explicit length
+    * reaching its final goal, and the other not having matched a partial
+    * character (which can happen when the fold of a character is more than one
+    * character). */
+    if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
+       return 1;
+    }
+
+    /* Successful match.  Set output pointers */
+    if (pe1) {
+       *pe1 = (char*)p1;
+    }
+    if (pe2) {
+       *pe2 = (char*)p2;
+    }
+    return 0;
 }
 
 /*