Add some comments to the recent Unicode case-folding saga.
Jarkko Hietaniemi [Mon, 31 Dec 2001 16:33:08 +0000 (16:33 +0000)]
p4raw-id: //depot/perl@13985

regcomp.c
regexec.c

index cac14bf..b442f2c 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -2987,7 +2987,7 @@ tryagain:
            STRLEN numlen;
            STRLEN ulen;
            STRLEN foldlen;
-           U8 tmpbuf[UTF8_MAXLEN_UCLC+1], *foldbuf;
+           U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
 
             parse_start = RExC_parse - 1;
 
@@ -3131,13 +3131,18 @@ tryagain:
                }
                if (RExC_flags16 & PMf_EXTENDED)
                    p = regwhite(p, RExC_end);
-               if (UTF && FOLD)
+               if (UTF && FOLD) {
+                   /* Prime the casefolded buffer. */
                    toFOLD_uni(ender, tmpbuf, &foldlen);
+                   /* Need to peek at the first character. */
+                   ender = utf8_to_uvchr(tmpbuf, 0);
+               }
                if (ISMULT2(p)) { /* Back off on ?+*. */
                    if (len)
                        p = oldp;
                    else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
                         if (FOLD) {
+                             /* Emit all the Unicode characters. */
                              for (foldbuf = tmpbuf;
                                   foldlen;
                                   foldlen -= numlen) {
@@ -3162,6 +3167,7 @@ tryagain:
                }
                if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
                     if (FOLD) {
+                         /* Emit all the Unicode characters. */
                          for (foldbuf = tmpbuf;
                               foldlen;
                               foldlen -= numlen) {
@@ -3206,6 +3212,8 @@ tryagain:
        break;
     }
 
+    /* If the encoding pragma is in effect recode the text of
+     * any EXACT-kind nodes. */
     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
         STRLEN oldlen = STR_LEN(ret);
         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
@@ -4020,9 +4028,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                         to_utf8_fold(tmpbuf, foldbuf, &foldlen);
                         f = utf8_to_uvchr(foldbuf, 0);
 
+                        /* If folding and foldable, insert also
+                         * the folded version to the charclass. */
                         if (f != value)
                              Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", f);
 
+                        /* If folding and the value is one of the Greek
+                         * sigmas insert a few more sigmas to make the
+                         * folding rules of the sigmas to work right.
+                         * Note that not all the possible combinations
+                         * are handled here: some of them are handled
+                         * handled by the standard folding rules, and
+                         * some of them (literal or EXACTF cases) are
+                         * handled during runtime in
+                         * regexec.c:S_find_byclass(). */
                         if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
                              Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
                                             (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
index e67774d..cf33abb 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -965,6 +965,16 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
            if (norun && e < s)
                e = s;                  /* Due to minlen logic of intuit() */
 
+           /* The idea in the EXACTF* cases is to first find the
+            * first character of the EXACTF* node and then, if
+            * necessary, case-insensitively compare the full
+            * text of the node.  The c1 and c2 are the first
+            * characters (though in Unicode it gets a bit
+            * more complicated because there are more cases
+            * than just upper and lower: one is really supposed
+            * to use the so-called folding case for case-insensitive
+            * matching (called "loose matching" in Unicode).  */
+
            if (do_utf8) {
                UV c, f;
                U8 tmpbuf [UTF8_MAXLEN+1];
@@ -1009,6 +1019,13 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                    while (s <= e) {
                        c = utf8_to_uvchr((U8*)s, &len);
 
+                       /* Handle some of the three Greek sigmas cases.
+                         * Note that not all the possible combinations
+                         * are handled here: some of them are handled
+                         * handled by the standard folding rules, and
+                         * some of them (the character class or ANYOF
+                         * cases) are handled during compiletime in
+                         * regexec.c:S_regclass(). */
                        if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
                            c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
                            c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
@@ -2396,12 +2413,16 @@ S_regmatch(pTHX_ regnode *prog)
                               U8 lfoldbuf[UTF8_MAXLEN_FOLD+1];
                               STRLEN lfoldlen;
 
+                              /* Try one of them folded. */
+
                               to_utf8_fold((U8*)l, lfoldbuf, &lfoldlen);
                               if (UTF8SKIP(s) != lfoldlen ||
                                   memNE(s, (char*)lfoldbuf, lfoldlen)) {
                                    U8 sfoldbuf[UTF8_MAXLEN_FOLD+1];
                                    STRLEN sfoldlen;
 
+                                   /* Try both of them folded. */
+
                                    to_utf8_fold((U8*)s, sfoldbuf, &sfoldlen);
                                    if (sfoldlen != lfoldlen ||
                                        memNE((char*)sfoldbuf,