From: Jarkko Hietaniemi Date: Mon, 31 Dec 2001 16:33:08 +0000 (+0000) Subject: Add some comments to the recent Unicode case-folding saga. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=60a8b682cede796bc3c248d2778db979d6f9b9ff;p=p5sagit%2Fp5-mst-13.2.git Add some comments to the recent Unicode case-folding saga. p4raw-id: //depot/perl@13985 --- diff --git a/regcomp.c b/regcomp.c index cac14bf..b442f2c 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2987,7 +2987,7 @@ tryagain: STRLEN numlen; STRLEN ulen; STRLEN foldlen; - U8 tmpbuf[UTF8_MAXLEN_UCLC+1], *foldbuf; + U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf; parse_start = RExC_parse - 1; @@ -3131,13 +3131,18 @@ tryagain: } if (RExC_flags16 & PMf_EXTENDED) p = regwhite(p, RExC_end); - if (UTF && FOLD) + if (UTF && FOLD) { + /* Prime the casefolded buffer. */ toFOLD_uni(ender, tmpbuf, &foldlen); + /* Need to peek at the first character. */ + ender = utf8_to_uvchr(tmpbuf, 0); + } if (ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) { if (FOLD) { + /* Emit all the Unicode characters. */ for (foldbuf = tmpbuf; foldlen; foldlen -= numlen) { @@ -3162,6 +3167,7 @@ tryagain: } if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) { if (FOLD) { + /* Emit all the Unicode characters. */ for (foldbuf = tmpbuf; foldlen; foldlen -= numlen) { @@ -3206,6 +3212,8 @@ tryagain: break; } + /* If the encoding pragma is in effect recode the text of + * any EXACT-kind nodes. */ if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) { STRLEN oldlen = STR_LEN(ret); SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen)); @@ -4020,9 +4028,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) to_utf8_fold(tmpbuf, foldbuf, &foldlen); f = utf8_to_uvchr(foldbuf, 0); + /* If folding and foldable, insert also + * the folded version to the charclass. */ if (f != value) Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", f); + /* If folding and the value is one of the Greek + * sigmas insert a few more sigmas to make the + * folding rules of the sigmas to work right. + * Note that not all the possible combinations + * are handled here: some of them are handled + * handled by the standard folding rules, and + * some of them (literal or EXACTF cases) are + * handled during runtime in + * regexec.c:S_find_byclass(). */ if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) { Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA); diff --git a/regexec.c b/regexec.c index e67774d..cf33abb 100644 --- a/regexec.c +++ b/regexec.c @@ -965,6 +965,16 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if (norun && e < s) e = s; /* Due to minlen logic of intuit() */ + /* The idea in the EXACTF* cases is to first find the + * first character of the EXACTF* node and then, if + * necessary, case-insensitively compare the full + * text of the node. The c1 and c2 are the first + * characters (though in Unicode it gets a bit + * more complicated because there are more cases + * than just upper and lower: one is really supposed + * to use the so-called folding case for case-insensitive + * matching (called "loose matching" in Unicode). */ + if (do_utf8) { UV c, f; U8 tmpbuf [UTF8_MAXLEN+1]; @@ -1009,6 +1019,13 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta while (s <= e) { c = utf8_to_uvchr((U8*)s, &len); + /* Handle some of the three Greek sigmas cases. + * Note that not all the possible combinations + * are handled here: some of them are handled + * handled by the standard folding rules, and + * some of them (the character class or ANYOF + * cases) are handled during compiletime in + * regexec.c:S_regclass(). */ if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA || c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA; @@ -2396,12 +2413,16 @@ S_regmatch(pTHX_ regnode *prog) U8 lfoldbuf[UTF8_MAXLEN_FOLD+1]; STRLEN lfoldlen; + /* Try one of them folded. */ + to_utf8_fold((U8*)l, lfoldbuf, &lfoldlen); if (UTF8SKIP(s) != lfoldlen || memNE(s, (char*)lfoldbuf, lfoldlen)) { U8 sfoldbuf[UTF8_MAXLEN_FOLD+1]; STRLEN sfoldlen; + /* Try both of them folded. */ + to_utf8_fold((U8*)s, sfoldbuf, &sfoldlen); if (sfoldlen != lfoldlen || memNE((char*)sfoldbuf,