From: Jarkko Hietaniemi Date: Sat, 22 Dec 2001 20:10:01 +0000 (+0000) Subject: Unicode casefolding continues. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=bc517b45fdfb539b223ef3bf8d22272436835518;p=p5sagit%2Fp5-mst-13.2.git Unicode casefolding continues. (lib/encoding.t still failing.) p4raw-id: //depot/perl@13855 --- diff --git a/regexec.c b/regexec.c index 1ad4003..4fe7889 100644 --- a/regexec.c +++ b/regexec.c @@ -970,7 +970,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if (c1 == c2) while (s <= e) { if ( utf8_to_uvchr((U8*)s, &len) == c1 - && (ln == 1 || + && (ln == len || ibcmp_utf8(s, do_utf8, strend - s, m, UTF, ln)) && (norun || regtry(prog, s)) ) @@ -981,7 +981,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta while (s <= e) { UV c = utf8_to_uvchr((U8*)s, &len); if ( (c == c1 || c == c2) - && (ln == 1 || + && (ln == len || ibcmp_utf8(s, do_utf8, strend - s, m, UTF, ln)) && (norun || regtry(prog, s)) ) @@ -2232,10 +2232,10 @@ S_regmatch(pTHX_ regnode *prog) s = STRING(scan); ln = STR_LEN(scan); if (do_utf8 != (UTF!=0)) { - /* The target and the pattern have differing "utf8ness". */ + /* The target and the pattern have differing utf8ness. */ char *l = locinput; char *e = s + ln; - STRLEN len; + STRLEN ulen; if (do_utf8) { /* The target is utf8, the pattern is not utf8. */ @@ -2243,9 +2243,9 @@ S_regmatch(pTHX_ regnode *prog) if (l >= PL_regeol) sayNO; if (NATIVE_TO_UNI(*(U8*)s) != - utf8_to_uvchr((U8*)l, &len)) + utf8_to_uvchr((U8*)l, &ulen)) sayNO; - l += len; + l += ulen; s ++; } } @@ -2255,9 +2255,9 @@ S_regmatch(pTHX_ regnode *prog) if (l >= PL_regeol) sayNO; if (NATIVE_TO_UNI(*((U8*)l)) != - utf8_to_uvchr((U8*)s, &len)) + utf8_to_uvchr((U8*)s, &ulen)) sayNO; - s += len; + s += ulen; l ++; } } @@ -2265,7 +2265,7 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(locinput); break; } - /* The target and the pattern have the same "utf8ness". */ + /* The target and the pattern have the same utf8ness. */ /* Inline the first character, for speed. */ if (UCHARAT(s) != nextchr) sayNO; @@ -2283,26 +2283,85 @@ S_regmatch(pTHX_ regnode *prog) s = STRING(scan); ln = STR_LEN(scan); - if (do_utf8) { + { char *l = locinput; - char *e; - STRLEN ulen; - U8 tmpbuf[UTF8_MAXLEN_UCLC+1]; - e = s + ln; - while (s < e) { - if (l >= PL_regeol) - sayNO; - toLOWER_utf8((U8*)l, tmpbuf, &ulen); - if (memNE(s, (char*)tmpbuf, ulen)) - sayNO; - s += UTF8SKIP(s); - l += ulen; + char *e = s + ln; + U8 tmpbuf[UTF8_MAXLEN_FOLD+1]; + + if (do_utf8 != (UTF!=0)) { + /* The target and the pattern have differing utf8ness. */ + STRLEN ulen1, ulen2; + UV cs, cl; + + if (do_utf8) { + /* The target is utf8, the pattern is not utf8. */ + while (s < e) { + if (l >= PL_regeol) + sayNO; + + cs = to_uni_fold(NATIVE_TO_UNI(*(U8*)s), + (U8*)s, &ulen1); + cl = utf8_to_uvchr((U8*)l, &ulen2); + + if (cs != cl) { + cl = to_uni_fold(cl, (U8*)l, &ulen2); + if (ulen1 != ulen2 || cs != cl) + sayNO; + } + l += ulen1; + s ++; + } + } + else { + /* The target is not utf8, the pattern is utf8. */ + while (s < e) { + if (l >= PL_regeol) + sayNO; + + cs = utf8_to_uvchr((U8*)s, &ulen1); + + cl = to_uni_fold(NATIVE_TO_UNI(*(U8*)l), + (U8*)l, &ulen2); + + if (cs != cl) { + cs = to_uni_fold(cs, (U8*)s, &ulen1); + if (ulen1 != ulen2 || cs != cl) + sayNO; + } + l ++; + s += ulen1; + } + } + locinput = l; + nextchr = UCHARAT(locinput); + break; + } + + if (do_utf8 && UTF) { + /* Both the target and the pattern are utf8. */ + STRLEN ulen; + + while (s < e) { + if (l >= PL_regeol) + sayNO; + if (UTF8SKIP(s) != UTF8SKIP(l) || + memNE(s, (char*)l, UTF8SKIP(s))) { + to_utf8_fold((U8*)l, tmpbuf, &ulen); + if (UTF8SKIP(s) != ulen || + memNE(s, (char*)tmpbuf, ulen)) + sayNO; + } + l += UTF8SKIP(l); + s += UTF8SKIP(s); + } + locinput = l; + nextchr = UCHARAT(locinput); + break; } - locinput = l; - nextchr = UCHARAT(locinput); - break; } + /* Neither the target and the pattern are utf8. */ + /* Inline the first character, for speed. */ if (UCHARAT(s) != nextchr && UCHARAT(s) != ((OP(scan) == EXACTF) diff --git a/t/op/pat.t b/t/op/pat.t index e4556ee..03eec49 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -6,7 +6,7 @@ $| = 1; -print "1..770\n"; +print "1..786\n"; BEGIN { chdir 't' if -d 't'; @@ -2290,6 +2290,55 @@ print "# some Unicode properties\n"; print "not " unless "A\x{100}" =~ /A/i; print "ok 757\n"; + + print "not " unless "\x{101}a" =~ /\x{100}/i; + print "ok 758\n"; + + print "not " unless "\x{100}a" =~ /\x{100}/i; + print "ok 759\n"; + + print "not " unless "\x{101}a" =~ /\x{101}/i; + print "ok 760\n"; + + print "not " unless "\x{100}a" =~ /\x{101}/i; + print "ok 761\n"; + + print "not " unless "a\x{100}" =~ /A\x{100}/i; + print "ok 762\n"; + + print "not " unless "A\x{100}" =~ /A\x{100}/i; + print "ok 763\n"; + + print "not " unless "a\x{100}" =~ /a\x{100}/i; + print "ok 764\n"; + + print "not " unless "A\x{100}" =~ /A\x{100}/i; + print "ok 765\n"; + + print "not " unless "a\x{100}" =~ /[A]/i; + print "ok 766\n"; + + print "not " unless "A\x{100}" =~ /[A]/i; + print "ok 767\n"; + + print "not " unless "a\x{100}" =~ /[a]/i; + print "ok 768\n"; + + print "not " unless "A\x{100}" =~ /[A]/i; + print "ok 769\n"; + + print "not " unless "\x{101}a" =~ /[\x{100}]/i; + print "ok 770\n"; + + print "not " unless "\x{100}a" =~ /[\x{100}]/i; + print "ok 771\n"; + + print "not " unless "\x{101}a" =~ /[\x{101}]/i; + print "ok 772\n"; + + print "not " unless "\x{100}a" =~ /[\x{101}]/i; + print "ok 773\n"; + } { @@ -2299,30 +2348,29 @@ print "# some Unicode properties\n"; my $lower = "\N{LATIN SMALL LETTER A WITH GRAVE}"; my $UPPER = "\N{LATIN CAPITAL LETTER A WITH GRAVE}"; - print $lower =~ m/$UPPER/i ? "ok 758\n" : "not ok 758\n"; - print $UPPER =~ m/$lower/i ? "ok 759\n" : "not ok 759\n"; - print $lower =~ m/[$UPPER]/i ? "ok 760\n" : "not ok 760\n"; - print $UPPER =~ m/[$lower]/i ? "ok 761\n" : "not ok 761\n"; + print $lower =~ m/$UPPER/i ? "ok 774\n" : "not ok 774\n"; + print $UPPER =~ m/$lower/i ? "ok 775\n" : "not ok 775\n"; + print $lower =~ m/[$UPPER]/i ? "ok 776\n" : "not ok 776\n"; + print $UPPER =~ m/[$lower]/i ? "ok 777\n" : "not ok 777\n"; print "# GREEK LETTER ALPHA WITH VRACHY\n"; $lower = "\N{GREEK CAPITAL LETTER ALPHA WITH VRACHY}"; $UPPER = "\N{GREEK SMALL LETTER ALPHA WITH VRACHY}"; - print $lower =~ m/$UPPER/i ? "ok 762\n" : "not ok 762\n"; - print $UPPER =~ m/$lower/i ? "ok 763\n" : "not ok 763\n"; - print $lower =~ m/[$UPPER]/i ? "ok 764\n" : "not ok 764\n"; - print $UPPER =~ m/[$lower]/i ? "ok 765\n" : "not ok 765\n"; + print $lower =~ m/$UPPER/i ? "ok 778\n" : "not ok 778\n"; + print $UPPER =~ m/$lower/i ? "ok 779\n" : "not ok 779\n"; + print $lower =~ m/[$UPPER]/i ? "ok 780\n" : "not ok 780\n"; + print $UPPER =~ m/[$lower]/i ? "ok 781\n" : "not ok 781\n"; print "# LATIN LETTER Y WITH DIAERESIS\n"; $lower = "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}"; $UPPER = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}"; - - print $lower =~ m/$UPPER/i ? "ok 766\n" : "not ok 766\n"; - print $UPPER =~ m/$lower/i ? "ok 767\n" : "not ok 767\n"; - print $lower =~ m/[$UPPER]/i ? "ok 768\n" : "not ok 768\n"; - print $UPPER =~ m/[$lower]/i ? "ok 769\n" : "not ok 769\n"; + print $lower =~ m/$UPPER/i ? "ok 782\n" : "not ok 782\n"; + print $UPPER =~ m/$lower/i ? "ok 783\n" : "not ok 783\n"; + print $lower =~ m/[$UPPER]/i ? "ok 784\n" : "not ok 784\n"; + print $UPPER =~ m/[$lower]/i ? "ok 785\n" : "not ok 785\n"; } { @@ -2338,6 +2386,6 @@ print "# some Unicode properties\n"; my $char = "\N{COMBINING GREEK PERISPOMENI}"; my $code = sprintf "%04x", ord($char); - # Before #13843 this was failing. - print "_:$char:_" =~ m/_:$SIGMA:_/i ? "not ok 770\n" : "ok 770\n"; + # Before #13843 this was failing by matching falsely. + print "_:$char:_" =~ m/_:$SIGMA:_/i ? "not ok 786\n" : "ok 786\n"; }