From: Jarkko Hietaniemi <jhi@iki.fi>
Date: Sat, 22 Dec 2001 20:10:01 +0000 (+0000)
Subject: Unicode casefolding continues.
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=bc517b45fdfb539b223ef3bf8d22272436835518;p=p5sagit%2Fp5-mst-13.2.git

Unicode casefolding continues.
(lib/encoding.t still failing.)

p4raw-id: //depot/perl@13855
---

diff --git a/regexec.c b/regexec.c
index 1ad4003..4fe7889 100644
--- a/regexec.c
+++ b/regexec.c
@@ -970,7 +970,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		if (c1 == c2)
 		    while (s <= e) {
 			if ( utf8_to_uvchr((U8*)s, &len) == c1
-			     && (ln == 1 ||
+			     && (ln == len ||
 				 ibcmp_utf8(s, do_utf8,  strend - s,
 					    m, UTF, ln))
 			     && (norun || regtry(prog, s)) )
@@ -981,7 +981,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		    while (s <= e) {
 			UV c = utf8_to_uvchr((U8*)s, &len);
 			if ( (c == c1 || c == c2)
-			     && (ln == 1 ||
+			     && (ln == len ||
 				 ibcmp_utf8(s, do_utf8, strend - s,
 					    m, UTF, ln))
 			     && (norun || regtry(prog, s)) )
@@ -2232,10 +2232,10 @@ S_regmatch(pTHX_ regnode *prog)
 	    s = STRING(scan);
 	    ln = STR_LEN(scan);
 	    if (do_utf8 != (UTF!=0)) {
-		/* The target and the pattern have differing "utf8ness". */
+		/* The target and the pattern have differing utf8ness. */
 		char *l = locinput;
 		char *e = s + ln;
-		STRLEN len;
+		STRLEN ulen;
 
 		if (do_utf8) {
 		    /* The target is utf8, the pattern is not utf8. */
@@ -2243,9 +2243,9 @@ S_regmatch(pTHX_ regnode *prog)
 			if (l >= PL_regeol)
 			     sayNO;
 			if (NATIVE_TO_UNI(*(U8*)s) !=
-			    utf8_to_uvchr((U8*)l, &len))
+			    utf8_to_uvchr((U8*)l, &ulen))
 			     sayNO;
-			l += len;
+			l += ulen;
 			s ++;
 		    }
 		}
@@ -2255,9 +2255,9 @@ S_regmatch(pTHX_ regnode *prog)
 			if (l >= PL_regeol)
 			    sayNO;
 			if (NATIVE_TO_UNI(*((U8*)l)) !=
-			    utf8_to_uvchr((U8*)s, &len))
+			    utf8_to_uvchr((U8*)s, &ulen))
 			    sayNO;
-			s += len;
+			s += ulen;
 			l ++;
 		    }
 		}
@@ -2265,7 +2265,7 @@ S_regmatch(pTHX_ regnode *prog)
 		nextchr = UCHARAT(locinput);
 		break;
 	    }
-	    /* The target and the pattern have the same "utf8ness". */
+	    /* The target and the pattern have the same utf8ness. */
 	    /* Inline the first character, for speed. */
 	    if (UCHARAT(s) != nextchr)
 		sayNO;
@@ -2283,26 +2283,85 @@ S_regmatch(pTHX_ regnode *prog)
 	    s = STRING(scan);
 	    ln = STR_LEN(scan);
 
-	    if (do_utf8) {
+	    {
 		char *l = locinput;
-		char *e;
-		STRLEN ulen;
-		U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
-		e = s + ln;
-		while (s < e) {
-		    if (l >= PL_regeol)
-			sayNO;
-		    toLOWER_utf8((U8*)l, tmpbuf, &ulen);
-		    if (memNE(s, (char*)tmpbuf, ulen))
-		        sayNO;
-		    s += UTF8SKIP(s);
-		    l += ulen;
+		char *e = s + ln;
+		U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
+
+		if (do_utf8 != (UTF!=0)) {
+		     /* The target and the pattern have differing utf8ness. */
+		     STRLEN ulen1, ulen2;
+		     UV cs, cl;
+
+		     if (do_utf8) {
+			  /* The target is utf8, the pattern is not utf8. */
+			  while (s < e) {
+			       if (l >= PL_regeol)
+				    sayNO;
+
+			       cs = to_uni_fold(NATIVE_TO_UNI(*(U8*)s),
+						(U8*)s, &ulen1);
+			       cl = utf8_to_uvchr((U8*)l, &ulen2);
+
+			       if (cs != cl) {
+				    cl = to_uni_fold(cl, (U8*)l, &ulen2);
+				    if (ulen1 != ulen2 || cs != cl)
+					 sayNO;
+			       }
+			       l += ulen1;
+			       s ++;
+			  }
+		     }
+		     else {
+			  /* The target is not utf8, the pattern is utf8. */
+			  while (s < e) {
+			       if (l >= PL_regeol)
+				    sayNO;
+
+			       cs = utf8_to_uvchr((U8*)s, &ulen1);
+
+			       cl = to_uni_fold(NATIVE_TO_UNI(*(U8*)l),
+						(U8*)l, &ulen2);
+
+			       if (cs != cl) {
+				    cs = to_uni_fold(cs, (U8*)s, &ulen1);
+				    if (ulen1 != ulen2 || cs != cl)
+					 sayNO;
+			       }
+			       l ++;
+			       s += ulen1;
+			  }
+		     }
+		     locinput = l;
+		     nextchr = UCHARAT(locinput);
+		     break;
+		}
+
+		if (do_utf8 && UTF) {
+		     /* Both the target and the pattern are utf8. */
+		     STRLEN ulen;
+		     
+		     while (s < e) {
+			  if (l >= PL_regeol)
+			       sayNO;
+			  if (UTF8SKIP(s) != UTF8SKIP(l) ||
+			      memNE(s, (char*)l, UTF8SKIP(s))) {
+			       to_utf8_fold((U8*)l, tmpbuf, &ulen);
+			       if (UTF8SKIP(s) != ulen ||
+				   memNE(s, (char*)tmpbuf, ulen))
+				    sayNO;
+			  }
+			  l += UTF8SKIP(l);
+			  s += UTF8SKIP(s);
+		     }
+		     locinput = l;
+		     nextchr = UCHARAT(locinput);
+		     break;
 		}
-		locinput = l;
-		nextchr = UCHARAT(locinput);
-		break;
 	    }
 
+	    /* Neither the target and the pattern are utf8. */
+
 	    /* Inline the first character, for speed. */
 	    if (UCHARAT(s) != nextchr &&
 		UCHARAT(s) != ((OP(scan) == EXACTF)
diff --git a/t/op/pat.t b/t/op/pat.t
index e4556ee..03eec49 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -6,7 +6,7 @@
 
 $| = 1;
 
-print "1..770\n";
+print "1..786\n";
 
 BEGIN {
     chdir 't' if -d 't';
@@ -2290,6 +2290,55 @@ print "# some Unicode properties\n";
 
     print "not " unless "A\x{100}" =~ /A/i;
     print "ok 757\n";
+
+    print "not " unless "\x{101}a" =~ /\x{100}/i;
+    print "ok 758\n";
+
+    print "not " unless "\x{100}a" =~ /\x{100}/i;
+    print "ok 759\n";
+
+    print "not " unless "\x{101}a" =~ /\x{101}/i;
+    print "ok 760\n";
+
+    print "not " unless "\x{100}a" =~ /\x{101}/i;
+    print "ok 761\n";
+
+    print "not " unless "a\x{100}" =~ /A\x{100}/i;
+    print "ok 762\n";
+
+    print "not " unless "A\x{100}" =~ /A\x{100}/i;
+    print "ok 763\n";
+
+    print "not " unless "a\x{100}" =~ /a\x{100}/i;
+    print "ok 764\n";
+
+    print "not " unless "A\x{100}" =~ /A\x{100}/i;
+    print "ok 765\n";
+
+    print "not " unless "a\x{100}" =~ /[A]/i;
+    print "ok 766\n";
+
+    print "not " unless "A\x{100}" =~ /[A]/i;
+    print "ok 767\n";
+
+    print "not " unless "a\x{100}" =~ /[a]/i;
+    print "ok 768\n";
+
+    print "not " unless "A\x{100}" =~ /[A]/i;
+    print "ok 769\n";
+
+    print "not " unless "\x{101}a" =~ /[\x{100}]/i;
+    print "ok 770\n";
+
+    print "not " unless "\x{100}a" =~ /[\x{100}]/i;
+    print "ok 771\n";
+
+    print "not " unless "\x{101}a" =~ /[\x{101}]/i;
+    print "ok 772\n";
+
+    print "not " unless "\x{100}a" =~ /[\x{101}]/i;
+    print "ok 773\n";
+
 }
 
 {
@@ -2299,30 +2348,29 @@ print "# some Unicode properties\n";
     my $lower = "\N{LATIN SMALL LETTER A WITH GRAVE}";
     my $UPPER = "\N{LATIN CAPITAL LETTER A WITH GRAVE}";
 
-    print $lower =~ m/$UPPER/i   ? "ok 758\n" : "not ok 758\n";
-    print $UPPER =~ m/$lower/i   ? "ok 759\n" : "not ok 759\n";
-    print $lower =~ m/[$UPPER]/i ? "ok 760\n" : "not ok 760\n";
-    print $UPPER =~ m/[$lower]/i ? "ok 761\n" : "not ok 761\n";
+    print $lower =~ m/$UPPER/i   ? "ok 774\n" : "not ok 774\n";
+    print $UPPER =~ m/$lower/i   ? "ok 775\n" : "not ok 775\n";
+    print $lower =~ m/[$UPPER]/i ? "ok 776\n" : "not ok 776\n";
+    print $UPPER =~ m/[$lower]/i ? "ok 777\n" : "not ok 777\n";
 
     print "# GREEK LETTER ALPHA WITH VRACHY\n";
 
     $lower = "\N{GREEK CAPITAL LETTER ALPHA WITH VRACHY}";
     $UPPER = "\N{GREEK SMALL LETTER ALPHA WITH VRACHY}";
 
-    print $lower =~ m/$UPPER/i   ? "ok 762\n" : "not ok 762\n";
-    print $UPPER =~ m/$lower/i   ? "ok 763\n" : "not ok 763\n";
-    print $lower =~ m/[$UPPER]/i ? "ok 764\n" : "not ok 764\n";
-    print $UPPER =~ m/[$lower]/i ? "ok 765\n" : "not ok 765\n";
+    print $lower =~ m/$UPPER/i   ? "ok 778\n" : "not ok 778\n";
+    print $UPPER =~ m/$lower/i   ? "ok 779\n" : "not ok 779\n";
+    print $lower =~ m/[$UPPER]/i ? "ok 780\n" : "not ok 780\n";
+    print $UPPER =~ m/[$lower]/i ? "ok 781\n" : "not ok 781\n";
 
     print "# LATIN LETTER Y WITH DIAERESIS\n";
 
     $lower = "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}";
     $UPPER = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
-
-    print $lower =~ m/$UPPER/i   ? "ok 766\n" : "not ok 766\n";
-    print $UPPER =~ m/$lower/i   ? "ok 767\n" : "not ok 767\n";
-    print $lower =~ m/[$UPPER]/i ? "ok 768\n" : "not ok 768\n";
-    print $UPPER =~ m/[$lower]/i ? "ok 769\n" : "not ok 769\n";
+    print $lower =~ m/$UPPER/i   ? "ok 782\n" : "not ok 782\n";
+    print $UPPER =~ m/$lower/i   ? "ok 783\n" : "not ok 783\n";
+    print $lower =~ m/[$UPPER]/i ? "ok 784\n" : "not ok 784\n";
+    print $UPPER =~ m/[$lower]/i ? "ok 785\n" : "not ok 785\n";
 }
 
 {
@@ -2338,6 +2386,6 @@ print "# some Unicode properties\n";
     my $char = "\N{COMBINING GREEK PERISPOMENI}";
     my $code = sprintf "%04x", ord($char);
 
-    # Before #13843 this was failing.
-    print "_:$char:_" =~ m/_:$SIGMA:_/i ? "not ok 770\n" : "ok 770\n";
+    # Before #13843 this was failing by matching falsely.
+    print "_:$char:_" =~ m/_:$SIGMA:_/i ? "not ok 786\n" : "ok 786\n";
 }