From: Jarkko Hietaniemi <jhi@iki.fi>
Date: Sat, 12 Jan 2002 20:05:29 +0000 (+0000)
Subject: Sharp S as a special treat for our German UTF-8 testers :-)
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=e0f9d4a8607e11c8ec6d3c60018f5ccbfea703e6;p=p5sagit%2Fp5-mst-13.2.git

Sharp S as a special treat for our German UTF-8 testers :-)

p4raw-id: //depot/perl@14222
---

diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index 23d6ff1..beb742e 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -639,8 +639,12 @@ Level 1 - Basic Unicode Support
         [ 5] have negation
         [ 6] can use look-ahead to emulate subtraction (*)
         [ 7] include Letters in word characters
-        [ 8] some cases of "ss"/"SS" matching U+00DF in a character
-	     class are missing, but that is allowed according to the TR18.
+        [ 8] note that perl does Full casefolding in matching, not Simple:
+             for example U+1F88 is equivalent with U+1F000 U+03B9,
+             not with 1F80.  This difference matters for certain Greek
+             capital letters with certain modifiers: the Full casefolding
+             decomposes the letter, while the Simple casefolding would map
+             it to a single character.
         [ 9] see UTR#13 Unicode Newline Guidelines
         [10] should do ^ and $ also on \x{85}, \x{2028} and \x{2029})
              (should also affect <>, $., and script line numbers)
diff --git a/regexec.c b/regexec.c
index 78c4e24..df4a31b 100644
--- a/regexec.c
+++ b/regexec.c
@@ -916,15 +916,19 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	switch (OP(c)) {
 	case ANYOF:
 	    while (s < strend) {
-		if (reginclass(c, (U8*)s, do_utf8)) {
+	        STRLEN skip = do_utf8 ? UTF8SKIP(s) : 1;
+
+		if (reginclass(c, (U8*)s, do_utf8) ||
+		    (ANYOF_UNICODE_FOLD_SHARP_S(c, s, strend) &&
+		     (skip = 2))) {
 		    if (tmp && (norun || regtry(prog, s)))
 			goto got_it;
 		    else
 			tmp = doevery;
 		}
-		else
-		    tmp = 1;
-		s += do_utf8 ? UTF8SKIP(s) : 1;
+		else 
+		     tmp = 1;
+		s += skip;
 	    }
 	    break;
 	case CANY:
@@ -2108,6 +2112,7 @@ typedef union re_unwind_t {
 
 #define sayYES goto yes
 #define sayNO goto no
+#define sayNO_ANYOF goto no_anyof
 #define sayYES_FINAL goto yes_final
 #define sayYES_LOUD  goto yes_loud
 #define sayNO_FINAL  goto no_final
@@ -2396,21 +2401,33 @@ S_regmatch(pTHX_ regnode *prog)
 	        STRLEN inclasslen = PL_regeol - locinput;
 
 	        if (!reginclasslen(scan, (U8*)locinput, &inclasslen, do_utf8))
-		    sayNO;
+		    sayNO_ANYOF;
 		if (locinput >= PL_regeol)
 		    sayNO;
 		locinput += inclasslen;
 		nextchr = UCHARAT(locinput);
+		break;
 	    }
 	    else {
 		if (nextchr < 0)
 		    nextchr = UCHARAT(locinput);
 		if (!reginclass(scan, (U8*)locinput, do_utf8))
-		    sayNO;
+		    sayNO_ANYOF;
 		if (!nextchr && locinput >= PL_regeol)
 		    sayNO;
 		nextchr = UCHARAT(++locinput);
+		break;
+	    }
+	no_anyof:
+	    /* If we might have the case of the German sharp s
+	     * in a casefolding Unicode character class. */
+
+	    if (ANYOF_UNICODE_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
+		 locinput += 2;
+		 nextchr = UCHARAT(locinput);
 	    }
+	    else
+		 sayNO;
 	    break;
 	case ALNUML:
 	    PL_reg_flags |= RF_tainted;
diff --git a/t/op/pat.t b/t/op/pat.t
index 19ec634..edd34b7 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -6,7 +6,7 @@
 
 $| = 1;
 
-print "1..848\n";
+print "1..850\n";
 
 BEGIN {
     chdir 't' if -d 't';
@@ -2602,22 +2602,20 @@ print "# some Unicode properties\n";
     print "SS" =~
 	/\N{LATIN SMALL LETTER SHARP S}/i   ? "ok 840\n" : "not ok 840\n";
 
-# These are a bit tricky.  Since the LATIN SMALL LETTER SHARP S is U+00DF,
-# the ANYOF reduces to a byte.  The Unicodeness needs to be caught earlier.
-#    print "ss" =~
-#	/[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 841\n" : "not ok 841\n";
-#
-#    print "SS" =~
-#	/[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 842\n" : "not ok 842\n";
+    print "ss" =~
+	/[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 841\n" : "not ok 841\n";
+
+    print "SS" =~
+	/[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 842\n" : "not ok 842\n";
 }
 
 {
     print "# more whitespace: U+0085, U+2028, U+2029\n";
 
     # U+0085 needs to be forced to be Unicode, the \x{100} does that.
-    print "<\x{100}\x{0085}>" =~ /<\x{100}\s>/ ? "ok 841\n" : "not ok 841\n";
-    print "<\x{2028}>" =~ /<\s>/ ? "ok 842\n" : "not ok 842\n";
-    print "<\x{2029}>" =~ /<\s>/ ? "ok 843\n" : "not ok 843\n";
+    print "<\x{100}\x{0085}>" =~ /<\x{100}\s>/ ? "ok 843\n" : "not ok 843\n";
+    print "<\x{2028}>" =~ /<\s>/ ? "ok 844\n" : "not ok 844\n";
+    print "<\x{2029}>" =~ /<\s>/ ? "ok 845\n" : "not ok 845\n";
 }
 
 {
@@ -2628,7 +2626,7 @@ print "# some Unicode properties\n";
     # This is not expected to match: the point is that
     # neither should we get "Malformed UTF-8" warnings.
     print $s =~ /\G(.+?)\n/gcs ?
-	"not ok 844\n" : "ok 844\n";
+	"not ok 846\n" : "ok 846\n";
 
     my @c;
 
@@ -2636,7 +2634,7 @@ print "# some Unicode properties\n";
 	push @c, $1;
     }
 
-    print join("", @c) eq $s ? "ok 845\n" : "not ok 845\n";
+    print join("", @c) eq $s ? "ok 847\n" : "not ok 847\n";
 
     my $t1 = "Q003\n\n\x{e4}\x{f6}\n\nQ004\n\n\x{e7}"; # test only chars < 256
     my $r1 = "";
@@ -2650,12 +2648,12 @@ print "# some Unicode properties\n";
 	$r2 .= $1 . $2;
     }
     $r2 =~ s/\x{100}//;
-    print $r1 eq $r2 ? "ok 846\n" : "not ok 846\n";
+    print $r1 eq $r2 ? "ok 848\n" : "not ok 848\n";
 }
 
 {
     print "# Unicode lookbehind\n";
 
-    print "A\x{100}B"        =~ /(?<=A.)B/  ? "ok 847\n" : "not ok 847\n";
-    print "A\x{200}\x{300}B" =~ /(?<=A..)B/ ? "ok 848\n" : "not ok 848\n";
+    print "A\x{100}B"        =~ /(?<=A.)B/  ? "ok 849\n" : "not ok 849\n";
+    print "A\x{200}\x{300}B" =~ /(?<=A..)B/ ? "ok 850\n" : "not ok 850\n";
 }
diff --git a/utf8.h b/utf8.h
index 8c27afa..2ac5f91 100644
--- a/utf8.h
+++ b/utf8.h
@@ -189,6 +189,7 @@ END_EXTERN_C
 
 #define UTF8_IS_ASCII(c) UTF8_IS_INVARIANT(c)
 
+#define UNICODE_LATIN_SMALL_LETTER_SHARP_S	0x00DF
 #define UNICODE_GREEK_CAPITAL_LETTER_SIGMA	0x03A3
 #define UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA	0x03C2
 #define UNICODE_GREEK_SMALL_LETTER_SIGMA	0x03C3
@@ -198,3 +199,10 @@ END_EXTERN_C
 #define UNI_DISPLAY_QQ		(UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
 #define UNI_DISPLAY_REGEX	(UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
 
+#define ANYOF_UNICODE_FOLD_SHARP_S(n, s, e)	\
+	(ANYOF_BITMAP_TEST(n, UNICODE_LATIN_SMALL_LETTER_SHARP_S) && \
+	 ANYOF_FLAGS(n) & ANYOF_UNICODE && \
+	 ANYOF_FLAGS(n) & ANYOF_FOLD && \
+	 ((e) > (s) + 1) && \
+	 toLOWER((s)[0]) == 's' && \
+	 toLOWER((s)[1]) == 's')