From: Jarkko Hietaniemi <jhi@iki.fi>
Date: Sun, 30 Dec 2001 20:04:32 +0000 (+0000)
Subject: Fix all the C(ommon) case foldings as per CaseFold.txt.
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=254ba52af7d39101b8ee76681bf95a1a95ebd042;p=p5sagit%2Fp5-mst-13.2.git

Fix all the C(ommon) case foldings as per CaseFold.txt.

p4raw-id: //depot/perl@13963
---

diff --git a/lib/unicore/To/Fold.pl b/lib/unicore/To/Fold.pl
index 1502690..6b0c2e9 100644
--- a/lib/unicore/To/Fold.pl
+++ b/lib/unicore/To/Fold.pl
@@ -266,9 +266,12 @@ return <<'END';
 01B7		0292
 01B8		01B9
 01BC		01BD
-01C4	01C5	01C6
-01C7	01C8	01C9
-01CA	01CB	01CC
+01C4		01C6
+01C5		01C6
+01C7		01C9
+01C8		01C9
+01CA		01CC
+01CB		01CC
 01CD		01CE
 01CF		01D0
 01D1		01D2
@@ -286,7 +289,8 @@ return <<'END';
 01EA		01EB
 01EC		01ED
 01EE		01EF
-01F1	01F2	01F3
+01F1		01F3
+01F2		01F3
 01F4		01F5
 01F6		0195
 01F7		01BF
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index 546b3cf..34d1388 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -833,12 +833,14 @@ if (open(my $CaseFold, "CaseFold.txt")) {
     my %Fold;
 
     while (<$CaseFold>) {
+	# Skip status 'S', simple case folding
 	next unless /^([0-9A-Fa-f]+)\s*;\s*([CFI])\s*;\s*([0-9A-Fa-f]+(?: [0-9A-Fa-f]+)*)\s*;/;
 
 	my ($code, $status, $fold) = ($1, $2, $3);
 
 	if ($status eq 'C') { # Common: one-to-one folding
-	    append(\@Fold, $code, $fold);
+	    # No append() since several codes may fold into one.
+	    push @Fold, [ $code, $code, $fold ];
 	} else { # F: full, or I: dotted uppercase I -> dotless lowercase I
 	    $Fold{hex($code)} = $fold;
 	}
diff --git a/regcomp.c b/regcomp.c
index b5d9860..c537eaa 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3985,6 +3985,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
 		    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
 				   (UV)value);
 		    if (FOLD) {
+			 U8 tmpbuf [UTF8_MAXLEN+1];
+			 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+			 STRLEN foldlen;
+			 UV f;
+
+			 uvchr_to_utf8(tmpbuf, value);
+			 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
+
+			 if (f != value)
+			      Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", f);
+
 			 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
 			      Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
 					     (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
diff --git a/regexec.c b/regexec.c
index 7b459e2..3f1449d 100644
--- a/regexec.c
+++ b/regexec.c
@@ -979,8 +979,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		    while (s <= e) {
 			if ( utf8_to_uvchr((U8*)s, &len) == c1
 			     && (ln == len ||
-				 ibcmp_utf8(s, do_utf8,  strend - s,
-					    m, UTF, ln))
+				 ibcmp_utf8(s, do_utf8, (I32)(strend - s),
+					    m, UTF, (I32)ln))
 			     && (norun || regtry(prog, s)) )
 			    goto got_it;
 			s += len;
@@ -988,14 +988,21 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		}
 		else {
 		    while (s <= e) {
+			U8 tmpbuf [UTF8_MAXLEN+1];
+			U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+			STRLEN foldlen;
 			UV c = utf8_to_uvchr((U8*)s, &len);
+			UV f;
+
+			uvchr_to_utf8(tmpbuf, c);
+			f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
+
 			if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
 			    c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
 			    c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
-			if ( (c == c1 || c == c2)
-			     && (ln == len ||
-				 ibcmp_utf8(s, do_utf8, strend - s,
-					    m, UTF, ln))
+			if ( (c == c1 || c == c2 || f == c1 || f == c2)
+			     && ibcmp_utf8(s, do_utf8, (I32)(strend - s),
+					   m, UTF, (I32)ln)
 			     && (norun || regtry(prog, s)) )
 			    goto got_it;
 			s += len;
diff --git a/utf8.c b/utf8.c
index debfb9c..27f86b6 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1651,7 +1651,7 @@ Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
 }
 
 /*
-=for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|const char *s2|bool u2|register I32 len
+=for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|register I32 len1|const char *s2|bool u2|register I32 len2
 
 Return true if the strings s1 and s2 differ case-insensitively, false
 if not (if they are equal case-insensitively).  If u1 is true, the