Fix all the C(ommon) case foldings as per CaseFold.txt.
Jarkko Hietaniemi [Sun, 30 Dec 2001 20:04:32 +0000 (20:04 +0000)]
p4raw-id: //depot/perl@13963

lib/unicore/To/Fold.pl
lib/unicore/mktables
regcomp.c
regexec.c
utf8.c

index 1502690..6b0c2e9 100644 (file)
@@ -266,9 +266,12 @@ return <<'END';
 01B7           0292
 01B8           01B9
 01BC           01BD
-01C4   01C5    01C6
-01C7   01C8    01C9
-01CA   01CB    01CC
+01C4           01C6
+01C5           01C6
+01C7           01C9
+01C8           01C9
+01CA           01CC
+01CB           01CC
 01CD           01CE
 01CF           01D0
 01D1           01D2
@@ -286,7 +289,8 @@ return <<'END';
 01EA           01EB
 01EC           01ED
 01EE           01EF
-01F1   01F2    01F3
+01F1           01F3
+01F2           01F3
 01F4           01F5
 01F6           0195
 01F7           01BF
index 546b3cf..34d1388 100644 (file)
@@ -833,12 +833,14 @@ if (open(my $CaseFold, "CaseFold.txt")) {
     my %Fold;
 
     while (<$CaseFold>) {
+       # Skip status 'S', simple case folding
        next unless /^([0-9A-Fa-f]+)\s*;\s*([CFI])\s*;\s*([0-9A-Fa-f]+(?: [0-9A-Fa-f]+)*)\s*;/;
 
        my ($code, $status, $fold) = ($1, $2, $3);
 
        if ($status eq 'C') { # Common: one-to-one folding
-           append(\@Fold, $code, $fold);
+           # No append() since several codes may fold into one.
+           push @Fold, [ $code, $code, $fold ];
        } else { # F: full, or I: dotted uppercase I -> dotless lowercase I
            $Fold{hex($code)} = $fold;
        }
index b5d9860..c537eaa 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -3985,6 +3985,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
                                   (UV)value);
                    if (FOLD) {
+                        U8 tmpbuf [UTF8_MAXLEN+1];
+                        U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+                        STRLEN foldlen;
+                        UV f;
+
+                        uvchr_to_utf8(tmpbuf, value);
+                        f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
+
+                        if (f != value)
+                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", f);
+
                         if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
                              Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
                                             (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
index 7b459e2..3f1449d 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -979,8 +979,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                    while (s <= e) {
                        if ( utf8_to_uvchr((U8*)s, &len) == c1
                             && (ln == len ||
-                                ibcmp_utf8(s, do_utf8,  strend - s,
-                                           m, UTF, ln))
+                                ibcmp_utf8(s, do_utf8, (I32)(strend - s),
+                                           m, UTF, (I32)ln))
                             && (norun || regtry(prog, s)) )
                            goto got_it;
                        s += len;
@@ -988,14 +988,21 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                }
                else {
                    while (s <= e) {
+                       U8 tmpbuf [UTF8_MAXLEN+1];
+                       U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+                       STRLEN foldlen;
                        UV c = utf8_to_uvchr((U8*)s, &len);
+                       UV f;
+
+                       uvchr_to_utf8(tmpbuf, c);
+                       f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
+
                        if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
                            c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
                            c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
-                       if ( (c == c1 || c == c2)
-                            && (ln == len ||
-                                ibcmp_utf8(s, do_utf8, strend - s,
-                                           m, UTF, ln))
+                       if ( (c == c1 || c == c2 || f == c1 || f == c2)
+                            && ibcmp_utf8(s, do_utf8, (I32)(strend - s),
+                                          m, UTF, (I32)ln)
                             && (norun || regtry(prog, s)) )
                            goto got_it;
                        s += len;
diff --git a/utf8.c b/utf8.c
index debfb9c..27f86b6 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1651,7 +1651,7 @@ Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
 }
 
 /*
-=for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|const char *s2|bool u2|register I32 len
+=for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|register I32 len1|const char *s2|bool u2|register I32 len2
 
 Return true if the strings s1 and s2 differ case-insensitively, false
 if not (if they are equal case-insensitively).  If u1 is true, the