From: Jarkko Hietaniemi Date: Sun, 30 Dec 2001 20:04:32 +0000 (+0000) Subject: Fix all the C(ommon) case foldings as per CaseFold.txt. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=254ba52af7d39101b8ee76681bf95a1a95ebd042;p=p5sagit%2Fp5-mst-13.2.git Fix all the C(ommon) case foldings as per CaseFold.txt. p4raw-id: //depot/perl@13963 --- diff --git a/lib/unicore/To/Fold.pl b/lib/unicore/To/Fold.pl index 1502690..6b0c2e9 100644 --- a/lib/unicore/To/Fold.pl +++ b/lib/unicore/To/Fold.pl @@ -266,9 +266,12 @@ return <<'END'; 01B7 0292 01B8 01B9 01BC 01BD -01C4 01C5 01C6 -01C7 01C8 01C9 -01CA 01CB 01CC +01C4 01C6 +01C5 01C6 +01C7 01C9 +01C8 01C9 +01CA 01CC +01CB 01CC 01CD 01CE 01CF 01D0 01D1 01D2 @@ -286,7 +289,8 @@ return <<'END'; 01EA 01EB 01EC 01ED 01EE 01EF -01F1 01F2 01F3 +01F1 01F3 +01F2 01F3 01F4 01F5 01F6 0195 01F7 01BF diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 546b3cf..34d1388 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -833,12 +833,14 @@ if (open(my $CaseFold, "CaseFold.txt")) { my %Fold; while (<$CaseFold>) { + # Skip status 'S', simple case folding next unless /^([0-9A-Fa-f]+)\s*;\s*([CFI])\s*;\s*([0-9A-Fa-f]+(?: [0-9A-Fa-f]+)*)\s*;/; my ($code, $status, $fold) = ($1, $2, $3); if ($status eq 'C') { # Common: one-to-one folding - append(\@Fold, $code, $fold); + # No append() since several codes may fold into one. + push @Fold, [ $code, $code, $fold ]; } else { # F: full, or I: dotted uppercase I -> dotless lowercase I $Fold{hex($code)} = $fold; } diff --git a/regcomp.c b/regcomp.c index b5d9860..c537eaa 100644 --- a/regcomp.c +++ b/regcomp.c @@ -3985,6 +3985,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", (UV)value); if (FOLD) { + U8 tmpbuf [UTF8_MAXLEN+1]; + U8 foldbuf[UTF8_MAXLEN_FOLD+1]; + STRLEN foldlen; + UV f; + + uvchr_to_utf8(tmpbuf, value); + f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); + + if (f != value) + Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", f); + if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) { Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA); diff --git a/regexec.c b/regexec.c index 7b459e2..3f1449d 100644 --- a/regexec.c +++ b/regexec.c @@ -979,8 +979,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta while (s <= e) { if ( utf8_to_uvchr((U8*)s, &len) == c1 && (ln == len || - ibcmp_utf8(s, do_utf8, strend - s, - m, UTF, ln)) + ibcmp_utf8(s, do_utf8, (I32)(strend - s), + m, UTF, (I32)ln)) && (norun || regtry(prog, s)) ) goto got_it; s += len; @@ -988,14 +988,21 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else { while (s <= e) { + U8 tmpbuf [UTF8_MAXLEN+1]; + U8 foldbuf[UTF8_MAXLEN_FOLD+1]; + STRLEN foldlen; UV c = utf8_to_uvchr((U8*)s, &len); + UV f; + + uvchr_to_utf8(tmpbuf, c); + f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); + if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA || c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA; - if ( (c == c1 || c == c2) - && (ln == len || - ibcmp_utf8(s, do_utf8, strend - s, - m, UTF, ln)) + if ( (c == c1 || c == c2 || f == c1 || f == c2) + && ibcmp_utf8(s, do_utf8, (I32)(strend - s), + m, UTF, (I32)ln) && (norun || regtry(prog, s)) ) goto got_it; s += len; diff --git a/utf8.c b/utf8.c index debfb9c..27f86b6 100644 --- a/utf8.c +++ b/utf8.c @@ -1651,7 +1651,7 @@ Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags) } /* -=for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|const char *s2|bool u2|register I32 len +=for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|register I32 len1|const char *s2|bool u2|register I32 len2 Return true if the strings s1 and s2 differ case-insensitively, false if not (if they are equal case-insensitively). If u1 is true, the