From: Jarkko Hietaniemi Date: Sat, 12 Jan 2002 20:05:29 +0000 (+0000) Subject: Sharp S as a special treat for our German UTF-8 testers :-) X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=e0f9d4a8607e11c8ec6d3c60018f5ccbfea703e6;p=p5sagit%2Fp5-mst-13.2.git Sharp S as a special treat for our German UTF-8 testers :-) p4raw-id: //depot/perl@14222 --- diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 23d6ff1..beb742e 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -639,8 +639,12 @@ Level 1 - Basic Unicode Support [ 5] have negation [ 6] can use look-ahead to emulate subtraction (*) [ 7] include Letters in word characters - [ 8] some cases of "ss"/"SS" matching U+00DF in a character - class are missing, but that is allowed according to the TR18. + [ 8] note that perl does Full casefolding in matching, not Simple: + for example U+1F88 is equivalent with U+1F000 U+03B9, + not with 1F80. This difference matters for certain Greek + capital letters with certain modifiers: the Full casefolding + decomposes the letter, while the Simple casefolding would map + it to a single character. [ 9] see UTR#13 Unicode Newline Guidelines [10] should do ^ and $ also on \x{85}, \x{2028} and \x{2029}) (should also affect <>, $., and script line numbers) diff --git a/regexec.c b/regexec.c index 78c4e24..df4a31b 100644 --- a/regexec.c +++ b/regexec.c @@ -916,15 +916,19 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta switch (OP(c)) { case ANYOF: while (s < strend) { - if (reginclass(c, (U8*)s, do_utf8)) { + STRLEN skip = do_utf8 ? UTF8SKIP(s) : 1; + + if (reginclass(c, (U8*)s, do_utf8) || + (ANYOF_UNICODE_FOLD_SHARP_S(c, s, strend) && + (skip = 2))) { if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; } - else - tmp = 1; - s += do_utf8 ? UTF8SKIP(s) : 1; + else + tmp = 1; + s += skip; } break; case CANY: @@ -2108,6 +2112,7 @@ typedef union re_unwind_t { #define sayYES goto yes #define sayNO goto no +#define sayNO_ANYOF goto no_anyof #define sayYES_FINAL goto yes_final #define sayYES_LOUD goto yes_loud #define sayNO_FINAL goto no_final @@ -2396,21 +2401,33 @@ S_regmatch(pTHX_ regnode *prog) STRLEN inclasslen = PL_regeol - locinput; if (!reginclasslen(scan, (U8*)locinput, &inclasslen, do_utf8)) - sayNO; + sayNO_ANYOF; if (locinput >= PL_regeol) sayNO; locinput += inclasslen; nextchr = UCHARAT(locinput); + break; } else { if (nextchr < 0) nextchr = UCHARAT(locinput); if (!reginclass(scan, (U8*)locinput, do_utf8)) - sayNO; + sayNO_ANYOF; if (!nextchr && locinput >= PL_regeol) sayNO; nextchr = UCHARAT(++locinput); + break; + } + no_anyof: + /* If we might have the case of the German sharp s + * in a casefolding Unicode character class. */ + + if (ANYOF_UNICODE_FOLD_SHARP_S(scan, locinput, PL_regeol)) { + locinput += 2; + nextchr = UCHARAT(locinput); } + else + sayNO; break; case ALNUML: PL_reg_flags |= RF_tainted; diff --git a/t/op/pat.t b/t/op/pat.t index 19ec634..edd34b7 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -6,7 +6,7 @@ $| = 1; -print "1..848\n"; +print "1..850\n"; BEGIN { chdir 't' if -d 't'; @@ -2602,22 +2602,20 @@ print "# some Unicode properties\n"; print "SS" =~ /\N{LATIN SMALL LETTER SHARP S}/i ? "ok 840\n" : "not ok 840\n"; -# These are a bit tricky. Since the LATIN SMALL LETTER SHARP S is U+00DF, -# the ANYOF reduces to a byte. The Unicodeness needs to be caught earlier. -# print "ss" =~ -# /[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 841\n" : "not ok 841\n"; -# -# print "SS" =~ -# /[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 842\n" : "not ok 842\n"; + print "ss" =~ + /[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 841\n" : "not ok 841\n"; + + print "SS" =~ + /[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 842\n" : "not ok 842\n"; } { print "# more whitespace: U+0085, U+2028, U+2029\n"; # U+0085 needs to be forced to be Unicode, the \x{100} does that. - print "<\x{100}\x{0085}>" =~ /<\x{100}\s>/ ? "ok 841\n" : "not ok 841\n"; - print "<\x{2028}>" =~ /<\s>/ ? "ok 842\n" : "not ok 842\n"; - print "<\x{2029}>" =~ /<\s>/ ? "ok 843\n" : "not ok 843\n"; + print "<\x{100}\x{0085}>" =~ /<\x{100}\s>/ ? "ok 843\n" : "not ok 843\n"; + print "<\x{2028}>" =~ /<\s>/ ? "ok 844\n" : "not ok 844\n"; + print "<\x{2029}>" =~ /<\s>/ ? "ok 845\n" : "not ok 845\n"; } { @@ -2628,7 +2626,7 @@ print "# some Unicode properties\n"; # This is not expected to match: the point is that # neither should we get "Malformed UTF-8" warnings. print $s =~ /\G(.+?)\n/gcs ? - "not ok 844\n" : "ok 844\n"; + "not ok 846\n" : "ok 846\n"; my @c; @@ -2636,7 +2634,7 @@ print "# some Unicode properties\n"; push @c, $1; } - print join("", @c) eq $s ? "ok 845\n" : "not ok 845\n"; + print join("", @c) eq $s ? "ok 847\n" : "not ok 847\n"; my $t1 = "Q003\n\n\x{e4}\x{f6}\n\nQ004\n\n\x{e7}"; # test only chars < 256 my $r1 = ""; @@ -2650,12 +2648,12 @@ print "# some Unicode properties\n"; $r2 .= $1 . $2; } $r2 =~ s/\x{100}//; - print $r1 eq $r2 ? "ok 846\n" : "not ok 846\n"; + print $r1 eq $r2 ? "ok 848\n" : "not ok 848\n"; } { print "# Unicode lookbehind\n"; - print "A\x{100}B" =~ /(?<=A.)B/ ? "ok 847\n" : "not ok 847\n"; - print "A\x{200}\x{300}B" =~ /(?<=A..)B/ ? "ok 848\n" : "not ok 848\n"; + print "A\x{100}B" =~ /(?<=A.)B/ ? "ok 849\n" : "not ok 849\n"; + print "A\x{200}\x{300}B" =~ /(?<=A..)B/ ? "ok 850\n" : "not ok 850\n"; } diff --git a/utf8.h b/utf8.h index 8c27afa..2ac5f91 100644 --- a/utf8.h +++ b/utf8.h @@ -189,6 +189,7 @@ END_EXTERN_C #define UTF8_IS_ASCII(c) UTF8_IS_INVARIANT(c) +#define UNICODE_LATIN_SMALL_LETTER_SHARP_S 0x00DF #define UNICODE_GREEK_CAPITAL_LETTER_SIGMA 0x03A3 #define UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA 0x03C2 #define UNICODE_GREEK_SMALL_LETTER_SIGMA 0x03C3 @@ -198,3 +199,10 @@ END_EXTERN_C #define UNI_DISPLAY_QQ (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH) #define UNI_DISPLAY_REGEX (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH) +#define ANYOF_UNICODE_FOLD_SHARP_S(n, s, e) \ + (ANYOF_BITMAP_TEST(n, UNICODE_LATIN_SMALL_LETTER_SHARP_S) && \ + ANYOF_FLAGS(n) & ANYOF_UNICODE && \ + ANYOF_FLAGS(n) & ANYOF_FOLD && \ + ((e) > (s) + 1) && \ + toLOWER((s)[0]) == 's' && \ + toLOWER((s)[1]) == 's')