[ 5] have negation
[ 6] can use look-ahead to emulate subtraction (*)
[ 7] include Letters in word characters
- [ 8] some cases of "ss"/"SS" matching U+00DF in a character
- class are missing, but that is allowed according to the TR18.
+ [ 8] note that perl does Full casefolding in matching, not Simple:
+ for example U+1F88 is equivalent with U+1F000 U+03B9,
+ not with 1F80. This difference matters for certain Greek
+ capital letters with certain modifiers: the Full casefolding
+ decomposes the letter, while the Simple casefolding would map
+ it to a single character.
[ 9] see UTR#13 Unicode Newline Guidelines
[10] should do ^ and $ also on \x{85}, \x{2028} and \x{2029})
(should also affect <>, $., and script line numbers)
switch (OP(c)) {
case ANYOF:
while (s < strend) {
- if (reginclass(c, (U8*)s, do_utf8)) {
+ STRLEN skip = do_utf8 ? UTF8SKIP(s) : 1;
+
+ if (reginclass(c, (U8*)s, do_utf8) ||
+ (ANYOF_UNICODE_FOLD_SHARP_S(c, s, strend) &&
+ (skip = 2))) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
else
tmp = doevery;
}
- else
- tmp = 1;
- s += do_utf8 ? UTF8SKIP(s) : 1;
+ else
+ tmp = 1;
+ s += skip;
}
break;
case CANY:
#define sayYES goto yes
#define sayNO goto no
+#define sayNO_ANYOF goto no_anyof
#define sayYES_FINAL goto yes_final
#define sayYES_LOUD goto yes_loud
#define sayNO_FINAL goto no_final
STRLEN inclasslen = PL_regeol - locinput;
if (!reginclasslen(scan, (U8*)locinput, &inclasslen, do_utf8))
- sayNO;
+ sayNO_ANYOF;
if (locinput >= PL_regeol)
sayNO;
locinput += inclasslen;
nextchr = UCHARAT(locinput);
+ break;
}
else {
if (nextchr < 0)
nextchr = UCHARAT(locinput);
if (!reginclass(scan, (U8*)locinput, do_utf8))
- sayNO;
+ sayNO_ANYOF;
if (!nextchr && locinput >= PL_regeol)
sayNO;
nextchr = UCHARAT(++locinput);
+ break;
+ }
+ no_anyof:
+ /* If we might have the case of the German sharp s
+ * in a casefolding Unicode character class. */
+
+ if (ANYOF_UNICODE_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
+ locinput += 2;
+ nextchr = UCHARAT(locinput);
}
+ else
+ sayNO;
break;
case ALNUML:
PL_reg_flags |= RF_tainted;
$| = 1;
-print "1..848\n";
+print "1..850\n";
BEGIN {
chdir 't' if -d 't';
print "SS" =~
/\N{LATIN SMALL LETTER SHARP S}/i ? "ok 840\n" : "not ok 840\n";
-# These are a bit tricky. Since the LATIN SMALL LETTER SHARP S is U+00DF,
-# the ANYOF reduces to a byte. The Unicodeness needs to be caught earlier.
-# print "ss" =~
-# /[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 841\n" : "not ok 841\n";
-#
-# print "SS" =~
-# /[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 842\n" : "not ok 842\n";
+ print "ss" =~
+ /[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 841\n" : "not ok 841\n";
+
+ print "SS" =~
+ /[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 842\n" : "not ok 842\n";
}
{
print "# more whitespace: U+0085, U+2028, U+2029\n";
# U+0085 needs to be forced to be Unicode, the \x{100} does that.
- print "<\x{100}\x{0085}>" =~ /<\x{100}\s>/ ? "ok 841\n" : "not ok 841\n";
- print "<\x{2028}>" =~ /<\s>/ ? "ok 842\n" : "not ok 842\n";
- print "<\x{2029}>" =~ /<\s>/ ? "ok 843\n" : "not ok 843\n";
+ print "<\x{100}\x{0085}>" =~ /<\x{100}\s>/ ? "ok 843\n" : "not ok 843\n";
+ print "<\x{2028}>" =~ /<\s>/ ? "ok 844\n" : "not ok 844\n";
+ print "<\x{2029}>" =~ /<\s>/ ? "ok 845\n" : "not ok 845\n";
}
{
# This is not expected to match: the point is that
# neither should we get "Malformed UTF-8" warnings.
print $s =~ /\G(.+?)\n/gcs ?
- "not ok 844\n" : "ok 844\n";
+ "not ok 846\n" : "ok 846\n";
my @c;
push @c, $1;
}
- print join("", @c) eq $s ? "ok 845\n" : "not ok 845\n";
+ print join("", @c) eq $s ? "ok 847\n" : "not ok 847\n";
my $t1 = "Q003\n\n\x{e4}\x{f6}\n\nQ004\n\n\x{e7}"; # test only chars < 256
my $r1 = "";
$r2 .= $1 . $2;
}
$r2 =~ s/\x{100}//;
- print $r1 eq $r2 ? "ok 846\n" : "not ok 846\n";
+ print $r1 eq $r2 ? "ok 848\n" : "not ok 848\n";
}
{
print "# Unicode lookbehind\n";
- print "A\x{100}B" =~ /(?<=A.)B/ ? "ok 847\n" : "not ok 847\n";
- print "A\x{200}\x{300}B" =~ /(?<=A..)B/ ? "ok 848\n" : "not ok 848\n";
+ print "A\x{100}B" =~ /(?<=A.)B/ ? "ok 849\n" : "not ok 849\n";
+ print "A\x{200}\x{300}B" =~ /(?<=A..)B/ ? "ok 850\n" : "not ok 850\n";
}
#define UTF8_IS_ASCII(c) UTF8_IS_INVARIANT(c)
+#define UNICODE_LATIN_SMALL_LETTER_SHARP_S 0x00DF
#define UNICODE_GREEK_CAPITAL_LETTER_SIGMA 0x03A3
#define UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA 0x03C2
#define UNICODE_GREEK_SMALL_LETTER_SIGMA 0x03C3
#define UNI_DISPLAY_QQ (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
#define UNI_DISPLAY_REGEX (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
+#define ANYOF_UNICODE_FOLD_SHARP_S(n, s, e) \
+ (ANYOF_BITMAP_TEST(n, UNICODE_LATIN_SMALL_LETTER_SHARP_S) && \
+ ANYOF_FLAGS(n) & ANYOF_UNICODE && \
+ ANYOF_FLAGS(n) & ANYOF_FOLD && \
+ ((e) > (s) + 1) && \
+ toLOWER((s)[0]) == 's' && \
+ toLOWER((s)[1]) == 's')