Fixes
20001230.002.
What still remains broken is that the submatches that
have \C in them get their UTF8 flag on because their
parent SV has it on. This will result in malformed
UTF8 if a \C happened to match a non-ASCII byte.
p4raw-id: //depot/perl@8836
r->reganch |= ROPT_LOOKBEHIND_SEEN;
if (RExC_seen & REG_SEEN_EVAL)
r->reganch |= ROPT_EVAL_SEEN;
+ if (RExC_seen & REG_SEEN_SANY)
+ r->reganch |= ROPT_SANY_SEEN;
Newz(1002, r->startp, RExC_npar, I32);
Newz(1002, r->endp, RExC_npar, I32);
PL_regdata = r->data; /* for regprop() */
break;
case 'C':
ret = reg_node(pRExC_state, SANY);
+ RExC_seen |= REG_SEEN_SANY;
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
break;
#define EXTRA_SIZE(guy) ((sizeof(guy)-1)/sizeof(struct regnode))
-#define REG_SEEN_ZERO_LEN 1
-#define REG_SEEN_LOOKBEHIND 2
-#define REG_SEEN_GPOS 4
-#define REG_SEEN_EVAL 8
+#define REG_SEEN_ZERO_LEN 1
+#define REG_SEEN_LOOKBEHIND 2
+#define REG_SEEN_GPOS 4
+#define REG_SEEN_EVAL 8
+#define REG_SEEN_SANY 16
START_EXTERN_C
DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
goto fail;
}
- if (prog->check_offset_min == prog->check_offset_max) {
+ if (prog->check_offset_min == prog->check_offset_max &&
+ !(prog->reganch & ROPT_SANY_SEEN)) {
/* Substring at constant offset from beg-of-str... */
I32 slen;
if (data)
*data->scream_olds = s;
}
+ else if (prog->reganch & ROPT_SANY_SEEN)
+ s = fbm_instr((U8*)(s + start_shift),
+ (U8*)(strend - end_shift),
+ check, PL_multiline ? FBMrf_MULTILINE : 0);
else
s = fbm_instr(HOP3(s, start_shift, strend),
HOP3(strend, -end_shift, strbeg),
minlen = prog->minlen;
if (do_utf8) {
- if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
+ if (!(prog->reganch & ROPT_SANY_SEEN))
+ if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
}
else {
if (strend - startpos < minlen) goto phooey;
sayNO;
break;
case SANY:
- if (do_utf8) {
- locinput += PL_utf8skip[nextchr];
- if (locinput > PL_regeol)
- sayNO;
- nextchr = UCHARAT(locinput);
- break;
- }
if (!nextchr && locinput >= PL_regeol)
sayNO;
nextchr = UCHARAT(++locinput);
}
break;
case SANY:
- if (do_utf8) {
- loceol = PL_regeol;
- while (hardcount < max && scan < loceol) {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- } else {
- scan = loceol;
- }
+ scan = loceol;
break;
case EXACT: /* length of string is 1 */
c = (U8)*STRING(p);
#define ROPT_CHECK_ALL 0x00100
#define ROPT_LOOKBEHIND_SEEN 0x00200
#define ROPT_EVAL_SEEN 0x00400
+#define ROPT_SANY_SEEN 0x00800
/* 0xf800 of reganch is used by PMf_COMPILETIME */
# the format supported by op/regexp.t. If you want to add a test
# that does fit that format, add it to op/re_tests, not here.
-print "1..231\n";
+print "1..240\n";
BEGIN {
chdir 't' if -d 't';
print "ok $test\n";
$test++;
+$_ = "a\x{100}b";
+if (/(.)(\C)(\C)(.)/) {
+ print "ok 232\n";
+ if ($1 eq "a") {
+ print "ok 233\n";
+ } else {
+ print "not ok 233\n";
+ }
+ if ($2 eq "\xC4") {
+ print "ok 234\n";
+ } else {
+ print "not ok 234\n";
+ }
+ if ($3 eq "\x80") {
+ print "ok 235\n";
+ } else {
+ print "not ok 235\n";
+ }
+ if ($4 eq "b") {
+ print "ok 236\n";
+ } else {
+ print "not ok 236\n";
+ }
+} else {
+ for (232..236) {
+ print "not ok $_\n";
+ }
+}
+$_ = "\x{100}";
+if (/(\C)/g) {
+ print "ok 237\n";
+ if ($1 eq "\xC4") {
+ print "ok 238\n";
+ } else {
+ print "not ok 238\n";
+ }
+} else {
+ for (237..238) {
+ print "not ok $_\n";
+ }
+}
+if (/(\C)/g) {
+ print "ok 239\n";
+ if ($1 eq "\x80") {
+ print "ok 240\n";
+ } else {
+ print "not ok 240\n";
+ }
+} else {
+ for (239..240) {
+ print "not ok $_\n";
+ }
+}
}
}
-print "1..107\n";
+print "1..109\n";
my $test = 1;
$test++; # 107
}
+{
+ # bug id 20001230.002
+
+ use utf8;
+
+ print "not " unless "École" =~ /^\C\C(.)/ && $1 eq 'c';
+ print "ok $test\n";
+ $test++; # 108
+
+ print "not " unless "École" =~ /^\C\C(c)/;
+ print "ok $test\n";
+ $test++; # 109
+}