From: George Greer Date: Sun, 12 Jul 2009 18:53:29 +0000 (-0400) Subject: much better swap logic to support reentrancy and fix assert failure X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=e9105d30ed;p=p5sagit%2Fp5-mst-13.2.git much better swap logic to support reentrancy and fix assert failure Commit c74340f9 added backreferences as well as the idea of a ->swap regex pointer to keep track of the match offsets in case of backtracking. The problem is that when Perl re-enters the regex engine to handle utf8::SWASHNEW, the ->swap is not saved/restored/cleared so any capture from the utf8 (Perl) code could inadvertently modify the regex match data that caused the utf8 swash to get built. This change should close out RT #60508 --- diff --git a/embed.fnc b/embed.fnc index 0001d1f..4dc2706 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1678,7 +1678,6 @@ ERsn |U8* |reghop4 |NN U8 *s|I32 off|NN const U8 *llim \ #endif ERsn |U8* |reghopmaybe3 |NN U8 *s|I32 off|NN const U8 *lim ERs |char* |find_byclass |NN regexp * prog|NN const regnode *c|NN char *s|NN const char *strend|NULLOK regmatch_info *reginfo -Es |void |swap_match_buff|NN regexp * prog Es |void |to_utf8_substr |NN regexp * prog Es |void |to_byte_substr |NN regexp * prog ERs |I32 |reg_check_named_buff_matched |NN const regexp *rex \ diff --git a/embed.h b/embed.h index e702d32..0ef5ff2 100644 --- a/embed.h +++ b/embed.h @@ -1468,7 +1468,6 @@ #if defined(PERL_CORE) || defined(PERL_EXT) #define reghopmaybe3 S_reghopmaybe3 #define find_byclass S_find_byclass -#define swap_match_buff S_swap_match_buff #define to_utf8_substr S_to_utf8_substr #define to_byte_substr S_to_byte_substr #define reg_check_named_buff_matched S_reg_check_named_buff_matched @@ -3814,7 +3813,6 @@ #if defined(PERL_CORE) || defined(PERL_EXT) #define reghopmaybe3 S_reghopmaybe3 #define find_byclass(a,b,c,d,e) S_find_byclass(aTHX_ a,b,c,d,e) -#define swap_match_buff(a) S_swap_match_buff(aTHX_ a) #define to_utf8_substr(a) S_to_utf8_substr(aTHX_ a) #define to_byte_substr(a) S_to_byte_substr(aTHX_ a) #define reg_check_named_buff_matched(a,b) S_reg_check_named_buff_matched(aTHX_ a,b) diff --git a/ext/Devel-PPPort/parts/embed.fnc b/ext/Devel-PPPort/parts/embed.fnc index 68f3817..48cb9f3 100644 --- a/ext/Devel-PPPort/parts/embed.fnc +++ b/ext/Devel-PPPort/parts/embed.fnc @@ -1677,7 +1677,6 @@ ERsn |U8* |reghop4 |NN U8 *s|I32 off|NN const U8 *llim \ #endif ERsn |U8* |reghopmaybe3 |NN U8 *s|I32 off|NN const U8 *lim ERs |char* |find_byclass |NN regexp * prog|NN const regnode *c|NN char *s|NN const char *strend|NULLOK regmatch_info *reginfo -Es |void |swap_match_buff|NN regexp * prog Es |void |to_utf8_substr |NN regexp * prog Es |void |to_byte_substr |NN regexp * prog ERs |I32 |reg_check_named_buff_matched |NN const regexp *rex \ diff --git a/pod/perlreapi.pod b/pod/perlreapi.pod index b0d6275..03996fd 100644 --- a/pod/perlreapi.pod +++ b/pod/perlreapi.pod @@ -598,7 +598,7 @@ engine should use something else. =head2 C -TODO: document +Unused. Left in for compatibility with perl 5.10.0. =head2 C diff --git a/pod/perlreguts.pod b/pod/perlreguts.pod index 2049931..9c54ec4 100644 --- a/pod/perlreguts.pod +++ b/pod/perlreguts.pod @@ -810,13 +810,12 @@ value to other engine implementations. =item C -C is an extra set of startp/endp stored in a C -struct. This is used when the last successful match was from the same pattern -as the current pattern, so that a partial match doesn't overwrite the -previous match's results. When this field is data filled the matching -engine will swap buffers before every match attempt. If the match fails, -then it swaps them back. If it's successful it leaves them. This field -is populated on demand and is by default null. +C formerly was an extra set of startp/endp stored in a +C struct. This was used when the last successful match +was from the same pattern as the current pattern, so that a partial +match didn't overwrite the previous match's results, but it caused a +problem with re-entrant code such as trying to build the UTF-8 swashes. +Currently unused and left for backward compatibility with 5.10.0. =item C diff --git a/proto.h b/proto.h index b2e9e90..5a08e3e 100644 --- a/proto.h +++ b/proto.h @@ -5434,11 +5434,6 @@ STATIC char* S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, cons #define PERL_ARGS_ASSERT_FIND_BYCLASS \ assert(prog); assert(c); assert(s); assert(strend) -STATIC void S_swap_match_buff(pTHX_ regexp * prog) - __attribute__nonnull__(pTHX_1); -#define PERL_ARGS_ASSERT_SWAP_MATCH_BUFF \ - assert(prog) - STATIC void S_to_utf8_substr(pTHX_ regexp * prog) __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_TO_UTF8_SUBSTR \ diff --git a/regcomp.c b/regcomp.c index 22c02c8..696796b 100644 --- a/regcomp.c +++ b/regcomp.c @@ -9414,7 +9414,6 @@ Perl_pregfree2(pTHX_ REGEXP *rx) if (r->saved_copy) SvREFCNT_dec(r->saved_copy); #endif - Safefree(r->swap); Safefree(r->offs); } @@ -9474,7 +9473,6 @@ Perl_reg_temp_copy (pTHX_ REGEXP *rx) ret->saved_copy = NULL; #endif ret->mother_re = rx; - ret->swap = NULL; return ret_x; } diff --git a/regexec.c b/regexec.c index d3dd612..5d31d73 100644 --- a/regexec.c +++ b/regexec.c @@ -1734,28 +1734,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, return s; } -static void -S_swap_match_buff (pTHX_ regexp *prog) -{ - regexp_paren_pair *t; - - PERL_ARGS_ASSERT_SWAP_MATCH_BUFF; - - if (!prog->swap) { - /* We have to be careful. If the previous successful match - was from this regex we don't want a subsequent paritally - successful match to clobber the old results. - So when we detect this possibility we add a swap buffer - to the re, and switch the buffer each match. If we fail - we switch it back, otherwise we leave it swapped. - */ - Newxz(prog->swap, (prog->nparens + 1), regexp_paren_pair); - } - t = prog->swap; - prog->swap = prog->offs; - prog->offs = t; -} - /* - regexec_flags - match a regexp against a string @@ -1785,7 +1763,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre I32 multiline; RXi_GET_DECL(prog,progi); regmatch_info reginfo; /* create some info to pass to regtry etc */ - bool swap_on_fail = 0; + regexp_paren_pair *swap = NULL; GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGEXEC_FLAGS; @@ -1863,9 +1841,16 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre reginfo.ganch = strbeg; } if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) { - swap_on_fail = 1; - swap_match_buff(prog); /* do we need a save destructor here for - eval dies? */ + /* We have to be careful. If the previous successful match + was from this regex we don't want a subsequent partially + successful match to clobber the old results. + So when we detect this possibility we add a swap buffer + to the re, and switch the buffer each match. If we fail + we switch it back, otherwise we leave it swapped. + */ + swap = prog->offs; + /* do we need a save destructor here for eval dies? */ + Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair); } if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) { re_scream_pos_data d; @@ -2164,6 +2149,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre goto phooey; got_it: + Safefree(swap); RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted); if (PL_reg_eval_set) @@ -2209,10 +2195,12 @@ phooey: PL_colors[4], PL_colors[5])); if (PL_reg_eval_set) restore_pos(aTHX_ prog); - if (swap_on_fail) + if (swap) { /* we failed :-( roll it back */ - swap_match_buff(prog); - + Safefree(prog->offs); + prog->offs = swap; + } + return 0; } diff --git a/regexp.h b/regexp.h index fe67142..90e3406 100644 --- a/regexp.h +++ b/regexp.h @@ -88,7 +88,7 @@ typedef struct regexp_paren_pair { /* during matching */ \ U32 lastparen; /* last open paren matched */ \ U32 lastcloseparen; /* last close paren matched */ \ - regexp_paren_pair *swap; /* Swap copy of *offs */ \ + regexp_paren_pair *swap; /* Unused: 5.10.1 and later */ \ /* Array of offsets for (@-) and (@+) */ \ regexp_paren_pair *offs; \ /* saved or original string so \digit works forever. */ \ diff --git a/t/op/pat.t b/t/op/pat.t index 87a05dc..00c5490 100644 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -13,7 +13,7 @@ sub run_tests; $| = 1; -my $EXPECTED_TESTS = 4065; # Update this when adding/deleting tests. +my $EXPECTED_TESTS = 4066; # Update this when adding/deleting tests. BEGIN { chdir 't' if -d 't'; @@ -4347,6 +4347,24 @@ sub run_tests { } } + # This only works under -DEBUGGING because it relies on an assert(). + { + local $BugId = '60508'; + local $Message = "Check capture offset re-entrancy of utf8 code."; + + sub fswash { $_[0] =~ s/([>X])//g; } + + my $k1 = "." x 4 . ">>"; + fswash($k1); + + my $k2 = "\x{f1}\x{2022}"; + $k2 =~ s/([\360-\362])/>/g; + fswash($k2); + + iseq($k2, "\x{2022}", "utf8::SWASHNEW doesn't cause capture leaks"); + } + + { local $BugId = 65372; # minimal CURLYM limited to 32767 matches my @pat = (