From: Yves Orton Date: Sun, 12 Nov 2006 23:29:41 +0000 (+0100) Subject: Regex Utility Functions and Substituion Fix (XML::Twig core dump) X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=de8c53012b7e614137ab875e0d58a92474b317ce;p=p5sagit%2Fp5-mst-13.2.git Regex Utility Functions and Substituion Fix (XML::Twig core dump) Message-ID: <9b18b3110611121429g1fc9d6c1t4007dc711f9e8396@mail.gmail.com> Plus a couple tweaks to ext/re/re.pm and t/op/pat.t to those patches to apply cleanly. p4raw-id: //depot/perl@29252 --- diff --git a/MANIFEST b/MANIFEST index 9987b37..7326273 100644 --- a/MANIFEST +++ b/MANIFEST @@ -987,6 +987,7 @@ ext/re/t/lexical_debug.t test that lexical re 'debug' works ext/re/t/regop.pl generate debug output for various patterns ext/re/t/regop.t test RE optimizations by scraping debug output ext/re/t/re.t see if re pragma works +ext/re/t/re_funcs.t see if exportable funcs from re.pm work ext/Safe/t/safe1.t See if Safe works ext/Safe/t/safe2.t See if Safe works ext/Safe/t/safe3.t See if Safe works diff --git a/embed.fnc b/embed.fnc index 5b254b5..3ac4bc4 100644 --- a/embed.fnc +++ b/embed.fnc @@ -668,6 +668,7 @@ Ap |I32 |pregexec |NN regexp* prog|NN char* stringarg \ |NN char* strend|NN char* strbeg|I32 minend \ |NN SV* screamer|U32 nosave Ap |void |pregfree |NULLOK struct regexp* r +p |char * |reg_stringify |NN MAGIC *mg|NULLOK STRLEN *lp|NULLOK U32 *flags|NULLOK I32 *haseval #if defined(USE_ITHREADS) Ap |regexp*|regdupe |NN const regexp* r|NN CLONE_PARAMS* param #endif @@ -1429,7 +1430,6 @@ s |void |utf8_mg_pos_cache_update|NN SV *sv|NN MAGIC **mgp \ |STRLEN byte|STRLEN utf8|STRLEN blen s |STRLEN |sv_pos_b2u_midway|NN const U8 *s|NN const U8 *const target \ |NN const U8 *end|STRLEN endu -s |char * |stringify_regexp|NN SV *sv|NN MAGIC *mg|NULLOK STRLEN *lp sn |char * |F0convert |NV nv|NN char *endbuf|NN STRLEN *len # if defined(PERL_OLD_COPY_ON_WRITE) sM |void |sv_release_COW |NN SV *sv|NN const char *pvx|STRLEN len|NN SV *after diff --git a/embed.h b/embed.h index fea5b27..c4bf329 100644 --- a/embed.h +++ b/embed.h @@ -680,6 +680,9 @@ #define regclass_swash Perl_regclass_swash #define pregexec Perl_pregexec #define pregfree Perl_pregfree +#ifdef PERL_CORE +#define reg_stringify Perl_reg_stringify +#endif #if defined(USE_ITHREADS) #define regdupe Perl_regdupe #endif @@ -1429,7 +1432,6 @@ #define sv_pos_u2b_cached S_sv_pos_u2b_cached #define utf8_mg_pos_cache_update S_utf8_mg_pos_cache_update #define sv_pos_b2u_midway S_sv_pos_b2u_midway -#define stringify_regexp S_stringify_regexp #define F0convert S_F0convert #endif # if defined(PERL_OLD_COPY_ON_WRITE) @@ -2887,6 +2889,9 @@ #define regclass_swash(a,b,c,d,e) Perl_regclass_swash(aTHX_ a,b,c,d,e) #define pregexec(a,b,c,d,e,f,g) Perl_pregexec(aTHX_ a,b,c,d,e,f,g) #define pregfree(a) Perl_pregfree(aTHX_ a) +#ifdef PERL_CORE +#define reg_stringify(a,b,c,d) Perl_reg_stringify(aTHX_ a,b,c,d) +#endif #if defined(USE_ITHREADS) #define regdupe(a,b) Perl_regdupe(aTHX_ a,b) #endif @@ -3632,7 +3637,6 @@ #define sv_pos_u2b_cached(a,b,c,d,e,f,g) S_sv_pos_u2b_cached(aTHX_ a,b,c,d,e,f,g) #define utf8_mg_pos_cache_update(a,b,c,d,e) S_utf8_mg_pos_cache_update(aTHX_ a,b,c,d,e) #define sv_pos_b2u_midway(a,b,c,d) S_sv_pos_b2u_midway(aTHX_ a,b,c,d) -#define stringify_regexp(a,b,c) S_stringify_regexp(aTHX_ a,b,c) #define F0convert S_F0convert #endif # if defined(PERL_OLD_COPY_ON_WRITE) diff --git a/ext/re/re.pm b/ext/re/re.pm index dfdfe86..e9d710f 100644 --- a/ext/re/re.pm +++ b/ext/re/re.pm @@ -1,6 +1,170 @@ package re; -our $VERSION = 0.06_03; +# pragma for controlling the regex engine +use strict; +use warnings; + +our $VERSION = "0.06_03"; +our @ISA = qw(Exporter); +our @EXPORT_OK = qw(is_regexp regexp_pattern); +our %EXPORT_OK = map { $_ => 1 } @EXPORT_OK; + +# *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING *** +# +# If you modify these values see comment below! + +my %bitmask = ( + taint => 0x00100000, # HINT_RE_TAINT + eval => 0x00200000, # HINT_RE_EVAL +); + +# - File::Basename contains a literal for 'taint' as a fallback. If +# taint is changed here, File::Basename must be updated as well. +# +# - ExtUtils::ParseXS uses a hardcoded +# BEGIN { $^H |= 0x00200000 } +# in it to allow re.xs to be built. So if 'eval' is changed here then +# ExtUtils::ParseXS must be changed as well. +# +# *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING *** + +sub setcolor { + eval { # Ignore errors + require Term::Cap; + + my $terminal = Tgetent Term::Cap ({OSPEED => 9600}); # Avoid warning. + my $props = $ENV{PERL_RE_TC} || 'md,me,so,se,us,ue'; + my @props = split /,/, $props; + my $colors = join "\t", map {$terminal->Tputs($_,1)} @props; + + $colors =~ s/\0//g; + $ENV{PERL_RE_COLORS} = $colors; + }; + if ($@) { + $ENV{PERL_RE_COLORS} ||= qq'\t\t> <\t> <\t\t'; + } + +} + +my %flags = ( + COMPILE => 0x0000FF, + PARSE => 0x000001, + OPTIMISE => 0x000002, + TRIEC => 0x000004, + DUMP => 0x000008, + + EXECUTE => 0x00FF00, + INTUIT => 0x000100, + MATCH => 0x000200, + TRIEE => 0x000400, + + EXTRA => 0xFF0000, + TRIEM => 0x010000, + OFFSETS => 0x020000, + OFFSETSDBG => 0x040000, + STATE => 0x080000, + OPTIMISEM => 0x100000, + STACK => 0x280000, +); +$flags{ALL} = -1; +$flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE}; +$flags{Extra} = $flags{EXECUTE} | $flags{COMPILE}; +$flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE}; +$flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE}; +$flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC}; + +my $installed; +my $installed_error; + +sub _do_install { + if ( ! defined($installed) ) { + require XSLoader; + $installed = eval { XSLoader::load('re', $VERSION) } || 0; + $installed_error = $@; + } +} + +sub _load_unload { + my ($on)= @_; + if ($on) { + _do_install(); + if ( ! $installed ) { + die "'re' not installed!? ($installed_error)"; + } else { + # We call install() every time, as if we didn't, we wouldn't + # "see" any changes to the color environment var since + # the last time it was called. + + # install() returns an integer, which if casted properly + # in C resolves to a structure containing the regex + # hooks. Setting it to a random integer will guarantee + # segfaults. + $^H{regcomp} = install(); + } + } else { + delete $^H{regcomp}; + } +} + +sub bits { + my $on = shift; + my $bits = 0; + unless (@_) { + require Carp; + Carp::carp("Useless use of \"re\" pragma"); + } + foreach my $idx (0..$#_){ + my $s=$_[$idx]; + if ($s eq 'Debug' or $s eq 'Debugcolor') { + setcolor() if $s =~/color/i; + ${^RE_DEBUG_FLAGS} = 0 unless defined ${^RE_DEBUG_FLAGS}; + for my $idx ($idx+1..$#_) { + if ($flags{$_[$idx]}) { + if ($on) { + ${^RE_DEBUG_FLAGS} |= $flags{$_[$idx]}; + } else { + ${^RE_DEBUG_FLAGS} &= ~ $flags{$_[$idx]}; + } + } else { + require Carp; + Carp::carp("Unknown \"re\" Debug flag '$_[$idx]', possible flags: ", + join(", ",sort keys %flags ) ); + } + } + _load_unload($on ? 1 : ${^RE_DEBUG_FLAGS}); + last; + } elsif ($s eq 'debug' or $s eq 'debugcolor') { + setcolor() if $s =~/color/i; + _load_unload($on); + } elsif (exists $bitmask{$s}) { + $bits |= $bitmask{$s}; + } elsif ($EXPORT_OK{$s}) { + _do_install(); + require Exporter; + re->export_to_level(2, 're', $s); + } else { + require Carp; + Carp::carp("Unknown \"re\" subpragma '$s' (known ones are: ", + join(', ', map {qq('$_')} 'debug', 'debugcolor', sort keys %bitmask), + ")"); + } + } + $bits; +} + +sub import { + shift; + $^H |= bits(1, @_); +} + +sub unimport { + shift; + $^H &= ~ bits(0, @_); +} + +1; + +__END__ =head1 NAME @@ -33,17 +197,29 @@ re - Perl pragma to alter regular expression behaviour use re qw(Debug All); # Finer tuned debugging options. use re qw(Debug More); no re qw(Debug ALL); # Turn of all re debugging in this scope + + use re qw(is_regexp regexp_pattern); # import utility functions + my ($pat,$mods)=regexp_pattern(qr/foo/i); + if (is_regexp($obj)) { + print "Got regexp: ", + scalar regexp_pattern($obj); # just as perl would stringify it + } # but no hassle with blessed re's. + (We use $^X in these examples because it's tainted by default.) =head1 DESCRIPTION +=head2 'taint' mode + When C is in effect, and a tainted string is the target of a regex, the regex memories (or values returned by the m// operator in list context) are tainted. This feature is useful when regex operations on tainted data aren't meant to extract safe substrings, but to perform other transformations. +=head2 'eval' mode + When C is in effect, a regex is allowed to contain C<(?{ ... })> zero-width assertions even if regular expression contains variable interpolation. That is normally disallowed, since it is a @@ -60,6 +236,8 @@ interpolation. Thus: I allowed if $pat is a precompiled regular expression, even if $pat contains C<(?{ ... })> assertions. +=head2 'debug' mode + When C is in effect, perl emits debugging messages when compiling and using regular expressions. The output is the same as that obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the @@ -71,6 +249,14 @@ comma-separated list of C properties to use for highlighting strings on/off, pre-point part on/off. See L for additional info. +As of 5.9.5 the directive C and its equivalents are +lexically scoped, as the other directives are. However they have both +compile-time and run-time effects. + +See L. + +=head2 'Debug' mode + Similarly C produces debugging output, the difference being that it allows the fine tuning of what debugging output will be emitted. Options are divided into three groups, those related to @@ -208,141 +394,50 @@ As of 5.9.5 the directive C and its equivalents are lexically scoped, as the other directives are. However they have both compile-time and run-time effects. -See L. +=head2 Exportable Functions -=cut +As of perl 5.9.5 're' debug contains a number of utility functions that +may be optionally exported into the callers namespace. They are listed +below. -# N.B. File::Basename contains a literal for 'taint' as a fallback. If -# taint is changed here, File::Basename must be updated as well. -my %bitmask = ( -taint => 0x00100000, # HINT_RE_TAINT -eval => 0x00200000, # HINT_RE_EVAL -); +=over 4 -sub setcolor { - eval { # Ignore errors - require Term::Cap; +=item is_regexp($ref) - my $terminal = Tgetent Term::Cap ({OSPEED => 9600}); # Avoid warning. - my $props = $ENV{PERL_RE_TC} || 'md,me,so,se,us,ue'; - my @props = split /,/, $props; - my $colors = join "\t", map {$terminal->Tputs($_,1)} @props; +Returns true if the argument is a compiled regular expression as returned +by C, false if it is not. - $colors =~ s/\0//g; - $ENV{PERL_RE_COLORS} = $colors; - }; - if ($@) { - $ENV{PERL_RE_COLORS}||=qq'\t\t> <\t> <\t\t' - } +This function will not be confused by overloading or blessing. In +internals terms this extracts the regexp pointer out of the +PERL_MAGIC_qr structure so it it cannot be fooled. -} +=item regexp_pattern($ref) -my %flags = ( - COMPILE => 0x0000FF, - PARSE => 0x000001, - OPTIMISE => 0x000002, - TRIEC => 0x000004, - DUMP => 0x000008, +If the argument is a compiled regular expression as returned by C +then this function returns the pattern. - EXECUTE => 0x00FF00, - INTUIT => 0x000100, - MATCH => 0x000200, - TRIEE => 0x000400, +In list context it returns a two element list, the first element +containing the pattern and the second containing the modifiers used when +the pattern was compiled. - EXTRA => 0xFF0000, - TRIEM => 0x010000, - OFFSETS => 0x020000, - OFFSETSDBG => 0x040000, - STATE => 0x080000, - OPTIMISEM => 0x100000, - STACK => 0x280000, -); -$flags{ALL} = -1; -$flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE}; -$flags{Extra} = $flags{EXECUTE} | $flags{COMPILE}; -$flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE}; -$flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE}; -$flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC}; + my ($pat,$mods)=regexp_pattern($ref); -my $installed; -my $installed_error; +In scalar context it returns the same as perl would when strigifying a +raw qr// with the same pattern inside. If the argument is not a +compiled reference then this routine returns false but defined in scalar +context, and the empty list in list context. Thus the following -sub _load_unload { - my ($on)= @_; - if ($on) { - if ( ! defined($installed) ) { - require XSLoader; - $installed = eval { XSLoader::load('re') } || 0; - $installed_error = $@; - } - if ( ! $installed ) { - die "'re' not installed!? ($installed_error)"; - } else { - # We call install() every time, as if we didn't, we wouldn't - # "see" any changes to the color environment var since - # the last time it was called. + if (regexp_pattern($ref) eq '(?i-xsm:foo)') - # install() returns an integer, which if casted properly - # in C resolves to a structure containing the regex - # hooks. Setting it to a random integer will guarantee - # segfaults. - $^H{regcomp} = install(); - } - } else { - delete $^H{regcomp}; - } -} +will be warning free regardless of what $ref actually is. -sub bits { - my $on = shift; - my $bits = 0; - unless (@_) { - require Carp; - Carp::carp("Useless use of \"re\" pragma"); - } - foreach my $idx (0..$#_){ - my $s=$_[$idx]; - if ($s eq 'Debug' or $s eq 'Debugcolor') { - setcolor() if $s =~/color/i; - ${^RE_DEBUG_FLAGS} = 0 unless defined ${^RE_DEBUG_FLAGS}; - for my $idx ($idx+1..$#_) { - if ($flags{$_[$idx]}) { - if ($on) { - ${^RE_DEBUG_FLAGS} |= $flags{$_[$idx]}; - } else { - ${^RE_DEBUG_FLAGS} &= ~ $flags{$_[$idx]}; - } - } else { - require Carp; - Carp::carp("Unknown \"re\" Debug flag '$_[$idx]', possible flags: ", - join(", ",sort keys %flags ) ); - } - } - _load_unload($on ? 1 : ${^RE_DEBUG_FLAGS}); - last; - } elsif ($s eq 'debug' or $s eq 'debugcolor') { - setcolor() if $s =~/color/i; - _load_unload($on); - } elsif (exists $bitmask{$s}) { - $bits |= $bitmask{$s}; - } else { - require Carp; - Carp::carp("Unknown \"re\" subpragma '$s' (known ones are: ", - join(', ', map {qq('$_')} 'debug', 'debugcolor', sort keys %bitmask), - ")"); - } - } - $bits; -} +Like c this function will not be confused by overloading +or blessing of the object. -sub import { - shift; - $^H |= bits(1, @_); -} +=back -sub unimport { - shift; - $^H &= ~ bits(0, @_); -} +=head1 SEE ALSO -1; +L. + +=cut diff --git a/ext/re/re.xs b/ext/re/re.xs index 58fb124..b82062a 100644 --- a/ext/re/re.xs +++ b/ext/re/re.xs @@ -19,6 +19,7 @@ extern char* my_re_intuit_start (pTHX_ regexp *prog, SV *sv, char *strpos, char *strend, U32 flags, struct re_scream_pos_data_s *data); extern SV* my_re_intuit_string (pTHX_ regexp *prog); +extern char* my_reg_stringify (pTHX_ MAGIC *mg, U32 *flags, STRLEN *lp, I32 *haseval); #if defined(USE_ITHREADS) extern regexp* my_regdupe (pTHX_ const regexp *r, CLONE_PARAMS *param); @@ -30,6 +31,7 @@ EXTERN_C const struct regexp_engine my_reg_engine = { my_re_intuit_start, my_re_intuit_string, my_regfree, + my_reg_stringify, #if defined(USE_ITHREADS) my_regdupe #endif @@ -46,3 +48,119 @@ install() /* PL_debug |= DEBUG_r_FLAG; */ XPUSHs(sv_2mortal(newSViv(PTR2IV(&my_reg_engine)))); + +void +is_regexp(sv) + SV * sv +PROTOTYPE: $ +PREINIT: + MAGIC *mg; +PPCODE: +{ + if (SvMAGICAL(sv)) + mg_get(sv); + if (SvROK(sv) && + (sv = (SV*)SvRV(sv)) && /* assign deliberate */ + SvTYPE(sv) == SVt_PVMG && + (mg = mg_find(sv, PERL_MAGIC_qr))) /* assign deliberate */ + { + XSRETURN_YES; + } else { + XSRETURN_NO; + } + /* NOTREACHED */ +} + +void +regexp_pattern(sv) + SV * sv +PROTOTYPE: $ +PREINIT: + MAGIC *mg; +PPCODE: +{ + /* + Checks if a reference is a regex or not. If the parameter is + not a ref, or is not the result of a qr// then returns false + in scalar context and an empty list in list context. + Otherwise in list context it returns the pattern and the + modifiers, in scalar context it returns the pattern just as it + would if the qr// was stringified normally, regardless as + to the class of the variable and any strigification overloads + on the object. + */ + + if (SvMAGICAL(sv)) + mg_get(sv); + if (SvROK(sv) && + (sv = (SV*)SvRV(sv)) && /* assign deliberate */ + SvTYPE(sv) == SVt_PVMG && + (mg = mg_find(sv, PERL_MAGIC_qr))) /* assign deliberate */ + { + + /* Housten, we have a regex! */ + SV *pattern; + regexp *re = (regexp *)mg->mg_obj; + STRLEN patlen = 0; + STRLEN left = 0; + char reflags[6]; + + if ( GIMME_V == G_ARRAY ) { + /* + we are in list context so stringify + the modifiers that apply. We ignore "negative + modifiers" in this scenario. + */ + + char *fptr = "msix"; + char ch; + U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12); + + while((ch = *fptr++)) { + if(reganch & 1) { + reflags[left++] = ch; + } + reganch >>= 1; + } + + pattern = sv_2mortal(newSVpvn(re->precomp,re->prelen)); + if (re->reganch & ROPT_UTF8) SvUTF8_on(pattern); + + /* return the pattern and the modifiers */ + XPUSHs(pattern); + XPUSHs(sv_2mortal(newSVpvn(reflags,left))); + XSRETURN(2); + } else { + /* Scalar, so use the string that Perl would return */ + if (!mg->mg_ptr) + CALLREG_STRINGIFY(mg,0,0); + + /* return the pattern in (?msix:..) format */ + pattern = sv_2mortal(newSVpvn(mg->mg_ptr,mg->mg_len)); + if (re->reganch & ROPT_UTF8) + SvUTF8_on(pattern); + XPUSHs(pattern); + XSRETURN(1); + } + } else { + /* It ain't a regexp folks */ + if ( GIMME_V == G_ARRAY ) { + /* return the empty list */ + XSRETURN_UNDEF; + } else { + /* Because of the (?:..) wrapping involved in a + stringified pattern it is impossible to get a + result for a real regexp that would evaluate to + false. Therefore we can return PL_sv_no to signify + that the object is not a regex, this means that one + can say + + if (regex($might_be_a_regex) eq '(?:foo)') { } + + and not worry about undefined values. + */ + XSRETURN_NO; + } + } + /* NOT-REACHED */ +} \ No newline at end of file diff --git a/ext/re/re_top.h b/ext/re/re_top.h index af729ae..39b7fd1 100644 --- a/ext/re/re_top.h +++ b/ext/re/re_top.h @@ -16,6 +16,7 @@ #define Perl_pregfree my_regfree #define Perl_re_intuit_string my_re_intuit_string #define Perl_regdupe my_regdupe +#define Perl_reg_stringify my_reg_stringify #define PERL_NO_GET_CONTEXT diff --git a/ext/re/t/re_funcs.t b/ext/re/t/re_funcs.t new file mode 100644 index 0000000..16ab864 --- /dev/null +++ b/ext/re/t/re_funcs.t @@ -0,0 +1,24 @@ +#!./perl + +BEGIN { + chdir 't' if -d 't'; + @INC = '../lib'; + require Config; + if (($Config::Config{'extensions'} !~ /\bre\b/) ){ + print "1..0 # Skip -- Perl configured without re module\n"; + exit 0; + } +} + +use strict; + +use Test::More tests => 6; +use re qw(is_regexp regexp_pattern); +my $qr=qr/foo/i; + +ok(is_regexp($qr),'is_regexp($qr)'); +ok(!is_regexp(''),'is_regexp("")'); +is((regexp_pattern($qr))[0],'foo','regexp_pattern[0]'); +is((regexp_pattern($qr))[1],'i','regexp_pattern[1]'); +is(regexp_pattern($qr),'(?i-xsm:foo)','scalar regexp_pattern'); +ok(!regexp_pattern(''),'!regexp_pattern("")'); diff --git a/lib/ExtUtils/ParseXS.pm b/lib/ExtUtils/ParseXS.pm index b6f4220..0729397 100644 --- a/lib/ExtUtils/ParseXS.pm +++ b/lib/ExtUtils/ParseXS.pm @@ -203,7 +203,8 @@ sub process_file { $size = qr[,\s* (??{ $bal }) ]x; # Third arg (to setpvn) foreach my $key (keys %output_expr) { - use re 'eval'; + #use re 'eval'; + BEGIN { $^H |= 0x00200000}; my ($t, $with_size, $arg, $sarg) = ($output_expr{$key} =~ diff --git a/perl.h b/perl.h index d708f81..51f26e4 100644 --- a/perl.h +++ b/perl.h @@ -210,6 +210,9 @@ CALL_FPTR((prog)->engine->checkstr)(aTHX_ (prog)) #define CALLREGFREE(prog) \ if(prog) CALL_FPTR((prog)->engine->free)(aTHX_ (prog)) +#define CALLREG_AS_STR(mg,lp,flags,haseval) \ + CALL_FPTR(((regexp *)((mg)->mg_obj))->engine->as_str)(aTHX_ (mg), (lp), (flags), (haseval)) +#define CALLREG_STRINGIFY(mg,lp,flags) CALLREG_AS_STR(mg,lp,flags,0) #if defined(USE_ITHREADS) #define CALLREGDUPE(prog,param) \ (prog ? CALL_FPTR((prog)->engine->dupe)(aTHX_ (prog),(param)) \ diff --git a/pod/perlreguts.pod b/pod/perlreguts.pod index 4ee2be1..9375657 100644 --- a/pod/perlreguts.pod +++ b/pod/perlreguts.pod @@ -759,7 +759,8 @@ F contains the base structure definition: U32 *offsets; /* offset annotations 20001228 MJD */ I32 sublen; /* Length of string pointed by subbeg */ I32 refcnt; - I32 minlen; /* mininum possible length of $& */ + I32 minlen; /* mininum length of string to match */ + I32 minlenret; /* mininum possible length of $& */ I32 prelen; /* length of precomp */ U32 nparens; /* number of parentheses */ U32 lastparen; /* last paren matched */ @@ -838,13 +839,28 @@ that handles this is called C. Sometimes this field points at a regop embedded in the program, and sometimes it points at an independent synthetic regop that has been constructed by the optimiser. -=item C +=item C C -The minimum possible length of the final matching string. This is used -to prune the search space by not bothering to match any closer to the -end of a string than would allow a match. For instance there is no point -in even starting the regex engine if the minlen is 10 but the string -is only 5 characters long. There is no way that the pattern can match. +C is the minimum string length required for the pattern to match. +This is used to prune the search space by not bothering to match any +closer to the end of a string than would allow a match. For instance +there is no point in even starting the regex engine if the minlen is +10 but the string is only 5 characters long. There is no way that the +pattern can match. + +C is the minimum length of the string that would be found +in $& after a match. + +The difference between C and C can be seen in the +following pattern: + + /ns(?=\d)/ + +where the C would be 3 but the minlen ret would only be 2 as +the \d is required to match but is not actually included in the matched +content. This distinction is particularly important as the substitution +logic uses the C to tell whether it can do in-place substition +which can result in considerable speedup. =item C diff --git a/pp.c b/pp.c index a9ca236..0ec54bf 100644 --- a/pp.c +++ b/pp.c @@ -4652,7 +4652,7 @@ PP(pp_split) const int tail = (rx->reganch & RE_INTUIT_TAIL); SV * const csv = CALLREG_INTUIT_STRING(rx); - len = rx->minlen; + len = rx->minlenret; if (len == 1 && !(rx->reganch & ROPT_UTF8) && !tail) { const char c = *SvPV_nolen_const(csv); while (--limit) { diff --git a/pp_hot.c b/pp_hot.c index d2e8e87..025e957 100644 --- a/pp_hot.c +++ b/pp_hot.c @@ -1497,11 +1497,11 @@ yup: /* Confirmed by INTUIT */ rx->subbeg = (char *) truebase; rx->startp[0] = s - truebase; if (RX_MATCH_UTF8(rx)) { - char * const t = (char*)utf8_hop((U8*)s, rx->minlen); + char * const t = (char*)utf8_hop((U8*)s, rx->minlenret); rx->endp[0] = t - truebase; } else { - rx->endp[0] = s - truebase + rx->minlen; + rx->endp[0] = s - truebase + rx->minlenret; } rx->sublen = strend - truebase; goto gotcha; @@ -1531,11 +1531,11 @@ yup: /* Confirmed by INTUIT */ rx->sublen = strend - t; RX_MATCH_COPIED_on(rx); off = rx->startp[0] = s - t; - rx->endp[0] = off + rx->minlen; + rx->endp[0] = off + rx->minlenret; } else { /* startp/endp are used by @- @+. */ rx->startp[0] = s - truebase; - rx->endp[0] = s - truebase + rx->minlen; + rx->endp[0] = s - truebase + rx->minlenret; } rx->nparens = rx->lastparen = rx->lastcloseparen = 0; /* used by @-, @+, and $^N */ LEAVE_SCOPE(oldsave); @@ -2188,7 +2188,7 @@ PP(pp_subst) #ifdef PERL_OLD_COPY_ON_WRITE && !is_cow #endif - && (I32)clen <= rx->minlen && (once || !(r_flags & REXEC_COPY_STR)) + && (I32)clen <= rx->minlenret && (once || !(r_flags & REXEC_COPY_STR)) && !(rx->reganch & ROPT_LOOKBEHIND_SEEN) && (!doutf8 || SvUTF8(TARG))) { if (!CALLREGEXEC(rx, s, strend, orig, 0, TARG, NULL, diff --git a/proto.h b/proto.h index ce25ca0..11a5fc4 100644 --- a/proto.h +++ b/proto.h @@ -1830,6 +1830,9 @@ PERL_CALLCONV I32 Perl_pregexec(pTHX_ regexp* prog, char* stringarg, char* stren __attribute__nonnull__(pTHX_6); PERL_CALLCONV void Perl_pregfree(pTHX_ struct regexp* r); +PERL_CALLCONV char * Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval) + __attribute__nonnull__(pTHX_1); + #if defined(USE_ITHREADS) PERL_CALLCONV regexp* Perl_regdupe(pTHX_ const regexp* r, CLONE_PARAMS* param) __attribute__nonnull__(pTHX_1) @@ -3859,10 +3862,6 @@ STATIC STRLEN S_sv_pos_b2u_midway(pTHX_ const U8 *s, const U8 *const target, con __attribute__nonnull__(pTHX_2) __attribute__nonnull__(pTHX_3); -STATIC char * S_stringify_regexp(pTHX_ SV *sv, MAGIC *mg, STRLEN *lp) - __attribute__nonnull__(pTHX_1) - __attribute__nonnull__(pTHX_2); - STATIC char * S_F0convert(NV nv, char *endbuf, STRLEN *len) __attribute__nonnull__(2) __attribute__nonnull__(3); diff --git a/regcomp.c b/regcomp.c index 46851dd..4a2e52b 100644 --- a/regcomp.c +++ b/regcomp.c @@ -4216,8 +4216,7 @@ reStudy: * it happens that c_offset_min has been invalidated, since the * earlier string may buy us something the later one won't.] */ - minlen = 0; - + data.longest_fixed = newSVpvs(""); data.longest_float = newSVpvs(""); data.last_found = newSVpvs(""); @@ -4230,7 +4229,7 @@ reStudy: } else /* XXXX Check for BOUND? */ stclass_flag = 0; data.last_closep = &last_close; - + minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */ &data, -1, NULL, NULL, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0); @@ -4408,9 +4407,10 @@ reStudy: data.start_class = &ch_class; data.last_closep = &last_close; + minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size, &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0); - + CHECK_RESTUDY_GOTO; r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 @@ -4437,6 +4437,11 @@ reStudy: /* Guard against an embedded (?=) or (?<=) with a longer minlen than the "real" pattern. */ + DEBUG_OPTIMISE_r({ + PerlIO_printf(Perl_debug_log,"minlen: %d r->minlen:%d\n", + minlen, r->minlen); + }); + r->minlenret = minlen; if (r->minlen < minlen) r->minlen = minlen; @@ -8561,6 +8566,7 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param) ret->precomp = SAVEPVN(r->precomp, r->prelen); ret->refcnt = r->refcnt; ret->minlen = r->minlen; + ret->minlenret = r->minlenret; ret->prelen = r->prelen; ret->nparens = r->nparens; ret->lastparen = r->lastparen; @@ -8586,6 +8592,111 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param) } #endif +/* + reg_stringify() + + converts a regexp embedded in a MAGIC struct to its stringified form, + caching the converted form in the struct and returns the cached + string. + + If lp is nonnull then it is used to return the length of the + resulting string + + If flags is nonnull and the returned string contains UTF8 then + (flags & 1) will be true. + + If haseval is nonnull then it is used to return whether the pattern + contains evals. + + Normally called via macro: + + CALLREG_STRINGIFY(mg,0,0); + + And internally with + + CALLREG_AS_STR(mg,lp,flags,haseval) + + See sv_2pv_flags() in sv.c for an example of internal usage. + + */ + +char * +Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) { + dVAR; + const regexp * const re = (regexp *)mg->mg_obj; + + if (!mg->mg_ptr) { + const char *fptr = "msix"; + char reflags[6]; + char ch; + int left = 0; + int right = 4; + bool need_newline = 0; + U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12); + + while((ch = *fptr++)) { + if(reganch & 1) { + reflags[left++] = ch; + } + else { + reflags[right--] = ch; + } + reganch >>= 1; + } + if(left != 4) { + reflags[left] = '-'; + left = 5; + } + + mg->mg_len = re->prelen + 4 + left; + /* + * If /x was used, we have to worry about a regex ending with a + * comment later being embedded within another regex. If so, we don't + * want this regex's "commentization" to leak out to the right part of + * the enclosing regex, we must cap it with a newline. + * + * So, if /x was used, we scan backwards from the end of the regex. If + * we find a '#' before we find a newline, we need to add a newline + * ourself. If we find a '\n' first (or if we don't find '#' or '\n'), + * we don't need to add anything. -jfriedl + */ + if (PMf_EXTENDED & re->reganch) { + const char *endptr = re->precomp + re->prelen; + while (endptr >= re->precomp) { + const char c = *(endptr--); + if (c == '\n') + break; /* don't need another */ + if (c == '#') { + /* we end while in a comment, so we need a newline */ + mg->mg_len++; /* save space for it */ + need_newline = 1; /* note to add it */ + break; + } + } + } + + Newx(mg->mg_ptr, mg->mg_len + 1 + left, char); + mg->mg_ptr[0] = '('; + mg->mg_ptr[1] = '?'; + Copy(reflags, mg->mg_ptr+2, left, char); + *(mg->mg_ptr+left+2) = ':'; + Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char); + if (need_newline) + mg->mg_ptr[mg->mg_len - 2] = '\n'; + mg->mg_ptr[mg->mg_len - 1] = ')'; + mg->mg_ptr[mg->mg_len] = 0; + } + if (haseval) + *haseval = re->program[0].next_off; + if (flags) + *flags = ((re->reganch & ROPT_UTF8) ? 1 : 0); + + if (lp) + *lp = mg->mg_len; + return mg->mg_ptr; +} + + #ifndef PERL_IN_XSUB_RE /* - regnext - dig the "next" pointer out of a node diff --git a/regcomp.h b/regcomp.h index 2774a27..e3d671d 100644 --- a/regcomp.h +++ b/regcomp.h @@ -398,6 +398,7 @@ EXTCONST regexp_engine PL_core_reg_engine = { Perl_re_intuit_start, Perl_re_intuit_string, Perl_pregfree, + Perl_reg_stringify, #if defined(USE_ITHREADS) Perl_regdupe #endif diff --git a/regexp.h b/regexp.h index f74f2af..5e3e947 100644 --- a/regexp.h +++ b/regexp.h @@ -47,7 +47,8 @@ typedef struct regexp { U32 *offsets; /* offset annotations 20001228 MJD */ I32 sublen; /* Length of string pointed by subbeg */ I32 refcnt; - I32 minlen; /* mininum possible length of $& */ + I32 minlen; /* mininum possible length of string to match */ + I32 minlenret; /* mininum possible length of $& */ I32 prelen; /* length of precomp */ U32 nparens; /* number of parentheses */ U32 lastparen; /* last paren matched */ @@ -76,6 +77,7 @@ typedef struct regexp_engine { struct re_scream_pos_data_s *data); SV* (*checkstr) (pTHX_ regexp *prog); void (*free) (pTHX_ struct regexp* r); + char* (*as_str) (pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval); #ifdef USE_ITHREADS regexp* (*dupe) (pTHX_ const regexp *r, CLONE_PARAMS *param); #endif diff --git a/sv.c b/sv.c index ad31ce1..6696e9a 100644 --- a/sv.c +++ b/sv.c @@ -2542,87 +2542,6 @@ S_uiv_2buf(char *buf, IV iv, UV uv, int is_uv, char **peob) return ptr; } -/* stringify_regexp(): private routine for use by sv_2pv_flags(): converts - * a regexp to its stringified form. - */ - -static char * -S_stringify_regexp(pTHX_ SV *sv, MAGIC *mg, STRLEN *lp) { - dVAR; - const regexp * const re = (regexp *)mg->mg_obj; - - if (!mg->mg_ptr) { - const char *fptr = "msix"; - char reflags[6]; - char ch; - int left = 0; - int right = 4; - bool need_newline = 0; - U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12); - - while((ch = *fptr++)) { - if(reganch & 1) { - reflags[left++] = ch; - } - else { - reflags[right--] = ch; - } - reganch >>= 1; - } - if(left != 4) { - reflags[left] = '-'; - left = 5; - } - - mg->mg_len = re->prelen + 4 + left; - /* - * If /x was used, we have to worry about a regex ending with a - * comment later being embedded within another regex. If so, we don't - * want this regex's "commentization" to leak out to the right part of - * the enclosing regex, we must cap it with a newline. - * - * So, if /x was used, we scan backwards from the end of the regex. If - * we find a '#' before we find a newline, we need to add a newline - * ourself. If we find a '\n' first (or if we don't find '#' or '\n'), - * we don't need to add anything. -jfriedl - */ - if (PMf_EXTENDED & re->reganch) { - const char *endptr = re->precomp + re->prelen; - while (endptr >= re->precomp) { - const char c = *(endptr--); - if (c == '\n') - break; /* don't need another */ - if (c == '#') { - /* we end while in a comment, so we need a newline */ - mg->mg_len++; /* save space for it */ - need_newline = 1; /* note to add it */ - break; - } - } - } - - Newx(mg->mg_ptr, mg->mg_len + 1 + left, char); - mg->mg_ptr[0] = '('; - mg->mg_ptr[1] = '?'; - Copy(reflags, mg->mg_ptr+2, left, char); - *(mg->mg_ptr+left+2) = ':'; - Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char); - if (need_newline) - mg->mg_ptr[mg->mg_len - 2] = '\n'; - mg->mg_ptr[mg->mg_len - 1] = ')'; - mg->mg_ptr[mg->mg_len] = 0; - } - PL_reginterp_cnt += re->program[0].next_off; - - if (re->reganch & ROPT_UTF8) - SvUTF8_on(sv); - else - SvUTF8_off(sv); - if (lp) - *lp = mg->mg_len; - return mg->mg_ptr; -} - /* =for apidoc sv_2pv_flags @@ -2740,8 +2659,18 @@ Perl_sv_2pv_flags(pTHX_ register SV *sv, STRLEN *lp, I32 flags) && ((SvFLAGS(referent) & (SVs_OBJECT|SVf_OK|SVs_GMG|SVs_SMG|SVs_RMG)) == (SVs_OBJECT|SVs_SMG)) - && (mg = mg_find(referent, PERL_MAGIC_qr))) { - return stringify_regexp(sv, mg, lp); + && (mg = mg_find(referent, PERL_MAGIC_qr))) + { + char *str = NULL; + I32 haseval = 0; + I32 flags = 0; + (str) = CALLREG_AS_STR(mg,lp,&flags,&haseval); + if (flags & 1) + SvUTF8_on(sv); + else + SvUTF8_off(sv); + PL_reginterp_cnt += haseval; + return str; } else { const char *const typestr = sv_reftype(referent, 0); const STRLEN typelen = strlen(typestr); diff --git a/t/op/pat.t b/t/op/pat.t index 5405cf6..0de3b14 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -3993,8 +3993,20 @@ ok((q(a)x 100) =~ /^(??{'(.)'x 100})/, "Regexp /^(??{'(.)'x 100})/ crashes older perls") or print "# Unexpected outcome: should pass or crash perl\n"; +{ + $_="ns1ns1ns1"; + s/ns(?=\d)/ns_/g; + iseq($_,"ns_1ns_1ns_1"); + $_="ns1"; + s/ns(?=\d)/ns_/; + iseq($_,"ns_1"); + $_="123"; + s/(?=\d+)|(?<=\d)/!Bang!/g; + iseq($_,"!Bang!1!Bang!2!Bang!3!Bang!"); +} + # Put new tests above the line, not here. # Don't forget to update this! -BEGIN{print "1..1344\n"}; +BEGIN{print "1..1347\n"};