ext/re/t/regop.pl generate debug output for various patterns
ext/re/t/regop.t test RE optimizations by scraping debug output
ext/re/t/re.t see if re pragma works
+ext/re/t/re_funcs.t see if exportable funcs from re.pm work
ext/Safe/t/safe1.t See if Safe works
ext/Safe/t/safe2.t See if Safe works
ext/Safe/t/safe3.t See if Safe works
|NN char* strend|NN char* strbeg|I32 minend \
|NN SV* screamer|U32 nosave
Ap |void |pregfree |NULLOK struct regexp* r
+p |char * |reg_stringify |NN MAGIC *mg|NULLOK STRLEN *lp|NULLOK U32 *flags|NULLOK I32 *haseval
#if defined(USE_ITHREADS)
Ap |regexp*|regdupe |NN const regexp* r|NN CLONE_PARAMS* param
#endif
|STRLEN byte|STRLEN utf8|STRLEN blen
s |STRLEN |sv_pos_b2u_midway|NN const U8 *s|NN const U8 *const target \
|NN const U8 *end|STRLEN endu
-s |char * |stringify_regexp|NN SV *sv|NN MAGIC *mg|NULLOK STRLEN *lp
sn |char * |F0convert |NV nv|NN char *endbuf|NN STRLEN *len
# if defined(PERL_OLD_COPY_ON_WRITE)
sM |void |sv_release_COW |NN SV *sv|NN const char *pvx|STRLEN len|NN SV *after
#define regclass_swash Perl_regclass_swash
#define pregexec Perl_pregexec
#define pregfree Perl_pregfree
+#ifdef PERL_CORE
+#define reg_stringify Perl_reg_stringify
+#endif
#if defined(USE_ITHREADS)
#define regdupe Perl_regdupe
#endif
#define sv_pos_u2b_cached S_sv_pos_u2b_cached
#define utf8_mg_pos_cache_update S_utf8_mg_pos_cache_update
#define sv_pos_b2u_midway S_sv_pos_b2u_midway
-#define stringify_regexp S_stringify_regexp
#define F0convert S_F0convert
#endif
# if defined(PERL_OLD_COPY_ON_WRITE)
#define regclass_swash(a,b,c,d,e) Perl_regclass_swash(aTHX_ a,b,c,d,e)
#define pregexec(a,b,c,d,e,f,g) Perl_pregexec(aTHX_ a,b,c,d,e,f,g)
#define pregfree(a) Perl_pregfree(aTHX_ a)
+#ifdef PERL_CORE
+#define reg_stringify(a,b,c,d) Perl_reg_stringify(aTHX_ a,b,c,d)
+#endif
#if defined(USE_ITHREADS)
#define regdupe(a,b) Perl_regdupe(aTHX_ a,b)
#endif
#define sv_pos_u2b_cached(a,b,c,d,e,f,g) S_sv_pos_u2b_cached(aTHX_ a,b,c,d,e,f,g)
#define utf8_mg_pos_cache_update(a,b,c,d,e) S_utf8_mg_pos_cache_update(aTHX_ a,b,c,d,e)
#define sv_pos_b2u_midway(a,b,c,d) S_sv_pos_b2u_midway(aTHX_ a,b,c,d)
-#define stringify_regexp(a,b,c) S_stringify_regexp(aTHX_ a,b,c)
#define F0convert S_F0convert
#endif
# if defined(PERL_OLD_COPY_ON_WRITE)
package re;
-our $VERSION = 0.06_03;
+# pragma for controlling the regex engine
+use strict;
+use warnings;
+
+our $VERSION = "0.06_03";
+our @ISA = qw(Exporter);
+our @EXPORT_OK = qw(is_regexp regexp_pattern);
+our %EXPORT_OK = map { $_ => 1 } @EXPORT_OK;
+
+# *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING ***
+#
+# If you modify these values see comment below!
+
+my %bitmask = (
+ taint => 0x00100000, # HINT_RE_TAINT
+ eval => 0x00200000, # HINT_RE_EVAL
+);
+
+# - File::Basename contains a literal for 'taint' as a fallback. If
+# taint is changed here, File::Basename must be updated as well.
+#
+# - ExtUtils::ParseXS uses a hardcoded
+# BEGIN { $^H |= 0x00200000 }
+# in it to allow re.xs to be built. So if 'eval' is changed here then
+# ExtUtils::ParseXS must be changed as well.
+#
+# *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING ***
+
+sub setcolor {
+ eval { # Ignore errors
+ require Term::Cap;
+
+ my $terminal = Tgetent Term::Cap ({OSPEED => 9600}); # Avoid warning.
+ my $props = $ENV{PERL_RE_TC} || 'md,me,so,se,us,ue';
+ my @props = split /,/, $props;
+ my $colors = join "\t", map {$terminal->Tputs($_,1)} @props;
+
+ $colors =~ s/\0//g;
+ $ENV{PERL_RE_COLORS} = $colors;
+ };
+ if ($@) {
+ $ENV{PERL_RE_COLORS} ||= qq'\t\t> <\t> <\t\t';
+ }
+
+}
+
+my %flags = (
+ COMPILE => 0x0000FF,
+ PARSE => 0x000001,
+ OPTIMISE => 0x000002,
+ TRIEC => 0x000004,
+ DUMP => 0x000008,
+
+ EXECUTE => 0x00FF00,
+ INTUIT => 0x000100,
+ MATCH => 0x000200,
+ TRIEE => 0x000400,
+
+ EXTRA => 0xFF0000,
+ TRIEM => 0x010000,
+ OFFSETS => 0x020000,
+ OFFSETSDBG => 0x040000,
+ STATE => 0x080000,
+ OPTIMISEM => 0x100000,
+ STACK => 0x280000,
+);
+$flags{ALL} = -1;
+$flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE};
+$flags{Extra} = $flags{EXECUTE} | $flags{COMPILE};
+$flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE};
+$flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE};
+$flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC};
+
+my $installed;
+my $installed_error;
+
+sub _do_install {
+ if ( ! defined($installed) ) {
+ require XSLoader;
+ $installed = eval { XSLoader::load('re', $VERSION) } || 0;
+ $installed_error = $@;
+ }
+}
+
+sub _load_unload {
+ my ($on)= @_;
+ if ($on) {
+ _do_install();
+ if ( ! $installed ) {
+ die "'re' not installed!? ($installed_error)";
+ } else {
+ # We call install() every time, as if we didn't, we wouldn't
+ # "see" any changes to the color environment var since
+ # the last time it was called.
+
+ # install() returns an integer, which if casted properly
+ # in C resolves to a structure containing the regex
+ # hooks. Setting it to a random integer will guarantee
+ # segfaults.
+ $^H{regcomp} = install();
+ }
+ } else {
+ delete $^H{regcomp};
+ }
+}
+
+sub bits {
+ my $on = shift;
+ my $bits = 0;
+ unless (@_) {
+ require Carp;
+ Carp::carp("Useless use of \"re\" pragma");
+ }
+ foreach my $idx (0..$#_){
+ my $s=$_[$idx];
+ if ($s eq 'Debug' or $s eq 'Debugcolor') {
+ setcolor() if $s =~/color/i;
+ ${^RE_DEBUG_FLAGS} = 0 unless defined ${^RE_DEBUG_FLAGS};
+ for my $idx ($idx+1..$#_) {
+ if ($flags{$_[$idx]}) {
+ if ($on) {
+ ${^RE_DEBUG_FLAGS} |= $flags{$_[$idx]};
+ } else {
+ ${^RE_DEBUG_FLAGS} &= ~ $flags{$_[$idx]};
+ }
+ } else {
+ require Carp;
+ Carp::carp("Unknown \"re\" Debug flag '$_[$idx]', possible flags: ",
+ join(", ",sort keys %flags ) );
+ }
+ }
+ _load_unload($on ? 1 : ${^RE_DEBUG_FLAGS});
+ last;
+ } elsif ($s eq 'debug' or $s eq 'debugcolor') {
+ setcolor() if $s =~/color/i;
+ _load_unload($on);
+ } elsif (exists $bitmask{$s}) {
+ $bits |= $bitmask{$s};
+ } elsif ($EXPORT_OK{$s}) {
+ _do_install();
+ require Exporter;
+ re->export_to_level(2, 're', $s);
+ } else {
+ require Carp;
+ Carp::carp("Unknown \"re\" subpragma '$s' (known ones are: ",
+ join(', ', map {qq('$_')} 'debug', 'debugcolor', sort keys %bitmask),
+ ")");
+ }
+ }
+ $bits;
+}
+
+sub import {
+ shift;
+ $^H |= bits(1, @_);
+}
+
+sub unimport {
+ shift;
+ $^H &= ~ bits(0, @_);
+}
+
+1;
+
+__END__
=head1 NAME
use re qw(Debug All); # Finer tuned debugging options.
use re qw(Debug More);
no re qw(Debug ALL); # Turn of all re debugging in this scope
+
+ use re qw(is_regexp regexp_pattern); # import utility functions
+ my ($pat,$mods)=regexp_pattern(qr/foo/i);
+ if (is_regexp($obj)) {
+ print "Got regexp: ",
+ scalar regexp_pattern($obj); # just as perl would stringify it
+ } # but no hassle with blessed re's.
+
(We use $^X in these examples because it's tainted by default.)
=head1 DESCRIPTION
+=head2 'taint' mode
+
When C<use re 'taint'> is in effect, and a tainted string is the target
of a regex, the regex memories (or values returned by the m// operator
in list context) are tainted. This feature is useful when regex operations
on tainted data aren't meant to extract safe substrings, but to perform
other transformations.
+=head2 'eval' mode
+
When C<use re 'eval'> is in effect, a regex is allowed to contain
C<(?{ ... })> zero-width assertions even if regular expression contains
variable interpolation. That is normally disallowed, since it is a
I<is> allowed if $pat is a precompiled regular expression, even
if $pat contains C<(?{ ... })> assertions.
+=head2 'debug' mode
+
When C<use re 'debug'> is in effect, perl emits debugging messages when
compiling and using regular expressions. The output is the same as that
obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the
strings on/off, pre-point part on/off.
See L<perldebug/"Debugging regular expressions"> for additional info.
+As of 5.9.5 the directive C<use re 'debug'> and its equivalents are
+lexically scoped, as the other directives are. However they have both
+compile-time and run-time effects.
+
+See L<perlmodlib/Pragmatic Modules>.
+
+=head2 'Debug' mode
+
Similarly C<use re 'Debug'> produces debugging output, the difference
being that it allows the fine tuning of what debugging output will be
emitted. Options are divided into three groups, those related to
lexically scoped, as the other directives are. However they have both
compile-time and run-time effects.
-See L<perlmodlib/Pragmatic Modules>.
+=head2 Exportable Functions
-=cut
+As of perl 5.9.5 're' debug contains a number of utility functions that
+may be optionally exported into the callers namespace. They are listed
+below.
-# N.B. File::Basename contains a literal for 'taint' as a fallback. If
-# taint is changed here, File::Basename must be updated as well.
-my %bitmask = (
-taint => 0x00100000, # HINT_RE_TAINT
-eval => 0x00200000, # HINT_RE_EVAL
-);
+=over 4
-sub setcolor {
- eval { # Ignore errors
- require Term::Cap;
+=item is_regexp($ref)
- my $terminal = Tgetent Term::Cap ({OSPEED => 9600}); # Avoid warning.
- my $props = $ENV{PERL_RE_TC} || 'md,me,so,se,us,ue';
- my @props = split /,/, $props;
- my $colors = join "\t", map {$terminal->Tputs($_,1)} @props;
+Returns true if the argument is a compiled regular expression as returned
+by C<qr//>, false if it is not.
- $colors =~ s/\0//g;
- $ENV{PERL_RE_COLORS} = $colors;
- };
- if ($@) {
- $ENV{PERL_RE_COLORS}||=qq'\t\t> <\t> <\t\t'
- }
+This function will not be confused by overloading or blessing. In
+internals terms this extracts the regexp pointer out of the
+PERL_MAGIC_qr structure so it it cannot be fooled.
-}
+=item regexp_pattern($ref)
-my %flags = (
- COMPILE => 0x0000FF,
- PARSE => 0x000001,
- OPTIMISE => 0x000002,
- TRIEC => 0x000004,
- DUMP => 0x000008,
+If the argument is a compiled regular expression as returned by C<qr//>
+then this function returns the pattern.
- EXECUTE => 0x00FF00,
- INTUIT => 0x000100,
- MATCH => 0x000200,
- TRIEE => 0x000400,
+In list context it returns a two element list, the first element
+containing the pattern and the second containing the modifiers used when
+the pattern was compiled.
- EXTRA => 0xFF0000,
- TRIEM => 0x010000,
- OFFSETS => 0x020000,
- OFFSETSDBG => 0x040000,
- STATE => 0x080000,
- OPTIMISEM => 0x100000,
- STACK => 0x280000,
-);
-$flags{ALL} = -1;
-$flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE};
-$flags{Extra} = $flags{EXECUTE} | $flags{COMPILE};
-$flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE};
-$flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE};
-$flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC};
+ my ($pat,$mods)=regexp_pattern($ref);
-my $installed;
-my $installed_error;
+In scalar context it returns the same as perl would when strigifying a
+raw qr// with the same pattern inside. If the argument is not a
+compiled reference then this routine returns false but defined in scalar
+context, and the empty list in list context. Thus the following
-sub _load_unload {
- my ($on)= @_;
- if ($on) {
- if ( ! defined($installed) ) {
- require XSLoader;
- $installed = eval { XSLoader::load('re') } || 0;
- $installed_error = $@;
- }
- if ( ! $installed ) {
- die "'re' not installed!? ($installed_error)";
- } else {
- # We call install() every time, as if we didn't, we wouldn't
- # "see" any changes to the color environment var since
- # the last time it was called.
+ if (regexp_pattern($ref) eq '(?i-xsm:foo)')
- # install() returns an integer, which if casted properly
- # in C resolves to a structure containing the regex
- # hooks. Setting it to a random integer will guarantee
- # segfaults.
- $^H{regcomp} = install();
- }
- } else {
- delete $^H{regcomp};
- }
-}
+will be warning free regardless of what $ref actually is.
-sub bits {
- my $on = shift;
- my $bits = 0;
- unless (@_) {
- require Carp;
- Carp::carp("Useless use of \"re\" pragma");
- }
- foreach my $idx (0..$#_){
- my $s=$_[$idx];
- if ($s eq 'Debug' or $s eq 'Debugcolor') {
- setcolor() if $s =~/color/i;
- ${^RE_DEBUG_FLAGS} = 0 unless defined ${^RE_DEBUG_FLAGS};
- for my $idx ($idx+1..$#_) {
- if ($flags{$_[$idx]}) {
- if ($on) {
- ${^RE_DEBUG_FLAGS} |= $flags{$_[$idx]};
- } else {
- ${^RE_DEBUG_FLAGS} &= ~ $flags{$_[$idx]};
- }
- } else {
- require Carp;
- Carp::carp("Unknown \"re\" Debug flag '$_[$idx]', possible flags: ",
- join(", ",sort keys %flags ) );
- }
- }
- _load_unload($on ? 1 : ${^RE_DEBUG_FLAGS});
- last;
- } elsif ($s eq 'debug' or $s eq 'debugcolor') {
- setcolor() if $s =~/color/i;
- _load_unload($on);
- } elsif (exists $bitmask{$s}) {
- $bits |= $bitmask{$s};
- } else {
- require Carp;
- Carp::carp("Unknown \"re\" subpragma '$s' (known ones are: ",
- join(', ', map {qq('$_')} 'debug', 'debugcolor', sort keys %bitmask),
- ")");
- }
- }
- $bits;
-}
+Like c<is_regexp> this function will not be confused by overloading
+or blessing of the object.
-sub import {
- shift;
- $^H |= bits(1, @_);
-}
+=back
-sub unimport {
- shift;
- $^H &= ~ bits(0, @_);
-}
+=head1 SEE ALSO
-1;
+L<perlmodlib/Pragmatic Modules>.
+
+=cut
char *strend, U32 flags,
struct re_scream_pos_data_s *data);
extern SV* my_re_intuit_string (pTHX_ regexp *prog);
+extern char* my_reg_stringify (pTHX_ MAGIC *mg, U32 *flags, STRLEN *lp, I32 *haseval);
#if defined(USE_ITHREADS)
extern regexp* my_regdupe (pTHX_ const regexp *r, CLONE_PARAMS *param);
my_re_intuit_start,
my_re_intuit_string,
my_regfree,
+ my_reg_stringify,
#if defined(USE_ITHREADS)
my_regdupe
#endif
/* PL_debug |= DEBUG_r_FLAG; */
XPUSHs(sv_2mortal(newSViv(PTR2IV(&my_reg_engine))));
+
+void
+is_regexp(sv)
+ SV * sv
+PROTOTYPE: $
+PREINIT:
+ MAGIC *mg;
+PPCODE:
+{
+ if (SvMAGICAL(sv))
+ mg_get(sv);
+ if (SvROK(sv) &&
+ (sv = (SV*)SvRV(sv)) && /* assign deliberate */
+ SvTYPE(sv) == SVt_PVMG &&
+ (mg = mg_find(sv, PERL_MAGIC_qr))) /* assign deliberate */
+ {
+ XSRETURN_YES;
+ } else {
+ XSRETURN_NO;
+ }
+ /* NOTREACHED */
+}
+
+void
+regexp_pattern(sv)
+ SV * sv
+PROTOTYPE: $
+PREINIT:
+ MAGIC *mg;
+PPCODE:
+{
+ /*
+ Checks if a reference is a regex or not. If the parameter is
+ not a ref, or is not the result of a qr// then returns false
+ in scalar context and an empty list in list context.
+ Otherwise in list context it returns the pattern and the
+ modifiers, in scalar context it returns the pattern just as it
+ would if the qr// was stringified normally, regardless as
+ to the class of the variable and any strigification overloads
+ on the object.
+ */
+
+ if (SvMAGICAL(sv))
+ mg_get(sv);
+ if (SvROK(sv) &&
+ (sv = (SV*)SvRV(sv)) && /* assign deliberate */
+ SvTYPE(sv) == SVt_PVMG &&
+ (mg = mg_find(sv, PERL_MAGIC_qr))) /* assign deliberate */
+ {
+
+ /* Housten, we have a regex! */
+ SV *pattern;
+ regexp *re = (regexp *)mg->mg_obj;
+ STRLEN patlen = 0;
+ STRLEN left = 0;
+ char reflags[6];
+
+ if ( GIMME_V == G_ARRAY ) {
+ /*
+ we are in list context so stringify
+ the modifiers that apply. We ignore "negative
+ modifiers" in this scenario.
+ */
+
+ char *fptr = "msix";
+ char ch;
+ U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12);
+
+ while((ch = *fptr++)) {
+ if(reganch & 1) {
+ reflags[left++] = ch;
+ }
+ reganch >>= 1;
+ }
+
+ pattern = sv_2mortal(newSVpvn(re->precomp,re->prelen));
+ if (re->reganch & ROPT_UTF8) SvUTF8_on(pattern);
+
+ /* return the pattern and the modifiers */
+ XPUSHs(pattern);
+ XPUSHs(sv_2mortal(newSVpvn(reflags,left)));
+ XSRETURN(2);
+ } else {
+ /* Scalar, so use the string that Perl would return */
+ if (!mg->mg_ptr)
+ CALLREG_STRINGIFY(mg,0,0);
+
+ /* return the pattern in (?msix:..) format */
+ pattern = sv_2mortal(newSVpvn(mg->mg_ptr,mg->mg_len));
+ if (re->reganch & ROPT_UTF8)
+ SvUTF8_on(pattern);
+ XPUSHs(pattern);
+ XSRETURN(1);
+ }
+ } else {
+ /* It ain't a regexp folks */
+ if ( GIMME_V == G_ARRAY ) {
+ /* return the empty list */
+ XSRETURN_UNDEF;
+ } else {
+ /* Because of the (?:..) wrapping involved in a
+ stringified pattern it is impossible to get a
+ result for a real regexp that would evaluate to
+ false. Therefore we can return PL_sv_no to signify
+ that the object is not a regex, this means that one
+ can say
+
+ if (regex($might_be_a_regex) eq '(?:foo)') { }
+
+ and not worry about undefined values.
+ */
+ XSRETURN_NO;
+ }
+ }
+ /* NOT-REACHED */
+}
\ No newline at end of file
#define Perl_pregfree my_regfree
#define Perl_re_intuit_string my_re_intuit_string
#define Perl_regdupe my_regdupe
+#define Perl_reg_stringify my_reg_stringify
#define PERL_NO_GET_CONTEXT
--- /dev/null
+#!./perl
+
+BEGIN {
+ chdir 't' if -d 't';
+ @INC = '../lib';
+ require Config;
+ if (($Config::Config{'extensions'} !~ /\bre\b/) ){
+ print "1..0 # Skip -- Perl configured without re module\n";
+ exit 0;
+ }
+}
+
+use strict;
+
+use Test::More tests => 6;
+use re qw(is_regexp regexp_pattern);
+my $qr=qr/foo/i;
+
+ok(is_regexp($qr),'is_regexp($qr)');
+ok(!is_regexp(''),'is_regexp("")');
+is((regexp_pattern($qr))[0],'foo','regexp_pattern[0]');
+is((regexp_pattern($qr))[1],'i','regexp_pattern[1]');
+is(regexp_pattern($qr),'(?i-xsm:foo)','scalar regexp_pattern');
+ok(!regexp_pattern(''),'!regexp_pattern("")');
$size = qr[,\s* (??{ $bal }) ]x; # Third arg (to setpvn)
foreach my $key (keys %output_expr) {
- use re 'eval';
+ #use re 'eval';
+ BEGIN { $^H |= 0x00200000};
my ($t, $with_size, $arg, $sarg) =
($output_expr{$key} =~
CALL_FPTR((prog)->engine->checkstr)(aTHX_ (prog))
#define CALLREGFREE(prog) \
if(prog) CALL_FPTR((prog)->engine->free)(aTHX_ (prog))
+#define CALLREG_AS_STR(mg,lp,flags,haseval) \
+ CALL_FPTR(((regexp *)((mg)->mg_obj))->engine->as_str)(aTHX_ (mg), (lp), (flags), (haseval))
+#define CALLREG_STRINGIFY(mg,lp,flags) CALLREG_AS_STR(mg,lp,flags,0)
#if defined(USE_ITHREADS)
#define CALLREGDUPE(prog,param) \
(prog ? CALL_FPTR((prog)->engine->dupe)(aTHX_ (prog),(param)) \
U32 *offsets; /* offset annotations 20001228 MJD */
I32 sublen; /* Length of string pointed by subbeg */
I32 refcnt;
- I32 minlen; /* mininum possible length of $& */
+ I32 minlen; /* mininum length of string to match */
+ I32 minlenret; /* mininum possible length of $& */
I32 prelen; /* length of precomp */
U32 nparens; /* number of parentheses */
U32 lastparen; /* last paren matched */
points at a regop embedded in the program, and sometimes it points at
an independent synthetic regop that has been constructed by the optimiser.
-=item C<minlen>
+=item C<minlen> C<minlenret>
-The minimum possible length of the final matching string. This is used
-to prune the search space by not bothering to match any closer to the
-end of a string than would allow a match. For instance there is no point
-in even starting the regex engine if the minlen is 10 but the string
-is only 5 characters long. There is no way that the pattern can match.
+C<minlen> is the minimum string length required for the pattern to match.
+This is used to prune the search space by not bothering to match any
+closer to the end of a string than would allow a match. For instance
+there is no point in even starting the regex engine if the minlen is
+10 but the string is only 5 characters long. There is no way that the
+pattern can match.
+
+C<minlenret> is the minimum length of the string that would be found
+in $& after a match.
+
+The difference between C<minlen> and C<minlenret> can be seen in the
+following pattern:
+
+ /ns(?=\d)/
+
+where the C<minlen> would be 3 but the minlen ret would only be 2 as
+the \d is required to match but is not actually included in the matched
+content. This distinction is particularly important as the substitution
+logic uses the C<minlenret> to tell whether it can do in-place substition
+which can result in considerable speedup.
=item C<reganch>
const int tail = (rx->reganch & RE_INTUIT_TAIL);
SV * const csv = CALLREG_INTUIT_STRING(rx);
- len = rx->minlen;
+ len = rx->minlenret;
if (len == 1 && !(rx->reganch & ROPT_UTF8) && !tail) {
const char c = *SvPV_nolen_const(csv);
while (--limit) {
rx->subbeg = (char *) truebase;
rx->startp[0] = s - truebase;
if (RX_MATCH_UTF8(rx)) {
- char * const t = (char*)utf8_hop((U8*)s, rx->minlen);
+ char * const t = (char*)utf8_hop((U8*)s, rx->minlenret);
rx->endp[0] = t - truebase;
}
else {
- rx->endp[0] = s - truebase + rx->minlen;
+ rx->endp[0] = s - truebase + rx->minlenret;
}
rx->sublen = strend - truebase;
goto gotcha;
rx->sublen = strend - t;
RX_MATCH_COPIED_on(rx);
off = rx->startp[0] = s - t;
- rx->endp[0] = off + rx->minlen;
+ rx->endp[0] = off + rx->minlenret;
}
else { /* startp/endp are used by @- @+. */
rx->startp[0] = s - truebase;
- rx->endp[0] = s - truebase + rx->minlen;
+ rx->endp[0] = s - truebase + rx->minlenret;
}
rx->nparens = rx->lastparen = rx->lastcloseparen = 0; /* used by @-, @+, and $^N */
LEAVE_SCOPE(oldsave);
#ifdef PERL_OLD_COPY_ON_WRITE
&& !is_cow
#endif
- && (I32)clen <= rx->minlen && (once || !(r_flags & REXEC_COPY_STR))
+ && (I32)clen <= rx->minlenret && (once || !(r_flags & REXEC_COPY_STR))
&& !(rx->reganch & ROPT_LOOKBEHIND_SEEN)
&& (!doutf8 || SvUTF8(TARG))) {
if (!CALLREGEXEC(rx, s, strend, orig, 0, TARG, NULL,
__attribute__nonnull__(pTHX_6);
PERL_CALLCONV void Perl_pregfree(pTHX_ struct regexp* r);
+PERL_CALLCONV char * Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval)
+ __attribute__nonnull__(pTHX_1);
+
#if defined(USE_ITHREADS)
PERL_CALLCONV regexp* Perl_regdupe(pTHX_ const regexp* r, CLONE_PARAMS* param)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2)
__attribute__nonnull__(pTHX_3);
-STATIC char * S_stringify_regexp(pTHX_ SV *sv, MAGIC *mg, STRLEN *lp)
- __attribute__nonnull__(pTHX_1)
- __attribute__nonnull__(pTHX_2);
-
STATIC char * S_F0convert(NV nv, char *endbuf, STRLEN *len)
__attribute__nonnull__(2)
__attribute__nonnull__(3);
* it happens that c_offset_min has been invalidated, since the
* earlier string may buy us something the later one won't.]
*/
- minlen = 0;
-
+
data.longest_fixed = newSVpvs("");
data.longest_float = newSVpvs("");
data.last_found = newSVpvs("");
} else /* XXXX Check for BOUND? */
stclass_flag = 0;
data.last_closep = &last_close;
-
+
minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
&data, -1, NULL, NULL,
SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
data.start_class = &ch_class;
data.last_closep = &last_close;
+
minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
&data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
-
+
CHECK_RESTUDY_GOTO;
r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
/* Guard against an embedded (?=) or (?<=) with a longer minlen than
the "real" pattern. */
+ DEBUG_OPTIMISE_r({
+ PerlIO_printf(Perl_debug_log,"minlen: %d r->minlen:%d\n",
+ minlen, r->minlen);
+ });
+ r->minlenret = minlen;
if (r->minlen < minlen)
r->minlen = minlen;
ret->precomp = SAVEPVN(r->precomp, r->prelen);
ret->refcnt = r->refcnt;
ret->minlen = r->minlen;
+ ret->minlenret = r->minlenret;
ret->prelen = r->prelen;
ret->nparens = r->nparens;
ret->lastparen = r->lastparen;
}
#endif
+/*
+ reg_stringify()
+
+ converts a regexp embedded in a MAGIC struct to its stringified form,
+ caching the converted form in the struct and returns the cached
+ string.
+
+ If lp is nonnull then it is used to return the length of the
+ resulting string
+
+ If flags is nonnull and the returned string contains UTF8 then
+ (flags & 1) will be true.
+
+ If haseval is nonnull then it is used to return whether the pattern
+ contains evals.
+
+ Normally called via macro:
+
+ CALLREG_STRINGIFY(mg,0,0);
+
+ And internally with
+
+ CALLREG_AS_STR(mg,lp,flags,haseval)
+
+ See sv_2pv_flags() in sv.c for an example of internal usage.
+
+ */
+
+char *
+Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
+ dVAR;
+ const regexp * const re = (regexp *)mg->mg_obj;
+
+ if (!mg->mg_ptr) {
+ const char *fptr = "msix";
+ char reflags[6];
+ char ch;
+ int left = 0;
+ int right = 4;
+ bool need_newline = 0;
+ U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12);
+
+ while((ch = *fptr++)) {
+ if(reganch & 1) {
+ reflags[left++] = ch;
+ }
+ else {
+ reflags[right--] = ch;
+ }
+ reganch >>= 1;
+ }
+ if(left != 4) {
+ reflags[left] = '-';
+ left = 5;
+ }
+
+ mg->mg_len = re->prelen + 4 + left;
+ /*
+ * If /x was used, we have to worry about a regex ending with a
+ * comment later being embedded within another regex. If so, we don't
+ * want this regex's "commentization" to leak out to the right part of
+ * the enclosing regex, we must cap it with a newline.
+ *
+ * So, if /x was used, we scan backwards from the end of the regex. If
+ * we find a '#' before we find a newline, we need to add a newline
+ * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
+ * we don't need to add anything. -jfriedl
+ */
+ if (PMf_EXTENDED & re->reganch) {
+ const char *endptr = re->precomp + re->prelen;
+ while (endptr >= re->precomp) {
+ const char c = *(endptr--);
+ if (c == '\n')
+ break; /* don't need another */
+ if (c == '#') {
+ /* we end while in a comment, so we need a newline */
+ mg->mg_len++; /* save space for it */
+ need_newline = 1; /* note to add it */
+ break;
+ }
+ }
+ }
+
+ Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
+ mg->mg_ptr[0] = '(';
+ mg->mg_ptr[1] = '?';
+ Copy(reflags, mg->mg_ptr+2, left, char);
+ *(mg->mg_ptr+left+2) = ':';
+ Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
+ if (need_newline)
+ mg->mg_ptr[mg->mg_len - 2] = '\n';
+ mg->mg_ptr[mg->mg_len - 1] = ')';
+ mg->mg_ptr[mg->mg_len] = 0;
+ }
+ if (haseval)
+ *haseval = re->program[0].next_off;
+ if (flags)
+ *flags = ((re->reganch & ROPT_UTF8) ? 1 : 0);
+
+ if (lp)
+ *lp = mg->mg_len;
+ return mg->mg_ptr;
+}
+
+
#ifndef PERL_IN_XSUB_RE
/*
- regnext - dig the "next" pointer out of a node
Perl_re_intuit_start,
Perl_re_intuit_string,
Perl_pregfree,
+ Perl_reg_stringify,
#if defined(USE_ITHREADS)
Perl_regdupe
#endif
U32 *offsets; /* offset annotations 20001228 MJD */
I32 sublen; /* Length of string pointed by subbeg */
I32 refcnt;
- I32 minlen; /* mininum possible length of $& */
+ I32 minlen; /* mininum possible length of string to match */
+ I32 minlenret; /* mininum possible length of $& */
I32 prelen; /* length of precomp */
U32 nparens; /* number of parentheses */
U32 lastparen; /* last paren matched */
struct re_scream_pos_data_s *data);
SV* (*checkstr) (pTHX_ regexp *prog);
void (*free) (pTHX_ struct regexp* r);
+ char* (*as_str) (pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval);
#ifdef USE_ITHREADS
regexp* (*dupe) (pTHX_ const regexp *r, CLONE_PARAMS *param);
#endif
return ptr;
}
-/* stringify_regexp(): private routine for use by sv_2pv_flags(): converts
- * a regexp to its stringified form.
- */
-
-static char *
-S_stringify_regexp(pTHX_ SV *sv, MAGIC *mg, STRLEN *lp) {
- dVAR;
- const regexp * const re = (regexp *)mg->mg_obj;
-
- if (!mg->mg_ptr) {
- const char *fptr = "msix";
- char reflags[6];
- char ch;
- int left = 0;
- int right = 4;
- bool need_newline = 0;
- U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12);
-
- while((ch = *fptr++)) {
- if(reganch & 1) {
- reflags[left++] = ch;
- }
- else {
- reflags[right--] = ch;
- }
- reganch >>= 1;
- }
- if(left != 4) {
- reflags[left] = '-';
- left = 5;
- }
-
- mg->mg_len = re->prelen + 4 + left;
- /*
- * If /x was used, we have to worry about a regex ending with a
- * comment later being embedded within another regex. If so, we don't
- * want this regex's "commentization" to leak out to the right part of
- * the enclosing regex, we must cap it with a newline.
- *
- * So, if /x was used, we scan backwards from the end of the regex. If
- * we find a '#' before we find a newline, we need to add a newline
- * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
- * we don't need to add anything. -jfriedl
- */
- if (PMf_EXTENDED & re->reganch) {
- const char *endptr = re->precomp + re->prelen;
- while (endptr >= re->precomp) {
- const char c = *(endptr--);
- if (c == '\n')
- break; /* don't need another */
- if (c == '#') {
- /* we end while in a comment, so we need a newline */
- mg->mg_len++; /* save space for it */
- need_newline = 1; /* note to add it */
- break;
- }
- }
- }
-
- Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
- mg->mg_ptr[0] = '(';
- mg->mg_ptr[1] = '?';
- Copy(reflags, mg->mg_ptr+2, left, char);
- *(mg->mg_ptr+left+2) = ':';
- Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
- if (need_newline)
- mg->mg_ptr[mg->mg_len - 2] = '\n';
- mg->mg_ptr[mg->mg_len - 1] = ')';
- mg->mg_ptr[mg->mg_len] = 0;
- }
- PL_reginterp_cnt += re->program[0].next_off;
-
- if (re->reganch & ROPT_UTF8)
- SvUTF8_on(sv);
- else
- SvUTF8_off(sv);
- if (lp)
- *lp = mg->mg_len;
- return mg->mg_ptr;
-}
-
/*
=for apidoc sv_2pv_flags
&& ((SvFLAGS(referent) &
(SVs_OBJECT|SVf_OK|SVs_GMG|SVs_SMG|SVs_RMG))
== (SVs_OBJECT|SVs_SMG))
- && (mg = mg_find(referent, PERL_MAGIC_qr))) {
- return stringify_regexp(sv, mg, lp);
+ && (mg = mg_find(referent, PERL_MAGIC_qr)))
+ {
+ char *str = NULL;
+ I32 haseval = 0;
+ I32 flags = 0;
+ (str) = CALLREG_AS_STR(mg,lp,&flags,&haseval);
+ if (flags & 1)
+ SvUTF8_on(sv);
+ else
+ SvUTF8_off(sv);
+ PL_reginterp_cnt += haseval;
+ return str;
} else {
const char *const typestr = sv_reftype(referent, 0);
const STRLEN typelen = strlen(typestr);
"Regexp /^(??{'(.)'x 100})/ crashes older perls")
or print "# Unexpected outcome: should pass or crash perl\n";
+{
+ $_="ns1ns1ns1";
+ s/ns(?=\d)/ns_/g;
+ iseq($_,"ns_1ns_1ns_1");
+ $_="ns1";
+ s/ns(?=\d)/ns_/;
+ iseq($_,"ns_1");
+ $_="123";
+ s/(?=\d+)|(?<=\d)/!Bang!/g;
+ iseq($_,"!Bang!1!Bang!2!Bang!3!Bang!");
+}
+
# Put new tests above the line, not here.
# Don't forget to update this!
-BEGIN{print "1..1344\n"};
+BEGIN{print "1..1347\n"};