X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=ext%2Fre%2Fre.pm;h=4f8d4105a8aca3b0f7f0a61a32401220498db5e9;hb=ded05c2a789e70bb7204e21b2aa98c6d1ac776c2;hp=b763fef7eb3debec5866889c45f2846d66afa593;hpb=894be9b73be0493c898492f5cfad130c681ee44d;p=p5sagit%2Fp5-mst-13.2.git diff --git a/ext/re/re.pm b/ext/re/re.pm index b763fef..4f8d410 100644 --- a/ext/re/re.pm +++ b/ext/re/re.pm @@ -1,6 +1,173 @@ package re; -our $VERSION = 0.06_02; +# pragma for controlling the regex engine +use strict; +use warnings; + +our $VERSION = "0.08"; +our @ISA = qw(Exporter); +our @EXPORT_OK = qw(is_regexp regexp_pattern regmust + regname regnames + regnames_count regnames_iterinit regnames_iternext); +our %EXPORT_OK = map { $_ => 1 } @EXPORT_OK; + +# *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING *** +# +# If you modify these values see comment below! + +my %bitmask = ( + taint => 0x00100000, # HINT_RE_TAINT + eval => 0x00200000, # HINT_RE_EVAL +); + +# - File::Basename contains a literal for 'taint' as a fallback. If +# taint is changed here, File::Basename must be updated as well. +# +# - ExtUtils::ParseXS uses a hardcoded +# BEGIN { $^H |= 0x00200000 } +# in it to allow re.xs to be built. So if 'eval' is changed here then +# ExtUtils::ParseXS must be changed as well. +# +# *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING *** + +sub setcolor { + eval { # Ignore errors + require Term::Cap; + + my $terminal = Tgetent Term::Cap ({OSPEED => 9600}); # Avoid warning. + my $props = $ENV{PERL_RE_TC} || 'md,me,so,se,us,ue'; + my @props = split /,/, $props; + my $colors = join "\t", map {$terminal->Tputs($_,1)} @props; + + $colors =~ s/\0//g; + $ENV{PERL_RE_COLORS} = $colors; + }; + if ($@) { + $ENV{PERL_RE_COLORS} ||= qq'\t\t> <\t> <\t\t'; + } + +} + +my %flags = ( + COMPILE => 0x0000FF, + PARSE => 0x000001, + OPTIMISE => 0x000002, + TRIEC => 0x000004, + DUMP => 0x000008, + + EXECUTE => 0x00FF00, + INTUIT => 0x000100, + MATCH => 0x000200, + TRIEE => 0x000400, + + EXTRA => 0xFF0000, + TRIEM => 0x010000, + OFFSETS => 0x020000, + OFFSETSDBG => 0x040000, + STATE => 0x080000, + OPTIMISEM => 0x100000, + STACK => 0x280000, +); +$flags{ALL} = -1; +$flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE}; +$flags{Extra} = $flags{EXECUTE} | $flags{COMPILE}; +$flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE}; +$flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE}; +$flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC}; + +my $installed; +my $installed_error; + +sub _do_install { + if ( ! defined($installed) ) { + require XSLoader; + $installed = eval { XSLoader::load('re', $VERSION) } || 0; + $installed_error = $@; + } +} + +sub _load_unload { + my ($on)= @_; + if ($on) { + _do_install(); + if ( ! $installed ) { + die "'re' not installed!? ($installed_error)"; + } else { + # We call install() every time, as if we didn't, we wouldn't + # "see" any changes to the color environment var since + # the last time it was called. + + # install() returns an integer, which if casted properly + # in C resolves to a structure containing the regex + # hooks. Setting it to a random integer will guarantee + # segfaults. + $^H{regcomp} = install(); + } + } else { + delete $^H{regcomp}; + } +} + +sub bits { + my $on = shift; + my $bits = 0; + unless (@_) { + require Carp; + Carp::carp("Useless use of \"re\" pragma"); + } + foreach my $idx (0..$#_){ + my $s=$_[$idx]; + if ($s eq 'Debug' or $s eq 'Debugcolor') { + setcolor() if $s =~/color/i; + ${^RE_DEBUG_FLAGS} = 0 unless defined ${^RE_DEBUG_FLAGS}; + for my $idx ($idx+1..$#_) { + if ($flags{$_[$idx]}) { + if ($on) { + ${^RE_DEBUG_FLAGS} |= $flags{$_[$idx]}; + } else { + ${^RE_DEBUG_FLAGS} &= ~ $flags{$_[$idx]}; + } + } else { + require Carp; + Carp::carp("Unknown \"re\" Debug flag '$_[$idx]', possible flags: ", + join(", ",sort keys %flags ) ); + } + } + _load_unload($on ? 1 : ${^RE_DEBUG_FLAGS}); + last; + } elsif ($s eq 'debug' or $s eq 'debugcolor') { + setcolor() if $s =~/color/i; + _load_unload($on); + last; + } elsif (exists $bitmask{$s}) { + $bits |= $bitmask{$s}; + } elsif ($EXPORT_OK{$s}) { + _do_install(); + require Exporter; + re->export_to_level(2, 're', $s); + } else { + require Carp; + Carp::carp("Unknown \"re\" subpragma '$s' (known ones are: ", + join(', ', map {qq('$_')} 'debug', 'debugcolor', sort keys %bitmask), + ")"); + } + } + $bits; +} + +sub import { + shift; + $^H |= bits(1, @_); +} + +sub unimport { + shift; + $^H &= ~ bits(0, @_); +} + +1; + +__END__ =head1 NAME @@ -23,27 +190,38 @@ re - Perl pragma to alter regular expression behaviour /foo${pat}bar/; # disallowed (with or without -T switch) } - use re 'debug'; # NOT lexically scoped (as others are) - /^(.*)$/s; # output debugging info during - # compile and run time + use re 'debug'; # output debugging info during + /^(.*)$/s; # compile and run time + use re 'debugcolor'; # same as 'debug', but with colored output ... use re qw(Debug All); # Finer tuned debugging options. - use re qw(Debug More); # Similarly not lexically scoped. - no re qw(Debug ALL); # Turn of all re dugging and unload the module. + use re qw(Debug More); + no re qw(Debug ALL); # Turn of all re debugging in this scope + + use re qw(is_regexp regexp_pattern); # import utility functions + my ($pat,$mods)=regexp_pattern(qr/foo/i); + if (is_regexp($obj)) { + print "Got regexp: ", + scalar regexp_pattern($obj); # just as perl would stringify it + } # but no hassle with blessed re's. (We use $^X in these examples because it's tainted by default.) =head1 DESCRIPTION +=head2 'taint' mode + When C is in effect, and a tainted string is the target of a regex, the regex memories (or values returned by the m// operator in list context) are tainted. This feature is useful when regex operations on tainted data aren't meant to extract safe substrings, but to perform other transformations. +=head2 'eval' mode + When C is in effect, a regex is allowed to contain C<(?{ ... })> zero-width assertions even if regular expression contains variable interpolation. That is normally disallowed, since it is a @@ -60,6 +238,8 @@ interpolation. Thus: I allowed if $pat is a precompiled regular expression, even if $pat contains C<(?{ ... })> assertions. +=head2 'debug' mode + When C is in effect, perl emits debugging messages when compiling and using regular expressions. The output is the same as that obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the @@ -71,6 +251,14 @@ comma-separated list of C properties to use for highlighting strings on/off, pre-point part on/off. See L for additional info. +As of 5.9.5 the directive C and its equivalents are +lexically scoped, as the other directives are. However they have both +compile-time and run-time effects. + +See L. + +=head2 'Debug' mode + Similarly C produces debugging output, the difference being that it allows the fine tuning of what debugging output will be emitted. Options are divided into three groups, those related to @@ -95,7 +283,7 @@ Turns on debug output related to the process of parsing the pattern. Enables output related to the optimisation phase of compilation. -=item TRIE_COMPILE +=item TRIEC Detailed info about trie compilation. @@ -103,17 +291,6 @@ Detailed info about trie compilation. Dump the final program out after it is compiled and optimised. -=item OFFSETS - -Dump offset information. This can be used to see how regops correlate -to the pattern. Output format is - - NODENUM:POSITION[LENGTH] - -Where 1 is the position of the first char in the string. Note that position -can be 0, or larger than the actual length of the pattern, likewise length -can be zero. - =back =item Execute related options @@ -128,7 +305,7 @@ Turns on all execute related debug options. Turns on debugging of the main matching loop. -=item TRIE_EXECUTE +=item TRIEE Extra debugging of how tries execute. @@ -146,18 +323,44 @@ Enable debugging of start point optimisations. Turns on all "extra" debugging options. -=item TRIE_MORE +=item TRIEM + +Enable enhanced TRIE debugging. Enhances both TRIEE +and TRIEC. + +=item STATE -Enable enhanced TRIE debugging. Enhances both TRIE_EXECUTE -and TRIE_COMPILE. +Enable debugging of states in the engine. -=item OFFSETS_DEBUG +=item STACK + +Enable debugging of the recursion stack in the engine. Enabling +or disabling this option automatically does the same for debugging +states as well. This output from this can be quite large. + +=item OPTIMISEM + +Enable enhanced optimisation debugging and start point optimisations. +Probably not useful except when debugging the regex engine itself. + +=item OFFSETS + +Dump offset information. This can be used to see how regops correlate +to the pattern. Output format is + + NODENUM:POSITION[LENGTH] + +Where 1 is the position of the first char in the string. Note that position +can be 0, or larger than the actual length of the pattern, likewise length +can be zero. + +=item OFFSETSDBG Enable debugging of offsets information. This emits copious -amounts of trace information and doesnt mesh well with other +amounts of trace information and doesn't mesh well with other debug options. -Almost definately only useful to people hacking +Almost definitely only useful to people hacking on the offsets part of the debug engine. =back @@ -174,7 +377,7 @@ Enable all compile and execute options at once. =item All -Enable DUMP and all execute options. Equivelent to: +Enable DUMP and all execute options. Equivalent to: use re 'debug'; @@ -182,147 +385,132 @@ Enable DUMP and all execute options. Equivelent to: =item More -Enable TRIE_MORE and all execute compile and execute options. +Enable TRIEM and all execute compile and execute options. -=back 4 +=back -=back 4 +=back -The directive C and its equivalents are I lexically -scoped, as the other directives are. They have both compile-time and run-time -effects. +As of 5.9.5 the directive C and its equivalents are +lexically scoped, as the other directives are. However they have both +compile-time and run-time effects. -See L. +=head2 Exportable Functions -=cut +As of perl 5.9.5 're' debug contains a number of utility functions that +may be optionally exported into the caller's namespace. They are listed +below. -# N.B. File::Basename contains a literal for 'taint' as a fallback. If -# taint is changed here, File::Basename must be updated as well. -my %bitmask = ( -taint => 0x00100000, # HINT_RE_TAINT -eval => 0x00200000, # HINT_RE_EVAL -); +=over 4 -sub setcolor { - eval { # Ignore errors - require Term::Cap; +=item is_regexp($ref) - my $terminal = Tgetent Term::Cap ({OSPEED => 9600}); # Avoid warning. - my $props = $ENV{PERL_RE_TC} || 'md,me,so,se,us,ue'; - my @props = split /,/, $props; - my $colors = join "\t", map {$terminal->Tputs($_,1)} @props; +Returns true if the argument is a compiled regular expression as returned +by C, false if it is not. - $colors =~ s/\0//g; - $ENV{PERL_RE_COLORS} = $colors; - }; - if ($@) { - $ENV{PERL_RE_COLORS}||=qq'\t\t> <\t> <\t\t' - } +This function will not be confused by overloading or blessing. In +internals terms, this extracts the regexp pointer out of the +PERL_MAGIC_qr structure so it it cannot be fooled. -} +=item regexp_pattern($ref) -my %flags = ( - COMPILE => 0x0000FF, - PARSE => 0x000001, - OPTIMISE => 0x000002, - TRIEC => 0x000004, - DUMP => 0x000008, +If the argument is a compiled regular expression as returned by C, +then this function returns the pattern. - EXECUTE => 0x00FF00, - INTUIT => 0x000100, - MATCH => 0x000200, - TRIEE => 0x000400, +In list context it returns a two element list, the first element +containing the pattern and the second containing the modifiers used when +the pattern was compiled. - EXTRA => 0xFF0000, - TRIEM => 0x010000, - OFFSETS => 0x020000, - OFFSETSDBG => 0x040000, - STATE => 0x080000, - OPTIMISEM => 0x100000, -); -$flags{ALL} = -1; -$flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE}; -$flags{Extra} = $flags{EXECUTE} | $flags{COMPILE}; -$flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE}; -$flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE}; -$flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC}; + my ($pat, $mods) = regexp_pattern($ref); -my $installed; +In scalar context it returns the same as perl would when strigifying a raw +C with the same pattern inside. If the argument is not a compiled +reference then this routine returns false but defined in scalar context, +and the empty list in list context. Thus the following -sub _load_unload { - my ($on)= @_; - if ($on) { - if ( ! defined($installed) ) { - require XSLoader; - XSLoader::load('re'); - $installed = install() || 0; - } - if ( ! $installed ) { - die "'re' not installed!?"; - } else { - # We could just say = $installed; but then we wouldn't - # "see" any changes to the color environment var. - - # install() returns an integer, which if casted properly - # in C resolves to a structure containing the regex - # hooks. Setting it to a random integer will guarantee - # segfaults. - $^H{regcomp} = install(); - } - } else { - delete $^H{regcomp}; - } -} + if (regexp_pattern($ref) eq '(?i-xsm:foo)') -sub bits { - my $on = shift; - my $bits = 0; - unless (@_) { - return; - } - foreach my $idx (0..$#_){ - my $s=$_[$idx]; - if ($s eq 'Debug' or $s eq 'Debugcolor') { - setcolor() if $s =~/color/i; - ${^RE_DEBUG_FLAGS} = 0 unless defined ${^RE_DEBUG_FLAGS}; - for my $idx ($idx+1..$#_) { - if ($flags{$_[$idx]}) { - if ($on) { - ${^RE_DEBUG_FLAGS} |= $flags{$_[$idx]}; - } else { - ${^RE_DEBUG_FLAGS} &= ~ $flags{$_[$idx]}; - } - } else { - require Carp; - Carp::carp("Unknown \"re\" Debug flag '$_[$idx]', possible flags: ", - join(", ",sort { $flags{$a} <=> $flags{$b} } keys %flags ) ); - } - } - _load_unload($on ? 1 : ${^RE_DEBUG_FLAGS}); - last; - } elsif ($s eq 'debug' or $s eq 'debugcolor') { - setcolor() if $s =~/color/i; - _load_unload($on); - } elsif (exists $bitmask{$s}) { - $bits |= $bitmask{$s}; - } else { - require Carp; - Carp::carp("Unknown \"re\" subpragma '$s' (known ones are: ", - join(', ', map {qq('$_')} 'debug', 'debugcolor', sort keys %bitmask), - ")"); - } - } - $bits; -} +will be warning free regardless of what $ref actually is. -sub import { - shift; - $^H |= bits(1, @_); -} +Like C this function will not be confused by overloading +or blessing of the object. -sub unimport { - shift; - $^H &= ~ bits(0, @_); -} +=item regmust($ref) -1; +If the argument is a compiled regular expression as returned by C, +then this function returns what the optimiser consiers to be the longest +anchored fixed string and longest floating fixed string in the pattern. + +A I is defined as being a substring that must appear for the +pattern to match. An I is a fixed string that must +appear at a particular offset from the beginning of the match. A I is defined as a fixed string that can appear at any point in +a range of positions relative to the start of the match. For example, + + my $qr = qr/here .* there/x; + my ($anchored, $floating) = regmust($qr); + print "anchored:'$anchored'\nfloating:'$floating'\n"; + +results in + + anchored:'here' + floating:'there' + +Because the C is before the C<.*> in the pattern, its position +can be determined exactly. That's not true, however, for the C; +it could appear at any point after where the anchored string appeared. +Perl uses both for its optimisations, prefering the longer, or, if they are +equal, the floating. + +B This may not necessarily be the definitive longest anchored and +floating string. This will be what the optimiser of the Perl that you +are using thinks is the longest. If you believe that the result is wrong +please report it via the L utility. + +=item regname($name,$qr,$all) + +Returns the contents of a named buffer. If $qr is missing, or is not the +result of a qr// then returns the result of the last successful match. If +$all is true then returns an array ref containing one entry per buffer, +otherwise returns the first defined buffer. + +=item regnames($qr,$all) + +Returns a list of all of the named buffers defined in a pattern. If +$all is true then it returns all names defined, if not returns only +names which were involved in the last successful match. If $qr is omitted +or is not the result of a qr// then returns the details for the last +successful match. + +=item regnames_iterinit($qr) + +Initializes the internal hash iterator associated to a regexps named capture +buffers. If $qr is omitted resets the iterator associated with the regexp used +in the last successful match. + +=item regnames_iternext($qr,$all) + +Gets the next key from the hash associated with a regexp. If $qr +is omitted resets the iterator associated with the regexp used in the +last successful match. If $all is true returns the keys of all of the +distinct named buffers in the pattern, if not returns only those names +used in the last successful match. + +=item regnames_count($qr) + +Returns the number of distinct names defined in the regexp $qr. If +$qr is omitted or not a regexp returns the count of names in the +last successful match. + +B that this result is always the actual number of distinct +named buffers defined, it may not actually match that which is +returned by C and related routines when those routines +have not been called with the $all parameter set.. + +=back + +=head1 SEE ALSO + +L. + +=cut