Commit | Line | Data |
b3eb6a9b |
1 | package re; |
2 | |
de8c5301 |
3 | # pragma for controlling the regex engine |
4 | use strict; |
5 | use warnings; |
6 | |
192c1e27 |
7 | our $VERSION = "0.09"; |
de8c5301 |
8 | our @ISA = qw(Exporter); |
192c1e27 |
9 | my @XS_FUNCTIONS = qw(regmust); |
10 | my %XS_FUNCTIONS = map { $_ => 1 } @XS_FUNCTIONS; |
11 | our @EXPORT_OK = (@XS_FUNCTIONS, |
12 | qw(is_regexp regexp_pattern |
13 | regname regnames regnames_count)); |
de8c5301 |
14 | our %EXPORT_OK = map { $_ => 1 } @EXPORT_OK; |
15 | |
16 | # *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING *** |
17 | # |
18 | # If you modify these values see comment below! |
19 | |
20 | my %bitmask = ( |
21 | taint => 0x00100000, # HINT_RE_TAINT |
22 | eval => 0x00200000, # HINT_RE_EVAL |
23 | ); |
24 | |
25 | # - File::Basename contains a literal for 'taint' as a fallback. If |
26 | # taint is changed here, File::Basename must be updated as well. |
27 | # |
28 | # - ExtUtils::ParseXS uses a hardcoded |
29 | # BEGIN { $^H |= 0x00200000 } |
30 | # in it to allow re.xs to be built. So if 'eval' is changed here then |
31 | # ExtUtils::ParseXS must be changed as well. |
32 | # |
33 | # *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING *** |
34 | |
35 | sub setcolor { |
36 | eval { # Ignore errors |
37 | require Term::Cap; |
38 | |
39 | my $terminal = Tgetent Term::Cap ({OSPEED => 9600}); # Avoid warning. |
40 | my $props = $ENV{PERL_RE_TC} || 'md,me,so,se,us,ue'; |
41 | my @props = split /,/, $props; |
42 | my $colors = join "\t", map {$terminal->Tputs($_,1)} @props; |
43 | |
44 | $colors =~ s/\0//g; |
45 | $ENV{PERL_RE_COLORS} = $colors; |
46 | }; |
47 | if ($@) { |
48 | $ENV{PERL_RE_COLORS} ||= qq'\t\t> <\t> <\t\t'; |
49 | } |
50 | |
51 | } |
52 | |
53 | my %flags = ( |
54 | COMPILE => 0x0000FF, |
55 | PARSE => 0x000001, |
56 | OPTIMISE => 0x000002, |
57 | TRIEC => 0x000004, |
58 | DUMP => 0x000008, |
f7819f85 |
59 | FLAGS => 0x000010, |
de8c5301 |
60 | |
61 | EXECUTE => 0x00FF00, |
62 | INTUIT => 0x000100, |
63 | MATCH => 0x000200, |
64 | TRIEE => 0x000400, |
65 | |
66 | EXTRA => 0xFF0000, |
67 | TRIEM => 0x010000, |
68 | OFFSETS => 0x020000, |
69 | OFFSETSDBG => 0x040000, |
70 | STATE => 0x080000, |
71 | OPTIMISEM => 0x100000, |
72 | STACK => 0x280000, |
e7707071 |
73 | BUFFERS => 0x400000, |
2c296965 |
74 | GPOS => 0x800000, |
de8c5301 |
75 | ); |
e7707071 |
76 | $flags{ALL} = -1 & ~($flags{OFFSETS}|$flags{OFFSETSDBG}|$flags{BUFFERS}); |
de8c5301 |
77 | $flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE}; |
2c296965 |
78 | $flags{Extra} = $flags{EXECUTE} | $flags{COMPILE} | $flags{GPOS}; |
de8c5301 |
79 | $flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE}; |
80 | $flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE}; |
81 | $flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC}; |
82 | |
83 | my $installed; |
84 | my $installed_error; |
85 | |
86 | sub _do_install { |
87 | if ( ! defined($installed) ) { |
88 | require XSLoader; |
89 | $installed = eval { XSLoader::load('re', $VERSION) } || 0; |
90 | $installed_error = $@; |
91 | } |
92 | } |
93 | |
94 | sub _load_unload { |
95 | my ($on)= @_; |
96 | if ($on) { |
97 | _do_install(); |
98 | if ( ! $installed ) { |
99 | die "'re' not installed!? ($installed_error)"; |
100 | } else { |
101 | # We call install() every time, as if we didn't, we wouldn't |
102 | # "see" any changes to the color environment var since |
103 | # the last time it was called. |
104 | |
105 | # install() returns an integer, which if casted properly |
106 | # in C resolves to a structure containing the regex |
107 | # hooks. Setting it to a random integer will guarantee |
108 | # segfaults. |
109 | $^H{regcomp} = install(); |
110 | } |
111 | } else { |
112 | delete $^H{regcomp}; |
113 | } |
114 | } |
115 | |
116 | sub bits { |
117 | my $on = shift; |
118 | my $bits = 0; |
119 | unless (@_) { |
120 | require Carp; |
121 | Carp::carp("Useless use of \"re\" pragma"); |
122 | } |
123 | foreach my $idx (0..$#_){ |
124 | my $s=$_[$idx]; |
125 | if ($s eq 'Debug' or $s eq 'Debugcolor') { |
126 | setcolor() if $s =~/color/i; |
127 | ${^RE_DEBUG_FLAGS} = 0 unless defined ${^RE_DEBUG_FLAGS}; |
128 | for my $idx ($idx+1..$#_) { |
129 | if ($flags{$_[$idx]}) { |
130 | if ($on) { |
131 | ${^RE_DEBUG_FLAGS} |= $flags{$_[$idx]}; |
132 | } else { |
133 | ${^RE_DEBUG_FLAGS} &= ~ $flags{$_[$idx]}; |
134 | } |
135 | } else { |
136 | require Carp; |
137 | Carp::carp("Unknown \"re\" Debug flag '$_[$idx]', possible flags: ", |
138 | join(", ",sort keys %flags ) ); |
139 | } |
140 | } |
141 | _load_unload($on ? 1 : ${^RE_DEBUG_FLAGS}); |
142 | last; |
143 | } elsif ($s eq 'debug' or $s eq 'debugcolor') { |
144 | setcolor() if $s =~/color/i; |
145 | _load_unload($on); |
66e6b4c5 |
146 | last; |
de8c5301 |
147 | } elsif (exists $bitmask{$s}) { |
148 | $bits |= $bitmask{$s}; |
192c1e27 |
149 | } elsif ($XS_FUNCTIONS{$s}) { |
150 | _do_install(); |
151 | if (! $installed) { |
152 | require Carp; |
153 | Carp::croak("\"re\" function '$s' not available"); |
154 | } |
155 | require Exporter; |
156 | re->export_to_level(2, 're', $s); |
de8c5301 |
157 | } elsif ($EXPORT_OK{$s}) { |
de8c5301 |
158 | require Exporter; |
159 | re->export_to_level(2, 're', $s); |
160 | } else { |
161 | require Carp; |
162 | Carp::carp("Unknown \"re\" subpragma '$s' (known ones are: ", |
163 | join(', ', map {qq('$_')} 'debug', 'debugcolor', sort keys %bitmask), |
164 | ")"); |
165 | } |
166 | } |
167 | $bits; |
168 | } |
169 | |
170 | sub import { |
171 | shift; |
172 | $^H |= bits(1, @_); |
173 | } |
174 | |
175 | sub unimport { |
176 | shift; |
177 | $^H &= ~ bits(0, @_); |
178 | } |
179 | |
180 | 1; |
181 | |
182 | __END__ |
56953603 |
183 | |
b3eb6a9b |
184 | =head1 NAME |
185 | |
186 | re - Perl pragma to alter regular expression behaviour |
187 | |
188 | =head1 SYNOPSIS |
189 | |
e4d48cc9 |
190 | use re 'taint'; |
191 | ($x) = ($^X =~ /^(.*)$/s); # $x is tainted here |
b3eb6a9b |
192 | |
2cd61cdb |
193 | $pat = '(?{ $foo = 1 })'; |
e4d48cc9 |
194 | use re 'eval'; |
2cd61cdb |
195 | /foo${pat}bar/; # won't fail (when not under -T switch) |
e4d48cc9 |
196 | |
197 | { |
198 | no re 'taint'; # the default |
199 | ($x) = ($^X =~ /^(.*)$/s); # $x is not tainted here |
200 | |
201 | no re 'eval'; # the default |
2cd61cdb |
202 | /foo${pat}bar/; # disallowed (with or without -T switch) |
e4d48cc9 |
203 | } |
b3eb6a9b |
204 | |
1e2e3d02 |
205 | use re 'debug'; # output debugging info during |
206 | /^(.*)$/s; # compile and run time |
207 | |
2cd61cdb |
208 | |
02ea72ae |
209 | use re 'debugcolor'; # same as 'debug', but with colored output |
210 | ... |
211 | |
a3621e74 |
212 | use re qw(Debug All); # Finer tuned debugging options. |
4ee9a43f |
213 | use re qw(Debug More); |
fe759410 |
214 | no re qw(Debug ALL); # Turn of all re debugging in this scope |
4ee9a43f |
215 | |
de8c5301 |
216 | use re qw(is_regexp regexp_pattern); # import utility functions |
217 | my ($pat,$mods)=regexp_pattern(qr/foo/i); |
218 | if (is_regexp($obj)) { |
219 | print "Got regexp: ", |
220 | scalar regexp_pattern($obj); # just as perl would stringify it |
221 | } # but no hassle with blessed re's. |
a3621e74 |
222 | |
3ffabb8c |
223 | (We use $^X in these examples because it's tainted by default.) |
224 | |
b3eb6a9b |
225 | =head1 DESCRIPTION |
226 | |
de8c5301 |
227 | =head2 'taint' mode |
228 | |
b3eb6a9b |
229 | When C<use re 'taint'> is in effect, and a tainted string is the target |
230 | of a regex, the regex memories (or values returned by the m// operator |
e4d48cc9 |
231 | in list context) are tainted. This feature is useful when regex operations |
232 | on tainted data aren't meant to extract safe substrings, but to perform |
233 | other transformations. |
b3eb6a9b |
234 | |
de8c5301 |
235 | =head2 'eval' mode |
236 | |
e4d48cc9 |
237 | When C<use re 'eval'> is in effect, a regex is allowed to contain |
2cd61cdb |
238 | C<(?{ ... })> zero-width assertions even if regular expression contains |
ffbc6a93 |
239 | variable interpolation. That is normally disallowed, since it is a |
2cd61cdb |
240 | potential security risk. Note that this pragma is ignored when the regular |
241 | expression is obtained from tainted data, i.e. evaluation is always |
3c4b39be |
242 | disallowed with tainted regular expressions. See L<perlre/(?{ code })>. |
2cd61cdb |
243 | |
ffbc6a93 |
244 | For the purpose of this pragma, interpolation of precompiled regular |
0a92e3a8 |
245 | expressions (i.e., the result of C<qr//>) is I<not> considered variable |
246 | interpolation. Thus: |
2cd61cdb |
247 | |
248 | /foo${pat}bar/ |
249 | |
ffbc6a93 |
250 | I<is> allowed if $pat is a precompiled regular expression, even |
2cd61cdb |
251 | if $pat contains C<(?{ ... })> assertions. |
252 | |
de8c5301 |
253 | =head2 'debug' mode |
254 | |
ffbc6a93 |
255 | When C<use re 'debug'> is in effect, perl emits debugging messages when |
2cd61cdb |
256 | compiling and using regular expressions. The output is the same as that |
257 | obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the |
258 | B<-Dr> switch. It may be quite voluminous depending on the complexity |
02ea72ae |
259 | of the match. Using C<debugcolor> instead of C<debug> enables a |
260 | form of output that can be used to get a colorful display on terminals |
261 | that understand termcap color sequences. Set C<$ENV{PERL_RE_TC}> to a |
262 | comma-separated list of C<termcap> properties to use for highlighting |
ffbc6a93 |
263 | strings on/off, pre-point part on/off. |
2cd61cdb |
264 | See L<perldebug/"Debugging regular expressions"> for additional info. |
265 | |
de8c5301 |
266 | As of 5.9.5 the directive C<use re 'debug'> and its equivalents are |
267 | lexically scoped, as the other directives are. However they have both |
268 | compile-time and run-time effects. |
269 | |
270 | See L<perlmodlib/Pragmatic Modules>. |
271 | |
272 | =head2 'Debug' mode |
273 | |
a3621e74 |
274 | Similarly C<use re 'Debug'> produces debugging output, the difference |
275 | being that it allows the fine tuning of what debugging output will be |
be8e71aa |
276 | emitted. Options are divided into three groups, those related to |
277 | compilation, those related to execution and those related to special |
278 | purposes. The options are as follows: |
279 | |
280 | =over 4 |
281 | |
282 | =item Compile related options |
283 | |
284 | =over 4 |
285 | |
286 | =item COMPILE |
287 | |
288 | Turns on all compile related debug options. |
289 | |
290 | =item PARSE |
291 | |
292 | Turns on debug output related to the process of parsing the pattern. |
293 | |
294 | =item OPTIMISE |
295 | |
296 | Enables output related to the optimisation phase of compilation. |
297 | |
24b23f37 |
298 | =item TRIEC |
be8e71aa |
299 | |
300 | Detailed info about trie compilation. |
301 | |
302 | =item DUMP |
303 | |
304 | Dump the final program out after it is compiled and optimised. |
305 | |
be8e71aa |
306 | =back |
307 | |
308 | =item Execute related options |
309 | |
310 | =over 4 |
311 | |
312 | =item EXECUTE |
313 | |
314 | Turns on all execute related debug options. |
315 | |
316 | =item MATCH |
317 | |
318 | Turns on debugging of the main matching loop. |
319 | |
24b23f37 |
320 | =item TRIEE |
be8e71aa |
321 | |
322 | Extra debugging of how tries execute. |
323 | |
324 | =item INTUIT |
325 | |
326 | Enable debugging of start point optimisations. |
327 | |
328 | =back |
329 | |
330 | =item Extra debugging options |
331 | |
332 | =over 4 |
333 | |
334 | =item EXTRA |
335 | |
336 | Turns on all "extra" debugging options. |
337 | |
e7707071 |
338 | =item BUFFERS |
339 | |
340 | Enable debugging the capture buffer storage during match. Warning, |
341 | this can potentially produce extremely large output. |
342 | |
24b23f37 |
343 | =item TRIEM |
344 | |
345 | Enable enhanced TRIE debugging. Enhances both TRIEE |
346 | and TRIEC. |
347 | |
348 | =item STATE |
349 | |
4ee9a43f |
350 | Enable debugging of states in the engine. |
24b23f37 |
351 | |
352 | =item STACK |
be8e71aa |
353 | |
24b23f37 |
354 | Enable debugging of the recursion stack in the engine. Enabling |
355 | or disabling this option automatically does the same for debugging |
356 | states as well. This output from this can be quite large. |
357 | |
358 | =item OPTIMISEM |
359 | |
360 | Enable enhanced optimisation debugging and start point optimisations. |
361 | Probably not useful except when debugging the regex engine itself. |
362 | |
363 | =item OFFSETS |
364 | |
365 | Dump offset information. This can be used to see how regops correlate |
366 | to the pattern. Output format is |
367 | |
368 | NODENUM:POSITION[LENGTH] |
369 | |
370 | Where 1 is the position of the first char in the string. Note that position |
371 | can be 0, or larger than the actual length of the pattern, likewise length |
372 | can be zero. |
be8e71aa |
373 | |
24b23f37 |
374 | =item OFFSETSDBG |
be8e71aa |
375 | |
376 | Enable debugging of offsets information. This emits copious |
fe759410 |
377 | amounts of trace information and doesn't mesh well with other |
be8e71aa |
378 | debug options. |
379 | |
fe759410 |
380 | Almost definitely only useful to people hacking |
be8e71aa |
381 | on the offsets part of the debug engine. |
382 | |
383 | =back |
384 | |
385 | =item Other useful flags |
386 | |
387 | These are useful shortcuts to save on the typing. |
388 | |
389 | =over 4 |
390 | |
391 | =item ALL |
392 | |
e7707071 |
393 | Enable all options at once except OFFSETS, OFFSETSDBG and BUFFERS |
be8e71aa |
394 | |
395 | =item All |
396 | |
fe759410 |
397 | Enable DUMP and all execute options. Equivalent to: |
be8e71aa |
398 | |
399 | use re 'debug'; |
400 | |
401 | =item MORE |
402 | |
403 | =item More |
404 | |
24b23f37 |
405 | Enable TRIEM and all execute compile and execute options. |
be8e71aa |
406 | |
dba3f186 |
407 | =back |
be8e71aa |
408 | |
dba3f186 |
409 | =back |
a3621e74 |
410 | |
1e2e3d02 |
411 | As of 5.9.5 the directive C<use re 'debug'> and its equivalents are |
4ee9a43f |
412 | lexically scoped, as the other directives are. However they have both |
1e2e3d02 |
413 | compile-time and run-time effects. |
b3eb6a9b |
414 | |
de8c5301 |
415 | =head2 Exportable Functions |
b3eb6a9b |
416 | |
de8c5301 |
417 | As of perl 5.9.5 're' debug contains a number of utility functions that |
4ee9a43f |
418 | may be optionally exported into the caller's namespace. They are listed |
de8c5301 |
419 | below. |
b3eb6a9b |
420 | |
de8c5301 |
421 | =over 4 |
b3eb6a9b |
422 | |
de8c5301 |
423 | =item is_regexp($ref) |
02ea72ae |
424 | |
de8c5301 |
425 | Returns true if the argument is a compiled regular expression as returned |
4ee9a43f |
426 | by C<qr//>, false if it is not. |
02ea72ae |
427 | |
4ee9a43f |
428 | This function will not be confused by overloading or blessing. In |
429 | internals terms, this extracts the regexp pointer out of the |
de8c5301 |
430 | PERL_MAGIC_qr structure so it it cannot be fooled. |
894be9b7 |
431 | |
de8c5301 |
432 | =item regexp_pattern($ref) |
02ea72ae |
433 | |
4ee9a43f |
434 | If the argument is a compiled regular expression as returned by C<qr//>, |
435 | then this function returns the pattern. |
be8e71aa |
436 | |
4ee9a43f |
437 | In list context it returns a two element list, the first element |
438 | containing the pattern and the second containing the modifiers used when |
439 | the pattern was compiled. |
be8e71aa |
440 | |
4ee9a43f |
441 | my ($pat, $mods) = regexp_pattern($ref); |
a3621e74 |
442 | |
4ee9a43f |
443 | In scalar context it returns the same as perl would when strigifying a raw |
444 | C<qr//> with the same pattern inside. If the argument is not a compiled |
445 | reference then this routine returns false but defined in scalar context, |
446 | and the empty list in list context. Thus the following |
f9f4320a |
447 | |
de8c5301 |
448 | if (regexp_pattern($ref) eq '(?i-xsm:foo)') |
dba3f186 |
449 | |
de8c5301 |
450 | will be warning free regardless of what $ref actually is. |
380e0b81 |
451 | |
4ee9a43f |
452 | Like C<is_regexp> this function will not be confused by overloading |
453 | or blessing of the object. |
b3eb6a9b |
454 | |
256ddcd0 |
455 | =item regmust($ref) |
456 | |
432acd5f |
457 | If the argument is a compiled regular expression as returned by C<qr//>, |
458 | then this function returns what the optimiser consiers to be the longest |
459 | anchored fixed string and longest floating fixed string in the pattern. |
460 | |
461 | A I<fixed string> is defined as being a substring that must appear for the |
462 | pattern to match. An I<anchored fixed string> is a fixed string that must |
463 | appear at a particular offset from the beginning of the match. A I<floating |
464 | fixed string> is defined as a fixed string that can appear at any point in |
465 | a range of positions relative to the start of the match. For example, |
466 | |
467 | my $qr = qr/here .* there/x; |
468 | my ($anchored, $floating) = regmust($qr); |
256ddcd0 |
469 | print "anchored:'$anchored'\nfloating:'$floating'\n"; |
432acd5f |
470 | |
256ddcd0 |
471 | results in |
472 | |
473 | anchored:'here' |
474 | floating:'there' |
475 | |
432acd5f |
476 | Because the C<here> is before the C<.*> in the pattern, its position |
477 | can be determined exactly. That's not true, however, for the C<there>; |
478 | it could appear at any point after where the anchored string appeared. |
256ddcd0 |
479 | Perl uses both for its optimisations, prefering the longer, or, if they are |
480 | equal, the floating. |
481 | |
482 | B<NOTE:> This may not necessarily be the definitive longest anchored and |
432acd5f |
483 | floating string. This will be what the optimiser of the Perl that you |
256ddcd0 |
484 | are using thinks is the longest. If you believe that the result is wrong |
485 | please report it via the L<perlbug> utility. |
486 | |
28d8d7f4 |
487 | =item regname($name,$all) |
44a2ac75 |
488 | |
28d8d7f4 |
489 | Returns the contents of a named buffer of the last successful match. If |
490 | $all is true, then returns an array ref containing one entry per buffer, |
44a2ac75 |
491 | otherwise returns the first defined buffer. |
492 | |
28d8d7f4 |
493 | =item regnames($all) |
44a2ac75 |
494 | |
28d8d7f4 |
495 | Returns a list of all of the named buffers defined in the last successful |
496 | match. If $all is true, then it returns all names defined, if not it returns |
497 | only names which were involved in the match. |
44a2ac75 |
498 | |
28d8d7f4 |
499 | =item regnames_count() |
44a2ac75 |
500 | |
28d8d7f4 |
501 | Returns the number of distinct names defined in the pattern used |
502 | for the last successful match. |
44a2ac75 |
503 | |
28d8d7f4 |
504 | B<Note:> this result is always the actual number of distinct |
505 | named buffers defined, it may not actually match that which is |
506 | returned by C<regnames()> and related routines when those routines |
507 | have not been called with the $all parameter set. |
44a2ac75 |
508 | |
de8c5301 |
509 | =back |
b3eb6a9b |
510 | |
de8c5301 |
511 | =head1 SEE ALSO |
b3eb6a9b |
512 | |
de8c5301 |
513 | L<perlmodlib/Pragmatic Modules>. |
514 | |
515 | =cut |