Commit | Line | Data |
6a28abbc |
1 | #!/usr/bin/perl |
2 | |
3 | # Copyright (C) 2005, Larry Wall |
4 | # This software may be copied under the same terms as Perl. |
5 | |
6 | package P5re; |
7 | |
8 | use strict; |
9 | use warnings; |
10 | |
11 | our @EXPORT_OK = qw(re re2xml qr2xml); |
12 | |
13 | my $indent = 0; |
14 | my $in = ""; |
15 | my $delim = 1; |
16 | my $debug = 0; |
17 | my $maxbrack; |
18 | |
19 | our $extended; |
20 | our $insensitive; |
21 | our $singleline; |
22 | our $multiline; |
23 | |
24 | my %xmlish = ( |
25 | chr(0x00) => "STUPIDXML(#x00)", |
26 | chr(0x01) => "STUPIDXML(#x01)", |
27 | chr(0x02) => "STUPIDXML(#x02)", |
28 | chr(0x03) => "STUPIDXML(#x03)", |
29 | chr(0x04) => "STUPIDXML(#x04)", |
30 | chr(0x05) => "STUPIDXML(#x05)", |
31 | chr(0x06) => "STUPIDXML(#x06)", |
32 | chr(0x07) => "STUPIDXML(#x07)", |
33 | chr(0x08) => "STUPIDXML(#x08)", |
34 | chr(0x09) => "	", |
35 | chr(0x0a) => " ", |
36 | chr(0x0b) => "STUPIDXML(#x0b)", |
37 | chr(0x0c) => "STUPIDXML(#x0c)", |
38 | chr(0x0d) => " ", |
39 | chr(0x0e) => "STUPIDXML(#x0e)", |
40 | chr(0x0f) => "STUPIDXML(#x0f)", |
41 | chr(0x10) => "STUPIDXML(#x10)", |
42 | chr(0x11) => "STUPIDXML(#x11)", |
43 | chr(0x12) => "STUPIDXML(#x12)", |
44 | chr(0x13) => "STUPIDXML(#x13)", |
45 | chr(0x14) => "STUPIDXML(#x14)", |
46 | chr(0x15) => "STUPIDXML(#x15)", |
47 | chr(0x16) => "STUPIDXML(#x16)", |
48 | chr(0x17) => "STUPIDXML(#x17)", |
49 | chr(0x18) => "STUPIDXML(#x18)", |
50 | chr(0x19) => "STUPIDXML(#x19)", |
51 | chr(0x1a) => "STUPIDXML(#x1a)", |
52 | chr(0x1b) => "STUPIDXML(#x1b)", |
53 | chr(0x1c) => "STUPIDXML(#x1c)", |
54 | chr(0x1d) => "STUPIDXML(#x1d)", |
55 | chr(0x1e) => "STUPIDXML(#x1e)", |
56 | chr(0x1f) => "STUPIDXML(#x1f)", |
57 | chr(0x7f) => "STUPIDXML(#x7f)", |
58 | chr(0x80) => "STUPIDXML(#x80)", |
59 | chr(0x81) => "STUPIDXML(#x81)", |
60 | chr(0x82) => "STUPIDXML(#x82)", |
61 | chr(0x83) => "STUPIDXML(#x83)", |
62 | chr(0x84) => "STUPIDXML(#x84)", |
63 | chr(0x86) => "STUPIDXML(#x86)", |
64 | chr(0x87) => "STUPIDXML(#x87)", |
65 | chr(0x88) => "STUPIDXML(#x88)", |
66 | chr(0x89) => "STUPIDXML(#x89)", |
67 | chr(0x90) => "STUPIDXML(#x90)", |
68 | chr(0x91) => "STUPIDXML(#x91)", |
69 | chr(0x92) => "STUPIDXML(#x92)", |
70 | chr(0x93) => "STUPIDXML(#x93)", |
71 | chr(0x94) => "STUPIDXML(#x94)", |
72 | chr(0x95) => "STUPIDXML(#x95)", |
73 | chr(0x96) => "STUPIDXML(#x96)", |
74 | chr(0x97) => "STUPIDXML(#x97)", |
75 | chr(0x98) => "STUPIDXML(#x98)", |
76 | chr(0x99) => "STUPIDXML(#x99)", |
77 | chr(0x9a) => "STUPIDXML(#x9a)", |
78 | chr(0x9b) => "STUPIDXML(#x9b)", |
79 | chr(0x9c) => "STUPIDXML(#x9c)", |
80 | chr(0x9d) => "STUPIDXML(#x9d)", |
81 | chr(0x9e) => "STUPIDXML(#x9e)", |
82 | chr(0x9f) => "STUPIDXML(#x9f)", |
83 | '<' => "<", |
84 | '>' => ">", |
85 | '&' => "&", |
86 | '"' => """, # XML idiocy |
87 | ); |
88 | |
89 | sub xmlquote { |
90 | my $text = shift; |
91 | $text =~ s/(.)/$xmlish{$1} || $1/seg; |
92 | return $text; |
93 | } |
94 | |
95 | sub text { |
96 | my $self = shift; |
97 | return xmlquote($self->{text}); |
98 | } |
99 | |
100 | sub rep { |
101 | my $self = shift; |
102 | return xmlquote($self->{rep}); |
103 | } |
104 | |
105 | sub xmlkids { |
106 | my $self = shift; |
107 | my $array = $self->{Kids}; |
108 | my $ret = ""; |
109 | $indent += 2; |
110 | $in = ' ' x $indent; |
111 | foreach my $chunk (@$array) { |
112 | if (ref $chunk eq "ARRAY") { |
113 | die; |
114 | } |
115 | elsif (ref $chunk) { |
116 | $ret .= $chunk->xml(); |
117 | } |
118 | else { |
119 | warn $chunk; |
120 | } |
121 | } |
122 | $indent -= 2; |
123 | $in = ' ' x $indent; |
124 | return $ret; |
125 | }; |
126 | |
127 | package P5re::RE; our @ISA = 'P5re'; |
128 | |
129 | sub xml { |
130 | my $self = shift; |
131 | my %flags = @_; |
132 | if ($flags{indent}) { |
133 | $indent = delete $flags{indent} || 0; |
134 | $in = ' ' x $indent; |
135 | } |
136 | |
137 | my $kind = $self->{kind}; |
138 | |
139 | my $first = $self->{Kids}[0]; |
140 | if ($first and ref $first eq 'P5re::Mod') { |
141 | for my $c (qw(i m s x)) { |
142 | next unless defined $first->{$c}; |
143 | $self->{$c} = $first->{$c}; |
144 | delete $first->{$c}; |
145 | } |
146 | } |
147 | |
148 | my $modifiers = ""; |
149 | foreach my $k (sort keys %$self) { |
150 | next if $k eq 'kind' or $k eq "Kids"; |
151 | my $v = $self->{$k}; |
152 | $k =~ s/^[A-Z]//; |
153 | $modifiers .= " $k=\"$v\""; |
154 | } |
155 | my $text = "$in<$kind$modifiers>\n"; |
156 | $text .= $self->xmlkids(); |
157 | $text .= "$in</$kind>\n"; |
158 | return $text; |
159 | } |
160 | |
161 | package P5re::Alt; our @ISA = 'P5re'; |
162 | |
163 | sub xml { |
164 | my $self = shift; |
165 | my $text = "$in<alt>\n"; |
166 | $text .= $self->xmlkids(); |
167 | $text .= "$in</alt>\n"; |
168 | return $text; |
169 | } |
170 | |
171 | #package P5re::Atom; our @ISA = 'P5re'; |
172 | # |
173 | #sub xml { |
174 | # my $self = shift; |
175 | # my $text = "$in<atom>\n"; |
176 | # $text .= $self->xmlkids(); |
177 | # $text .= "$in</atom>\n"; |
178 | # return $text; |
179 | #} |
180 | |
181 | package P5re::Quant; our @ISA = 'P5re'; |
182 | |
183 | sub xml { |
184 | my $self = shift; |
185 | my $q = $self->{rep}; |
186 | my $min = $self->{min}; |
187 | my $max = $self->{max}; |
188 | my $greedy = $self->{greedy}; |
189 | my $text = "$in<quant rep=\"$q\" min=\"$min\" max=\"$max\" greedy=\"$greedy\">\n"; |
190 | $text .= $self->xmlkids(); |
191 | $text .= "$in</quant>\n"; |
192 | return $text; |
193 | } |
194 | |
195 | package P5re::White; our @ISA = 'P5re'; |
196 | |
197 | sub xml { |
198 | my $self = shift; |
199 | return "$in<white text=\"" . $self->text() . "\" />\n"; |
200 | } |
201 | |
202 | package P5re::Char; our @ISA = 'P5re'; |
203 | |
204 | sub xml { |
205 | my $self = shift; |
206 | return "$in<char text=\"" . $self->text() . "\" />\n"; |
207 | } |
208 | |
209 | package P5re::Comment; our @ISA = 'P5re'; |
210 | |
211 | sub xml { |
212 | my $self = shift; |
213 | return "$in<comment rep=\"" . $self->rep() . "\" />\n"; |
214 | } |
215 | |
216 | package P5re::Mod; our @ISA = 'P5re'; |
217 | |
218 | sub xml { |
219 | my $self = shift; |
220 | my $modifiers = ""; |
221 | foreach my $k (sort keys %$self) { |
222 | next if $k eq 'kind' or $k eq "Kids"; |
223 | my $v = $self->{$k}; |
224 | $k =~ s/^[A-Z]//; |
225 | $modifiers .= " $k=\"$v\""; |
226 | } |
227 | return "$in<mod$modifiers />\n"; |
228 | } |
229 | |
230 | package P5re::Meta; our @ISA = 'P5re'; |
231 | |
232 | sub xml { |
233 | my $self = shift; |
234 | my $sem = ""; |
235 | if ($self->{sem}) { |
236 | $sem = 'sem="' . $self->{sem} . '" ' |
237 | } |
238 | return "$in<meta rep=\"" . $self->rep() . "\" $sem/>\n"; |
239 | } |
240 | |
241 | package P5re::Back; our @ISA = 'P5re'; |
242 | |
243 | sub xml { |
244 | my $self = shift; |
245 | return "$in<backref to=\"" . P5re::xmlquote($self->{to}) . "\"/>\n"; |
246 | } |
247 | |
248 | package P5re::Var; our @ISA = 'P5re'; |
249 | |
250 | sub xml { |
251 | my $self = shift; |
252 | return "$in<var name=\"" . $self->{name} . "\" />\n"; |
253 | } |
254 | |
255 | package P5re::Closure; our @ISA = 'P5re'; |
256 | |
257 | sub xml { |
258 | my $self = shift; |
259 | return "$in<closure rep=\"" . P5re::xmlquote($self->{rep}) . "\" />\n"; |
260 | } |
261 | |
262 | package P5re::CClass; our @ISA = 'P5re'; |
263 | |
264 | sub xml { |
265 | my $self = shift; |
266 | my $neg = $self->{neg} ? "negated" : "normal"; |
267 | my $text = "$in<cclass match=\"$neg\">\n"; |
268 | $text .= $self->xmlkids(); |
269 | $text .= "$in</cclass>\n"; |
270 | return $text; |
271 | } |
272 | |
273 | package P5re::Range; our @ISA = 'P5re'; |
274 | |
275 | sub xml { |
276 | my $self = shift; |
277 | my $text = "$in<range>\n"; |
278 | $text .= $self->xmlkids(); |
279 | $text .= "$in</range>\n"; |
280 | return $text; |
281 | } |
282 | |
283 | package P5re; |
284 | |
285 | unless (caller) { |
286 | while (<>) { |
287 | chomp; |
288 | print qr2xml($_); |
289 | print "#######################################\n"; |
290 | } |
291 | } |
292 | |
293 | sub qrparse { |
294 | my $qr = shift; |
295 | my $mod; |
296 | if ($qr =~ /^s/) { |
297 | $qr =~ s/^(?:\w*)(\W)((?:\\.|.)*?)\1(.*)\1(\w*)$/$2/; |
298 | $mod = $4; |
299 | } |
300 | else { |
301 | $qr =~ s/^(?:\w*)(\W)(.*)\1(\w*)$/$2/; |
302 | $mod = $3; |
303 | } |
304 | substr($qr,0,0) = "(?$mod)" if defined $mod and $mod ne ""; |
305 | return parse($qr,@_); |
306 | } |
307 | |
308 | sub qr2xml { |
309 | return qrparse(@_)->xml(); |
310 | } |
311 | |
312 | sub re2xml { |
313 | my $re = shift; |
314 | return parse($re,@_)->xml(); |
315 | } |
316 | |
317 | sub parse { |
318 | local($_) = shift; |
319 | my %flags = @_; |
320 | $maxbrack = 0; |
321 | $indent = delete $flags{indent} || 0; |
322 | $in = ' ' x $indent; |
323 | warn "$_\n" if $debug; |
324 | my $re = re('re'); |
325 | @$re{keys %flags} = values %flags; |
326 | return $re; |
327 | } |
328 | |
329 | sub re { |
330 | my $kind = shift; |
331 | |
332 | my $oldextended = $extended; |
333 | my $oldinsensitive = $insensitive; |
334 | my $oldmultiline = $multiline; |
335 | my $oldsingleline = $singleline; |
336 | |
337 | local $extended = $extended; |
338 | local $insensitive = $insensitive; |
339 | local $multiline = $multiline; |
340 | local $singleline = $singleline; |
341 | |
342 | my $first = alt(); |
343 | |
344 | my $re; |
345 | if (not /^\|/) { |
346 | $first->{kind} = $kind; |
347 | $re = bless $first, "P5re::RE"; # rebless to remove single alt |
348 | } |
349 | else { |
350 | my @alts = ($first); |
351 | |
352 | while (s/^\|//) { |
353 | push(@alts, alt()); |
354 | } |
355 | $re = bless { Kids => [@alts], kind => $kind }, "P5re::RE"; |
356 | } |
357 | |
358 | $re->{x} = $oldextended || 0; |
359 | $re->{i} = $oldinsensitive || 0; |
360 | $re->{m} = $oldmultiline || 0; |
361 | $re->{s} = $oldsingleline || 0; |
362 | return $re; |
363 | } |
364 | |
365 | sub alt { |
366 | my @quants; |
367 | |
368 | my $quant; |
369 | while ($quant = quant()) { |
370 | if (@quants and |
371 | ref $quant eq ref $quants[-1] and |
372 | exists $quants[-1]{text} and |
373 | exists $quant->{text} ) |
374 | { |
375 | $quants[-1]{text} .= $quant->{text}; |
376 | } |
377 | else { |
378 | push(@quants, $quant); |
379 | } |
380 | } |
381 | return bless { Kids => [@quants] }, "P5re::Alt"; |
382 | } |
383 | |
384 | sub quant { |
385 | my $atom = atom(); |
386 | return 0 unless $atom; |
387 | # $atom = bless { Kids => [$atom] }, "P5re::Atom"; |
388 | if (s/^(([*+?])(\??)|\{(\d+)(?:(,)(\d*))?\}(\??))//) { |
389 | my $min = 0; |
390 | my $max = "Inf"; |
391 | my $greed = 1; |
392 | if ($2) { |
393 | if ($2 eq '+') { |
394 | $min = 1; |
395 | } |
396 | elsif ($2 eq '?') { |
397 | $max = 1; |
398 | } |
399 | $greed = 0 if $3; |
400 | } |
401 | elsif (defined $4) { |
402 | $min = $4; |
403 | if ($5) { |
404 | $max = $6 if $6; |
405 | } |
406 | else { |
407 | $max = $min; |
408 | } |
409 | $greed = 0 if $7; |
410 | } |
411 | $greed = "na" if $min == $max; |
412 | return bless { Kids => [$atom], |
413 | rep => $1, |
414 | min => $min, |
415 | max => $max, |
416 | greedy => $greed |
417 | }, "P5re::Quant"; |
418 | } |
419 | return $atom; |
420 | } |
421 | |
422 | sub atom { |
423 | my $re; |
424 | if ($_ eq "") { return 0 } |
425 | if (/^[)|]/) { return 0 } |
426 | |
427 | # whitespace is special because we don't know if /x is in effect |
428 | if ($extended) { |
429 | if (s/^(?=\s|#)(\s*(?:#.*)?)//) { return bless { text => $1 }, "P5re::White"; } |
430 | } |
431 | |
432 | # all the parenthesized forms |
433 | if (s/^\(//) { |
434 | if (s/^\?://) { |
435 | $re = re('bracket'); |
436 | } |
437 | elsif (s/^(\?#.*?)\)/)/) { |
438 | $re = bless { rep => "($1)" }, "P5re::Comment"; |
439 | } |
440 | elsif (s/^\?=//) { |
441 | $re = re('lookahead'); |
442 | } |
443 | elsif (s/^\?!//) { |
444 | $re = re('neglookahead'); |
445 | } |
446 | elsif (s/^\?<=//) { |
447 | $re = re('lookbehind'); |
448 | } |
449 | elsif (s/^\?<!//) { |
450 | $re = re('neglookbehind'); |
451 | } |
452 | elsif (s/^\?>//) { |
453 | $re = re('nobacktrack'); |
454 | } |
455 | elsif (s/^(\?\??\{.*?\})\)/)/) { |
456 | $re = bless { rep => "($1)" }, "P5re::Closure"; |
457 | } |
458 | elsif (s/^(\?\(\d+\))//) { |
459 | my $mods = $1; |
460 | $re = re('conditional'); |
461 | $re->{Arep} = "$mods"; |
462 | } |
463 | elsif (s/^\?(?=\(\?)//) { |
464 | my $mods = $1; |
465 | my $cond = atom(); |
466 | $re = re('conditional'); |
467 | unshift(@{$re->{Kids}}, $cond); |
468 | } |
469 | elsif (s/^(\?[-\w]+)://) { |
470 | my $mods = $1; |
471 | local $extended = $extended; |
472 | local $insensitive = $insensitive; |
473 | local $multiline = $multiline; |
474 | local $singleline = $singleline; |
475 | setmods($mods); |
476 | $re = re('bracket'); |
477 | $re->{Arep} = "($mods)"; |
478 | $re->{x} = $extended || 0; |
479 | $re->{i} = $insensitive || 0; |
480 | $re->{m} = $multiline || 0; |
481 | $re->{s} = $singleline || 0; |
482 | } |
483 | elsif (s/^(\?[-\w]+)//) { |
484 | my $mods = $1; |
485 | $re = bless { Arep => "($mods)" }, "P5re::Mod"; |
486 | setmods($mods); |
487 | $re->{x} = $extended || 0; |
488 | $re->{i} = $insensitive || 0; |
489 | $re->{m} = $multiline || 0; |
490 | $re->{s} = $singleline || 0; |
491 | } |
492 | elsif (s/^\?//) { |
493 | $re = re('UNRECOGNIZED'); |
494 | } |
495 | else { |
496 | my $brack = ++$maxbrack; |
497 | $re = re('capture'); |
498 | $re->{Ato} = $brack; |
499 | } |
500 | |
501 | if (not s/^\)//) { warn "Expected right paren at: '$_'" } |
502 | return $re; |
503 | } |
504 | |
505 | # special meta |
506 | if (s/^\.//) { |
507 | my $s = $singleline ? '.' : '\N'; |
508 | return bless { rep => '.', sem => $s }, "P5re::Meta"; |
509 | } |
510 | if (s/^\^//) { |
511 | my $s = $multiline ? '^^' : '^'; |
512 | return bless { rep => '^', sem => $s }, "P5re::Meta"; |
513 | } |
514 | if (s/^\$(?:$|(?=[|)]))//) { |
515 | my $s = $multiline ? '$$' : '$'; |
516 | return bless { rep => '$', sem => $s }, "P5re::Meta"; |
517 | } |
518 | if (s/^([\$\@](\w+|.))//) { # XXX need to handle subscripts here |
519 | return bless { name => $1 }, "P5re::Var"; |
520 | } |
521 | |
522 | # character classes |
523 | if (s/^\[//) { |
524 | my $re = cclass(); |
525 | if (not s/^\]//) { warn "Expected right bracket at: '$_'" } |
526 | return $re; |
527 | } |
528 | |
529 | # backwhacks |
530 | if (/^\\([1-9]\d*)/ and $1 <= $maxbrack) { |
531 | my $to = $1; |
532 | onechar(); |
533 | return bless { to => $to }, "P5re::Back"; |
534 | } |
535 | |
536 | # backwhacks |
537 | if (/^\\(?=\w)/) { |
538 | return bless { rep => onechar() }, "P5re::Meta"; |
539 | } |
540 | |
541 | # backwhacks |
542 | if (s/^\\(.)//) { |
543 | return bless { text => $1 }, "P5re::Char"; |
544 | } |
545 | |
546 | # optimization, would happen anyway |
547 | if (s/^(\w+)//) { return bless { text => $1 }, "P5re::Char"; } |
548 | |
549 | # random character |
550 | if (s/^(.)//) { return bless { text => $1 }, "P5re::Char"; } |
551 | } |
552 | |
553 | sub cclass { |
554 | my @cclass; |
555 | my $cclass = ""; |
556 | my $neg = 0; |
557 | if (s/^\^//) { $neg = 1 } |
558 | if (s/^([\]\-])//) { $cclass .= $1 } |
559 | |
560 | while ($_ ne "" and not /^\]/) { |
561 | # backwhacks |
562 | if (/^\\(?=.)|.-/) { |
563 | my $o1 = onecharobj(); |
564 | if ($cclass ne "") { |
565 | push @cclass, bless { text => $cclass }, "P5re::Char"; |
566 | $cclass = ""; |
567 | } |
568 | |
569 | if (s/^-(?=[^]])//) { |
570 | my $o2 = onecharobj(); |
571 | push @cclass, bless { Kids => [$o1, $o2] }, "P5re::Range"; |
572 | } |
573 | else { |
574 | push @cclass, $o1; |
575 | } |
576 | } |
577 | elsif (s/^(\[([:=.])\^?\w*\2\])//) { |
578 | if ($cclass ne "") { |
579 | push @cclass, bless { text => $cclass }, "P5re::Char"; |
580 | $cclass = ""; |
581 | } |
582 | push @cclass, bless { rep => $1 }, "P5re::Meta"; |
583 | } |
584 | else { |
585 | $cclass .= onechar(); |
586 | } |
587 | } |
588 | |
589 | if ($cclass ne "") { |
590 | push @cclass, bless { text => $cclass }, "P5re::Char"; |
591 | } |
592 | return bless { Kids => [@cclass], neg => $neg }, "P5re::CClass"; |
593 | } |
594 | |
595 | sub onecharobj { |
596 | my $ch = onechar(); |
597 | if ($ch =~ /^\\/) { |
598 | $ch = bless { rep => $ch }, "P5re::Meta"; |
599 | } |
600 | else { |
601 | $ch = bless { text => $ch }, "P5re::Char"; |
602 | } |
603 | } |
604 | |
605 | sub onechar { |
606 | die "Oops, short cclass" unless s/^(.)//; |
607 | my $ch = $1; |
608 | if ($ch eq '\\') { |
609 | if (s/^([rntf]|[0-7]{1,4})//) { $ch .= $1 } |
610 | elsif (s/^(x[0-9a-fA-f]{1,2})//) { $ch .= $1 } |
611 | elsif (s/^(x\{[0-9a-fA-f]+\})//) { $ch .= $1 } |
612 | elsif (s/^([NpP]\{.*?\})//) { $ch .= $1 } |
613 | elsif (s/^([cpP].)//) { $ch .= $1 } |
614 | elsif (s/^(.)//) { $ch .= $1 } |
615 | else { |
616 | die "Oops, short backwhack"; |
617 | } |
618 | } |
619 | return $ch; |
620 | } |
621 | |
622 | sub setmods { |
623 | my $mods = shift; |
624 | if ($mods =~ /\-.*x/) { |
625 | $extended = 0; |
626 | } |
627 | elsif ($mods =~ /x/) { |
628 | $extended = 1; |
629 | } |
630 | if ($mods =~ /\-.*i/) { |
631 | $insensitive = 0; |
632 | } |
633 | elsif ($mods =~ /i/) { |
634 | $insensitive = 1; |
635 | } |
636 | if ($mods =~ /\-.*m/) { |
637 | $multiline = 0; |
638 | } |
639 | elsif ($mods =~ /m/) { |
640 | $multiline = 1; |
641 | } |
642 | if ($mods =~ /\-.*s/) { |
643 | $singleline = 0; |
644 | } |
645 | elsif ($mods =~ /s/) { |
646 | $singleline = 1; |
647 | } |
648 | } |
649 | |
650 | 1; |