3 # Copyright (C) 2005, Larry Wall
4 # This software may be copied under the same terms as Perl.
11 our @EXPORT_OK = qw(re re2xml qr2xml);
25 chr(0x00) => "STUPIDXML(#x00)",
26 chr(0x01) => "STUPIDXML(#x01)",
27 chr(0x02) => "STUPIDXML(#x02)",
28 chr(0x03) => "STUPIDXML(#x03)",
29 chr(0x04) => "STUPIDXML(#x04)",
30 chr(0x05) => "STUPIDXML(#x05)",
31 chr(0x06) => "STUPIDXML(#x06)",
32 chr(0x07) => "STUPIDXML(#x07)",
33 chr(0x08) => "STUPIDXML(#x08)",
36 chr(0x0b) => "STUPIDXML(#x0b)",
37 chr(0x0c) => "STUPIDXML(#x0c)",
39 chr(0x0e) => "STUPIDXML(#x0e)",
40 chr(0x0f) => "STUPIDXML(#x0f)",
41 chr(0x10) => "STUPIDXML(#x10)",
42 chr(0x11) => "STUPIDXML(#x11)",
43 chr(0x12) => "STUPIDXML(#x12)",
44 chr(0x13) => "STUPIDXML(#x13)",
45 chr(0x14) => "STUPIDXML(#x14)",
46 chr(0x15) => "STUPIDXML(#x15)",
47 chr(0x16) => "STUPIDXML(#x16)",
48 chr(0x17) => "STUPIDXML(#x17)",
49 chr(0x18) => "STUPIDXML(#x18)",
50 chr(0x19) => "STUPIDXML(#x19)",
51 chr(0x1a) => "STUPIDXML(#x1a)",
52 chr(0x1b) => "STUPIDXML(#x1b)",
53 chr(0x1c) => "STUPIDXML(#x1c)",
54 chr(0x1d) => "STUPIDXML(#x1d)",
55 chr(0x1e) => "STUPIDXML(#x1e)",
56 chr(0x1f) => "STUPIDXML(#x1f)",
57 chr(0x7f) => "STUPIDXML(#x7f)",
58 chr(0x80) => "STUPIDXML(#x80)",
59 chr(0x81) => "STUPIDXML(#x81)",
60 chr(0x82) => "STUPIDXML(#x82)",
61 chr(0x83) => "STUPIDXML(#x83)",
62 chr(0x84) => "STUPIDXML(#x84)",
63 chr(0x86) => "STUPIDXML(#x86)",
64 chr(0x87) => "STUPIDXML(#x87)",
65 chr(0x88) => "STUPIDXML(#x88)",
66 chr(0x89) => "STUPIDXML(#x89)",
67 chr(0x90) => "STUPIDXML(#x90)",
68 chr(0x91) => "STUPIDXML(#x91)",
69 chr(0x92) => "STUPIDXML(#x92)",
70 chr(0x93) => "STUPIDXML(#x93)",
71 chr(0x94) => "STUPIDXML(#x94)",
72 chr(0x95) => "STUPIDXML(#x95)",
73 chr(0x96) => "STUPIDXML(#x96)",
74 chr(0x97) => "STUPIDXML(#x97)",
75 chr(0x98) => "STUPIDXML(#x98)",
76 chr(0x99) => "STUPIDXML(#x99)",
77 chr(0x9a) => "STUPIDXML(#x9a)",
78 chr(0x9b) => "STUPIDXML(#x9b)",
79 chr(0x9c) => "STUPIDXML(#x9c)",
80 chr(0x9d) => "STUPIDXML(#x9d)",
81 chr(0x9e) => "STUPIDXML(#x9e)",
82 chr(0x9f) => "STUPIDXML(#x9f)",
86 '"' => """, # XML idiocy
91 $text =~ s/(.)/$xmlish{$1} || $1/seg;
97 return xmlquote($self->{text});
102 return xmlquote($self->{rep});
107 my $array = $self->{Kids};
111 foreach my $chunk (@$array) {
112 if (ref $chunk eq "ARRAY") {
116 $ret .= $chunk->xml();
127 package P5re::RE; our @ISA = 'P5re';
132 if ($flags{indent}) {
133 $indent = delete $flags{indent} || 0;
137 my $kind = $self->{kind};
139 my $first = $self->{Kids}[0];
140 if ($first and ref $first eq 'P5re::Mod') {
141 for my $c (qw(i m s x)) {
142 next unless defined $first->{$c};
143 $self->{$c} = $first->{$c};
149 foreach my $k (sort keys %$self) {
150 next if $k eq 'kind' or $k eq "Kids";
153 $modifiers .= " $k=\"$v\"";
155 my $text = "$in<$kind$modifiers>\n";
156 $text .= $self->xmlkids();
157 $text .= "$in</$kind>\n";
161 package P5re::Alt; our @ISA = 'P5re';
165 my $text = "$in<alt>\n";
166 $text .= $self->xmlkids();
167 $text .= "$in</alt>\n";
171 #package P5re::Atom; our @ISA = 'P5re';
175 # my $text = "$in<atom>\n";
176 # $text .= $self->xmlkids();
177 # $text .= "$in</atom>\n";
181 package P5re::Quant; our @ISA = 'P5re';
185 my $q = $self->{rep};
186 my $min = $self->{min};
187 my $max = $self->{max};
188 my $greedy = $self->{greedy};
189 my $text = "$in<quant rep=\"$q\" min=\"$min\" max=\"$max\" greedy=\"$greedy\">\n";
190 $text .= $self->xmlkids();
191 $text .= "$in</quant>\n";
195 package P5re::White; our @ISA = 'P5re';
199 return "$in<white text=\"" . $self->text() . "\" />\n";
202 package P5re::Char; our @ISA = 'P5re';
206 return "$in<char text=\"" . $self->text() . "\" />\n";
209 package P5re::Comment; our @ISA = 'P5re';
213 return "$in<comment rep=\"" . $self->rep() . "\" />\n";
216 package P5re::Mod; our @ISA = 'P5re';
221 foreach my $k (sort keys %$self) {
222 next if $k eq 'kind' or $k eq "Kids";
225 $modifiers .= " $k=\"$v\"";
227 return "$in<mod$modifiers />\n";
230 package P5re::Meta; our @ISA = 'P5re';
236 $sem = 'sem="' . $self->{sem} . '" '
238 return "$in<meta rep=\"" . $self->rep() . "\" $sem/>\n";
241 package P5re::Back; our @ISA = 'P5re';
245 return "$in<backref to=\"" . P5re::xmlquote($self->{to}) . "\"/>\n";
248 package P5re::Var; our @ISA = 'P5re';
252 return "$in<var name=\"" . $self->{name} . "\" />\n";
255 package P5re::Closure; our @ISA = 'P5re';
259 return "$in<closure rep=\"" . P5re::xmlquote($self->{rep}) . "\" />\n";
262 package P5re::CClass; our @ISA = 'P5re';
266 my $neg = $self->{neg} ? "negated" : "normal";
267 my $text = "$in<cclass match=\"$neg\">\n";
268 $text .= $self->xmlkids();
269 $text .= "$in</cclass>\n";
273 package P5re::Range; our @ISA = 'P5re';
277 my $text = "$in<range>\n";
278 $text .= $self->xmlkids();
279 $text .= "$in</range>\n";
289 print "#######################################\n";
297 $qr =~ s/^(?:\w*)(\W)((?:\\.|.)*?)\1(.*)\1(\w*)$/$2/;
301 $qr =~ s/^(?:\w*)(\W)(.*)\1(\w*)$/$2/;
304 substr($qr,0,0) = "(?$mod)" if defined $mod and $mod ne "";
305 return parse($qr,@_);
309 return qrparse(@_)->xml();
314 return parse($re,@_)->xml();
321 $indent = delete $flags{indent} || 0;
323 warn "$_\n" if $debug;
325 @$re{keys %flags} = values %flags;
332 my $oldextended = $extended;
333 my $oldinsensitive = $insensitive;
334 my $oldmultiline = $multiline;
335 my $oldsingleline = $singleline;
337 local $extended = $extended;
338 local $insensitive = $insensitive;
339 local $multiline = $multiline;
340 local $singleline = $singleline;
346 $first->{kind} = $kind;
347 $re = bless $first, "P5re::RE"; # rebless to remove single alt
355 $re = bless { Kids => [@alts], kind => $kind }, "P5re::RE";
358 $re->{x} = $oldextended || 0;
359 $re->{i} = $oldinsensitive || 0;
360 $re->{m} = $oldmultiline || 0;
361 $re->{s} = $oldsingleline || 0;
369 while ($quant = quant()) {
371 ref $quant eq ref $quants[-1] and
372 exists $quants[-1]{text} and
373 exists $quant->{text} )
375 $quants[-1]{text} .= $quant->{text};
378 push(@quants, $quant);
381 return bless { Kids => [@quants] }, "P5re::Alt";
386 return 0 unless $atom;
387 # $atom = bless { Kids => [$atom] }, "P5re::Atom";
388 if (s/^(([*+?])(\??)|\{(\d+)(?:(,)(\d*))?\}(\??))//) {
411 $greed = "na" if $min == $max;
412 return bless { Kids => [$atom],
424 if ($_ eq "") { return 0 }
425 if (/^[)|]/) { return 0 }
427 # whitespace is special because we don't know if /x is in effect
429 if (s/^(?=\s|#)(\s*(?:#.*)?)//) { return bless { text => $1 }, "P5re::White"; }
432 # all the parenthesized forms
437 elsif (s/^(\?#.*?)\)/)/) {
438 $re = bless { rep => "($1)" }, "P5re::Comment";
441 $re = re('lookahead');
444 $re = re('neglookahead');
447 $re = re('lookbehind');
450 $re = re('neglookbehind');
453 $re = re('nobacktrack');
455 elsif (s/^(\?\??\{.*?\})\)/)/) {
456 $re = bless { rep => "($1)" }, "P5re::Closure";
458 elsif (s/^(\?\(\d+\))//) {
460 $re = re('conditional');
461 $re->{Arep} = "$mods";
463 elsif (s/^\?(?=\(\?)//) {
466 $re = re('conditional');
467 unshift(@{$re->{Kids}}, $cond);
469 elsif (s/^(\?[-\w]+)://) {
471 local $extended = $extended;
472 local $insensitive = $insensitive;
473 local $multiline = $multiline;
474 local $singleline = $singleline;
477 $re->{Arep} = "($mods)";
478 $re->{x} = $extended || 0;
479 $re->{i} = $insensitive || 0;
480 $re->{m} = $multiline || 0;
481 $re->{s} = $singleline || 0;
483 elsif (s/^(\?[-\w]+)//) {
485 $re = bless { Arep => "($mods)" }, "P5re::Mod";
487 $re->{x} = $extended || 0;
488 $re->{i} = $insensitive || 0;
489 $re->{m} = $multiline || 0;
490 $re->{s} = $singleline || 0;
493 $re = re('UNRECOGNIZED');
496 my $brack = ++$maxbrack;
501 if (not s/^\)//) { warn "Expected right paren at: '$_'" }
507 my $s = $singleline ? '.' : '\N';
508 return bless { rep => '.', sem => $s }, "P5re::Meta";
511 my $s = $multiline ? '^^' : '^';
512 return bless { rep => '^', sem => $s }, "P5re::Meta";
514 if (s/^\$(?:$|(?=[|)]))//) {
515 my $s = $multiline ? '$$' : '$';
516 return bless { rep => '$', sem => $s }, "P5re::Meta";
518 if (s/^([\$\@](\w+|.))//) { # XXX need to handle subscripts here
519 return bless { name => $1 }, "P5re::Var";
525 if (not s/^\]//) { warn "Expected right bracket at: '$_'" }
530 if (/^\\([1-9]\d*)/ and $1 <= $maxbrack) {
533 return bless { to => $to }, "P5re::Back";
538 return bless { rep => onechar() }, "P5re::Meta";
543 return bless { text => $1 }, "P5re::Char";
546 # optimization, would happen anyway
547 if (s/^(\w+)//) { return bless { text => $1 }, "P5re::Char"; }
550 if (s/^(.)//) { return bless { text => $1 }, "P5re::Char"; }
557 if (s/^\^//) { $neg = 1 }
558 if (s/^([\]\-])//) { $cclass .= $1 }
560 while ($_ ne "" and not /^\]/) {
563 my $o1 = onecharobj();
565 push @cclass, bless { text => $cclass }, "P5re::Char";
569 if (s/^-(?=[^]])//) {
570 my $o2 = onecharobj();
571 push @cclass, bless { Kids => [$o1, $o2] }, "P5re::Range";
577 elsif (s/^(\[([:=.])\^?\w*\2\])//) {
579 push @cclass, bless { text => $cclass }, "P5re::Char";
582 push @cclass, bless { rep => $1 }, "P5re::Meta";
585 $cclass .= onechar();
590 push @cclass, bless { text => $cclass }, "P5re::Char";
592 return bless { Kids => [@cclass], neg => $neg }, "P5re::CClass";
598 $ch = bless { rep => $ch }, "P5re::Meta";
601 $ch = bless { text => $ch }, "P5re::Char";
606 die "Oops, short cclass" unless s/^(.)//;
609 if (s/^([rntf]|[0-7]{1,4})//) { $ch .= $1 }
610 elsif (s/^(x[0-9a-fA-f]{1,2})//) { $ch .= $1 }
611 elsif (s/^(x\{[0-9a-fA-f]+\})//) { $ch .= $1 }
612 elsif (s/^([NpP]\{.*?\})//) { $ch .= $1 }
613 elsif (s/^([cpP].)//) { $ch .= $1 }
614 elsif (s/^(.)//) { $ch .= $1 }
616 die "Oops, short backwhack";
624 if ($mods =~ /\-.*x/) {
627 elsif ($mods =~ /x/) {
630 if ($mods =~ /\-.*i/) {
633 elsif ($mods =~ /i/) {
636 if ($mods =~ /\-.*m/) {
639 elsif ($mods =~ /m/) {
642 if ($mods =~ /\-.*s/) {
645 elsif ($mods =~ /s/) {