merge
[catagits/HTML-Zoom.git] / lib / HTML / Zoom / Parser / BuiltIn.pm
CommitLineData
456a815d 1package HTML::Zoom::Parser::BuiltIn;
2
1cf03540 3use strictures 1;
d80786d0 4use base qw(HTML::Zoom::SubObject);
456a815d 5
6sub html_to_events {
d80786d0 7 my ($self, $text) = @_;
456a815d 8 my @events;
9 _hacky_tag_parser($text => sub { push @events, $_[0] });
10 return \@events;
11}
12
13sub html_to_stream {
d80786d0 14 my ($self, $text) = @_;
15 return $self->_zconfig->stream_utils
16 ->stream_from_array(@{$self->html_to_events($text)});
456a815d 17}
18
af6300be 19# DO NOT BE AFRAID.
20#
21# Well, ok. Be afraid. A little. But this is lexing HTML with a regexp,
22# not really parsing (since the structure nesting isn't handled here) so
23# it's relatively not dangerous.
24#
25# Certainly it's not really any more or any less heinous than anything else
26# I could do in a handful of lines of pure perl.
27
456a815d 28sub _hacky_tag_parser {
29 my ($text, $handler) = @_;
4c6b4429 30 $text =~ m{^([^<]*)}g;
31 if ( length $1 ) { # leading PCDATA
32 $handler->({ type => 'TEXT', raw => $1 });
33 }
456a815d 34 while (
35 $text =~ m{
36 (
37 (?:[^<]*) < (?:
38 ( / )? ( [^/!<>\s"'=]+ )
abc91e12 39 ( (?:"[^"]*"|'[^']*'|[^/"'<>])+? )?
456a815d 40 |
41 (!-- .*? -- | ![^\-] .*? )
42 ) (\s*/\s*)? >
43 )
44 ([^<]*)
45 }sxg
46 ) {
e32e7b90 47 my ($whole, $is_close, $tag_name, $attributes, $is_special,
456a815d 48 $in_place_close, $content)
49 = ($1, $2, $3, $4, $5, $6, $7, $8);
e32e7b90 50 if ($is_special) {
51 $handler->({ type => 'SPECIAL', raw => $whole });
456a815d 52 } else {
e32e7b90 53 $tag_name =~ tr/A-Z/a-z/;
54 if ($is_close) {
55 $handler->({ type => 'CLOSE', name => $tag_name, raw => $whole });
56 } else {
57 $attributes = '' if !defined($attributes) or $attributes =~ /^ +$/;
456a815d 58 $handler->({
e32e7b90 59 type => 'OPEN',
60 name => $tag_name,
61 is_in_place_close => $in_place_close,
62 _hacky_attribute_parser($attributes),
63 raw_attrs => $attributes||'',
64 raw => $whole,
456a815d 65 });
e32e7b90 66 if ($in_place_close) {
67 $handler->({
68 type => 'CLOSE', name => $tag_name, raw => '',
69 is_in_place_close => 1
70 });
71 }
456a815d 72 }
73 }
74 if (length $content) {
75 $handler->({ type => 'TEXT', raw => $content });
76 }
77 }
78}
79
80sub _hacky_attribute_parser {
81 my ($attr_text) = @_;
82 my (%attrs, @attr_names);
83 while (
84 $attr_text =~ m{
85 ([^\s\=\"\']+)(\s*=\s*(?:(")(.*?)"|(')(.*?)'|([^'"\s=]+)['"]*))?
86 }sgx
87 ) {
88 my $key = $1;
89 my $test = $2;
90 my $val = ( $3 ? $4 : ( $5 ? $6 : $7 ));
91 my $lckey = lc($key);
92 if ($test) {
93 $attrs{$lckey} = _simple_unescape($val);
94 } else {
95 $attrs{$lckey} = $lckey;
96 }
97 push(@attr_names, $lckey);
98 }
99 (attrs => \%attrs, attr_names => \@attr_names);
100}
101
102sub _simple_unescape {
103 my $str = shift;
104 $str =~ s/&quot;/"/g;
105 $str =~ s/&lt;/</g;
106 $str =~ s/&gt;/>/g;
107 $str =~ s/&amp;/&/g;
108 $str;
109}
110
111sub _simple_escape {
112 my $str = shift;
113 $str =~ s/&/&amp;/g;
114 $str =~ s/"/&quot;/g;
115 $str =~ s/</&lt;/g;
116 $str =~ s/>/&gt;/g;
117 $str;
118}
119
f8ed299b 120sub html_escape { _simple_escape($_[1]) }
121
122sub html_unescape { _simple_unescape($_[1]) }
123
456a815d 1241;