Commit | Line | Data |
456a815d |
1 | package HTML::Zoom::Parser::BuiltIn; |
2 | |
1cf03540 |
3 | use strictures 1; |
d80786d0 |
4 | use base qw(HTML::Zoom::SubObject); |
456a815d |
5 | |
6 | sub html_to_events { |
d80786d0 |
7 | my ($self, $text) = @_; |
456a815d |
8 | my @events; |
9 | _hacky_tag_parser($text => sub { push @events, $_[0] }); |
10 | return \@events; |
11 | } |
12 | |
13 | sub html_to_stream { |
d80786d0 |
14 | my ($self, $text) = @_; |
15 | return $self->_zconfig->stream_utils |
16 | ->stream_from_array(@{$self->html_to_events($text)}); |
456a815d |
17 | } |
18 | |
af6300be |
19 | # DO NOT BE AFRAID. |
20 | # |
21 | # Well, ok. Be afraid. A little. But this is lexing HTML with a regexp, |
22 | # not really parsing (since the structure nesting isn't handled here) so |
23 | # it's relatively not dangerous. |
24 | # |
25 | # Certainly it's not really any more or any less heinous than anything else |
26 | # I could do in a handful of lines of pure perl. |
27 | |
456a815d |
28 | sub _hacky_tag_parser { |
29 | my ($text, $handler) = @_; |
4c6b4429 |
30 | $text =~ m{^([^<]*)}g; |
31 | if ( length $1 ) { # leading PCDATA |
32 | $handler->({ type => 'TEXT', raw => $1 }); |
33 | } |
456a815d |
34 | while ( |
35 | $text =~ m{ |
36 | ( |
37 | (?:[^<]*) < (?: |
38 | ( / )? ( [^/!<>\s"'=]+ ) |
abc91e12 |
39 | ( (?:"[^"]*"|'[^']*'|[^/"'<>])+? )? |
456a815d |
40 | | |
41 | (!-- .*? -- | ![^\-] .*? ) |
42 | ) (\s*/\s*)? > |
43 | ) |
44 | ([^<]*) |
45 | }sxg |
46 | ) { |
e32e7b90 |
47 | my ($whole, $is_close, $tag_name, $attributes, $is_special, |
456a815d |
48 | $in_place_close, $content) |
49 | = ($1, $2, $3, $4, $5, $6, $7, $8); |
e32e7b90 |
50 | if ($is_special) { |
51 | $handler->({ type => 'SPECIAL', raw => $whole }); |
456a815d |
52 | } else { |
e32e7b90 |
53 | $tag_name =~ tr/A-Z/a-z/; |
54 | if ($is_close) { |
55 | $handler->({ type => 'CLOSE', name => $tag_name, raw => $whole }); |
56 | } else { |
57 | $attributes = '' if !defined($attributes) or $attributes =~ /^ +$/; |
456a815d |
58 | $handler->({ |
e32e7b90 |
59 | type => 'OPEN', |
60 | name => $tag_name, |
61 | is_in_place_close => $in_place_close, |
62 | _hacky_attribute_parser($attributes), |
63 | raw_attrs => $attributes||'', |
64 | raw => $whole, |
456a815d |
65 | }); |
e32e7b90 |
66 | if ($in_place_close) { |
67 | $handler->({ |
68 | type => 'CLOSE', name => $tag_name, raw => '', |
69 | is_in_place_close => 1 |
70 | }); |
71 | } |
456a815d |
72 | } |
73 | } |
74 | if (length $content) { |
75 | $handler->({ type => 'TEXT', raw => $content }); |
76 | } |
77 | } |
78 | } |
79 | |
80 | sub _hacky_attribute_parser { |
81 | my ($attr_text) = @_; |
82 | my (%attrs, @attr_names); |
83 | while ( |
84 | $attr_text =~ m{ |
85 | ([^\s\=\"\']+)(\s*=\s*(?:(")(.*?)"|(')(.*?)'|([^'"\s=]+)['"]*))? |
86 | }sgx |
87 | ) { |
88 | my $key = $1; |
89 | my $test = $2; |
90 | my $val = ( $3 ? $4 : ( $5 ? $6 : $7 )); |
91 | my $lckey = lc($key); |
92 | if ($test) { |
93 | $attrs{$lckey} = _simple_unescape($val); |
94 | } else { |
95 | $attrs{$lckey} = $lckey; |
96 | } |
97 | push(@attr_names, $lckey); |
98 | } |
99 | (attrs => \%attrs, attr_names => \@attr_names); |
100 | } |
101 | |
102 | sub _simple_unescape { |
103 | my $str = shift; |
104 | $str =~ s/"/"/g; |
105 | $str =~ s/</</g; |
106 | $str =~ s/>/>/g; |
107 | $str =~ s/&/&/g; |
108 | $str; |
109 | } |
110 | |
111 | sub _simple_escape { |
112 | my $str = shift; |
113 | $str =~ s/&/&/g; |
114 | $str =~ s/"/"/g; |
115 | $str =~ s/</</g; |
116 | $str =~ s/>/>/g; |
117 | $str; |
118 | } |
119 | |
f8ed299b |
120 | sub html_escape { _simple_escape($_[1]) } |
121 | |
122 | sub html_unescape { _simple_unescape($_[1]) } |
123 | |
456a815d |
124 | 1; |