Commit | Line | Data |
456a815d |
1 | package HTML::Zoom::Parser::BuiltIn; |
2 | |
3 | use strict; |
4 | use warnings FATAL => 'all'; |
d80786d0 |
5 | use base qw(HTML::Zoom::SubObject); |
456a815d |
6 | |
7 | sub html_to_events { |
d80786d0 |
8 | my ($self, $text) = @_; |
456a815d |
9 | my @events; |
10 | _hacky_tag_parser($text => sub { push @events, $_[0] }); |
11 | return \@events; |
12 | } |
13 | |
14 | sub html_to_stream { |
d80786d0 |
15 | my ($self, $text) = @_; |
16 | return $self->_zconfig->stream_utils |
17 | ->stream_from_array(@{$self->html_to_events($text)}); |
456a815d |
18 | } |
19 | |
20 | sub _hacky_tag_parser { |
21 | my ($text, $handler) = @_; |
22 | while ( |
23 | $text =~ m{ |
24 | ( |
25 | (?:[^<]*) < (?: |
26 | ( / )? ( [^/!<>\s"'=]+ ) |
27 | ( (?:"[^"]*"|'[^']*'|[^"'<>])+? )? |
28 | | |
29 | (!-- .*? -- | ![^\-] .*? ) |
30 | ) (\s*/\s*)? > |
31 | ) |
32 | ([^<]*) |
33 | }sxg |
34 | ) { |
35 | my ($whole, $is_close, $tag_name, $attributes, $is_comment, |
36 | $in_place_close, $content) |
37 | = ($1, $2, $3, $4, $5, $6, $7, $8); |
38 | next if defined $is_comment; |
39 | $tag_name =~ tr/A-Z/a-z/; |
40 | if ($is_close) { |
41 | $handler->({ type => 'CLOSE', name => $tag_name, raw => $whole }); |
42 | } else { |
43 | $attributes = '' if !defined($attributes) or $attributes =~ /^ +$/; |
44 | $handler->({ |
45 | type => 'OPEN', |
46 | name => $tag_name, |
47 | is_in_place_close => $in_place_close, |
48 | _hacky_attribute_parser($attributes), |
49 | raw_attrs => $attributes||'', |
50 | raw => $whole, |
51 | }); |
52 | if ($in_place_close) { |
53 | $handler->({ |
54 | type => 'CLOSE', name => $tag_name, raw => '', |
55 | is_in_place_close => 1 |
56 | }); |
57 | } |
58 | } |
59 | if (length $content) { |
60 | $handler->({ type => 'TEXT', raw => $content }); |
61 | } |
62 | } |
63 | } |
64 | |
65 | sub _hacky_attribute_parser { |
66 | my ($attr_text) = @_; |
67 | my (%attrs, @attr_names); |
68 | while ( |
69 | $attr_text =~ m{ |
70 | ([^\s\=\"\']+)(\s*=\s*(?:(")(.*?)"|(')(.*?)'|([^'"\s=]+)['"]*))? |
71 | }sgx |
72 | ) { |
73 | my $key = $1; |
74 | my $test = $2; |
75 | my $val = ( $3 ? $4 : ( $5 ? $6 : $7 )); |
76 | my $lckey = lc($key); |
77 | if ($test) { |
78 | $attrs{$lckey} = _simple_unescape($val); |
79 | } else { |
80 | $attrs{$lckey} = $lckey; |
81 | } |
82 | push(@attr_names, $lckey); |
83 | } |
84 | (attrs => \%attrs, attr_names => \@attr_names); |
85 | } |
86 | |
87 | sub _simple_unescape { |
88 | my $str = shift; |
89 | $str =~ s/"/"/g; |
90 | $str =~ s/</</g; |
91 | $str =~ s/>/>/g; |
92 | $str =~ s/&/&/g; |
93 | $str; |
94 | } |
95 | |
96 | sub _simple_escape { |
97 | my $str = shift; |
98 | $str =~ s/&/&/g; |
99 | $str =~ s/"/"/g; |
100 | $str =~ s/</</g; |
101 | $str =~ s/>/>/g; |
102 | $str; |
103 | } |
104 | |
f8ed299b |
105 | sub html_escape { _simple_escape($_[1]) } |
106 | |
107 | sub html_unescape { _simple_unescape($_[1]) } |
108 | |
456a815d |
109 | 1; |