Commit | Line | Data |
9a5f1e3f |
1 | package DOM::Tiny::_HTML; |
d6512b50 |
2 | |
3 | use strict; |
4 | use warnings; |
e085469f |
5 | use DOM::Tiny::Entities qw(html_escape html_unescape); |
d6512b50 |
6 | use Scalar::Util 'weaken'; |
d6512b50 |
7 | |
8398aa8a |
8 | our $VERSION = '0.002'; |
927f1351 |
9 | |
d6512b50 |
10 | my $ATTR_RE = qr/ |
2d9f5165 |
11 | ([^<>=\s\/]+|\/) # Key |
d6512b50 |
12 | (?: |
13 | \s*=\s* |
2d9f5165 |
14 | (?s:(?:"(.*?)")|(?:'(.*?)')|([^>\s]*)) # Value |
d6512b50 |
15 | )? |
16 | \s* |
17 | /x; |
18 | my $TOKEN_RE = qr/ |
19 | ([^<]+)? # Text |
20 | (?: |
21 | <(?: |
22 | !(?: |
23 | DOCTYPE( |
24 | \s+\w+ # Doctype |
25 | (?:(?:\s+\w+)?(?:\s+(?:"[^"]*"|'[^']*'))+)? # External ID |
26 | (?:\s+\[.+?\])? # Int Subset |
27 | \s*) |
28 | | |
29 | --(.*?)--\s* # Comment |
30 | | |
31 | \[CDATA\[(.*?)\]\] # CDATA |
32 | ) |
33 | | |
34 | \?(.*?)\? # Processing Instruction |
35 | | |
2d9f5165 |
36 | \s*([^<>\s]+\s*(?>(?:$ATTR_RE){0,32766})*) # Tag |
37 | # Workaround for perl's limit of * to {0,32767} |
d6512b50 |
38 | )> |
39 | | |
40 | (<) # Runaway "<" |
41 | )?? |
42 | /xis; |
43 | |
44 | # HTML elements that only contain raw text |
45 | my %RAW = map { $_ => 1 } qw(script style); |
46 | |
47 | # HTML elements that only contain raw text and entities |
48 | my %RCDATA = map { $_ => 1 } qw(title textarea); |
49 | |
28d93999 |
50 | # HTML elements with optional end tags |
51 | my %END = (body => 'head', optgroup => 'optgroup', option => 'option'); |
52 | |
53 | # HTML elements that break paragraphs |
54 | $END{$_} = 'p' for |
55 | qw(address article aside blockquote dir div dl fieldset footer form h1 h2), |
56 | qw(h3 h4 h5 h6 header hr main menu nav ol p pre section table ul); |
d6512b50 |
57 | |
58 | # HTML table elements with optional end tags |
59 | my %TABLE = map { $_ => 1 } qw(colgroup tbody td tfoot th thead tr); |
60 | |
61 | # HTML elements with optional end tags and scoping rules |
62 | my %CLOSE |
63 | = (li => [{li => 1}, {ul => 1, ol => 1}], tr => [{tr => 1}, {table => 1}]); |
64 | $CLOSE{$_} = [\%TABLE, {table => 1}] for qw(colgroup tbody tfoot thead); |
65 | $CLOSE{$_} = [{dd => 1, dt => 1}, {dl => 1}] for qw(dd dt); |
66 | $CLOSE{$_} = [{rp => 1, rt => 1}, {ruby => 1}] for qw(rp rt); |
67 | $CLOSE{$_} = [{th => 1, td => 1}, {table => 1}] for qw(td th); |
68 | |
69 | # HTML elements without end tags |
70 | my %EMPTY = map { $_ => 1 } ( |
71 | qw(area base br col embed hr img input keygen link menuitem meta param), |
72 | qw(source track wbr) |
73 | ); |
74 | |
75 | # HTML elements categorized as phrasing content (and obsolete inline elements) |
76 | my @PHRASING = ( |
77 | qw(a abbr area audio b bdi bdo br button canvas cite code data datalist), |
78 | qw(del dfn em embed i iframe img input ins kbd keygen label link map mark), |
79 | qw(math meta meter noscript object output picture progress q ruby s samp), |
80 | qw(script select small span strong sub sup svg template textarea time u), |
81 | qw(var video wbr) |
82 | ); |
83 | my @OBSOLETE = qw(acronym applet basefont big font strike tt); |
84 | my %PHRASING = map { $_ => 1 } @OBSOLETE, @PHRASING; |
85 | |
86 | # HTML elements that don't get their self-closing flag acknowledged |
87 | my %BLOCK = map { $_ => 1 } ( |
88 | qw(a address applet article aside b big blockquote body button caption), |
89 | qw(center code col colgroup dd details dialog dir div dl dt em fieldset), |
90 | qw(figcaption figure font footer form frameset h1 h2 h3 h4 h5 h6 head), |
91 | qw(header hgroup html i iframe li listing main marquee menu nav nobr), |
92 | qw(noembed noframes noscript object ol optgroup option p plaintext pre rp), |
93 | qw(rt s script section select small strike strong style summary table), |
94 | qw(tbody td template textarea tfoot th thead title tr tt u ul xmp) |
95 | ); |
96 | |
d066f9b8 |
97 | sub new { |
98 | my $class = shift; |
99 | bless @_ ? @_ > 1 ? {@_} : {%{$_[0]}} : {}, ref $class || $class; |
100 | } |
101 | |
102 | sub tree { |
103 | my $self = shift; |
104 | return exists $self->{tree} ? $self->{tree} : ($self->{tree} = ['root']) unless @_; |
105 | $self->{tree} = shift; |
106 | return $self; |
107 | } |
108 | |
109 | sub xml { |
110 | my $self = shift; |
111 | return $self->{xml} unless @_; |
112 | $self->{xml} = shift; |
113 | return $self; |
114 | } |
115 | |
d6512b50 |
116 | sub parse { |
117 | my ($self, $html) = (shift, "$_[0]"); |
118 | |
119 | my $xml = $self->xml; |
120 | my $current = my $tree = ['root']; |
121 | while ($html =~ /\G$TOKEN_RE/gcso) { |
122 | my ($text, $doctype, $comment, $cdata, $pi, $tag, $runaway) |
123 | = ($1, $2, $3, $4, $5, $6, $11); |
124 | |
125 | # Text (and runaway "<") |
126 | $text .= '<' if defined $runaway; |
127 | _node($current, 'text', html_unescape $text) if defined $text; |
128 | |
129 | # Tag |
130 | if (defined $tag) { |
131 | |
132 | # End |
133 | if ($tag =~ /^\/\s*(\S+)/) { _end($xml ? $1 : lc $1, $xml, \$current) } |
134 | |
135 | # Start |
136 | elsif ($tag =~ m!^([^\s/]+)([\s\S]*)!) { |
137 | my ($start, $attr) = ($xml ? $1 : lc $1, $2); |
138 | |
139 | # Attributes |
140 | my (%attrs, $closing); |
141 | while ($attr =~ /$ATTR_RE/go) { |
2d9f5165 |
142 | my $key = $xml ? $1 : lc $1; |
143 | my $value = defined($2) ? $2 : defined($3) ? $3 : $4; |
d6512b50 |
144 | |
145 | # Empty tag |
146 | ++$closing and next if $key eq '/'; |
147 | |
148 | $attrs{$key} = defined $value ? html_unescape $value : $value; |
149 | } |
150 | |
151 | # "image" is an alias for "img" |
152 | $start = 'img' if !$xml && $start eq 'image'; |
153 | _start($start, \%attrs, $xml, \$current); |
154 | |
155 | # Element without end tag (self-closing) |
156 | _end($start, $xml, \$current) |
157 | if !$xml && $EMPTY{$start} || ($xml || !$BLOCK{$start}) && $closing; |
158 | |
159 | # Raw text elements |
160 | next if $xml || !$RAW{$start} && !$RCDATA{$start}; |
161 | next unless $html =~ m!\G(.*?)<\s*/\s*\Q$start\E\s*>!gcsi; |
162 | _node($current, 'raw', $RCDATA{$start} ? html_unescape $1 : $1); |
163 | _end($start, 0, \$current); |
164 | } |
165 | } |
166 | |
167 | # DOCTYPE |
168 | elsif (defined $doctype) { _node($current, 'doctype', $doctype) } |
169 | |
170 | # Comment |
171 | elsif (defined $comment) { _node($current, 'comment', $comment) } |
172 | |
173 | # CDATA |
174 | elsif (defined $cdata) { _node($current, 'cdata', $cdata) } |
175 | |
176 | # Processing instruction (try to detect XML) |
177 | elsif (defined $pi) { |
178 | $self->xml($xml = 1) if !exists $self->{xml} && $pi =~ /xml/i; |
179 | _node($current, 'pi', $pi); |
180 | } |
181 | } |
182 | |
183 | return $self->tree($tree); |
184 | } |
185 | |
186 | sub render { _render($_[0]->tree, $_[0]->xml) } |
187 | |
188 | sub _end { |
189 | my ($end, $xml, $current) = @_; |
190 | |
191 | # Search stack for start tag |
192 | my $next = $$current; |
193 | do { |
194 | |
195 | # Ignore useless end tag |
196 | return if $next->[0] eq 'root'; |
197 | |
198 | # Right tag |
199 | return $$current = $next->[3] if $next->[1] eq $end; |
200 | |
201 | # Phrasing content can only cross phrasing content |
202 | return if !$xml && $PHRASING{$end} && !$PHRASING{$next->[1]}; |
203 | |
204 | } while $next = $next->[3]; |
205 | } |
206 | |
207 | sub _node { |
208 | my ($current, $type, $content) = @_; |
209 | push @$current, my $new = [$type, $content, $current]; |
210 | weaken $new->[2]; |
211 | } |
212 | |
213 | sub _render { |
214 | my ($tree, $xml) = @_; |
215 | |
216 | # Text (escaped) |
217 | my $type = $tree->[0]; |
e085469f |
218 | return html_escape($tree->[1]) if $type eq 'text'; |
d6512b50 |
219 | |
220 | # Raw text |
221 | return $tree->[1] if $type eq 'raw'; |
222 | |
223 | # DOCTYPE |
224 | return '<!DOCTYPE' . $tree->[1] . '>' if $type eq 'doctype'; |
225 | |
226 | # Comment |
227 | return '<!--' . $tree->[1] . '-->' if $type eq 'comment'; |
228 | |
229 | # CDATA |
230 | return '<![CDATA[' . $tree->[1] . ']]>' if $type eq 'cdata'; |
231 | |
232 | # Processing instruction |
233 | return '<?' . $tree->[1] . '?>' if $type eq 'pi'; |
234 | |
235 | # Root |
236 | return join '', map { _render($_, $xml) } @$tree[1 .. $#$tree] |
237 | if $type eq 'root'; |
238 | |
239 | # Start tag |
240 | my $tag = $tree->[1]; |
241 | my $result = "<$tag"; |
242 | |
243 | # Attributes |
244 | for my $key (sort keys %{$tree->[2]}) { |
245 | my $value = $tree->[2]{$key}; |
246 | $result .= $xml ? qq{ $key="$key"} : " $key" and next unless defined $value; |
e085469f |
247 | $result .= qq{ $key="} . html_escape($value) . '"'; |
d6512b50 |
248 | } |
249 | |
250 | # No children |
251 | return $xml ? "$result />" : $EMPTY{$tag} ? "$result>" : "$result></$tag>" |
252 | unless $tree->[4]; |
253 | |
254 | # Children |
255 | no warnings 'recursion'; |
256 | $result .= '>' . join '', map { _render($_, $xml) } @$tree[4 .. $#$tree]; |
257 | |
258 | # End tag |
259 | return "$result</$tag>"; |
260 | } |
261 | |
262 | sub _start { |
263 | my ($start, $attrs, $xml, $current) = @_; |
264 | |
265 | # Autoclose optional HTML elements |
266 | if (!$xml && $$current->[0] ne 'root') { |
267 | if (my $end = $END{$start}) { _end($end, 0, $current) } |
268 | |
269 | elsif (my $close = $CLOSE{$start}) { |
270 | my ($allowed, $scope) = @$close; |
271 | |
272 | # Close allowed parent elements in scope |
273 | my $parent = $$current; |
274 | while ($parent->[0] ne 'root' && !$scope->{$parent->[1]}) { |
275 | _end($parent->[1], 0, $current) if $allowed->{$parent->[1]}; |
276 | $parent = $parent->[3]; |
277 | } |
278 | } |
279 | } |
280 | |
281 | # New tag |
282 | push @$$current, my $new = ['tag', $start, $attrs, $$current]; |
283 | weaken $new->[3]; |
284 | $$current = $new; |
285 | } |
286 | |
287 | 1; |
288 | |
9a5f1e3f |
289 | =for Pod::Coverage *EVERYTHING* |
d6512b50 |
290 | |
9a5f1e3f |
291 | =cut |