Commit | Line | Data |
d6512b50 |
1 | package DOM::Tiny::HTML; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use DOM::Tiny::Entities qw(html_unescape xml_escape); |
6 | use Scalar::Util 'weaken'; |
7 | use Class::Tiny::Chained 'xml', { tree => sub { ['root'] } }; |
8 | |
927f1351 |
9 | our $VERSION = '0.001'; |
10 | |
d6512b50 |
11 | my $ATTR_RE = qr/ |
927f1351 |
12 | ([^<>=\s\/]+|\/) # Key |
d6512b50 |
13 | (?: |
14 | \s*=\s* |
15 | (?s:(["'])(.*?)\g{-2}|([^>\s]*)) # Value |
16 | )? |
17 | \s* |
18 | /x; |
19 | my $TOKEN_RE = qr/ |
20 | ([^<]+)? # Text |
21 | (?: |
22 | <(?: |
23 | !(?: |
24 | DOCTYPE( |
25 | \s+\w+ # Doctype |
26 | (?:(?:\s+\w+)?(?:\s+(?:"[^"]*"|'[^']*'))+)? # External ID |
27 | (?:\s+\[.+?\])? # Int Subset |
28 | \s*) |
29 | | |
30 | --(.*?)--\s* # Comment |
31 | | |
32 | \[CDATA\[(.*?)\]\] # CDATA |
33 | ) |
34 | | |
35 | \?(.*?)\? # Processing Instruction |
36 | | |
37 | \s*([^<>\s]+\s*(?:(?:$ATTR_RE){0,32766})*+) # Tag |
38 | )> |
39 | | |
40 | (<) # Runaway "<" |
41 | )?? |
42 | /xis; |
43 | |
44 | # HTML elements that only contain raw text |
45 | my %RAW = map { $_ => 1 } qw(script style); |
46 | |
47 | # HTML elements that only contain raw text and entities |
48 | my %RCDATA = map { $_ => 1 } qw(title textarea); |
49 | |
50 | # HTML elements with optional end tags |
51 | my %END = (body => 'head', optgroup => 'optgroup', option => 'option'); |
52 | |
53 | # HTML elements that break paragraphs |
54 | map { $END{$_} = 'p' } ( |
55 | qw(address article aside blockquote dir div dl fieldset footer form h1 h2), |
56 | qw(h3 h4 h5 h6 header hr main menu nav ol p pre section table ul) |
57 | ); |
58 | |
59 | # HTML table elements with optional end tags |
60 | my %TABLE = map { $_ => 1 } qw(colgroup tbody td tfoot th thead tr); |
61 | |
62 | # HTML elements with optional end tags and scoping rules |
63 | my %CLOSE |
64 | = (li => [{li => 1}, {ul => 1, ol => 1}], tr => [{tr => 1}, {table => 1}]); |
65 | $CLOSE{$_} = [\%TABLE, {table => 1}] for qw(colgroup tbody tfoot thead); |
66 | $CLOSE{$_} = [{dd => 1, dt => 1}, {dl => 1}] for qw(dd dt); |
67 | $CLOSE{$_} = [{rp => 1, rt => 1}, {ruby => 1}] for qw(rp rt); |
68 | $CLOSE{$_} = [{th => 1, td => 1}, {table => 1}] for qw(td th); |
69 | |
70 | # HTML elements without end tags |
71 | my %EMPTY = map { $_ => 1 } ( |
72 | qw(area base br col embed hr img input keygen link menuitem meta param), |
73 | qw(source track wbr) |
74 | ); |
75 | |
76 | # HTML elements categorized as phrasing content (and obsolete inline elements) |
77 | my @PHRASING = ( |
78 | qw(a abbr area audio b bdi bdo br button canvas cite code data datalist), |
79 | qw(del dfn em embed i iframe img input ins kbd keygen label link map mark), |
80 | qw(math meta meter noscript object output picture progress q ruby s samp), |
81 | qw(script select small span strong sub sup svg template textarea time u), |
82 | qw(var video wbr) |
83 | ); |
84 | my @OBSOLETE = qw(acronym applet basefont big font strike tt); |
85 | my %PHRASING = map { $_ => 1 } @OBSOLETE, @PHRASING; |
86 | |
87 | # HTML elements that don't get their self-closing flag acknowledged |
88 | my %BLOCK = map { $_ => 1 } ( |
89 | qw(a address applet article aside b big blockquote body button caption), |
90 | qw(center code col colgroup dd details dialog dir div dl dt em fieldset), |
91 | qw(figcaption figure font footer form frameset h1 h2 h3 h4 h5 h6 head), |
92 | qw(header hgroup html i iframe li listing main marquee menu nav nobr), |
93 | qw(noembed noframes noscript object ol optgroup option p plaintext pre rp), |
94 | qw(rt s script section select small strike strong style summary table), |
95 | qw(tbody td template textarea tfoot th thead title tr tt u ul xmp) |
96 | ); |
97 | |
98 | sub parse { |
99 | my ($self, $html) = (shift, "$_[0]"); |
100 | |
101 | my $xml = $self->xml; |
102 | my $current = my $tree = ['root']; |
103 | while ($html =~ /\G$TOKEN_RE/gcso) { |
104 | my ($text, $doctype, $comment, $cdata, $pi, $tag, $runaway) |
105 | = ($1, $2, $3, $4, $5, $6, $11); |
106 | |
107 | # Text (and runaway "<") |
108 | $text .= '<' if defined $runaway; |
109 | _node($current, 'text', html_unescape $text) if defined $text; |
110 | |
111 | # Tag |
112 | if (defined $tag) { |
113 | |
114 | # End |
115 | if ($tag =~ /^\/\s*(\S+)/) { _end($xml ? $1 : lc $1, $xml, \$current) } |
116 | |
117 | # Start |
118 | elsif ($tag =~ m!^([^\s/]+)([\s\S]*)!) { |
119 | my ($start, $attr) = ($xml ? $1 : lc $1, $2); |
120 | |
121 | # Attributes |
122 | my (%attrs, $closing); |
123 | while ($attr =~ /$ATTR_RE/go) { |
927f1351 |
124 | my ($key, $value) = ($xml ? $1 : lc $1, defined $3 ? $3 : $4); |
d6512b50 |
125 | |
126 | # Empty tag |
127 | ++$closing and next if $key eq '/'; |
128 | |
129 | $attrs{$key} = defined $value ? html_unescape $value : $value; |
130 | } |
131 | |
132 | # "image" is an alias for "img" |
133 | $start = 'img' if !$xml && $start eq 'image'; |
134 | _start($start, \%attrs, $xml, \$current); |
135 | |
136 | # Element without end tag (self-closing) |
137 | _end($start, $xml, \$current) |
138 | if !$xml && $EMPTY{$start} || ($xml || !$BLOCK{$start}) && $closing; |
139 | |
140 | # Raw text elements |
141 | next if $xml || !$RAW{$start} && !$RCDATA{$start}; |
142 | next unless $html =~ m!\G(.*?)<\s*/\s*\Q$start\E\s*>!gcsi; |
143 | _node($current, 'raw', $RCDATA{$start} ? html_unescape $1 : $1); |
144 | _end($start, 0, \$current); |
145 | } |
146 | } |
147 | |
148 | # DOCTYPE |
149 | elsif (defined $doctype) { _node($current, 'doctype', $doctype) } |
150 | |
151 | # Comment |
152 | elsif (defined $comment) { _node($current, 'comment', $comment) } |
153 | |
154 | # CDATA |
155 | elsif (defined $cdata) { _node($current, 'cdata', $cdata) } |
156 | |
157 | # Processing instruction (try to detect XML) |
158 | elsif (defined $pi) { |
159 | $self->xml($xml = 1) if !exists $self->{xml} && $pi =~ /xml/i; |
160 | _node($current, 'pi', $pi); |
161 | } |
162 | } |
163 | |
164 | return $self->tree($tree); |
165 | } |
166 | |
167 | sub render { _render($_[0]->tree, $_[0]->xml) } |
168 | |
169 | sub _end { |
170 | my ($end, $xml, $current) = @_; |
171 | |
172 | # Search stack for start tag |
173 | my $next = $$current; |
174 | do { |
175 | |
176 | # Ignore useless end tag |
177 | return if $next->[0] eq 'root'; |
178 | |
179 | # Right tag |
180 | return $$current = $next->[3] if $next->[1] eq $end; |
181 | |
182 | # Phrasing content can only cross phrasing content |
183 | return if !$xml && $PHRASING{$end} && !$PHRASING{$next->[1]}; |
184 | |
185 | } while $next = $next->[3]; |
186 | } |
187 | |
188 | sub _node { |
189 | my ($current, $type, $content) = @_; |
190 | push @$current, my $new = [$type, $content, $current]; |
191 | weaken $new->[2]; |
192 | } |
193 | |
194 | sub _render { |
195 | my ($tree, $xml) = @_; |
196 | |
197 | # Text (escaped) |
198 | my $type = $tree->[0]; |
199 | return xml_escape($tree->[1]) if $type eq 'text'; |
200 | |
201 | # Raw text |
202 | return $tree->[1] if $type eq 'raw'; |
203 | |
204 | # DOCTYPE |
205 | return '<!DOCTYPE' . $tree->[1] . '>' if $type eq 'doctype'; |
206 | |
207 | # Comment |
208 | return '<!--' . $tree->[1] . '-->' if $type eq 'comment'; |
209 | |
210 | # CDATA |
211 | return '<![CDATA[' . $tree->[1] . ']]>' if $type eq 'cdata'; |
212 | |
213 | # Processing instruction |
214 | return '<?' . $tree->[1] . '?>' if $type eq 'pi'; |
215 | |
216 | # Root |
217 | return join '', map { _render($_, $xml) } @$tree[1 .. $#$tree] |
218 | if $type eq 'root'; |
219 | |
220 | # Start tag |
221 | my $tag = $tree->[1]; |
222 | my $result = "<$tag"; |
223 | |
224 | # Attributes |
225 | for my $key (sort keys %{$tree->[2]}) { |
226 | my $value = $tree->[2]{$key}; |
227 | $result .= $xml ? qq{ $key="$key"} : " $key" and next unless defined $value; |
228 | $result .= qq{ $key="} . xml_escape($value) . '"'; |
229 | } |
230 | |
231 | # No children |
232 | return $xml ? "$result />" : $EMPTY{$tag} ? "$result>" : "$result></$tag>" |
233 | unless $tree->[4]; |
234 | |
235 | # Children |
236 | no warnings 'recursion'; |
237 | $result .= '>' . join '', map { _render($_, $xml) } @$tree[4 .. $#$tree]; |
238 | |
239 | # End tag |
240 | return "$result</$tag>"; |
241 | } |
242 | |
243 | sub _start { |
244 | my ($start, $attrs, $xml, $current) = @_; |
245 | |
246 | # Autoclose optional HTML elements |
247 | if (!$xml && $$current->[0] ne 'root') { |
248 | if (my $end = $END{$start}) { _end($end, 0, $current) } |
249 | |
250 | elsif (my $close = $CLOSE{$start}) { |
251 | my ($allowed, $scope) = @$close; |
252 | |
253 | # Close allowed parent elements in scope |
254 | my $parent = $$current; |
255 | while ($parent->[0] ne 'root' && !$scope->{$parent->[1]}) { |
256 | _end($parent->[1], 0, $current) if $allowed->{$parent->[1]}; |
257 | $parent = $parent->[3]; |
258 | } |
259 | } |
260 | } |
261 | |
262 | # New tag |
263 | push @$$current, my $new = ['tag', $start, $attrs, $$current]; |
264 | weaken $new->[3]; |
265 | $$current = $new; |
266 | } |
267 | |
268 | 1; |
269 | |
270 | =encoding utf8 |
271 | |
272 | =head1 NAME |
273 | |
274 | DOM::Tiny::HTML - HTML/XML engine |
275 | |
276 | =head1 SYNOPSIS |
277 | |
278 | use DOM::Tiny::HTML; |
279 | |
280 | # Turn HTML into DOM tree |
281 | my $html = DOM::Tiny::HTML->new; |
282 | $html->parse('<div><p id="a">Test</p><p id="b">123</p></div>'); |
283 | my $tree = $html->tree; |
284 | |
285 | =head1 DESCRIPTION |
286 | |
287 | L<DOM::Tiny::HTML> is the HTML/XML engine used by L<DOM::Tiny> based on |
288 | L<Mojo::DOM::HTML>, which is based on the |
289 | L<HTML Living Standard|https://html.spec.whatwg.org> as well as the |
290 | L<Extensible Markup Language (XML) 1.0|http://www.w3.org/TR/xml/>. |
291 | |
292 | =head1 ATTRIBUTES |
293 | |
294 | L<DOM::Tiny::HTML> implements the following attributes. |
295 | |
296 | =head2 tree |
297 | |
298 | my $tree = $html->tree; |
299 | $html = $html->tree(['root']); |
300 | |
301 | Document Object Model. Note that this structure should only be used very |
302 | carefully since it is very dynamic. |
303 | |
304 | =head2 xml |
305 | |
306 | my $bool = $html->xml; |
307 | $html = $html->xml($bool); |
308 | |
309 | Disable HTML semantics in parser and activate case-sensitivity, defaults to |
310 | auto detection based on processing instructions. |
311 | |
312 | =head1 METHODS |
313 | |
314 | L<DOM::Tiny::HTML> implements the following methods. |
315 | |
316 | =head2 parse |
317 | |
318 | $html = $html->parse('<foo bar="baz">I ♥ DOM::Tiny!</foo>'); |
319 | |
320 | Parse HTML/XML fragment. |
321 | |
322 | =head2 render |
323 | |
324 | my $str = $html->render; |
325 | |
326 | Render DOM to HTML/XML. |
327 | |
328 | =head1 BUGS |
329 | |
330 | Report any issues on the public bugtracker. |
331 | |
332 | =head1 AUTHOR |
333 | |
334 | Dan Book <dbook@cpan.org> |
335 | |
336 | =head1 COPYRIGHT AND LICENSE |
337 | |
338 | This software is Copyright (c) 2015 by Dan Book. |
339 | |
340 | This is free software, licensed under: |
341 | |
342 | The Artistic License 2.0 (GPL Compatible) |
343 | |
344 | =head1 SEE ALSO |
345 | |
346 | L<Mojo::DOM::HTML> |