drop perl requirement to 5.8
[catagits/DOM-Tiny.git] / lib / DOM / Tiny / HTML.pm
CommitLineData
d6512b50 1package DOM::Tiny::HTML;
2
3use strict;
4use warnings;
5use DOM::Tiny::Entities qw(html_unescape xml_escape);
6use Scalar::Util 'weaken';
7use Class::Tiny::Chained 'xml', { tree => sub { ['root'] } };
8
927f1351 9our $VERSION = '0.001';
10
d6512b50 11my $ATTR_RE = qr/
927f1351 12 ([^<>=\s\/]+|\/) # Key
d6512b50 13 (?:
14 \s*=\s*
15 (?s:(["'])(.*?)\g{-2}|([^>\s]*)) # Value
16 )?
17 \s*
18/x;
19my $TOKEN_RE = qr/
20 ([^<]+)? # Text
21 (?:
22 <(?:
23 !(?:
24 DOCTYPE(
25 \s+\w+ # Doctype
26 (?:(?:\s+\w+)?(?:\s+(?:"[^"]*"|'[^']*'))+)? # External ID
27 (?:\s+\[.+?\])? # Int Subset
28 \s*)
29 |
30 --(.*?)--\s* # Comment
31 |
32 \[CDATA\[(.*?)\]\] # CDATA
33 )
34 |
35 \?(.*?)\? # Processing Instruction
36 |
37 \s*([^<>\s]+\s*(?:(?:$ATTR_RE){0,32766})*+) # Tag
38 )>
39 |
40 (<) # Runaway "<"
41 )??
42/xis;
43
44# HTML elements that only contain raw text
45my %RAW = map { $_ => 1 } qw(script style);
46
47# HTML elements that only contain raw text and entities
48my %RCDATA = map { $_ => 1 } qw(title textarea);
49
50# HTML elements with optional end tags
51my %END = (body => 'head', optgroup => 'optgroup', option => 'option');
52
53# HTML elements that break paragraphs
54map { $END{$_} = 'p' } (
55 qw(address article aside blockquote dir div dl fieldset footer form h1 h2),
56 qw(h3 h4 h5 h6 header hr main menu nav ol p pre section table ul)
57);
58
59# HTML table elements with optional end tags
60my %TABLE = map { $_ => 1 } qw(colgroup tbody td tfoot th thead tr);
61
62# HTML elements with optional end tags and scoping rules
63my %CLOSE
64 = (li => [{li => 1}, {ul => 1, ol => 1}], tr => [{tr => 1}, {table => 1}]);
65$CLOSE{$_} = [\%TABLE, {table => 1}] for qw(colgroup tbody tfoot thead);
66$CLOSE{$_} = [{dd => 1, dt => 1}, {dl => 1}] for qw(dd dt);
67$CLOSE{$_} = [{rp => 1, rt => 1}, {ruby => 1}] for qw(rp rt);
68$CLOSE{$_} = [{th => 1, td => 1}, {table => 1}] for qw(td th);
69
70# HTML elements without end tags
71my %EMPTY = map { $_ => 1 } (
72 qw(area base br col embed hr img input keygen link menuitem meta param),
73 qw(source track wbr)
74);
75
76# HTML elements categorized as phrasing content (and obsolete inline elements)
77my @PHRASING = (
78 qw(a abbr area audio b bdi bdo br button canvas cite code data datalist),
79 qw(del dfn em embed i iframe img input ins kbd keygen label link map mark),
80 qw(math meta meter noscript object output picture progress q ruby s samp),
81 qw(script select small span strong sub sup svg template textarea time u),
82 qw(var video wbr)
83);
84my @OBSOLETE = qw(acronym applet basefont big font strike tt);
85my %PHRASING = map { $_ => 1 } @OBSOLETE, @PHRASING;
86
87# HTML elements that don't get their self-closing flag acknowledged
88my %BLOCK = map { $_ => 1 } (
89 qw(a address applet article aside b big blockquote body button caption),
90 qw(center code col colgroup dd details dialog dir div dl dt em fieldset),
91 qw(figcaption figure font footer form frameset h1 h2 h3 h4 h5 h6 head),
92 qw(header hgroup html i iframe li listing main marquee menu nav nobr),
93 qw(noembed noframes noscript object ol optgroup option p plaintext pre rp),
94 qw(rt s script section select small strike strong style summary table),
95 qw(tbody td template textarea tfoot th thead title tr tt u ul xmp)
96);
97
98sub parse {
99 my ($self, $html) = (shift, "$_[0]");
100
101 my $xml = $self->xml;
102 my $current = my $tree = ['root'];
103 while ($html =~ /\G$TOKEN_RE/gcso) {
104 my ($text, $doctype, $comment, $cdata, $pi, $tag, $runaway)
105 = ($1, $2, $3, $4, $5, $6, $11);
106
107 # Text (and runaway "<")
108 $text .= '<' if defined $runaway;
109 _node($current, 'text', html_unescape $text) if defined $text;
110
111 # Tag
112 if (defined $tag) {
113
114 # End
115 if ($tag =~ /^\/\s*(\S+)/) { _end($xml ? $1 : lc $1, $xml, \$current) }
116
117 # Start
118 elsif ($tag =~ m!^([^\s/]+)([\s\S]*)!) {
119 my ($start, $attr) = ($xml ? $1 : lc $1, $2);
120
121 # Attributes
122 my (%attrs, $closing);
123 while ($attr =~ /$ATTR_RE/go) {
927f1351 124 my ($key, $value) = ($xml ? $1 : lc $1, defined $3 ? $3 : $4);
d6512b50 125
126 # Empty tag
127 ++$closing and next if $key eq '/';
128
129 $attrs{$key} = defined $value ? html_unescape $value : $value;
130 }
131
132 # "image" is an alias for "img"
133 $start = 'img' if !$xml && $start eq 'image';
134 _start($start, \%attrs, $xml, \$current);
135
136 # Element without end tag (self-closing)
137 _end($start, $xml, \$current)
138 if !$xml && $EMPTY{$start} || ($xml || !$BLOCK{$start}) && $closing;
139
140 # Raw text elements
141 next if $xml || !$RAW{$start} && !$RCDATA{$start};
142 next unless $html =~ m!\G(.*?)<\s*/\s*\Q$start\E\s*>!gcsi;
143 _node($current, 'raw', $RCDATA{$start} ? html_unescape $1 : $1);
144 _end($start, 0, \$current);
145 }
146 }
147
148 # DOCTYPE
149 elsif (defined $doctype) { _node($current, 'doctype', $doctype) }
150
151 # Comment
152 elsif (defined $comment) { _node($current, 'comment', $comment) }
153
154 # CDATA
155 elsif (defined $cdata) { _node($current, 'cdata', $cdata) }
156
157 # Processing instruction (try to detect XML)
158 elsif (defined $pi) {
159 $self->xml($xml = 1) if !exists $self->{xml} && $pi =~ /xml/i;
160 _node($current, 'pi', $pi);
161 }
162 }
163
164 return $self->tree($tree);
165}
166
167sub render { _render($_[0]->tree, $_[0]->xml) }
168
169sub _end {
170 my ($end, $xml, $current) = @_;
171
172 # Search stack for start tag
173 my $next = $$current;
174 do {
175
176 # Ignore useless end tag
177 return if $next->[0] eq 'root';
178
179 # Right tag
180 return $$current = $next->[3] if $next->[1] eq $end;
181
182 # Phrasing content can only cross phrasing content
183 return if !$xml && $PHRASING{$end} && !$PHRASING{$next->[1]};
184
185 } while $next = $next->[3];
186}
187
188sub _node {
189 my ($current, $type, $content) = @_;
190 push @$current, my $new = [$type, $content, $current];
191 weaken $new->[2];
192}
193
194sub _render {
195 my ($tree, $xml) = @_;
196
197 # Text (escaped)
198 my $type = $tree->[0];
199 return xml_escape($tree->[1]) if $type eq 'text';
200
201 # Raw text
202 return $tree->[1] if $type eq 'raw';
203
204 # DOCTYPE
205 return '<!DOCTYPE' . $tree->[1] . '>' if $type eq 'doctype';
206
207 # Comment
208 return '<!--' . $tree->[1] . '-->' if $type eq 'comment';
209
210 # CDATA
211 return '<![CDATA[' . $tree->[1] . ']]>' if $type eq 'cdata';
212
213 # Processing instruction
214 return '<?' . $tree->[1] . '?>' if $type eq 'pi';
215
216 # Root
217 return join '', map { _render($_, $xml) } @$tree[1 .. $#$tree]
218 if $type eq 'root';
219
220 # Start tag
221 my $tag = $tree->[1];
222 my $result = "<$tag";
223
224 # Attributes
225 for my $key (sort keys %{$tree->[2]}) {
226 my $value = $tree->[2]{$key};
227 $result .= $xml ? qq{ $key="$key"} : " $key" and next unless defined $value;
228 $result .= qq{ $key="} . xml_escape($value) . '"';
229 }
230
231 # No children
232 return $xml ? "$result />" : $EMPTY{$tag} ? "$result>" : "$result></$tag>"
233 unless $tree->[4];
234
235 # Children
236 no warnings 'recursion';
237 $result .= '>' . join '', map { _render($_, $xml) } @$tree[4 .. $#$tree];
238
239 # End tag
240 return "$result</$tag>";
241}
242
243sub _start {
244 my ($start, $attrs, $xml, $current) = @_;
245
246 # Autoclose optional HTML elements
247 if (!$xml && $$current->[0] ne 'root') {
248 if (my $end = $END{$start}) { _end($end, 0, $current) }
249
250 elsif (my $close = $CLOSE{$start}) {
251 my ($allowed, $scope) = @$close;
252
253 # Close allowed parent elements in scope
254 my $parent = $$current;
255 while ($parent->[0] ne 'root' && !$scope->{$parent->[1]}) {
256 _end($parent->[1], 0, $current) if $allowed->{$parent->[1]};
257 $parent = $parent->[3];
258 }
259 }
260 }
261
262 # New tag
263 push @$$current, my $new = ['tag', $start, $attrs, $$current];
264 weaken $new->[3];
265 $$current = $new;
266}
267
2681;
269
270=encoding utf8
271
272=head1 NAME
273
274DOM::Tiny::HTML - HTML/XML engine
275
276=head1 SYNOPSIS
277
278 use DOM::Tiny::HTML;
279
280 # Turn HTML into DOM tree
281 my $html = DOM::Tiny::HTML->new;
282 $html->parse('<div><p id="a">Test</p><p id="b">123</p></div>');
283 my $tree = $html->tree;
284
285=head1 DESCRIPTION
286
287L<DOM::Tiny::HTML> is the HTML/XML engine used by L<DOM::Tiny> based on
288L<Mojo::DOM::HTML>, which is based on the
289L<HTML Living Standard|https://html.spec.whatwg.org> as well as the
290L<Extensible Markup Language (XML) 1.0|http://www.w3.org/TR/xml/>.
291
292=head1 ATTRIBUTES
293
294L<DOM::Tiny::HTML> implements the following attributes.
295
296=head2 tree
297
298 my $tree = $html->tree;
299 $html = $html->tree(['root']);
300
301Document Object Model. Note that this structure should only be used very
302carefully since it is very dynamic.
303
304=head2 xml
305
306 my $bool = $html->xml;
307 $html = $html->xml($bool);
308
309Disable HTML semantics in parser and activate case-sensitivity, defaults to
310auto detection based on processing instructions.
311
312=head1 METHODS
313
314L<DOM::Tiny::HTML> implements the following methods.
315
316=head2 parse
317
318 $html = $html->parse('<foo bar="baz">I ♥ DOM::Tiny!</foo>');
319
320Parse HTML/XML fragment.
321
322=head2 render
323
324 my $str = $html->render;
325
326Render DOM to HTML/XML.
327
328=head1 BUGS
329
330Report any issues on the public bugtracker.
331
332=head1 AUTHOR
333
334Dan Book <dbook@cpan.org>
335
336=head1 COPYRIGHT AND LICENSE
337
338This software is Copyright (c) 2015 by Dan Book.
339
340This is free software, licensed under:
341
342 The Artistic License 2.0 (GPL Compatible)
343
344=head1 SEE ALSO
345
346L<Mojo::DOM::HTML>