1 package DOM::Tiny::HTML;
5 use DOM::Tiny::Entities qw(html_unescape xml_escape);
6 use Scalar::Util 'weaken';
7 use Class::Tiny::Chained 'xml', { tree => sub { ['root'] } };
10 ([^<>=\s\/]+|\/) # Key
13 (?s:(["'])(.*?)\g{-2}|([^>\s]*)) # Value
24 (?:(?:\s+\w+)?(?:\s+(?:"[^"]*"|'[^']*'))+)? # External ID
25 (?:\s+\[.+?\])? # Int Subset
28 --(.*?)--\s* # Comment
30 \[CDATA\[(.*?)\]\] # CDATA
33 \?(.*?)\? # Processing Instruction
35 \s*([^<>\s]+\s*(?:(?:$ATTR_RE){0,32766})*+) # Tag
42 # HTML elements that only contain raw text
43 my %RAW = map { $_ => 1 } qw(script style);
45 # HTML elements that only contain raw text and entities
46 my %RCDATA = map { $_ => 1 } qw(title textarea);
48 # HTML elements with optional end tags
49 my %END = (body => 'head', optgroup => 'optgroup', option => 'option');
51 # HTML elements that break paragraphs
52 map { $END{$_} = 'p' } (
53 qw(address article aside blockquote dir div dl fieldset footer form h1 h2),
54 qw(h3 h4 h5 h6 header hr main menu nav ol p pre section table ul)
57 # HTML table elements with optional end tags
58 my %TABLE = map { $_ => 1 } qw(colgroup tbody td tfoot th thead tr);
60 # HTML elements with optional end tags and scoping rules
62 = (li => [{li => 1}, {ul => 1, ol => 1}], tr => [{tr => 1}, {table => 1}]);
63 $CLOSE{$_} = [\%TABLE, {table => 1}] for qw(colgroup tbody tfoot thead);
64 $CLOSE{$_} = [{dd => 1, dt => 1}, {dl => 1}] for qw(dd dt);
65 $CLOSE{$_} = [{rp => 1, rt => 1}, {ruby => 1}] for qw(rp rt);
66 $CLOSE{$_} = [{th => 1, td => 1}, {table => 1}] for qw(td th);
68 # HTML elements without end tags
69 my %EMPTY = map { $_ => 1 } (
70 qw(area base br col embed hr img input keygen link menuitem meta param),
74 # HTML elements categorized as phrasing content (and obsolete inline elements)
76 qw(a abbr area audio b bdi bdo br button canvas cite code data datalist),
77 qw(del dfn em embed i iframe img input ins kbd keygen label link map mark),
78 qw(math meta meter noscript object output picture progress q ruby s samp),
79 qw(script select small span strong sub sup svg template textarea time u),
82 my @OBSOLETE = qw(acronym applet basefont big font strike tt);
83 my %PHRASING = map { $_ => 1 } @OBSOLETE, @PHRASING;
85 # HTML elements that don't get their self-closing flag acknowledged
86 my %BLOCK = map { $_ => 1 } (
87 qw(a address applet article aside b big blockquote body button caption),
88 qw(center code col colgroup dd details dialog dir div dl dt em fieldset),
89 qw(figcaption figure font footer form frameset h1 h2 h3 h4 h5 h6 head),
90 qw(header hgroup html i iframe li listing main marquee menu nav nobr),
91 qw(noembed noframes noscript object ol optgroup option p plaintext pre rp),
92 qw(rt s script section select small strike strong style summary table),
93 qw(tbody td template textarea tfoot th thead title tr tt u ul xmp)
97 my ($self, $html) = (shift, "$_[0]");
100 my $current = my $tree = ['root'];
101 while ($html =~ /\G$TOKEN_RE/gcso) {
102 my ($text, $doctype, $comment, $cdata, $pi, $tag, $runaway)
103 = ($1, $2, $3, $4, $5, $6, $11);
105 # Text (and runaway "<")
106 $text .= '<' if defined $runaway;
107 _node($current, 'text', html_unescape $text) if defined $text;
113 if ($tag =~ /^\/\s*(\S+)/) { _end($xml ? $1 : lc $1, $xml, \$current) }
116 elsif ($tag =~ m!^([^\s/]+)([\s\S]*)!) {
117 my ($start, $attr) = ($xml ? $1 : lc $1, $2);
120 my (%attrs, $closing);
121 while ($attr =~ /$ATTR_RE/go) {
122 my ($key, $value) = ($xml ? $1 : lc $1, $3 // $4);
125 ++$closing and next if $key eq '/';
127 $attrs{$key} = defined $value ? html_unescape $value : $value;
130 # "image" is an alias for "img"
131 $start = 'img' if !$xml && $start eq 'image';
132 _start($start, \%attrs, $xml, \$current);
134 # Element without end tag (self-closing)
135 _end($start, $xml, \$current)
136 if !$xml && $EMPTY{$start} || ($xml || !$BLOCK{$start}) && $closing;
139 next if $xml || !$RAW{$start} && !$RCDATA{$start};
140 next unless $html =~ m!\G(.*?)<\s*/\s*\Q$start\E\s*>!gcsi;
141 _node($current, 'raw', $RCDATA{$start} ? html_unescape $1 : $1);
142 _end($start, 0, \$current);
147 elsif (defined $doctype) { _node($current, 'doctype', $doctype) }
150 elsif (defined $comment) { _node($current, 'comment', $comment) }
153 elsif (defined $cdata) { _node($current, 'cdata', $cdata) }
155 # Processing instruction (try to detect XML)
156 elsif (defined $pi) {
157 $self->xml($xml = 1) if !exists $self->{xml} && $pi =~ /xml/i;
158 _node($current, 'pi', $pi);
162 return $self->tree($tree);
165 sub render { _render($_[0]->tree, $_[0]->xml) }
168 my ($end, $xml, $current) = @_;
170 # Search stack for start tag
171 my $next = $$current;
174 # Ignore useless end tag
175 return if $next->[0] eq 'root';
178 return $$current = $next->[3] if $next->[1] eq $end;
180 # Phrasing content can only cross phrasing content
181 return if !$xml && $PHRASING{$end} && !$PHRASING{$next->[1]};
183 } while $next = $next->[3];
187 my ($current, $type, $content) = @_;
188 push @$current, my $new = [$type, $content, $current];
193 my ($tree, $xml) = @_;
196 my $type = $tree->[0];
197 return xml_escape($tree->[1]) if $type eq 'text';
200 return $tree->[1] if $type eq 'raw';
203 return '<!DOCTYPE' . $tree->[1] . '>' if $type eq 'doctype';
206 return '<!--' . $tree->[1] . '-->' if $type eq 'comment';
209 return '<![CDATA[' . $tree->[1] . ']]>' if $type eq 'cdata';
211 # Processing instruction
212 return '<?' . $tree->[1] . '?>' if $type eq 'pi';
215 return join '', map { _render($_, $xml) } @$tree[1 .. $#$tree]
219 my $tag = $tree->[1];
220 my $result = "<$tag";
223 for my $key (sort keys %{$tree->[2]}) {
224 my $value = $tree->[2]{$key};
225 $result .= $xml ? qq{ $key="$key"} : " $key" and next unless defined $value;
226 $result .= qq{ $key="} . xml_escape($value) . '"';
230 return $xml ? "$result />" : $EMPTY{$tag} ? "$result>" : "$result></$tag>"
234 no warnings 'recursion';
235 $result .= '>' . join '', map { _render($_, $xml) } @$tree[4 .. $#$tree];
238 return "$result</$tag>";
242 my ($start, $attrs, $xml, $current) = @_;
244 # Autoclose optional HTML elements
245 if (!$xml && $$current->[0] ne 'root') {
246 if (my $end = $END{$start}) { _end($end, 0, $current) }
248 elsif (my $close = $CLOSE{$start}) {
249 my ($allowed, $scope) = @$close;
251 # Close allowed parent elements in scope
252 my $parent = $$current;
253 while ($parent->[0] ne 'root' && !$scope->{$parent->[1]}) {
254 _end($parent->[1], 0, $current) if $allowed->{$parent->[1]};
255 $parent = $parent->[3];
261 push @$$current, my $new = ['tag', $start, $attrs, $$current];
272 DOM::Tiny::HTML - HTML/XML engine
278 # Turn HTML into DOM tree
279 my $html = DOM::Tiny::HTML->new;
280 $html->parse('<div><p id="a">Test</p><p id="b">123</p></div>');
281 my $tree = $html->tree;
285 L<DOM::Tiny::HTML> is the HTML/XML engine used by L<DOM::Tiny> based on
286 L<Mojo::DOM::HTML>, which is based on the
287 L<HTML Living Standard|https://html.spec.whatwg.org> as well as the
288 L<Extensible Markup Language (XML) 1.0|http://www.w3.org/TR/xml/>.
292 L<DOM::Tiny::HTML> implements the following attributes.
296 my $tree = $html->tree;
297 $html = $html->tree(['root']);
299 Document Object Model. Note that this structure should only be used very
300 carefully since it is very dynamic.
304 my $bool = $html->xml;
305 $html = $html->xml($bool);
307 Disable HTML semantics in parser and activate case-sensitivity, defaults to
308 auto detection based on processing instructions.
312 L<DOM::Tiny::HTML> implements the following methods.
316 $html = $html->parse('<foo bar="baz">I ♥ DOM::Tiny!</foo>');
318 Parse HTML/XML fragment.
322 my $str = $html->render;
324 Render DOM to HTML/XML.
328 Report any issues on the public bugtracker.
332 Dan Book <dbook@cpan.org>
334 =head1 COPYRIGHT AND LICENSE
336 This software is Copyright (c) 2015 by Dan Book.
338 This is free software, licensed under:
340 The Artistic License 2.0 (GPL Compatible)