1 package DOM::Tiny::HTML;
5 use DOM::Tiny::Entities qw(html_escape html_unescape);
6 use Scalar::Util 'weaken';
7 use Class::Tiny::Chained 'xml', { tree => sub { ['root'] } };
9 our $VERSION = '0.001';
12 ([^<>=\s\/]+|\/) # Key
15 (?s:(["'])(.*?)\g{-2}|([^>\s]*)) # Value
26 (?:(?:\s+\w+)?(?:\s+(?:"[^"]*"|'[^']*'))+)? # External ID
27 (?:\s+\[.+?\])? # Int Subset
30 --(.*?)--\s* # Comment
32 \[CDATA\[(.*?)\]\] # CDATA
35 \?(.*?)\? # Processing Instruction
37 \s*([^<>\s]+\s*(?:(?:$ATTR_RE){0,32766})*+) # Tag
44 # HTML elements that only contain raw text
45 my %RAW = map { $_ => 1 } qw(script style);
47 # HTML elements that only contain raw text and entities
48 my %RCDATA = map { $_ => 1 } qw(title textarea);
50 # HTML elements with optional end tags
51 my %END = (body => 'head', optgroup => 'optgroup', option => 'option');
53 # HTML elements that break paragraphs
54 map { $END{$_} = 'p' } (
55 qw(address article aside blockquote dir div dl fieldset footer form h1 h2),
56 qw(h3 h4 h5 h6 header hr main menu nav ol p pre section table ul)
59 # HTML table elements with optional end tags
60 my %TABLE = map { $_ => 1 } qw(colgroup tbody td tfoot th thead tr);
62 # HTML elements with optional end tags and scoping rules
64 = (li => [{li => 1}, {ul => 1, ol => 1}], tr => [{tr => 1}, {table => 1}]);
65 $CLOSE{$_} = [\%TABLE, {table => 1}] for qw(colgroup tbody tfoot thead);
66 $CLOSE{$_} = [{dd => 1, dt => 1}, {dl => 1}] for qw(dd dt);
67 $CLOSE{$_} = [{rp => 1, rt => 1}, {ruby => 1}] for qw(rp rt);
68 $CLOSE{$_} = [{th => 1, td => 1}, {table => 1}] for qw(td th);
70 # HTML elements without end tags
71 my %EMPTY = map { $_ => 1 } (
72 qw(area base br col embed hr img input keygen link menuitem meta param),
76 # HTML elements categorized as phrasing content (and obsolete inline elements)
78 qw(a abbr area audio b bdi bdo br button canvas cite code data datalist),
79 qw(del dfn em embed i iframe img input ins kbd keygen label link map mark),
80 qw(math meta meter noscript object output picture progress q ruby s samp),
81 qw(script select small span strong sub sup svg template textarea time u),
84 my @OBSOLETE = qw(acronym applet basefont big font strike tt);
85 my %PHRASING = map { $_ => 1 } @OBSOLETE, @PHRASING;
87 # HTML elements that don't get their self-closing flag acknowledged
88 my %BLOCK = map { $_ => 1 } (
89 qw(a address applet article aside b big blockquote body button caption),
90 qw(center code col colgroup dd details dialog dir div dl dt em fieldset),
91 qw(figcaption figure font footer form frameset h1 h2 h3 h4 h5 h6 head),
92 qw(header hgroup html i iframe li listing main marquee menu nav nobr),
93 qw(noembed noframes noscript object ol optgroup option p plaintext pre rp),
94 qw(rt s script section select small strike strong style summary table),
95 qw(tbody td template textarea tfoot th thead title tr tt u ul xmp)
99 my ($self, $html) = (shift, "$_[0]");
101 my $xml = $self->xml;
102 my $current = my $tree = ['root'];
103 while ($html =~ /\G$TOKEN_RE/gcso) {
104 my ($text, $doctype, $comment, $cdata, $pi, $tag, $runaway)
105 = ($1, $2, $3, $4, $5, $6, $11);
107 # Text (and runaway "<")
108 $text .= '<' if defined $runaway;
109 _node($current, 'text', html_unescape $text) if defined $text;
115 if ($tag =~ /^\/\s*(\S+)/) { _end($xml ? $1 : lc $1, $xml, \$current) }
118 elsif ($tag =~ m!^([^\s/]+)([\s\S]*)!) {
119 my ($start, $attr) = ($xml ? $1 : lc $1, $2);
122 my (%attrs, $closing);
123 while ($attr =~ /$ATTR_RE/go) {
124 my ($key, $value) = ($xml ? $1 : lc $1, $3 // $4);
127 ++$closing and next if $key eq '/';
129 $attrs{$key} = defined $value ? html_unescape $value : $value;
132 # "image" is an alias for "img"
133 $start = 'img' if !$xml && $start eq 'image';
134 _start($start, \%attrs, $xml, \$current);
136 # Element without end tag (self-closing)
137 _end($start, $xml, \$current)
138 if !$xml && $EMPTY{$start} || ($xml || !$BLOCK{$start}) && $closing;
141 next if $xml || !$RAW{$start} && !$RCDATA{$start};
142 next unless $html =~ m!\G(.*?)<\s*/\s*\Q$start\E\s*>!gcsi;
143 _node($current, 'raw', $RCDATA{$start} ? html_unescape $1 : $1);
144 _end($start, 0, \$current);
149 elsif (defined $doctype) { _node($current, 'doctype', $doctype) }
152 elsif (defined $comment) { _node($current, 'comment', $comment) }
155 elsif (defined $cdata) { _node($current, 'cdata', $cdata) }
157 # Processing instruction (try to detect XML)
158 elsif (defined $pi) {
159 $self->xml($xml = 1) if !exists $self->{xml} && $pi =~ /xml/i;
160 _node($current, 'pi', $pi);
164 return $self->tree($tree);
167 sub render { _render($_[0]->tree, $_[0]->xml) }
170 my ($end, $xml, $current) = @_;
172 # Search stack for start tag
173 my $next = $$current;
176 # Ignore useless end tag
177 return if $next->[0] eq 'root';
180 return $$current = $next->[3] if $next->[1] eq $end;
182 # Phrasing content can only cross phrasing content
183 return if !$xml && $PHRASING{$end} && !$PHRASING{$next->[1]};
185 } while $next = $next->[3];
189 my ($current, $type, $content) = @_;
190 push @$current, my $new = [$type, $content, $current];
195 my ($tree, $xml) = @_;
198 my $type = $tree->[0];
199 return html_escape($tree->[1]) if $type eq 'text';
202 return $tree->[1] if $type eq 'raw';
205 return '<!DOCTYPE' . $tree->[1] . '>' if $type eq 'doctype';
208 return '<!--' . $tree->[1] . '-->' if $type eq 'comment';
211 return '<![CDATA[' . $tree->[1] . ']]>' if $type eq 'cdata';
213 # Processing instruction
214 return '<?' . $tree->[1] . '?>' if $type eq 'pi';
217 return join '', map { _render($_, $xml) } @$tree[1 .. $#$tree]
221 my $tag = $tree->[1];
222 my $result = "<$tag";
225 for my $key (sort keys %{$tree->[2]}) {
226 my $value = $tree->[2]{$key};
227 $result .= $xml ? qq{ $key="$key"} : " $key" and next unless defined $value;
228 $result .= qq{ $key="} . html_escape($value) . '"';
232 return $xml ? "$result />" : $EMPTY{$tag} ? "$result>" : "$result></$tag>"
236 no warnings 'recursion';
237 $result .= '>' . join '', map { _render($_, $xml) } @$tree[4 .. $#$tree];
240 return "$result</$tag>";
244 my ($start, $attrs, $xml, $current) = @_;
246 # Autoclose optional HTML elements
247 if (!$xml && $$current->[0] ne 'root') {
248 if (my $end = $END{$start}) { _end($end, 0, $current) }
250 elsif (my $close = $CLOSE{$start}) {
251 my ($allowed, $scope) = @$close;
253 # Close allowed parent elements in scope
254 my $parent = $$current;
255 while ($parent->[0] ne 'root' && !$scope->{$parent->[1]}) {
256 _end($parent->[1], 0, $current) if $allowed->{$parent->[1]};
257 $parent = $parent->[3];
263 push @$$current, my $new = ['tag', $start, $attrs, $$current];
274 DOM::Tiny::HTML - HTML/XML engine
280 # Turn HTML into DOM tree
281 my $html = DOM::Tiny::HTML->new;
282 $html->parse('<div><p id="a">Test</p><p id="b">123</p></div>');
283 my $tree = $html->tree;
287 L<DOM::Tiny::HTML> is the HTML/XML engine used by L<DOM::Tiny> based on
288 L<Mojo::DOM::HTML>, which is based on the
289 L<HTML Living Standard|https://html.spec.whatwg.org> as well as the
290 L<Extensible Markup Language (XML) 1.0|http://www.w3.org/TR/xml/>.
294 L<DOM::Tiny::HTML> implements the following attributes.
298 my $tree = $html->tree;
299 $html = $html->tree(['root']);
301 Document Object Model. Note that this structure should only be used very
302 carefully since it is very dynamic.
306 my $bool = $html->xml;
307 $html = $html->xml($bool);
309 Disable HTML semantics in parser and activate case-sensitivity, defaults to
310 auto detection based on processing instructions.
314 L<DOM::Tiny::HTML> implements the following methods.
318 $html = $html->parse('<foo bar="baz">I ♥ DOM::Tiny!</foo>');
320 Parse HTML/XML fragment.
324 my $str = $html->render;
326 Render DOM to HTML/XML.
330 Report any issues on the public bugtracker.
334 Dan Book <dbook@cpan.org>
336 =head1 COPYRIGHT AND LICENSE
338 This software is Copyright (c) 2015 by Dan Book.
340 This is free software, licensed under:
342 The Artistic License 2.0 (GPL Compatible)