1 package DOM::Tiny::HTML;
5 use DOM::Tiny::Entities qw(html_escape html_unescape);
6 use Scalar::Util 'weaken';
8 our $VERSION = '0.001';
11 ([^<>=\s\/]+|\/) # Key
14 (?s:(["'])(.*?)\g{-2}|([^>\s]*)) # Value
25 (?:(?:\s+\w+)?(?:\s+(?:"[^"]*"|'[^']*'))+)? # External ID
26 (?:\s+\[.+?\])? # Int Subset
29 --(.*?)--\s* # Comment
31 \[CDATA\[(.*?)\]\] # CDATA
34 \?(.*?)\? # Processing Instruction
36 \s*([^<>\s]+\s*(?:(?:$ATTR_RE){0,32766})*+) # Tag
43 # HTML elements that only contain raw text
44 my %RAW = map { $_ => 1 } qw(script style);
46 # HTML elements that only contain raw text and entities
47 my %RCDATA = map { $_ => 1 } qw(title textarea);
49 # HTML elements with optional end tags
50 my %END = (body => 'head', optgroup => 'optgroup', option => 'option');
52 # HTML elements that break paragraphs
53 map { $END{$_} = 'p' } (
54 qw(address article aside blockquote dir div dl fieldset footer form h1 h2),
55 qw(h3 h4 h5 h6 header hr main menu nav ol p pre section table ul)
58 # HTML table elements with optional end tags
59 my %TABLE = map { $_ => 1 } qw(colgroup tbody td tfoot th thead tr);
61 # HTML elements with optional end tags and scoping rules
63 = (li => [{li => 1}, {ul => 1, ol => 1}], tr => [{tr => 1}, {table => 1}]);
64 $CLOSE{$_} = [\%TABLE, {table => 1}] for qw(colgroup tbody tfoot thead);
65 $CLOSE{$_} = [{dd => 1, dt => 1}, {dl => 1}] for qw(dd dt);
66 $CLOSE{$_} = [{rp => 1, rt => 1}, {ruby => 1}] for qw(rp rt);
67 $CLOSE{$_} = [{th => 1, td => 1}, {table => 1}] for qw(td th);
69 # HTML elements without end tags
70 my %EMPTY = map { $_ => 1 } (
71 qw(area base br col embed hr img input keygen link menuitem meta param),
75 # HTML elements categorized as phrasing content (and obsolete inline elements)
77 qw(a abbr area audio b bdi bdo br button canvas cite code data datalist),
78 qw(del dfn em embed i iframe img input ins kbd keygen label link map mark),
79 qw(math meta meter noscript object output picture progress q ruby s samp),
80 qw(script select small span strong sub sup svg template textarea time u),
83 my @OBSOLETE = qw(acronym applet basefont big font strike tt);
84 my %PHRASING = map { $_ => 1 } @OBSOLETE, @PHRASING;
86 # HTML elements that don't get their self-closing flag acknowledged
87 my %BLOCK = map { $_ => 1 } (
88 qw(a address applet article aside b big blockquote body button caption),
89 qw(center code col colgroup dd details dialog dir div dl dt em fieldset),
90 qw(figcaption figure font footer form frameset h1 h2 h3 h4 h5 h6 head),
91 qw(header hgroup html i iframe li listing main marquee menu nav nobr),
92 qw(noembed noframes noscript object ol optgroup option p plaintext pre rp),
93 qw(rt s script section select small strike strong style summary table),
94 qw(tbody td template textarea tfoot th thead title tr tt u ul xmp)
99 bless @_ ? @_ > 1 ? {@_} : {%{$_[0]}} : {}, ref $class || $class;
104 return exists $self->{tree} ? $self->{tree} : ($self->{tree} = ['root']) unless @_;
105 $self->{tree} = shift;
111 return $self->{xml} unless @_;
112 $self->{xml} = shift;
117 my ($self, $html) = (shift, "$_[0]");
119 my $xml = $self->xml;
120 my $current = my $tree = ['root'];
121 while ($html =~ /\G$TOKEN_RE/gcso) {
122 my ($text, $doctype, $comment, $cdata, $pi, $tag, $runaway)
123 = ($1, $2, $3, $4, $5, $6, $11);
125 # Text (and runaway "<")
126 $text .= '<' if defined $runaway;
127 _node($current, 'text', html_unescape $text) if defined $text;
133 if ($tag =~ /^\/\s*(\S+)/) { _end($xml ? $1 : lc $1, $xml, \$current) }
136 elsif ($tag =~ m!^([^\s/]+)([\s\S]*)!) {
137 my ($start, $attr) = ($xml ? $1 : lc $1, $2);
140 my (%attrs, $closing);
141 while ($attr =~ /$ATTR_RE/go) {
142 my ($key, $value) = ($xml ? $1 : lc $1, $3 // $4);
145 ++$closing and next if $key eq '/';
147 $attrs{$key} = defined $value ? html_unescape $value : $value;
150 # "image" is an alias for "img"
151 $start = 'img' if !$xml && $start eq 'image';
152 _start($start, \%attrs, $xml, \$current);
154 # Element without end tag (self-closing)
155 _end($start, $xml, \$current)
156 if !$xml && $EMPTY{$start} || ($xml || !$BLOCK{$start}) && $closing;
159 next if $xml || !$RAW{$start} && !$RCDATA{$start};
160 next unless $html =~ m!\G(.*?)<\s*/\s*\Q$start\E\s*>!gcsi;
161 _node($current, 'raw', $RCDATA{$start} ? html_unescape $1 : $1);
162 _end($start, 0, \$current);
167 elsif (defined $doctype) { _node($current, 'doctype', $doctype) }
170 elsif (defined $comment) { _node($current, 'comment', $comment) }
173 elsif (defined $cdata) { _node($current, 'cdata', $cdata) }
175 # Processing instruction (try to detect XML)
176 elsif (defined $pi) {
177 $self->xml($xml = 1) if !exists $self->{xml} && $pi =~ /xml/i;
178 _node($current, 'pi', $pi);
182 return $self->tree($tree);
185 sub render { _render($_[0]->tree, $_[0]->xml) }
188 my ($end, $xml, $current) = @_;
190 # Search stack for start tag
191 my $next = $$current;
194 # Ignore useless end tag
195 return if $next->[0] eq 'root';
198 return $$current = $next->[3] if $next->[1] eq $end;
200 # Phrasing content can only cross phrasing content
201 return if !$xml && $PHRASING{$end} && !$PHRASING{$next->[1]};
203 } while $next = $next->[3];
207 my ($current, $type, $content) = @_;
208 push @$current, my $new = [$type, $content, $current];
213 my ($tree, $xml) = @_;
216 my $type = $tree->[0];
217 return html_escape($tree->[1]) if $type eq 'text';
220 return $tree->[1] if $type eq 'raw';
223 return '<!DOCTYPE' . $tree->[1] . '>' if $type eq 'doctype';
226 return '<!--' . $tree->[1] . '-->' if $type eq 'comment';
229 return '<![CDATA[' . $tree->[1] . ']]>' if $type eq 'cdata';
231 # Processing instruction
232 return '<?' . $tree->[1] . '?>' if $type eq 'pi';
235 return join '', map { _render($_, $xml) } @$tree[1 .. $#$tree]
239 my $tag = $tree->[1];
240 my $result = "<$tag";
243 for my $key (sort keys %{$tree->[2]}) {
244 my $value = $tree->[2]{$key};
245 $result .= $xml ? qq{ $key="$key"} : " $key" and next unless defined $value;
246 $result .= qq{ $key="} . html_escape($value) . '"';
250 return $xml ? "$result />" : $EMPTY{$tag} ? "$result>" : "$result></$tag>"
254 no warnings 'recursion';
255 $result .= '>' . join '', map { _render($_, $xml) } @$tree[4 .. $#$tree];
258 return "$result</$tag>";
262 my ($start, $attrs, $xml, $current) = @_;
264 # Autoclose optional HTML elements
265 if (!$xml && $$current->[0] ne 'root') {
266 if (my $end = $END{$start}) { _end($end, 0, $current) }
268 elsif (my $close = $CLOSE{$start}) {
269 my ($allowed, $scope) = @$close;
271 # Close allowed parent elements in scope
272 my $parent = $$current;
273 while ($parent->[0] ne 'root' && !$scope->{$parent->[1]}) {
274 _end($parent->[1], 0, $current) if $allowed->{$parent->[1]};
275 $parent = $parent->[3];
281 push @$$current, my $new = ['tag', $start, $attrs, $$current];
292 DOM::Tiny::HTML - HTML/XML engine
298 # Turn HTML into DOM tree
299 my $html = DOM::Tiny::HTML->new;
300 $html->parse('<div><p id="a">Test</p><p id="b">123</p></div>');
301 my $tree = $html->tree;
305 L<DOM::Tiny::HTML> is the HTML/XML engine used by L<DOM::Tiny> based on
306 L<Mojo::DOM::HTML>, which is based on the
307 L<HTML Living Standard|https://html.spec.whatwg.org> as well as the
308 L<Extensible Markup Language (XML) 1.0|http://www.w3.org/TR/xml/>.
312 L<DOM::Tiny::HTML> implements the following attributes.
316 my $tree = $html->tree;
317 $html = $html->tree(['root']);
319 Document Object Model. Note that this structure should only be used very
320 carefully since it is very dynamic.
324 my $bool = $html->xml;
325 $html = $html->xml($bool);
327 Disable HTML semantics in parser and activate case-sensitivity, defaults to
328 auto detection based on processing instructions.
332 L<DOM::Tiny::HTML> implements the following methods.
336 my $html = DOM::Tiny::HTML->new;
337 my $html = DOM::Tiny::HTML->new(xml => 1);
338 my $html = DOM::Tiny::HTML->new({xml => 1});
340 Construct a new hash-based L<DOM::Tiny::HTML> object.
344 $html = $html->parse('<foo bar="baz">I ♥ DOM::Tiny!</foo>');
346 Parse HTML/XML fragment.
350 my $str = $html->render;
352 Render DOM to HTML/XML.
356 Report any issues on the public bugtracker.
360 Dan Book <dbook@cpan.org>
362 =head1 COPYRIGHT AND LICENSE
364 This software is Copyright (c) 2015 by Dan Book.
366 This is free software, licensed under:
368 The Artistic License 2.0 (GPL Compatible)