1 package DOM::Tiny::_HTML;
5 use DOM::Tiny::Entities qw(html_escape html_unescape);
6 use Scalar::Util 'weaken';
8 our $VERSION = '0.002';
11 ([^<>=\s\/]+|\/) # Key
14 (?s:(?:"(.*?)")|(?:'(.*?)')|([^>\s]*)) # Value
25 (?:(?:\s+\w+)?(?:\s+(?:"[^"]*"|'[^']*'))+)? # External ID
26 (?:\s+\[.+?\])? # Int Subset
29 --(.*?)--\s* # Comment
31 \[CDATA\[(.*?)\]\] # CDATA
34 \?(.*?)\? # Processing Instruction
36 \s*([^<>\s]+\s*(?>(?:$ATTR_RE){0,32766})*) # Tag
37 # Workaround for perl's limit of * to {0,32767}
44 # HTML elements that only contain raw text
45 my %RAW = map { $_ => 1 } qw(script style);
47 # HTML elements that only contain raw text and entities
48 my %RCDATA = map { $_ => 1 } qw(title textarea);
50 # HTML elements with optional end tags
51 my %END = (body => 'head', optgroup => 'optgroup', option => 'option');
53 # HTML elements that break paragraphs
55 qw(address article aside blockquote dir div dl fieldset footer form h1 h2),
56 qw(h3 h4 h5 h6 header hr main menu nav ol p pre section table ul);
58 # HTML table elements with optional end tags
59 my %TABLE = map { $_ => 1 } qw(colgroup tbody td tfoot th thead tr);
61 # HTML elements with optional end tags and scoping rules
63 = (li => [{li => 1}, {ul => 1, ol => 1}], tr => [{tr => 1}, {table => 1}]);
64 $CLOSE{$_} = [\%TABLE, {table => 1}] for qw(colgroup tbody tfoot thead);
65 $CLOSE{$_} = [{dd => 1, dt => 1}, {dl => 1}] for qw(dd dt);
66 $CLOSE{$_} = [{rp => 1, rt => 1}, {ruby => 1}] for qw(rp rt);
67 $CLOSE{$_} = [{th => 1, td => 1}, {table => 1}] for qw(td th);
69 # HTML elements without end tags
70 my %EMPTY = map { $_ => 1 } (
71 qw(area base br col embed hr img input keygen link menuitem meta param),
75 # HTML elements categorized as phrasing content (and obsolete inline elements)
77 qw(a abbr area audio b bdi bdo br button canvas cite code data datalist),
78 qw(del dfn em embed i iframe img input ins kbd keygen label link map mark),
79 qw(math meta meter noscript object output picture progress q ruby s samp),
80 qw(script select small span strong sub sup svg template textarea time u),
83 my @OBSOLETE = qw(acronym applet basefont big font strike tt);
84 my %PHRASING = map { $_ => 1 } @OBSOLETE, @PHRASING;
86 # HTML elements that don't get their self-closing flag acknowledged
87 my %BLOCK = map { $_ => 1 } (
88 qw(a address applet article aside b big blockquote body button caption),
89 qw(center code col colgroup dd details dialog dir div dl dt em fieldset),
90 qw(figcaption figure font footer form frameset h1 h2 h3 h4 h5 h6 head),
91 qw(header hgroup html i iframe li listing main marquee menu nav nobr),
92 qw(noembed noframes noscript object ol optgroup option p plaintext pre rp),
93 qw(rt s script section select small strike strong style summary table),
94 qw(tbody td template textarea tfoot th thead title tr tt u ul xmp)
99 bless @_ ? @_ > 1 ? {@_} : {%{$_[0]}} : {}, ref $class || $class;
104 return exists $self->{tree} ? $self->{tree} : ($self->{tree} = ['root']) unless @_;
105 $self->{tree} = shift;
111 return $self->{xml} unless @_;
112 $self->{xml} = shift;
117 my ($self, $html) = (shift, "$_[0]");
119 my $xml = $self->xml;
120 my $current = my $tree = ['root'];
121 while ($html =~ /\G$TOKEN_RE/gcso) {
122 my ($text, $doctype, $comment, $cdata, $pi, $tag, $runaway)
123 = ($1, $2, $3, $4, $5, $6, $11);
125 # Text (and runaway "<")
126 $text .= '<' if defined $runaway;
127 _node($current, 'text', html_unescape $text) if defined $text;
133 if ($tag =~ /^\/\s*(\S+)/) { _end($xml ? $1 : lc $1, $xml, \$current) }
136 elsif ($tag =~ m!^([^\s/]+)([\s\S]*)!) {
137 my ($start, $attr) = ($xml ? $1 : lc $1, $2);
140 my (%attrs, $closing);
141 while ($attr =~ /$ATTR_RE/go) {
142 my $key = $xml ? $1 : lc $1;
143 my $value = defined($2) ? $2 : defined($3) ? $3 : $4;
146 ++$closing and next if $key eq '/';
148 $attrs{$key} = defined $value ? html_unescape $value : $value;
151 # "image" is an alias for "img"
152 $start = 'img' if !$xml && $start eq 'image';
153 _start($start, \%attrs, $xml, \$current);
155 # Element without end tag (self-closing)
156 _end($start, $xml, \$current)
157 if !$xml && $EMPTY{$start} || ($xml || !$BLOCK{$start}) && $closing;
160 next if $xml || !$RAW{$start} && !$RCDATA{$start};
161 next unless $html =~ m!\G(.*?)<\s*/\s*\Q$start\E\s*>!gcsi;
162 _node($current, 'raw', $RCDATA{$start} ? html_unescape $1 : $1);
163 _end($start, 0, \$current);
168 elsif (defined $doctype) { _node($current, 'doctype', $doctype) }
171 elsif (defined $comment) { _node($current, 'comment', $comment) }
174 elsif (defined $cdata) { _node($current, 'cdata', $cdata) }
176 # Processing instruction (try to detect XML)
177 elsif (defined $pi) {
178 $self->xml($xml = 1) if !exists $self->{xml} && $pi =~ /xml/i;
179 _node($current, 'pi', $pi);
183 return $self->tree($tree);
186 sub render { _render($_[0]->tree, $_[0]->xml) }
189 my ($end, $xml, $current) = @_;
191 # Search stack for start tag
192 my $next = $$current;
195 # Ignore useless end tag
196 return if $next->[0] eq 'root';
199 return $$current = $next->[3] if $next->[1] eq $end;
201 # Phrasing content can only cross phrasing content
202 return if !$xml && $PHRASING{$end} && !$PHRASING{$next->[1]};
204 } while $next = $next->[3];
208 my ($current, $type, $content) = @_;
209 push @$current, my $new = [$type, $content, $current];
214 my ($tree, $xml) = @_;
217 my $type = $tree->[0];
218 return html_escape($tree->[1]) if $type eq 'text';
221 return $tree->[1] if $type eq 'raw';
224 return '<!DOCTYPE' . $tree->[1] . '>' if $type eq 'doctype';
227 return '<!--' . $tree->[1] . '-->' if $type eq 'comment';
230 return '<![CDATA[' . $tree->[1] . ']]>' if $type eq 'cdata';
232 # Processing instruction
233 return '<?' . $tree->[1] . '?>' if $type eq 'pi';
236 return join '', map { _render($_, $xml) } @$tree[1 .. $#$tree]
240 my $tag = $tree->[1];
241 my $result = "<$tag";
244 for my $key (sort keys %{$tree->[2]}) {
245 my $value = $tree->[2]{$key};
246 $result .= $xml ? qq{ $key="$key"} : " $key" and next unless defined $value;
247 $result .= qq{ $key="} . html_escape($value) . '"';
251 return $xml ? "$result />" : $EMPTY{$tag} ? "$result>" : "$result></$tag>"
255 no warnings 'recursion';
256 $result .= '>' . join '', map { _render($_, $xml) } @$tree[4 .. $#$tree];
259 return "$result</$tag>";
263 my ($start, $attrs, $xml, $current) = @_;
265 # Autoclose optional HTML elements
266 if (!$xml && $$current->[0] ne 'root') {
267 if (my $end = $END{$start}) { _end($end, 0, $current) }
269 elsif (my $close = $CLOSE{$start}) {
270 my ($allowed, $scope) = @$close;
272 # Close allowed parent elements in scope
273 my $parent = $$current;
274 while ($parent->[0] ne 'root' && !$scope->{$parent->[1]}) {
275 _end($parent->[1], 0, $current) if $allowed->{$parent->[1]};
276 $parent = $parent->[3];
282 push @$$current, my $new = ['tag', $start, $attrs, $$current];
289 =for Pod::Coverage *EVERYTHING*