1 package DOM::Tiny::_HTML;
5 use DOM::Tiny::Entities qw(html_escape html_unescape);
6 use Scalar::Util 'weaken';
8 our $VERSION = '0.002';
11 ([^<>=\s\/]+|\/) # Key
14 (?s:(?:"(.*?)")|(?:'(.*?)')|([^>\s]*)) # Value
25 (?:(?:\s+\w+)?(?:\s+(?:"[^"]*"|'[^']*'))+)? # External ID
26 (?:\s+\[.+?\])? # Int Subset
29 --(.*?)--\s* # Comment
31 \[CDATA\[(.*?)\]\] # CDATA
34 \?(.*?)\? # Processing Instruction
36 \s*([^<>\s]+\s*(?>(?:$ATTR_RE){0,32766})*) # Tag
37 # Workaround for perl's limit of * to {0,32767}
44 # HTML elements that only contain raw text
45 my %RAW = map { $_ => 1 } qw(script style);
47 # HTML elements that only contain raw text and entities
48 my %RCDATA = map { $_ => 1 } qw(title textarea);
51 # HTML elements with optional end tags
52 body => 'head', optgroup => 'optgroup', option => 'option',
53 # HTML elements that break paragraphs
55 qw(address article aside blockquote dir div dl fieldset footer form h1 h2),
56 qw(h3 h4 h5 h6 header hr main menu nav ol p pre section table ul)
59 # HTML table elements with optional end tags
60 my %TABLE = map { $_ => 1 } qw(colgroup tbody td tfoot th thead tr);
62 # HTML elements with optional end tags and scoping rules
64 = (li => [{li => 1}, {ul => 1, ol => 1}], tr => [{tr => 1}, {table => 1}]);
65 $CLOSE{$_} = [\%TABLE, {table => 1}] for qw(colgroup tbody tfoot thead);
66 $CLOSE{$_} = [{dd => 1, dt => 1}, {dl => 1}] for qw(dd dt);
67 $CLOSE{$_} = [{rp => 1, rt => 1}, {ruby => 1}] for qw(rp rt);
68 $CLOSE{$_} = [{th => 1, td => 1}, {table => 1}] for qw(td th);
70 # HTML elements without end tags
71 my %EMPTY = map { $_ => 1 } (
72 qw(area base br col embed hr img input keygen link menuitem meta param),
76 # HTML elements categorized as phrasing content (and obsolete inline elements)
78 qw(a abbr area audio b bdi bdo br button canvas cite code data datalist),
79 qw(del dfn em embed i iframe img input ins kbd keygen label link map mark),
80 qw(math meta meter noscript object output picture progress q ruby s samp),
81 qw(script select small span strong sub sup svg template textarea time u),
84 my @OBSOLETE = qw(acronym applet basefont big font strike tt);
85 my %PHRASING = map { $_ => 1 } @OBSOLETE, @PHRASING;
87 # HTML elements that don't get their self-closing flag acknowledged
88 my %BLOCK = map { $_ => 1 } (
89 qw(a address applet article aside b big blockquote body button caption),
90 qw(center code col colgroup dd details dialog dir div dl dt em fieldset),
91 qw(figcaption figure font footer form frameset h1 h2 h3 h4 h5 h6 head),
92 qw(header hgroup html i iframe li listing main marquee menu nav nobr),
93 qw(noembed noframes noscript object ol optgroup option p plaintext pre rp),
94 qw(rt s script section select small strike strong style summary table),
95 qw(tbody td template textarea tfoot th thead title tr tt u ul xmp)
100 bless @_ ? @_ > 1 ? {@_} : {%{$_[0]}} : {}, ref $class || $class;
105 return exists $self->{tree} ? $self->{tree} : ($self->{tree} = ['root']) unless @_;
106 $self->{tree} = shift;
112 return $self->{xml} unless @_;
113 $self->{xml} = shift;
118 my ($self, $html) = (shift, "$_[0]");
120 my $xml = $self->xml;
121 my $current = my $tree = ['root'];
122 while ($html =~ /\G$TOKEN_RE/gcso) {
123 my ($text, $doctype, $comment, $cdata, $pi, $tag, $runaway)
124 = ($1, $2, $3, $4, $5, $6, $11);
126 # Text (and runaway "<")
127 $text .= '<' if defined $runaway;
128 _node($current, 'text', html_unescape $text) if defined $text;
134 if ($tag =~ /^\/\s*(\S+)/) { _end($xml ? $1 : lc $1, $xml, \$current) }
137 elsif ($tag =~ m!^([^\s/]+)([\s\S]*)!) {
138 my ($start, $attr) = ($xml ? $1 : lc $1, $2);
141 my (%attrs, $closing);
142 while ($attr =~ /$ATTR_RE/go) {
143 my $key = $xml ? $1 : lc $1;
144 my $value = defined($2) ? $2 : defined($3) ? $3 : $4;
147 ++$closing and next if $key eq '/';
149 $attrs{$key} = defined $value ? html_unescape $value : $value;
152 # "image" is an alias for "img"
153 $start = 'img' if !$xml && $start eq 'image';
154 _start($start, \%attrs, $xml, \$current);
156 # Element without end tag (self-closing)
157 _end($start, $xml, \$current)
158 if !$xml && $EMPTY{$start} || ($xml || !$BLOCK{$start}) && $closing;
161 next if $xml || !$RAW{$start} && !$RCDATA{$start};
162 next unless $html =~ m!\G(.*?)<\s*/\s*\Q$start\E\s*>!gcsi;
163 _node($current, 'raw', $RCDATA{$start} ? html_unescape $1 : $1);
164 _end($start, 0, \$current);
169 elsif (defined $doctype) { _node($current, 'doctype', $doctype) }
172 elsif (defined $comment) { _node($current, 'comment', $comment) }
175 elsif (defined $cdata) { _node($current, 'cdata', $cdata) }
177 # Processing instruction (try to detect XML)
178 elsif (defined $pi) {
179 $self->xml($xml = 1) if !exists $self->{xml} && $pi =~ /xml/i;
180 _node($current, 'pi', $pi);
184 return $self->tree($tree);
187 sub render { _render($_[0]->tree, $_[0]->xml) }
190 my ($end, $xml, $current) = @_;
192 # Search stack for start tag
193 my $next = $$current;
196 # Ignore useless end tag
197 return if $next->[0] eq 'root';
200 return $$current = $next->[3] if $next->[1] eq $end;
202 # Phrasing content can only cross phrasing content
203 return if !$xml && $PHRASING{$end} && !$PHRASING{$next->[1]};
205 } while $next = $next->[3];
209 my ($current, $type, $content) = @_;
210 push @$current, my $new = [$type, $content, $current];
215 my ($tree, $xml) = @_;
218 my $type = $tree->[0];
219 return html_escape($tree->[1]) if $type eq 'text';
222 return $tree->[1] if $type eq 'raw';
225 return '<!DOCTYPE' . $tree->[1] . '>' if $type eq 'doctype';
228 return '<!--' . $tree->[1] . '-->' if $type eq 'comment';
231 return '<![CDATA[' . $tree->[1] . ']]>' if $type eq 'cdata';
233 # Processing instruction
234 return '<?' . $tree->[1] . '?>' if $type eq 'pi';
237 return join '', map { _render($_, $xml) } @$tree[1 .. $#$tree]
241 my $tag = $tree->[1];
242 my $result = "<$tag";
245 for my $key (sort keys %{$tree->[2]}) {
246 my $value = $tree->[2]{$key};
247 $result .= $xml ? qq{ $key="$key"} : " $key" and next unless defined $value;
248 $result .= qq{ $key="} . html_escape($value) . '"';
252 return $xml ? "$result />" : $EMPTY{$tag} ? "$result>" : "$result></$tag>"
256 no warnings 'recursion';
257 $result .= '>' . join '', map { _render($_, $xml) } @$tree[4 .. $#$tree];
260 return "$result</$tag>";
264 my ($start, $attrs, $xml, $current) = @_;
266 # Autoclose optional HTML elements
267 if (!$xml && $$current->[0] ne 'root') {
268 if (my $end = $END{$start}) { _end($end, 0, $current) }
270 elsif (my $close = $CLOSE{$start}) {
271 my ($allowed, $scope) = @$close;
273 # Close allowed parent elements in scope
274 my $parent = $$current;
275 while ($parent->[0] ne 'root' && !$scope->{$parent->[1]}) {
276 _end($parent->[1], 0, $current) if $allowed->{$parent->[1]};
277 $parent = $parent->[3];
283 push @$$current, my $new = ['tag', $start, $attrs, $$current];
290 =for Pod::Coverage *EVERYTHING*