Bump version
[catagits/DOM-Tiny.git] / lib / DOM / Tiny / _HTML.pm
CommitLineData
9a5f1e3f 1package DOM::Tiny::_HTML;
d6512b50 2
3use strict;
4use warnings;
e085469f 5use DOM::Tiny::Entities qw(html_escape html_unescape);
d6512b50 6use Scalar::Util 'weaken';
d6512b50 7
8398aa8a 8our $VERSION = '0.002';
927f1351 9
d6512b50 10my $ATTR_RE = qr/
927f1351 11 ([^<>=\s\/]+|\/) # Key
d6512b50 12 (?:
13 \s*=\s*
14 (?s:(["'])(.*?)\g{-2}|([^>\s]*)) # Value
15 )?
16 \s*
17/x;
18my $TOKEN_RE = qr/
19 ([^<]+)? # Text
20 (?:
21 <(?:
22 !(?:
23 DOCTYPE(
24 \s+\w+ # Doctype
25 (?:(?:\s+\w+)?(?:\s+(?:"[^"]*"|'[^']*'))+)? # External ID
26 (?:\s+\[.+?\])? # Int Subset
27 \s*)
28 |
29 --(.*?)--\s* # Comment
30 |
31 \[CDATA\[(.*?)\]\] # CDATA
32 )
33 |
34 \?(.*?)\? # Processing Instruction
35 |
36 \s*([^<>\s]+\s*(?:(?:$ATTR_RE){0,32766})*+) # Tag
37 )>
38 |
39 (<) # Runaway "<"
40 )??
41/xis;
42
43# HTML elements that only contain raw text
44my %RAW = map { $_ => 1 } qw(script style);
45
46# HTML elements that only contain raw text and entities
47my %RCDATA = map { $_ => 1 } qw(title textarea);
48
49# HTML elements with optional end tags
50my %END = (body => 'head', optgroup => 'optgroup', option => 'option');
51
52# HTML elements that break paragraphs
53map { $END{$_} = 'p' } (
54 qw(address article aside blockquote dir div dl fieldset footer form h1 h2),
55 qw(h3 h4 h5 h6 header hr main menu nav ol p pre section table ul)
56);
57
58# HTML table elements with optional end tags
59my %TABLE = map { $_ => 1 } qw(colgroup tbody td tfoot th thead tr);
60
61# HTML elements with optional end tags and scoping rules
62my %CLOSE
63 = (li => [{li => 1}, {ul => 1, ol => 1}], tr => [{tr => 1}, {table => 1}]);
64$CLOSE{$_} = [\%TABLE, {table => 1}] for qw(colgroup tbody tfoot thead);
65$CLOSE{$_} = [{dd => 1, dt => 1}, {dl => 1}] for qw(dd dt);
66$CLOSE{$_} = [{rp => 1, rt => 1}, {ruby => 1}] for qw(rp rt);
67$CLOSE{$_} = [{th => 1, td => 1}, {table => 1}] for qw(td th);
68
69# HTML elements without end tags
70my %EMPTY = map { $_ => 1 } (
71 qw(area base br col embed hr img input keygen link menuitem meta param),
72 qw(source track wbr)
73);
74
75# HTML elements categorized as phrasing content (and obsolete inline elements)
76my @PHRASING = (
77 qw(a abbr area audio b bdi bdo br button canvas cite code data datalist),
78 qw(del dfn em embed i iframe img input ins kbd keygen label link map mark),
79 qw(math meta meter noscript object output picture progress q ruby s samp),
80 qw(script select small span strong sub sup svg template textarea time u),
81 qw(var video wbr)
82);
83my @OBSOLETE = qw(acronym applet basefont big font strike tt);
84my %PHRASING = map { $_ => 1 } @OBSOLETE, @PHRASING;
85
86# HTML elements that don't get their self-closing flag acknowledged
87my %BLOCK = map { $_ => 1 } (
88 qw(a address applet article aside b big blockquote body button caption),
89 qw(center code col colgroup dd details dialog dir div dl dt em fieldset),
90 qw(figcaption figure font footer form frameset h1 h2 h3 h4 h5 h6 head),
91 qw(header hgroup html i iframe li listing main marquee menu nav nobr),
92 qw(noembed noframes noscript object ol optgroup option p plaintext pre rp),
93 qw(rt s script section select small strike strong style summary table),
94 qw(tbody td template textarea tfoot th thead title tr tt u ul xmp)
95);
96
d066f9b8 97sub new {
98 my $class = shift;
99 bless @_ ? @_ > 1 ? {@_} : {%{$_[0]}} : {}, ref $class || $class;
100}
101
102sub tree {
103 my $self = shift;
104 return exists $self->{tree} ? $self->{tree} : ($self->{tree} = ['root']) unless @_;
105 $self->{tree} = shift;
106 return $self;
107}
108
109sub xml {
110 my $self = shift;
111 return $self->{xml} unless @_;
112 $self->{xml} = shift;
113 return $self;
114}
115
d6512b50 116sub parse {
117 my ($self, $html) = (shift, "$_[0]");
118
119 my $xml = $self->xml;
120 my $current = my $tree = ['root'];
121 while ($html =~ /\G$TOKEN_RE/gcso) {
122 my ($text, $doctype, $comment, $cdata, $pi, $tag, $runaway)
123 = ($1, $2, $3, $4, $5, $6, $11);
124
125 # Text (and runaway "<")
126 $text .= '<' if defined $runaway;
127 _node($current, 'text', html_unescape $text) if defined $text;
128
129 # Tag
130 if (defined $tag) {
131
132 # End
133 if ($tag =~ /^\/\s*(\S+)/) { _end($xml ? $1 : lc $1, $xml, \$current) }
134
135 # Start
136 elsif ($tag =~ m!^([^\s/]+)([\s\S]*)!) {
137 my ($start, $attr) = ($xml ? $1 : lc $1, $2);
138
139 # Attributes
140 my (%attrs, $closing);
141 while ($attr =~ /$ATTR_RE/go) {
91880340 142 my ($key, $value) = ($xml ? $1 : lc $1, $3 // $4);
d6512b50 143
144 # Empty tag
145 ++$closing and next if $key eq '/';
146
147 $attrs{$key} = defined $value ? html_unescape $value : $value;
148 }
149
150 # "image" is an alias for "img"
151 $start = 'img' if !$xml && $start eq 'image';
152 _start($start, \%attrs, $xml, \$current);
153
154 # Element without end tag (self-closing)
155 _end($start, $xml, \$current)
156 if !$xml && $EMPTY{$start} || ($xml || !$BLOCK{$start}) && $closing;
157
158 # Raw text elements
159 next if $xml || !$RAW{$start} && !$RCDATA{$start};
160 next unless $html =~ m!\G(.*?)<\s*/\s*\Q$start\E\s*>!gcsi;
161 _node($current, 'raw', $RCDATA{$start} ? html_unescape $1 : $1);
162 _end($start, 0, \$current);
163 }
164 }
165
166 # DOCTYPE
167 elsif (defined $doctype) { _node($current, 'doctype', $doctype) }
168
169 # Comment
170 elsif (defined $comment) { _node($current, 'comment', $comment) }
171
172 # CDATA
173 elsif (defined $cdata) { _node($current, 'cdata', $cdata) }
174
175 # Processing instruction (try to detect XML)
176 elsif (defined $pi) {
177 $self->xml($xml = 1) if !exists $self->{xml} && $pi =~ /xml/i;
178 _node($current, 'pi', $pi);
179 }
180 }
181
182 return $self->tree($tree);
183}
184
185sub render { _render($_[0]->tree, $_[0]->xml) }
186
187sub _end {
188 my ($end, $xml, $current) = @_;
189
190 # Search stack for start tag
191 my $next = $$current;
192 do {
193
194 # Ignore useless end tag
195 return if $next->[0] eq 'root';
196
197 # Right tag
198 return $$current = $next->[3] if $next->[1] eq $end;
199
200 # Phrasing content can only cross phrasing content
201 return if !$xml && $PHRASING{$end} && !$PHRASING{$next->[1]};
202
203 } while $next = $next->[3];
204}
205
206sub _node {
207 my ($current, $type, $content) = @_;
208 push @$current, my $new = [$type, $content, $current];
209 weaken $new->[2];
210}
211
212sub _render {
213 my ($tree, $xml) = @_;
214
215 # Text (escaped)
216 my $type = $tree->[0];
e085469f 217 return html_escape($tree->[1]) if $type eq 'text';
d6512b50 218
219 # Raw text
220 return $tree->[1] if $type eq 'raw';
221
222 # DOCTYPE
223 return '<!DOCTYPE' . $tree->[1] . '>' if $type eq 'doctype';
224
225 # Comment
226 return '<!--' . $tree->[1] . '-->' if $type eq 'comment';
227
228 # CDATA
229 return '<![CDATA[' . $tree->[1] . ']]>' if $type eq 'cdata';
230
231 # Processing instruction
232 return '<?' . $tree->[1] . '?>' if $type eq 'pi';
233
234 # Root
235 return join '', map { _render($_, $xml) } @$tree[1 .. $#$tree]
236 if $type eq 'root';
237
238 # Start tag
239 my $tag = $tree->[1];
240 my $result = "<$tag";
241
242 # Attributes
243 for my $key (sort keys %{$tree->[2]}) {
244 my $value = $tree->[2]{$key};
245 $result .= $xml ? qq{ $key="$key"} : " $key" and next unless defined $value;
e085469f 246 $result .= qq{ $key="} . html_escape($value) . '"';
d6512b50 247 }
248
249 # No children
250 return $xml ? "$result />" : $EMPTY{$tag} ? "$result>" : "$result></$tag>"
251 unless $tree->[4];
252
253 # Children
254 no warnings 'recursion';
255 $result .= '>' . join '', map { _render($_, $xml) } @$tree[4 .. $#$tree];
256
257 # End tag
258 return "$result</$tag>";
259}
260
261sub _start {
262 my ($start, $attrs, $xml, $current) = @_;
263
264 # Autoclose optional HTML elements
265 if (!$xml && $$current->[0] ne 'root') {
266 if (my $end = $END{$start}) { _end($end, 0, $current) }
267
268 elsif (my $close = $CLOSE{$start}) {
269 my ($allowed, $scope) = @$close;
270
271 # Close allowed parent elements in scope
272 my $parent = $$current;
273 while ($parent->[0] ne 'root' && !$scope->{$parent->[1]}) {
274 _end($parent->[1], 0, $current) if $allowed->{$parent->[1]};
275 $parent = $parent->[3];
276 }
277 }
278 }
279
280 # New tag
281 push @$$current, my $new = ['tag', $start, $attrs, $$current];
282 weaken $new->[3];
283 $$current = $new;
284}
285
2861;
287
9a5f1e3f 288=for Pod::Coverage *EVERYTHING*
d6512b50 289
9a5f1e3f 290=cut