burninate documentation for DOM::Tiny
[catagits/DOM-Tiny.git] / lib / DOM / Tiny / _HTML.pm
CommitLineData
9a5f1e3f 1package DOM::Tiny::_HTML;
d6512b50 2
3use strict;
4use warnings;
e085469f 5use DOM::Tiny::Entities qw(html_escape html_unescape);
d6512b50 6use Scalar::Util 'weaken';
d6512b50 7
22cfa6c8 8our $VERSION = '0.004';
927f1351 9
d6512b50 10my $ATTR_RE = qr/
2d9f5165 11 ([^<>=\s\/]+|\/) # Key
d6512b50 12 (?:
13 \s*=\s*
2d9f5165 14 (?s:(?:"(.*?)")|(?:'(.*?)')|([^>\s]*)) # Value
d6512b50 15 )?
16 \s*
17/x;
18my $TOKEN_RE = qr/
19 ([^<]+)? # Text
20 (?:
21 <(?:
22 !(?:
23 DOCTYPE(
24 \s+\w+ # Doctype
25 (?:(?:\s+\w+)?(?:\s+(?:"[^"]*"|'[^']*'))+)? # External ID
26 (?:\s+\[.+?\])? # Int Subset
27 \s*)
28 |
29 --(.*?)--\s* # Comment
30 |
31 \[CDATA\[(.*?)\]\] # CDATA
32 )
33 |
34 \?(.*?)\? # Processing Instruction
35 |
2d9f5165 36 \s*([^<>\s]+\s*(?>(?:$ATTR_RE){0,32766})*) # Tag
37 # Workaround for perl's limit of * to {0,32767}
d6512b50 38 )>
39 |
40 (<) # Runaway "<"
41 )??
42/xis;
43
44# HTML elements that only contain raw text
45my %RAW = map { $_ => 1 } qw(script style);
46
47# HTML elements that only contain raw text and entities
48my %RCDATA = map { $_ => 1 } qw(title textarea);
49
28d93999 50# HTML elements with optional end tags
51my %END = (body => 'head', optgroup => 'optgroup', option => 'option');
52
53# HTML elements that break paragraphs
54$END{$_} = 'p' for
55 qw(address article aside blockquote dir div dl fieldset footer form h1 h2),
56 qw(h3 h4 h5 h6 header hr main menu nav ol p pre section table ul);
d6512b50 57
58# HTML table elements with optional end tags
59my %TABLE = map { $_ => 1 } qw(colgroup tbody td tfoot th thead tr);
60
61# HTML elements with optional end tags and scoping rules
62my %CLOSE
63 = (li => [{li => 1}, {ul => 1, ol => 1}], tr => [{tr => 1}, {table => 1}]);
64$CLOSE{$_} = [\%TABLE, {table => 1}] for qw(colgroup tbody tfoot thead);
65$CLOSE{$_} = [{dd => 1, dt => 1}, {dl => 1}] for qw(dd dt);
66$CLOSE{$_} = [{rp => 1, rt => 1}, {ruby => 1}] for qw(rp rt);
67$CLOSE{$_} = [{th => 1, td => 1}, {table => 1}] for qw(td th);
68
69# HTML elements without end tags
70my %EMPTY = map { $_ => 1 } (
71 qw(area base br col embed hr img input keygen link menuitem meta param),
72 qw(source track wbr)
73);
74
75# HTML elements categorized as phrasing content (and obsolete inline elements)
76my @PHRASING = (
77 qw(a abbr area audio b bdi bdo br button canvas cite code data datalist),
78 qw(del dfn em embed i iframe img input ins kbd keygen label link map mark),
79 qw(math meta meter noscript object output picture progress q ruby s samp),
80 qw(script select small span strong sub sup svg template textarea time u),
81 qw(var video wbr)
82);
83my @OBSOLETE = qw(acronym applet basefont big font strike tt);
84my %PHRASING = map { $_ => 1 } @OBSOLETE, @PHRASING;
85
86# HTML elements that don't get their self-closing flag acknowledged
87my %BLOCK = map { $_ => 1 } (
88 qw(a address applet article aside b big blockquote body button caption),
89 qw(center code col colgroup dd details dialog dir div dl dt em fieldset),
90 qw(figcaption figure font footer form frameset h1 h2 h3 h4 h5 h6 head),
91 qw(header hgroup html i iframe li listing main marquee menu nav nobr),
92 qw(noembed noframes noscript object ol optgroup option p plaintext pre rp),
93 qw(rt s script section select small strike strong style summary table),
94 qw(tbody td template textarea tfoot th thead title tr tt u ul xmp)
95);
96
d066f9b8 97sub new {
98 my $class = shift;
99 bless @_ ? @_ > 1 ? {@_} : {%{$_[0]}} : {}, ref $class || $class;
100}
101
102sub tree {
103 my $self = shift;
104 return exists $self->{tree} ? $self->{tree} : ($self->{tree} = ['root']) unless @_;
105 $self->{tree} = shift;
106 return $self;
107}
108
109sub xml {
110 my $self = shift;
111 return $self->{xml} unless @_;
112 $self->{xml} = shift;
113 return $self;
114}
115
d6512b50 116sub parse {
117 my ($self, $html) = (shift, "$_[0]");
118
119 my $xml = $self->xml;
120 my $current = my $tree = ['root'];
121 while ($html =~ /\G$TOKEN_RE/gcso) {
122 my ($text, $doctype, $comment, $cdata, $pi, $tag, $runaway)
123 = ($1, $2, $3, $4, $5, $6, $11);
124
125 # Text (and runaway "<")
126 $text .= '<' if defined $runaway;
127 _node($current, 'text', html_unescape $text) if defined $text;
128
129 # Tag
130 if (defined $tag) {
131
132 # End
133 if ($tag =~ /^\/\s*(\S+)/) { _end($xml ? $1 : lc $1, $xml, \$current) }
134
135 # Start
136 elsif ($tag =~ m!^([^\s/]+)([\s\S]*)!) {
137 my ($start, $attr) = ($xml ? $1 : lc $1, $2);
138
139 # Attributes
140 my (%attrs, $closing);
141 while ($attr =~ /$ATTR_RE/go) {
2d9f5165 142 my $key = $xml ? $1 : lc $1;
143 my $value = defined($2) ? $2 : defined($3) ? $3 : $4;
d6512b50 144
145 # Empty tag
146 ++$closing and next if $key eq '/';
147
148 $attrs{$key} = defined $value ? html_unescape $value : $value;
149 }
150
151 # "image" is an alias for "img"
152 $start = 'img' if !$xml && $start eq 'image';
153 _start($start, \%attrs, $xml, \$current);
154
155 # Element without end tag (self-closing)
156 _end($start, $xml, \$current)
157 if !$xml && $EMPTY{$start} || ($xml || !$BLOCK{$start}) && $closing;
158
159 # Raw text elements
160 next if $xml || !$RAW{$start} && !$RCDATA{$start};
161 next unless $html =~ m!\G(.*?)<\s*/\s*\Q$start\E\s*>!gcsi;
162 _node($current, 'raw', $RCDATA{$start} ? html_unescape $1 : $1);
163 _end($start, 0, \$current);
164 }
165 }
166
167 # DOCTYPE
168 elsif (defined $doctype) { _node($current, 'doctype', $doctype) }
169
170 # Comment
171 elsif (defined $comment) { _node($current, 'comment', $comment) }
172
173 # CDATA
174 elsif (defined $cdata) { _node($current, 'cdata', $cdata) }
175
176 # Processing instruction (try to detect XML)
177 elsif (defined $pi) {
178 $self->xml($xml = 1) if !exists $self->{xml} && $pi =~ /xml/i;
179 _node($current, 'pi', $pi);
180 }
181 }
182
183 return $self->tree($tree);
184}
185
186sub render { _render($_[0]->tree, $_[0]->xml) }
187
188sub _end {
189 my ($end, $xml, $current) = @_;
190
191 # Search stack for start tag
192 my $next = $$current;
193 do {
194
195 # Ignore useless end tag
196 return if $next->[0] eq 'root';
197
198 # Right tag
199 return $$current = $next->[3] if $next->[1] eq $end;
200
201 # Phrasing content can only cross phrasing content
202 return if !$xml && $PHRASING{$end} && !$PHRASING{$next->[1]};
203
204 } while $next = $next->[3];
205}
206
207sub _node {
208 my ($current, $type, $content) = @_;
209 push @$current, my $new = [$type, $content, $current];
210 weaken $new->[2];
211}
212
213sub _render {
214 my ($tree, $xml) = @_;
215
216 # Text (escaped)
217 my $type = $tree->[0];
e085469f 218 return html_escape($tree->[1]) if $type eq 'text';
d6512b50 219
220 # Raw text
221 return $tree->[1] if $type eq 'raw';
222
223 # DOCTYPE
224 return '<!DOCTYPE' . $tree->[1] . '>' if $type eq 'doctype';
225
226 # Comment
227 return '<!--' . $tree->[1] . '-->' if $type eq 'comment';
228
229 # CDATA
230 return '<![CDATA[' . $tree->[1] . ']]>' if $type eq 'cdata';
231
232 # Processing instruction
233 return '<?' . $tree->[1] . '?>' if $type eq 'pi';
234
235 # Root
236 return join '', map { _render($_, $xml) } @$tree[1 .. $#$tree]
237 if $type eq 'root';
238
239 # Start tag
240 my $tag = $tree->[1];
241 my $result = "<$tag";
242
243 # Attributes
244 for my $key (sort keys %{$tree->[2]}) {
245 my $value = $tree->[2]{$key};
246 $result .= $xml ? qq{ $key="$key"} : " $key" and next unless defined $value;
e085469f 247 $result .= qq{ $key="} . html_escape($value) . '"';
d6512b50 248 }
249
250 # No children
251 return $xml ? "$result />" : $EMPTY{$tag} ? "$result>" : "$result></$tag>"
252 unless $tree->[4];
253
254 # Children
255 no warnings 'recursion';
256 $result .= '>' . join '', map { _render($_, $xml) } @$tree[4 .. $#$tree];
257
258 # End tag
259 return "$result</$tag>";
260}
261
262sub _start {
263 my ($start, $attrs, $xml, $current) = @_;
264
265 # Autoclose optional HTML elements
266 if (!$xml && $$current->[0] ne 'root') {
267 if (my $end = $END{$start}) { _end($end, 0, $current) }
268
269 elsif (my $close = $CLOSE{$start}) {
270 my ($allowed, $scope) = @$close;
271
272 # Close allowed parent elements in scope
273 my $parent = $$current;
274 while ($parent->[0] ne 'root' && !$scope->{$parent->[1]}) {
275 _end($parent->[1], 0, $current) if $allowed->{$parent->[1]};
276 $parent = $parent->[3];
277 }
278 }
279 }
280
281 # New tag
282 push @$$current, my $new = ['tag', $start, $attrs, $$current];
283 weaken $new->[3];
284 $$current = $new;
285}
286
2871;
288
9a5f1e3f 289=for Pod::Coverage *EVERYTHING*
d6512b50 290
9a5f1e3f 291=cut