perl 5.8 support (mst)
[catagits/DOM-Tiny.git] / lib / DOM / Tiny / _HTML.pm
CommitLineData
9a5f1e3f 1package DOM::Tiny::_HTML;
d6512b50 2
3use strict;
4use warnings;
e085469f 5use DOM::Tiny::Entities qw(html_escape html_unescape);
d6512b50 6use Scalar::Util 'weaken';
d6512b50 7
8398aa8a 8our $VERSION = '0.002';
927f1351 9
d6512b50 10my $ATTR_RE = qr/
2d9f5165 11 ([^<>=\s\/]+|\/) # Key
d6512b50 12 (?:
13 \s*=\s*
2d9f5165 14 (?s:(?:"(.*?)")|(?:'(.*?)')|([^>\s]*)) # Value
d6512b50 15 )?
16 \s*
17/x;
18my $TOKEN_RE = qr/
19 ([^<]+)? # Text
20 (?:
21 <(?:
22 !(?:
23 DOCTYPE(
24 \s+\w+ # Doctype
25 (?:(?:\s+\w+)?(?:\s+(?:"[^"]*"|'[^']*'))+)? # External ID
26 (?:\s+\[.+?\])? # Int Subset
27 \s*)
28 |
29 --(.*?)--\s* # Comment
30 |
31 \[CDATA\[(.*?)\]\] # CDATA
32 )
33 |
34 \?(.*?)\? # Processing Instruction
35 |
2d9f5165 36 \s*([^<>\s]+\s*(?>(?:$ATTR_RE){0,32766})*) # Tag
37 # Workaround for perl's limit of * to {0,32767}
d6512b50 38 )>
39 |
40 (<) # Runaway "<"
41 )??
42/xis;
43
44# HTML elements that only contain raw text
45my %RAW = map { $_ => 1 } qw(script style);
46
47# HTML elements that only contain raw text and entities
48my %RCDATA = map { $_ => 1 } qw(title textarea);
49
2d9f5165 50my %END = (
51 # HTML elements with optional end tags
52 body => 'head', optgroup => 'optgroup', option => 'option',
53 # HTML elements that break paragraphs
54 map +($_ => 'p'),
55 qw(address article aside blockquote dir div dl fieldset footer form h1 h2),
56 qw(h3 h4 h5 h6 header hr main menu nav ol p pre section table ul)
d6512b50 57);
58
59# HTML table elements with optional end tags
60my %TABLE = map { $_ => 1 } qw(colgroup tbody td tfoot th thead tr);
61
62# HTML elements with optional end tags and scoping rules
63my %CLOSE
64 = (li => [{li => 1}, {ul => 1, ol => 1}], tr => [{tr => 1}, {table => 1}]);
65$CLOSE{$_} = [\%TABLE, {table => 1}] for qw(colgroup tbody tfoot thead);
66$CLOSE{$_} = [{dd => 1, dt => 1}, {dl => 1}] for qw(dd dt);
67$CLOSE{$_} = [{rp => 1, rt => 1}, {ruby => 1}] for qw(rp rt);
68$CLOSE{$_} = [{th => 1, td => 1}, {table => 1}] for qw(td th);
69
70# HTML elements without end tags
71my %EMPTY = map { $_ => 1 } (
72 qw(area base br col embed hr img input keygen link menuitem meta param),
73 qw(source track wbr)
74);
75
76# HTML elements categorized as phrasing content (and obsolete inline elements)
77my @PHRASING = (
78 qw(a abbr area audio b bdi bdo br button canvas cite code data datalist),
79 qw(del dfn em embed i iframe img input ins kbd keygen label link map mark),
80 qw(math meta meter noscript object output picture progress q ruby s samp),
81 qw(script select small span strong sub sup svg template textarea time u),
82 qw(var video wbr)
83);
84my @OBSOLETE = qw(acronym applet basefont big font strike tt);
85my %PHRASING = map { $_ => 1 } @OBSOLETE, @PHRASING;
86
87# HTML elements that don't get their self-closing flag acknowledged
88my %BLOCK = map { $_ => 1 } (
89 qw(a address applet article aside b big blockquote body button caption),
90 qw(center code col colgroup dd details dialog dir div dl dt em fieldset),
91 qw(figcaption figure font footer form frameset h1 h2 h3 h4 h5 h6 head),
92 qw(header hgroup html i iframe li listing main marquee menu nav nobr),
93 qw(noembed noframes noscript object ol optgroup option p plaintext pre rp),
94 qw(rt s script section select small strike strong style summary table),
95 qw(tbody td template textarea tfoot th thead title tr tt u ul xmp)
96);
97
d066f9b8 98sub new {
99 my $class = shift;
100 bless @_ ? @_ > 1 ? {@_} : {%{$_[0]}} : {}, ref $class || $class;
101}
102
103sub tree {
104 my $self = shift;
105 return exists $self->{tree} ? $self->{tree} : ($self->{tree} = ['root']) unless @_;
106 $self->{tree} = shift;
107 return $self;
108}
109
110sub xml {
111 my $self = shift;
112 return $self->{xml} unless @_;
113 $self->{xml} = shift;
114 return $self;
115}
116
d6512b50 117sub parse {
118 my ($self, $html) = (shift, "$_[0]");
119
120 my $xml = $self->xml;
121 my $current = my $tree = ['root'];
122 while ($html =~ /\G$TOKEN_RE/gcso) {
123 my ($text, $doctype, $comment, $cdata, $pi, $tag, $runaway)
124 = ($1, $2, $3, $4, $5, $6, $11);
125
126 # Text (and runaway "<")
127 $text .= '<' if defined $runaway;
128 _node($current, 'text', html_unescape $text) if defined $text;
129
130 # Tag
131 if (defined $tag) {
132
133 # End
134 if ($tag =~ /^\/\s*(\S+)/) { _end($xml ? $1 : lc $1, $xml, \$current) }
135
136 # Start
137 elsif ($tag =~ m!^([^\s/]+)([\s\S]*)!) {
138 my ($start, $attr) = ($xml ? $1 : lc $1, $2);
139
140 # Attributes
141 my (%attrs, $closing);
142 while ($attr =~ /$ATTR_RE/go) {
2d9f5165 143 my $key = $xml ? $1 : lc $1;
144 my $value = defined($2) ? $2 : defined($3) ? $3 : $4;
d6512b50 145
146 # Empty tag
147 ++$closing and next if $key eq '/';
148
149 $attrs{$key} = defined $value ? html_unescape $value : $value;
150 }
151
152 # "image" is an alias for "img"
153 $start = 'img' if !$xml && $start eq 'image';
154 _start($start, \%attrs, $xml, \$current);
155
156 # Element without end tag (self-closing)
157 _end($start, $xml, \$current)
158 if !$xml && $EMPTY{$start} || ($xml || !$BLOCK{$start}) && $closing;
159
160 # Raw text elements
161 next if $xml || !$RAW{$start} && !$RCDATA{$start};
162 next unless $html =~ m!\G(.*?)<\s*/\s*\Q$start\E\s*>!gcsi;
163 _node($current, 'raw', $RCDATA{$start} ? html_unescape $1 : $1);
164 _end($start, 0, \$current);
165 }
166 }
167
168 # DOCTYPE
169 elsif (defined $doctype) { _node($current, 'doctype', $doctype) }
170
171 # Comment
172 elsif (defined $comment) { _node($current, 'comment', $comment) }
173
174 # CDATA
175 elsif (defined $cdata) { _node($current, 'cdata', $cdata) }
176
177 # Processing instruction (try to detect XML)
178 elsif (defined $pi) {
179 $self->xml($xml = 1) if !exists $self->{xml} && $pi =~ /xml/i;
180 _node($current, 'pi', $pi);
181 }
182 }
183
184 return $self->tree($tree);
185}
186
187sub render { _render($_[0]->tree, $_[0]->xml) }
188
189sub _end {
190 my ($end, $xml, $current) = @_;
191
192 # Search stack for start tag
193 my $next = $$current;
194 do {
195
196 # Ignore useless end tag
197 return if $next->[0] eq 'root';
198
199 # Right tag
200 return $$current = $next->[3] if $next->[1] eq $end;
201
202 # Phrasing content can only cross phrasing content
203 return if !$xml && $PHRASING{$end} && !$PHRASING{$next->[1]};
204
205 } while $next = $next->[3];
206}
207
208sub _node {
209 my ($current, $type, $content) = @_;
210 push @$current, my $new = [$type, $content, $current];
211 weaken $new->[2];
212}
213
214sub _render {
215 my ($tree, $xml) = @_;
216
217 # Text (escaped)
218 my $type = $tree->[0];
e085469f 219 return html_escape($tree->[1]) if $type eq 'text';
d6512b50 220
221 # Raw text
222 return $tree->[1] if $type eq 'raw';
223
224 # DOCTYPE
225 return '<!DOCTYPE' . $tree->[1] . '>' if $type eq 'doctype';
226
227 # Comment
228 return '<!--' . $tree->[1] . '-->' if $type eq 'comment';
229
230 # CDATA
231 return '<![CDATA[' . $tree->[1] . ']]>' if $type eq 'cdata';
232
233 # Processing instruction
234 return '<?' . $tree->[1] . '?>' if $type eq 'pi';
235
236 # Root
237 return join '', map { _render($_, $xml) } @$tree[1 .. $#$tree]
238 if $type eq 'root';
239
240 # Start tag
241 my $tag = $tree->[1];
242 my $result = "<$tag";
243
244 # Attributes
245 for my $key (sort keys %{$tree->[2]}) {
246 my $value = $tree->[2]{$key};
247 $result .= $xml ? qq{ $key="$key"} : " $key" and next unless defined $value;
e085469f 248 $result .= qq{ $key="} . html_escape($value) . '"';
d6512b50 249 }
250
251 # No children
252 return $xml ? "$result />" : $EMPTY{$tag} ? "$result>" : "$result></$tag>"
253 unless $tree->[4];
254
255 # Children
256 no warnings 'recursion';
257 $result .= '>' . join '', map { _render($_, $xml) } @$tree[4 .. $#$tree];
258
259 # End tag
260 return "$result</$tag>";
261}
262
263sub _start {
264 my ($start, $attrs, $xml, $current) = @_;
265
266 # Autoclose optional HTML elements
267 if (!$xml && $$current->[0] ne 'root') {
268 if (my $end = $END{$start}) { _end($end, 0, $current) }
269
270 elsif (my $close = $CLOSE{$start}) {
271 my ($allowed, $scope) = @$close;
272
273 # Close allowed parent elements in scope
274 my $parent = $$current;
275 while ($parent->[0] ne 'root' && !$scope->{$parent->[1]}) {
276 _end($parent->[1], 0, $current) if $allowed->{$parent->[1]};
277 $parent = $parent->[3];
278 }
279 }
280 }
281
282 # New tag
283 push @$$current, my $new = ['tag', $start, $attrs, $$current];
284 weaken $new->[3];
285 $$current = $new;
286}
287
2881;
289
9a5f1e3f 290=for Pod::Coverage *EVERYTHING*
d6512b50 291
9a5f1e3f 292=cut