lib/DOM/Tiny/HTML.pm

   1 package DOM::Tiny::HTML;
   2
   3 use strict;
   4 use warnings;
   5 use DOM::Tiny::Entities qw(html_escape html_unescape);
   6 use Scalar::Util 'weaken';
   7
   8 our $VERSION = '0.001';
   9
  10 my $ATTR_RE = qr/
  11   ([^<>=\s\/]+|\/)                     # Key
  12   (?:
  13     \s*=\s*
  14     (?s:(["'])(.*?)\g{-2}|([^>\s]*))   # Value
  15   )?
  16   \s*
  17 /x;
  18 my $TOKEN_RE = qr/
  19   ([^<]+)?                                            # Text
  20   (?:
  21     <(?:
  22       !(?:
  23         DOCTYPE(
  24         \s+\w+                                        # Doctype
  25         (?:(?:\s+\w+)?(?:\s+(?:"[^"]*"|'[^']*'))+)?   # External ID
  26         (?:\s+\[.+?\])?                               # Int Subset
  27         \s*)
  28       |
  29         --(.*?)--\s*                                  # Comment
  30       |
  31         \[CDATA\[(.*?)\]\]                            # CDATA
  32       )
  33     |
  34       \?(.*?)\?                                       # Processing Instruction
  35     |
  36       \s*([^<>\s]+\s*(?:(?:$ATTR_RE){0,32766})*+)     # Tag
  37     )>
  38   |
  39     (<)                                               # Runaway "<"
  40   )??
  41 /xis;
  42
  43 # HTML elements that only contain raw text
  44 my %RAW = map { $_ => 1 } qw(script style);
  45
  46 # HTML elements that only contain raw text and entities
  47 my %RCDATA = map { $_ => 1 } qw(title textarea);
  48
  49 # HTML elements with optional end tags
  50 my %END = (body => 'head', optgroup => 'optgroup', option => 'option');
  51
  52 # HTML elements that break paragraphs
  53 map { $END{$_} = 'p' } (
  54   qw(address article aside blockquote dir div dl fieldset footer form h1 h2),
  55   qw(h3 h4 h5 h6 header hr main menu nav ol p pre section table ul)
  56 );
  57
  58 # HTML table elements with optional end tags
  59 my %TABLE = map { $_ => 1 } qw(colgroup tbody td tfoot th thead tr);
  60
  61 # HTML elements with optional end tags and scoping rules
  62 my %CLOSE
  63   = (li => [{li => 1}, {ul => 1, ol => 1}], tr => [{tr => 1}, {table => 1}]);
  64 $CLOSE{$_} = [\%TABLE, {table => 1}] for qw(colgroup tbody tfoot thead);
  65 $CLOSE{$_} = [{dd => 1, dt => 1}, {dl    => 1}] for qw(dd dt);
  66 $CLOSE{$_} = [{rp => 1, rt => 1}, {ruby  => 1}] for qw(rp rt);
  67 $CLOSE{$_} = [{th => 1, td => 1}, {table => 1}] for qw(td th);
  68
  69 # HTML elements without end tags
  70 my %EMPTY = map { $_ => 1 } (
  71   qw(area base br col embed hr img input keygen link menuitem meta param),
  72   qw(source track wbr)
  73 );
  74
  75 # HTML elements categorized as phrasing content (and obsolete inline elements)
  76 my @PHRASING = (
  77   qw(a abbr area audio b bdi bdo br button canvas cite code data datalist),
  78   qw(del dfn em embed i iframe img input ins kbd keygen label link map mark),
  79   qw(math meta meter noscript object output picture progress q ruby s samp),
  80   qw(script select small span strong sub sup svg template textarea time u),
  81   qw(var video wbr)
  82 );
  83 my @OBSOLETE = qw(acronym applet basefont big font strike tt);
  84 my %PHRASING = map { $_ => 1 } @OBSOLETE, @PHRASING;
  85
  86 # HTML elements that don't get their self-closing flag acknowledged
  87 my %BLOCK = map { $_ => 1 } (
  88   qw(a address applet article aside b big blockquote body button caption),
  89   qw(center code col colgroup dd details dialog dir div dl dt em fieldset),
  90   qw(figcaption figure font footer form frameset h1 h2 h3 h4 h5 h6 head),
  91   qw(header hgroup html i iframe li listing main marquee menu nav nobr),
  92   qw(noembed noframes noscript object ol optgroup option p plaintext pre rp),
  93   qw(rt s script section select small strike strong style summary table),
  94   qw(tbody td template textarea tfoot th thead title tr tt u ul xmp)
  95 );
  96
  97 sub new {
  98   my $class = shift;
  99   bless @_ ? @_ > 1 ? {@_} : {%{$_[0]}} : {}, ref $class || $class;
 100 }
 101
 102 sub tree {
 103   my $self = shift;
 104   return exists $self->{tree} ? $self->{tree} : ($self->{tree} = ['root']) unless @_;
 105   $self->{tree} = shift;
 106   return $self;
 107 }
 108
 109 sub xml {
 110   my $self = shift;
 111   return $self->{xml} unless @_;
 112   $self->{xml} = shift;
 113   return $self;
 114 }
 115
 116 sub parse {
 117   my ($self, $html) = (shift, "$_[0]");
 118
 119   my $xml = $self->xml;
 120   my $current = my $tree = ['root'];
 121   while ($html =~ /\G$TOKEN_RE/gcso) {
 122     my ($text, $doctype, $comment, $cdata, $pi, $tag, $runaway)
 123       = ($1, $2, $3, $4, $5, $6, $11);
 124
 125     # Text (and runaway "<")
 126     $text .= '<' if defined $runaway;
 127     _node($current, 'text', html_unescape $text) if defined $text;
 128
 129     # Tag
 130     if (defined $tag) {
 131
 132       # End
 133       if ($tag =~ /^\/\s*(\S+)/) { _end($xml ? $1 : lc $1, $xml, \$current) }
 134
 135       # Start
 136       elsif ($tag =~ m!^([^\s/]+)([\s\S]*)!) {
 137         my ($start, $attr) = ($xml ? $1 : lc $1, $2);
 138
 139         # Attributes
 140         my (%attrs, $closing);
 141         while ($attr =~ /$ATTR_RE/go) {
 142           my ($key, $value) = ($xml ? $1 : lc $1, $3 // $4);
 143
 144           # Empty tag
 145           ++$closing and next if $key eq '/';
 146
 147           $attrs{$key} = defined $value ? html_unescape $value : $value;
 148         }
 149
 150         # "image" is an alias for "img"
 151         $start = 'img' if !$xml && $start eq 'image';
 152         _start($start, \%attrs, $xml, \$current);
 153
 154         # Element without end tag (self-closing)
 155         _end($start, $xml, \$current)
 156           if !$xml && $EMPTY{$start} || ($xml || !$BLOCK{$start}) && $closing;
 157
 158         # Raw text elements
 159         next if $xml || !$RAW{$start} && !$RCDATA{$start};
 160         next unless $html =~ m!\G(.*?)<\s*/\s*\Q$start\E\s*>!gcsi;
 161         _node($current, 'raw', $RCDATA{$start} ? html_unescape $1 : $1);
 162         _end($start, 0, \$current);
 163       }
 164     }
 165
 166     # DOCTYPE
 167     elsif (defined $doctype) { _node($current, 'doctype', $doctype) }
 168
 169     # Comment
 170     elsif (defined $comment) { _node($current, 'comment', $comment) }
 171
 172     # CDATA
 173     elsif (defined $cdata) { _node($current, 'cdata', $cdata) }
 174
 175     # Processing instruction (try to detect XML)
 176     elsif (defined $pi) {
 177       $self->xml($xml = 1) if !exists $self->{xml} && $pi =~ /xml/i;
 178       _node($current, 'pi', $pi);
 179     }
 180   }
 181
 182   return $self->tree($tree);
 183 }
 184
 185 sub render { _render($_[0]->tree, $_[0]->xml) }
 186
 187 sub _end {
 188   my ($end, $xml, $current) = @_;
 189
 190   # Search stack for start tag
 191   my $next = $$current;
 192   do {
 193
 194     # Ignore useless end tag
 195     return if $next->[0] eq 'root';
 196
 197     # Right tag
 198     return $$current = $next->[3] if $next->[1] eq $end;
 199
 200     # Phrasing content can only cross phrasing content
 201     return if !$xml && $PHRASING{$end} && !$PHRASING{$next->[1]};
 202
 203   } while $next = $next->[3];
 204 }
 205
 206 sub _node {
 207   my ($current, $type, $content) = @_;
 208   push @$current, my $new = [$type, $content, $current];
 209   weaken $new->[2];
 210 }
 211
 212 sub _render {
 213   my ($tree, $xml) = @_;
 214
 215   # Text (escaped)
 216   my $type = $tree->[0];
 217   return html_escape($tree->[1]) if $type eq 'text';
 218
 219   # Raw text
 220   return $tree->[1] if $type eq 'raw';
 221
 222   # DOCTYPE
 223   return '<!DOCTYPE' . $tree->[1] . '>' if $type eq 'doctype';
 224
 225   # Comment
 226   return '<!--' . $tree->[1] . '-->' if $type eq 'comment';
 227
 228   # CDATA
 229   return '<![CDATA[' . $tree->[1] . ']]>' if $type eq 'cdata';
 230
 231   # Processing instruction
 232   return '<?' . $tree->[1] . '?>' if $type eq 'pi';
 233
 234   # Root
 235   return join '', map { _render($_, $xml) } @$tree[1 .. $#$tree]
 236     if $type eq 'root';
 237
 238   # Start tag
 239   my $tag    = $tree->[1];
 240   my $result = "<$tag";
 241
 242   # Attributes
 243   for my $key (sort keys %{$tree->[2]}) {
 244     my $value = $tree->[2]{$key};
 245     $result .= $xml ? qq{ $key="$key"} : " $key" and next unless defined $value;
 246     $result .= qq{ $key="} . html_escape($value) . '"';
 247   }
 248
 249   # No children
 250   return $xml ? "$result />" : $EMPTY{$tag} ? "$result>" : "$result></$tag>"
 251     unless $tree->[4];
 252
 253   # Children
 254   no warnings 'recursion';
 255   $result .= '>' . join '', map { _render($_, $xml) } @$tree[4 .. $#$tree];
 256
 257   # End tag
 258   return "$result</$tag>";
 259 }
 260
 261 sub _start {
 262   my ($start, $attrs, $xml, $current) = @_;
 263
 264   # Autoclose optional HTML elements
 265   if (!$xml && $$current->[0] ne 'root') {
 266     if (my $end = $END{$start}) { _end($end, 0, $current) }
 267
 268     elsif (my $close = $CLOSE{$start}) {
 269       my ($allowed, $scope) = @$close;
 270
 271       # Close allowed parent elements in scope
 272       my $parent = $$current;
 273       while ($parent->[0] ne 'root' && !$scope->{$parent->[1]}) {
 274         _end($parent->[1], 0, $current) if $allowed->{$parent->[1]};
 275         $parent = $parent->[3];
 276       }
 277     }
 278   }
 279
 280   # New tag
 281   push @$$current, my $new = ['tag', $start, $attrs, $$current];
 282   weaken $new->[3];
 283   $$current = $new;
 284 }
 285
 286 1;
 287
 288 =encoding utf8
 289
 290 =head1 NAME
 291
 292 DOM::Tiny::HTML - HTML/XML engine
 293
 294 =head1 SYNOPSIS
 295
 296   use DOM::Tiny::HTML;
 297
 298   # Turn HTML into DOM tree
 299   my $html = DOM::Tiny::HTML->new;
 300   $html->parse('<div><p id="a">Test</p><p id="b">123</p></div>');
 301   my $tree = $html->tree;
 302
 303 =head1 DESCRIPTION
 304
 305 L<DOM::Tiny::HTML> is the HTML/XML engine used by L<DOM::Tiny> based on
 306 L<Mojo::DOM::HTML>, which is based on the
 307 L<HTML Living Standard|https://html.spec.whatwg.org> as well as the
 308 L<Extensible Markup Language (XML) 1.0|http://www.w3.org/TR/xml/>.
 309
 310 =head1 ATTRIBUTES
 311
 312 L<DOM::Tiny::HTML> implements the following attributes.
 313
 314 =head2 tree
 315
 316   my $tree = $html->tree;
 317   $html    = $html->tree(['root']);
 318
 319 Document Object Model. Note that this structure should only be used very
 320 carefully since it is very dynamic.
 321
 322 =head2 xml
 323
 324   my $bool = $html->xml;
 325   $html    = $html->xml($bool);
 326
 327 Disable HTML semantics in parser and activate case-sensitivity, defaults to
 328 auto detection based on processing instructions.
 329
 330 =head1 METHODS
 331
 332 L<DOM::Tiny::HTML> implements the following methods.
 333
 334 =head2 new
 335
 336   my $html = DOM::Tiny::HTML->new;
 337   my $html = DOM::Tiny::HTML->new(xml => 1);
 338   my $html = DOM::Tiny::HTML->new({xml => 1});
 339
 340 Construct a new hash-based L<DOM::Tiny::HTML> object.
 341
 342 =head2 parse
 343
 344   $html = $html->parse('<foo bar="baz">I ♥ DOM::Tiny!</foo>');
 345
 346 Parse HTML/XML fragment.
 347
 348 =head2 render
 349
 350   my $str = $html->render;
 351
 352 Render DOM to HTML/XML.
 353
 354 =head1 BUGS
 355
 356 Report any issues on the public bugtracker.
 357
 358 =head1 AUTHOR
 359
 360 Dan Book <dbook@cpan.org>
 361
 362 =head1 COPYRIGHT AND LICENSE
 363
 364 This software is Copyright (c) 2015 by Dan Book.
 365
 366 This is free software, licensed under:
 367
 368   The Artistic License 2.0 (GPL Compatible)
 369
 370 =head1 SEE ALSO
 371
 372 L<Mojo::DOM::HTML>