X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FDOM%2FTiny.pm;h=abb3cc02dd86c027f2c88db48dc4d6af5e28bc7c;hb=e99ef07d6c8b76be11c70996ce7edf40562b4625;hp=cf226c4cd82bf9872956c6bf1cb20a7a98aa8118;hpb=a292be342745c12aa805e7aca023756ad824f5d4;p=catagits%2FDOM-Tiny.git diff --git a/lib/DOM/Tiny.pm b/lib/DOM/Tiny.pm index cf226c4..abb3cc0 100644 --- a/lib/DOM/Tiny.pm +++ b/lib/DOM/Tiny.pm @@ -3,18 +3,1061 @@ package DOM::Tiny; use strict; use warnings; +use overload + '@{}' => sub { shift->child_nodes }, + '%{}' => sub { shift->attr }, + bool => sub {1}, + '""' => sub { shift->to_string }, + fallback => 1; + +use Carp 'croak'; +use DOM::Tiny::Collection; +use DOM::Tiny::CSS; +use DOM::Tiny::HTML; +use Scalar::Util qw(blessed weaken); + our $VERSION = '0.001'; +sub new { + my $class = shift; + my $self = bless \DOM::Tiny::HTML->new, ref $class || $class; + return @_ ? $self->parse(@_) : $self; +} + +sub all_text { shift->_all_text(1, @_) } + +sub ancestors { _select($_[0]->_collect($_[0]->_ancestors), $_[1]) } + +sub append { shift->_add(1, @_) } +sub append_content { shift->_content(1, 0, @_) } + +sub at { + my $self = shift; + return undef unless my $result = $self->_css->select_one(@_); + return $self->_build($result, $self->xml); +} + +sub attr { + my $self = shift; + + # Hash + my $tree = $self->tree; + my $attrs = $tree->[0] ne 'tag' ? {} : $tree->[2]; + return $attrs unless @_; + + # Get + return $attrs->{$_[0]} unless @_ > 1 || ref $_[0]; + + # Set + my $values = ref $_[0] ? $_[0] : {@_}; + @$attrs{keys %$values} = values %$values; + + return $self; +} + +sub child_nodes { $_[0]->_collect(_nodes($_[0]->tree)) } + +sub children { _select($_[0]->_collect(_nodes($_[0]->tree, 1)), $_[1]) } + +sub content { + my $self = shift; + + my $type = $self->type; + if ($type eq 'root' || $type eq 'tag') { + return $self->_content(0, 1, @_) if @_; + my $html = DOM::Tiny::HTML->new(xml => $self->xml); + return join '', map { $html->tree($_)->render } _nodes($self->tree); + } + + return $self->tree->[1] unless @_; + $self->tree->[1] = shift; + return $self; +} + +sub descendant_nodes { $_[0]->_collect(_all(_nodes($_[0]->tree))) } + +sub find { $_[0]->_collect(@{$_[0]->_css->select($_[1])}) } + +sub following { _select($_[0]->_collect(@{$_[0]->_siblings(1)->[1]}), $_[1]) } +sub following_nodes { $_[0]->_collect(@{$_[0]->_siblings->[1]}) } + +sub matches { shift->_css->matches(@_) } + +sub namespace { + my $self = shift; + + return undef if (my $tree = $self->tree)->[0] ne 'tag'; + + # Extract namespace prefix and search parents + my $ns = $tree->[1] =~ /^(.*?):/ ? "xmlns:$1" : undef; + for my $node ($tree, $self->_ancestors) { + + # Namespace for prefix + my $attrs = $node->[2]; + if ($ns) { $_ eq $ns and return $attrs->{$_} for keys %$attrs } + + # Namespace attribute + elsif (defined $attrs->{xmlns}) { return $attrs->{xmlns} } + } + + return undef; +} + +sub next { $_[0]->_maybe($_[0]->_siblings(1, 0)->[1]) } +sub next_node { $_[0]->_maybe($_[0]->_siblings(0, 0)->[1]) } + +sub parent { + my $self = shift; + return undef if $self->tree->[0] eq 'root'; + return $self->_build($self->_parent, $self->xml); +} + +sub parse { shift->_delegate(parse => @_) } + +sub preceding { _select($_[0]->_collect(@{$_[0]->_siblings(1)->[0]}), $_[1]) } +sub preceding_nodes { $_[0]->_collect(@{$_[0]->_siblings->[0]}) } + +sub prepend { shift->_add(0, @_) } +sub prepend_content { shift->_content(0, 0, @_) } + +sub previous { $_[0]->_maybe($_[0]->_siblings(1, -1)->[0]) } +sub previous_node { $_[0]->_maybe($_[0]->_siblings(0, -1)->[0]) } + +sub remove { shift->replace('') } + +sub replace { + my ($self, $new) = @_; + return $self->parse($new) if (my $tree = $self->tree)->[0] eq 'root'; + return $self->_replace($self->_parent, $tree, _nodes($self->_parse($new))); +} + +sub root { + my $self = shift; + return $self unless my $tree = $self->_ancestors(1); + return $self->_build($tree, $self->xml); +} + +sub strip { + my $self = shift; + return $self if (my $tree = $self->tree)->[0] ne 'tag'; + return $self->_replace($tree->[3], $tree, _nodes($tree)); +} + +sub tag { + my ($self, $tag) = @_; + return undef if (my $tree = $self->tree)->[0] ne 'tag'; + return $tree->[1] unless $tag; + $tree->[1] = $tag; + return $self; +} + +sub tap { shift->DOM::Tiny::Collection::tap(@_) } + +sub text { shift->_all_text(0, @_) } + +sub to_string { shift->_delegate('render') } + +sub tree { shift->_delegate(tree => @_) } + +sub type { shift->tree->[0] } + +sub val { + my $self = shift; + + # "option" + return $self->{value} // $self->text if (my $tag = $self->tag) eq 'option'; + + # "textarea", "input" or "button" + return $tag eq 'textarea' ? $self->text : $self->{value} if $tag ne 'select'; + + # "select" + my $v = $self->find('option:checked')->map('val'); + return exists $self->{multiple} ? $v->size ? $v->to_array : undef : $v->last; +} + +sub wrap { shift->_wrap(0, @_) } +sub wrap_content { shift->_wrap(1, @_) } + +sub xml { shift->_delegate(xml => @_) } + +sub _add { + my ($self, $offset, $new) = @_; + + return $self if (my $tree = $self->tree)->[0] eq 'root'; + + my $parent = $self->_parent; + splice @$parent, _offset($parent, $tree) + $offset, 0, + _link($parent, _nodes($self->_parse($new))); + + return $self; +} + +sub _all { + map { $_->[0] eq 'tag' ? ($_, _all(_nodes($_))) : ($_) } @_; +} + +sub _all_text { + my ($self, $recurse, $trim) = @_; + + # Detect "pre" tag + my $tree = $self->tree; + $trim = 1 unless defined $trim; + map { $_->[1] eq 'pre' and $trim = 0 } $self->_ancestors, $tree + if $trim && $tree->[0] ne 'root'; + + return _text([_nodes($tree)], $recurse, $trim); +} + +sub _ancestors { + my ($self, $root) = @_; + + return unless my $tree = $self->_parent; + my @ancestors; + do { push @ancestors, $tree } + while ($tree->[0] eq 'tag') && ($tree = $tree->[3]); + return $root ? $ancestors[-1] : @ancestors[0 .. $#ancestors - 1]; +} + +sub _build { shift->new->tree(shift)->xml(shift) } + +sub _collect { + my $self = shift; + my $xml = $self->xml; + return DOM::Tiny::Collection->new(map { $self->_build($_, $xml) } @_); +} + +sub _content { + my ($self, $start, $offset, $new) = @_; + + my $tree = $self->tree; + unless ($tree->[0] eq 'root' || $tree->[0] eq 'tag') { + my $old = $self->content; + return $self->content($start ? "$old$new" : "$new$old"); + } + + $start = $start ? ($#$tree + 1) : _start($tree); + $offset = $offset ? $#$tree : 0; + splice @$tree, $start, $offset, _link($tree, _nodes($self->_parse($new))); + + return $self; +} + +sub _css { DOM::Tiny::CSS->new(tree => shift->tree) } + +sub _delegate { + my ($self, $method) = (shift, shift); + return $$self->$method unless @_; + $$self->$method(@_); + return $self; +} + +sub _link { + my ($parent, @children) = @_; + + # Link parent to children + for my $node (@children) { + my $offset = $node->[0] eq 'tag' ? 3 : 2; + $node->[$offset] = $parent; + weaken $node->[$offset]; + } + + return @children; +} + +sub _maybe { $_[1] ? $_[0]->_build($_[1], $_[0]->xml) : undef } + +sub _nodes { + return unless my $tree = shift; + my @nodes = @$tree[_start($tree) .. $#$tree]; + return shift() ? grep { $_->[0] eq 'tag' } @nodes : @nodes; +} + +sub _offset { + my ($parent, $child) = @_; + my $i = _start($parent); + $_ eq $child ? last : $i++ for @$parent[$i .. $#$parent]; + return $i; +} + +sub _parent { $_[0]->tree->[$_[0]->type eq 'tag' ? 3 : 2] } + +sub _parse { DOM::Tiny::HTML->new(xml => shift->xml)->parse(shift)->tree } + +sub _replace { + my ($self, $parent, $child, @nodes) = @_; + splice @$parent, _offset($parent, $child), 1, _link($parent, @nodes); + return $self->parent; +} + +sub _select { + my ($collection, $selector) = @_; + return $collection unless $selector; + return $collection->new(grep { $_->matches($selector) } @$collection); +} + +sub _siblings { + my ($self, $tags, $i) = @_; + + return [] unless my $parent = $self->parent; + + my $tree = $self->tree; + my (@before, @after, $match); + for my $node (_nodes($parent->tree)) { + ++$match and next if !$match && $node eq $tree; + next if $tags && $node->[0] ne 'tag'; + $match ? push @after, $node : push @before, $node; + } + + return defined $i ? [$before[$i], $after[$i]] : [\@before, \@after]; +} + +sub _squish { + my $str = shift; + $str =~ s/^\s+//; + $str =~ s/\s+$//; + $str =~ s/\s+/ /g; + return $str; +} + +sub _start { $_[0][0] eq 'root' ? 1 : 4 } + +sub _text { + my ($nodes, $recurse, $trim) = @_; + + # Merge successive text nodes + my $i = 0; + while (my $next = $nodes->[$i + 1]) { + ++$i and next unless $nodes->[$i][0] eq 'text' && $next->[0] eq 'text'; + splice @$nodes, $i, 2, ['text', $nodes->[$i][1] . $next->[1]]; + } + + my $text = ''; + for my $node (@$nodes) { + my $type = $node->[0]; + + # Text + my $chunk = ''; + if ($type eq 'text') { $chunk = $trim ? _squish $node->[1] : $node->[1] } + + # CDATA or raw text + elsif ($type eq 'cdata' || $type eq 'raw') { $chunk = $node->[1] } + + # Nested tag + elsif ($type eq 'tag' && $recurse) { + no warnings 'recursion'; + $chunk = _text([_nodes($node)], 1, $node->[1] eq 'pre' ? 0 : $trim); + } + + # Add leading whitespace if punctuation allows it + $chunk = " $chunk" if $text =~ /\S\z/ && $chunk =~ /^[^.!?,;:\s]+/; + + # Trim whitespace blocks + $text .= $chunk if $chunk =~ /\S+/ || !$trim; + } + + return $text; +} + +sub _wrap { + my ($self, $content, $new) = @_; + + $content = 1 if (my $tree = $self->tree)->[0] eq 'root'; + $content = 0 if $tree->[0] ne 'root' && $tree->[0] ne 'tag'; + + # Find innermost tag + my $current; + my $first = $new = $self->_parse($new); + $current = $first while $first = (_nodes($first, 1))[0]; + return $self unless $current; + + # Wrap content + if ($content) { + push @$current, _link($current, _nodes($tree)); + splice @$tree, _start($tree), $#$tree, _link($tree, _nodes($new)); + return $self; + } + + # Wrap element + $self->_replace($self->_parent, $tree, _nodes($new)); + push @$current, _link($current, $tree); + return $self; +} + 1; +=encoding utf8 + =head1 NAME -DOM::Tiny - Module abstract +DOM::Tiny - Minimalistic HTML/XML DOM parser with CSS selectors =head1 SYNOPSIS + use DOM::Tiny; + + # Parse + my $dom = DOM::Tiny->new('
Test
123
456
'); + $dom->find(':not(p)')->map('strip'); + + # Render + say "$dom"; + =head1 DESCRIPTION +LHi!
'); + say $dom->at('p[id]')->text; + +If XML processing instructions are found, the parser will automatically switch +into XML mode and everything becomes case-sensitive. + + # XML semantics + my $dom = DOM::Tiny->new('Hi!
'); + say $dom->at('P[ID]')->text; + +XML detection can also be disabled with the L"xml"> method. + + # Force XML semantics + my $dom = DOM::Tiny->new->xml(1)->parse('Hi!
'); + say $dom->at('P[ID]')->text; + + # Force HTML semantics + my $dom = DOM::Tiny->new->xml(0)->parse('Hi!
'); + say $dom->at('p[id]')->text; + +=head1 METHODS + +Lbar
baz\nbar
baz\nI ⥠DOM::Tiny!
'); + +Append HTML/XML fragment to this node. + + # "Test 123
" + $dom->parse('Test
')->at('p') + ->child_nodes->first->append(' 123')->root; + +=head2 append_content + + $dom = $dom->append_content('I ⥠DOM::Tiny!
'); + +Append HTML/XML fragment (for CTest123
" + $dom->parse('Test
')->at('p')->append_content('123')->root; + +=head2 at + + my $result = $dom->at('div ~ p'); + +Find first descendant element of this element matching the CSS selector and +return it as a L123
" + $dom->parse('Test123
')->at('p')->child_nodes->first->remove; + + # "" + $dom->parse('123')->child_nodes->first; + + # " Test " + $dom->parse('123')->child_nodes->last->content; + +=head2 children + + my $collection = $dom->children; + my $collection = $dom->children('div ~ p'); + +Find all child elements of this element matching the CSS selector and return a +LI ⥠DOM::Tiny!
'); + +Return this node's content or replace it with HTML/XML fragment (for C123
" + $dom->parse('Test
')->at('p')->content('123')->root; + + # "123
" + $dom->parse('123
') + ->descendant_nodes->grep(sub { $_->type eq 'comment' }) + ->map('remove')->first; + + # "testtest
" + $dom->parse('123456
') + ->at('p')->descendant_nodes->grep(sub { $_->type eq 'text' }) + ->map(content => 'test')->first->root; + +=head2 find + + my $collection = $dom->find('div ~ p'); + +Find all descendant elements of this element matching the CSS selector and +return a LA
C')->at('p')->following_nodes->last->content; + +=head2 matches + + my $bool = $dom->matches('div ~ p'); + +Check if this element matches the CSS selector. All selectors from +LA
')->at('p')->matches('.a'); + $dom->parse('A
')->at('p')->matches('p[class]'); + + # False + $dom->parse('A
')->at('p')->matches('.b'); + $dom->parse('A
')->at('p')->matches('p[id]'); + +=head2 namespace + + my $namespace = $dom->namespace; + +Find this element's namespace or return C123456
') + ->at('b')->next_node->next_node; + + # " Test " + $dom->parse('123456
') + ->at('b')->next_node->content; + +=head2 parent + + my $parent = $dom->parent; + +Return LC
')->at('p')->preceding_nodes->first->content; + +=head2 prepend + + $dom = $dom->prepend('I ⥠DOM::Tiny!
'); + +Prepend HTML/XML fragment to this node. + + # "Test 123
" + $dom->parse('123
') + ->at('p')->child_nodes->first->prepend('Test ')->root; + +=head2 prepend_content + + $dom = $dom->prepend_content('I ⥠DOM::Tiny!
'); + +Prepend HTML/XML fragment (for C123Test
" + $dom->parse('Test
')->at('p')->prepend_content('123')->root; + +=head2 previous + + my $sibling = $dom->previous; + +Return L123456
') + ->at('b')->previous_node->previous_node; + + # " Test " + $dom->parse('123456
') + ->at('b')->previous_node->content; + +=head2 remove + + my $parent = $dom->remove; + +Remove this node and return L"root"> (for C456
" + $dom->parse('123456
') + ->at('p')->child_nodes->first->remove->root; + +=head2 replace + + my $parent = $dom->replace('123
" + $dom->parse('Test
') + ->at('p')->child_nodes->[0]->replace('123')->root; + +=head2 root + + my $root = $dom->root; + +Return Lbar
baz\nbar
baz\nTest
')->type; + + # "tag" + $dom->parse('Test
')->at('p')->type; + + # "text" + $dom->parse('Test
')->at('p')->child_nodes->first->type; + +=head2 val + + my $value = $dom->val; + +Extract value from form element (such as C