X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FDOM%2FTiny.pm;h=febc2a59a2b404cad26404069ead361f36fc93a9;hb=aeae951287c7fa77c0ce5acb7e83b8199ab87862;hp=8c208c7e4b1279d8773e54c989e836f6ca4caa60;hpb=918803403dcdce61810022aed3c3b20d48e74722;p=catagits%2FDOM-Tiny.git diff --git a/lib/DOM/Tiny.pm b/lib/DOM/Tiny.pm index 8c208c7..febc2a5 100644 --- a/lib/DOM/Tiny.pm +++ b/lib/DOM/Tiny.pm @@ -11,12 +11,20 @@ use overload fallback => 1; use Carp 'croak'; -use DOM::Tiny::Collection; -use DOM::Tiny::CSS; -use DOM::Tiny::HTML; +use DOM::Tiny::_Collection; +use DOM::Tiny::_CSS; +use DOM::Tiny::_HTML; use Scalar::Util qw(blessed weaken); -our $VERSION = '0.001'; +our $VERSION = '0.003'; + +sub new { + my $class = shift; + my $self = bless \DOM::Tiny::_HTML->new, ref $class || $class; + return @_ ? $self->parse(@_) : $self; +} + +sub TO_JSON { shift->_delegate('render') } sub all_text { shift->_all_text(1, @_) } @@ -59,7 +67,7 @@ sub content { my $type = $self->type; if ($type eq 'root' || $type eq 'tag') { return $self->_content(0, 1, @_) if @_; - my $html = DOM::Tiny::HTML->new(xml => $self->xml); + my $html = DOM::Tiny::_HTML->new(xml => $self->xml); return join '', map { $html->tree($_)->render } _nodes($self->tree); } @@ -97,12 +105,6 @@ sub namespace { return undef; } -sub new { - my $class = shift; - my $self = bless \DOM::Tiny::HTML->new, ref $class || $class; - return @_ ? $self->parse(@_) : $self; -} - sub next { $_[0]->_maybe($_[0]->_siblings(1, 0)->[1]) } sub next_node { $_[0]->_maybe($_[0]->_siblings(0, 0)->[1]) } @@ -151,7 +153,7 @@ sub tag { return $self; } -sub tap { shift->DOM::Tiny::Collection::tap(@_) } +sub tap { DOM::Tiny::_Collection::tap(@_) } sub text { shift->_all_text(0, @_) } @@ -165,7 +167,8 @@ sub val { my $self = shift; # "option" - return $self->{value} // $self->text if (my $tag = $self->tag) eq 'option'; + return defined($self->{value}) ? $self->{value} : $self->text + if (my $tag = $self->tag) eq 'option'; # "textarea", "input" or "button" return $tag eq 'textarea' ? $self->text : $self->{value} if $tag ne 'select'; @@ -211,7 +214,7 @@ sub _all_text { sub _ancestors { my ($self, $root) = @_; - return unless my $tree = $self->_parent; + return () unless my $tree = $self->_parent; my @ancestors; do { push @ancestors, $tree } while ($tree->[0] eq 'tag') && ($tree = $tree->[3]); @@ -223,7 +226,7 @@ sub _build { shift->new->tree(shift)->xml(shift) } sub _collect { my $self = shift; my $xml = $self->xml; - return DOM::Tiny::Collection->new(map { $self->_build($_, $xml) } @_); + return DOM::Tiny::_Collection->new(map { $self->_build($_, $xml) } @_); } sub _content { @@ -242,7 +245,7 @@ sub _content { return $self; } -sub _css { DOM::Tiny::CSS->new(tree => shift->tree) } +sub _css { DOM::Tiny::_CSS->new(tree => shift->tree) } sub _delegate { my ($self, $method) = (shift, shift); @@ -267,7 +270,7 @@ sub _link { sub _maybe { $_[1] ? $_[0]->_build($_[1], $_[0]->xml) : undef } sub _nodes { - return unless my $tree = shift; + return () unless my $tree = shift; my @nodes = @$tree[_start($tree) .. $#$tree]; return shift() ? grep { $_->[0] eq 'tag' } @nodes : @nodes; } @@ -281,7 +284,7 @@ sub _offset { sub _parent { $_[0]->tree->[$_[0]->type eq 'tag' ? 3 : 2] } -sub _parse { DOM::Tiny::HTML->new(xml => shift->xml)->parse(shift)->tree } +sub _parse { DOM::Tiny::_HTML->new(xml => shift->xml)->parse(shift)->tree } sub _replace { my ($self, $parent, $child, @nodes) = @_; @@ -420,9 +423,12 @@ DOM::Tiny - Minimalistic HTML/XML DOM parser with CSS selectors =head1 DESCRIPTION -L is a minimalistic and relaxed HTML/XML DOM parser with CSS -selector support based on L. It will even try to interpret broken -HTML and XML, so you should not use it for validation. +L is a minimalistic and relaxed pure-perl HTML/XML DOM parser based +on L. It supports the L +and L, and +matching based on L. It will +even try to interpret broken HTML and XML, so you should not use it for +validation. =head1 NODES AND ELEMENTS @@ -459,8 +465,8 @@ names are lowercased and selectors need to be lowercase as well. my $dom = DOM::Tiny->new('

Hi!

'); say $dom->at('p[id]')->text; -If XML processing instructions are found, the parser will automatically switch -into XML mode and everything becomes case-sensitive. +If an XML declaration is found, the parser will automatically switch into XML +mode and everything becomes case-sensitive. # XML semantics my $dom = DOM::Tiny->new('

Hi!

'); @@ -476,10 +482,276 @@ XML detection can also be disabled with the L method. my $dom = DOM::Tiny->new->xml(0)->parse('

Hi!

'); say $dom->at('p[id]')->text; +=head1 SELECTORS + +L uses a CSS selector engine based on L. All CSS +selectors that make sense for a standalone parser are supported. + +=over + +=item Z<>* + +Any element. + + my $all = $dom->find('*'); + +=item E + +An element of type C. + + my $title = $dom->at('title'); + +=item E[foo] + +An C element with a C attribute. + + my $links = $dom->find('a[href]'); + +=item E[foo="bar"] + +An C element whose C attribute value is exactly equal to C. + + my $case_sensitive = $dom->find('input[type="hidden"]'); + my $case_sensitive = $dom->find('input[type=hidden]'); + +=item E[foo="bar" i] + +An C element whose C attribute value is exactly equal to any +(ASCII-range) case-permutation of C. Note that this selector is +EXPERIMENTAL and might change without warning! + + my $case_insensitive = $dom->find('input[type="hidden" i]'); + my $case_insensitive = $dom->find('input[type=hidden i]'); + my $case_insensitive = $dom->find('input[class~="foo" i]'); + +This selector is part of +L, which is still a work +in progress. + +=item E[foo~="bar"] + +An C element whose C attribute value is a list of whitespace-separated +values, one of which is exactly equal to C. + + my $foo = $dom->find('input[class~="foo"]'); + my $foo = $dom->find('input[class~=foo]'); + +=item E[foo^="bar"] + +An C element whose C attribute value begins exactly with the string +C. + + my $begins_with = $dom->find('input[name^="f"]'); + my $begins_with = $dom->find('input[name^=f]'); + +=item E[foo$="bar"] + +An C element whose C attribute value ends exactly with the string +C. + + my $ends_with = $dom->find('input[name$="o"]'); + my $ends_with = $dom->find('input[name$=o]'); + +=item E[foo*="bar"] + +An C element whose C attribute value contains the substring C. + + my $contains = $dom->find('input[name*="fo"]'); + my $contains = $dom->find('input[name*=fo]'); + +=item E:root + +An C element, root of the document. + + my $root = $dom->at(':root'); + +=item E:nth-child(n) + +An C element, the C child of its parent. + + my $third = $dom->find('div:nth-child(3)'); + my $odd = $dom->find('div:nth-child(odd)'); + my $even = $dom->find('div:nth-child(even)'); + my $top3 = $dom->find('div:nth-child(-n+3)'); + +=item E:nth-last-child(n) + +An C element, the C child of its parent, counting from the last one. + + my $third = $dom->find('div:nth-last-child(3)'); + my $odd = $dom->find('div:nth-last-child(odd)'); + my $even = $dom->find('div:nth-last-child(even)'); + my $bottom3 = $dom->find('div:nth-last-child(-n+3)'); + +=item E:nth-of-type(n) + +An C element, the C sibling of its type. + + my $third = $dom->find('div:nth-of-type(3)'); + my $odd = $dom->find('div:nth-of-type(odd)'); + my $even = $dom->find('div:nth-of-type(even)'); + my $top3 = $dom->find('div:nth-of-type(-n+3)'); + +=item E:nth-last-of-type(n) + +An C element, the C sibling of its type, counting from the last one. + + my $third = $dom->find('div:nth-last-of-type(3)'); + my $odd = $dom->find('div:nth-last-of-type(odd)'); + my $even = $dom->find('div:nth-last-of-type(even)'); + my $bottom3 = $dom->find('div:nth-last-of-type(-n+3)'); + +=item E:first-child + +An C element, first child of its parent. + + my $first = $dom->find('div p:first-child'); + +=item E:last-child + +An C element, last child of its parent. + + my $last = $dom->find('div p:last-child'); + +=item E:first-of-type + +An C element, first sibling of its type. + + my $first = $dom->find('div p:first-of-type'); + +=item E:last-of-type + +An C element, last sibling of its type. + + my $last = $dom->find('div p:last-of-type'); + +=item E:only-child + +An C element, only child of its parent. + + my $lonely = $dom->find('div p:only-child'); + +=item E:only-of-type + +An C element, only sibling of its type. + + my $lonely = $dom->find('div p:only-of-type'); + +=item E:empty + +An C element that has no children (including text nodes). + + my $empty = $dom->find(':empty'); + +=item E:checked + +A user interface element C which is checked (for instance a radio-button or +checkbox). + + my $input = $dom->find(':checked'); + +=item E.warning + +An C element whose class is "warning". + + my $warning = $dom->find('div.warning'); + +=item E#myid + +An C element with C equal to "myid". + + my $foo = $dom->at('div#foo'); + +=item E:not(s) + +An C element that does not match simple selector C. + + my $others = $dom->find('div p:not(:first-child)'); + +=item E F + +An C element descendant of an C element. + + my $headlines = $dom->find('div h1'); + +=item E E F + +An C element child of an C element. + + my $headlines = $dom->find('html > body > div > h1'); + +=item E + F + +An C element immediately preceded by an C element. + + my $second = $dom->find('h1 + h2'); + +=item E ~ F + +An C element preceded by an C element. + + my $second = $dom->find('h1 ~ h2'); + +=item E, F, G + +Elements of type C, C and C. + + my $headlines = $dom->find('h1, h2, h3'); + +=item E[foo=bar][bar=baz] + +An C element whose attributes match all following attribute selectors. + + my $links = $dom->find('a[foo^=b][foo$=ar]'); + +=back + +=head1 OPERATORS + +L overloads the following operators. + +=head2 array + + my @nodes = @$dom; + +Alias for L. + + # "" + $dom->parse('123')->[0]; + +=head2 bool + + my $bool = !!$dom; + +Always true. + +=head2 hash + + my %attrs = %$dom; + +Alias for L. + + # "test" + $dom->parse('
Test
')->at('div')->{id}; + +=head2 stringify + + my $str = "$dom"; + +Alias for L. + =head1 METHODS L implements the following methods. +=head2 new + + my $dom = DOM::Tiny->new; + my $dom = DOM::Tiny->new('I ♥ DOM::Tiny!'); + +Construct a new scalar-based L object and L HTML/XML +fragment if necessary. + =head2 all_text my $trimmed = $dom->all_text; @@ -500,8 +772,8 @@ whitespace trimming is enabled by default. my $collection = $dom->ancestors('div ~ p'); Find all ancestor elements of this node matching the CSS selector and return a -L object containing these elements as L -objects. All selectors from L are supported. +L containing these elements as L +objects. All selectors listed in L are supported. # List tag names of ancestor elements say $dom->ancestors->map('tag')->join("\n"); @@ -543,8 +815,8 @@ node's content. my $result = $dom->at('div ~ p'); Find first descendant element of this element matching the CSS selector and -return it as a L object or return C if none could be found. -All selectors from L are supported. +return it as a L object, or C if none could be found. All +selectors listed in L are supported. # Find first element with "svg" namespace definition my $namespace = $dom->at('[xmlns\:svg]')->{'xmlns:svg'}; @@ -571,7 +843,7 @@ This element's attributes. my $collection = $dom->child_nodes; -Return a L object containing all child nodes of this +Return a L containing all child nodes of this element as L objects. # "

123

" @@ -589,8 +861,8 @@ element as L objects. my $collection = $dom->children('div ~ p'); Find all child elements of this element matching the CSS selector and return a -L object containing these elements as L -objects. All selectors from L are supported. +L containing these elements as L +objects. All selectors listed in L are supported. # Show tag name of random child element say $dom->children->shuffle->first->tag; @@ -626,7 +898,7 @@ and C nodes) or raw content. my $collection = $dom->descendant_nodes; -Return a L object containing all descendant nodes of +Return a L containing all descendant nodes of this element as L objects. # "

123

" @@ -644,9 +916,8 @@ this element as L objects. my $collection = $dom->find('div ~ p'); Find all descendant elements of this element matching the CSS selector and -return a L object containing these elements as -L objects. All selectors from L are -supported. +return a L containing these elements as +L objects. All selectors listed in L are supported. # Find a specific element and extract information my $id = $dom->find('div')->[23]{id}; @@ -666,8 +937,8 @@ supported. my $collection = $dom->following('div ~ p'); Find all sibling elements after this node matching the CSS selector and return -a L object containing these elements as L -objects. All selectors from L are supported. +a L containing these elements as L +objects. All selectors listen in L are supported. # List tags of sibling elements after this node say $dom->following->map('tag')->join("\n"); @@ -676,7 +947,7 @@ objects. All selectors from L are supported. my $collection = $dom->following_nodes; -Return a L object containing all sibling nodes after +Return a L containing all sibling nodes after this node as L objects. # "C" @@ -686,8 +957,8 @@ this node as L objects. my $bool = $dom->matches('div ~ p'); -Check if this element matches the CSS selector. All selectors from -L are supported. +Check if this element matches the CSS selector. All selectors listed in +L are supported. # True $dom->parse('

A

')->at('p')->matches('.a'); @@ -701,7 +972,7 @@ L are supported. my $namespace = $dom->namespace; -Find this element's namespace or return C if none could be found. +Find this element's namespace, or return C if none could be found. # Find namespace for an element with namespace prefix my $namespace = $dom->at('svg > svg\:circle')->namespace; @@ -709,20 +980,12 @@ Find this element's namespace or return C if none could be found. # Find namespace for an element that may or may not have a namespace prefix my $namespace = $dom->at('svg > circle')->namespace; -=head2 new - - my $dom = DOM::Tiny->new; - my $dom = DOM::Tiny->new('I ♥ DOM::Tiny!'); - -Construct a new scalar-based L object and L HTML/XML -fragment if necessary. - =head2 next my $sibling = $dom->next; -Return L object for next sibling element or C if there are no -more siblings. +Return L object for next sibling element, or C if there are +no more siblings. # "

123

" $dom->parse('

Test

123

')->at('h1')->next; @@ -731,7 +994,7 @@ more siblings. my $sibling = $dom->next_node; -Return L object for next sibling node or C if there are no +Return L object for next sibling node, or C if there are no more siblings. # "456" @@ -746,17 +1009,20 @@ more siblings. my $parent = $dom->parent; -Return L object for parent of this node or C if this node has -no parent. +Return L object for parent of this node, or C if this node +has no parent. + + # "Test" + $dom->parse('

Test

')->at('i')->parent; =head2 parse $dom = $dom->parse('I ♥ DOM::Tiny!'); -Parse HTML/XML fragment with L. +Parse HTML/XML fragment. # Parse XML - my $dom = DOM::Tiny->new->xml(1)->parse($xml); + my $dom = DOM::Tiny->new->xml(1)->parse('I ♥ DOM::Tiny!'); =head2 preceding @@ -764,8 +1030,8 @@ Parse HTML/XML fragment with L. my $collection = $dom->preceding('div ~ p'); Find all sibling elements before this node matching the CSS selector and return -a L object containing these elements as L -objects. All selectors from L are supported. +a L containing these elements as L +objects. All selectors listed in L are supported. # List tags of sibling elements before this node say $dom->preceding->map('tag')->join("\n"); @@ -774,8 +1040,8 @@ objects. All selectors from L are supported. my $collection = $dom->preceding_nodes; -Return a L object containing all sibling nodes before -this node as L objects. +Return a L containing all sibling nodes +before this node as L objects. # "A" $dom->parse('A

C

')->at('p')->preceding_nodes->first->content; @@ -816,7 +1082,7 @@ node's content. my $sibling = $dom->previous; -Return L object for previous sibling element or C if there +Return L object for previous sibling element, or C if there are no more siblings. # "

Test

" @@ -826,7 +1092,7 @@ are no more siblings. my $sibling = $dom->previous_node; -Return L object for previous sibling node or C if there are +Return L object for previous sibling node, or C if there are no more siblings. # "123" @@ -893,7 +1159,7 @@ This element's tag name. $dom = $dom->tap(sub {...}); -Alias for L. +Equivalent to L. =head2 text @@ -962,10 +1228,10 @@ C, C or C. my $value = $dom->val; Extract value from form element (such as C