X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=README.pod;h=e8ee528cca51312c2a0b585ce68cc587c1665de8;hb=03eb552102cee7ee2bb30ceeb6711c31d3c317cc;hp=91da133e05f813dc5c407d924cc70e9c5f829258;hpb=e99ef07d6c8b76be11c70996ce7edf40562b4625;p=catagits%2FDOM-Tiny.git diff --git a/README.pod b/README.pod index 91da133..e8ee528 100644 --- a/README.pod +++ b/README.pod @@ -35,9 +35,12 @@ DOM::Tiny - Minimalistic HTML/XML DOM parser with CSS selectors =head1 DESCRIPTION -L is a minimalistic and relaxed HTML/XML DOM parser with CSS -selector support based on L. It will even try to interpret broken -HTML and XML, so you should not use it for validation. +L is a minimalistic and relaxed pure-perl HTML/XML DOM parser based +on L. It supports the L +and L, and +matching based on L. It will +even try to interpret broken HTML and XML, so you should not use it for +validation. =head1 NODES AND ELEMENTS @@ -91,6 +94,264 @@ XML detection can also be disabled with the L method. my $dom = DOM::Tiny->new->xml(0)->parse('

Hi!

'); say $dom->at('p[id]')->text; +=head1 SELECTORS + +L uses a CSS selector engine based on L. All CSS +selectors that make sense for a standalone parser are supported. + +=over + +=item Z<>* + +Any element. + + my $all = $dom->find('*'); + +=item E + +An element of type C. + + my $title = $dom->at('title'); + +=item E[foo] + +An C element with a C attribute. + + my $links = $dom->find('a[href]'); + +=item E[foo="bar"] + +An C element whose C attribute value is exactly equal to C. + + my $case_sensitive = $dom->find('input[type="hidden"]'); + my $case_sensitive = $dom->find('input[type=hidden]'); + +=item E[foo="bar" i] + +An C element whose C attribute value is exactly equal to any +(ASCII-range) case-permutation of C. Note that this selector is +EXPERIMENTAL and might change without warning! + + my $case_insensitive = $dom->find('input[type="hidden" i]'); + my $case_insensitive = $dom->find('input[type=hidden i]'); + my $case_insensitive = $dom->find('input[class~="foo" i]'); + +This selector is part of +L, which is still a work +in progress. + +=item E[foo~="bar"] + +An C element whose C attribute value is a list of whitespace-separated +values, one of which is exactly equal to C. + + my $foo = $dom->find('input[class~="foo"]'); + my $foo = $dom->find('input[class~=foo]'); + +=item E[foo^="bar"] + +An C element whose C attribute value begins exactly with the string +C. + + my $begins_with = $dom->find('input[name^="f"]'); + my $begins_with = $dom->find('input[name^=f]'); + +=item E[foo$="bar"] + +An C element whose C attribute value ends exactly with the string +C. + + my $ends_with = $dom->find('input[name$="o"]'); + my $ends_with = $dom->find('input[name$=o]'); + +=item E[foo*="bar"] + +An C element whose C attribute value contains the substring C. + + my $contains = $dom->find('input[name*="fo"]'); + my $contains = $dom->find('input[name*=fo]'); + +=item E:root + +An C element, root of the document. + + my $root = $dom->at(':root'); + +=item E:nth-child(n) + +An C element, the C child of its parent. + + my $third = $dom->find('div:nth-child(3)'); + my $odd = $dom->find('div:nth-child(odd)'); + my $even = $dom->find('div:nth-child(even)'); + my $top3 = $dom->find('div:nth-child(-n+3)'); + +=item E:nth-last-child(n) + +An C element, the C child of its parent, counting from the last one. + + my $third = $dom->find('div:nth-last-child(3)'); + my $odd = $dom->find('div:nth-last-child(odd)'); + my $even = $dom->find('div:nth-last-child(even)'); + my $bottom3 = $dom->find('div:nth-last-child(-n+3)'); + +=item E:nth-of-type(n) + +An C element, the C sibling of its type. + + my $third = $dom->find('div:nth-of-type(3)'); + my $odd = $dom->find('div:nth-of-type(odd)'); + my $even = $dom->find('div:nth-of-type(even)'); + my $top3 = $dom->find('div:nth-of-type(-n+3)'); + +=item E:nth-last-of-type(n) + +An C element, the C sibling of its type, counting from the last one. + + my $third = $dom->find('div:nth-last-of-type(3)'); + my $odd = $dom->find('div:nth-last-of-type(odd)'); + my $even = $dom->find('div:nth-last-of-type(even)'); + my $bottom3 = $dom->find('div:nth-last-of-type(-n+3)'); + +=item E:first-child + +An C element, first child of its parent. + + my $first = $dom->find('div p:first-child'); + +=item E:last-child + +An C element, last child of its parent. + + my $last = $dom->find('div p:last-child'); + +=item E:first-of-type + +An C element, first sibling of its type. + + my $first = $dom->find('div p:first-of-type'); + +=item E:last-of-type + +An C element, last sibling of its type. + + my $last = $dom->find('div p:last-of-type'); + +=item E:only-child + +An C element, only child of its parent. + + my $lonely = $dom->find('div p:only-child'); + +=item E:only-of-type + +An C element, only sibling of its type. + + my $lonely = $dom->find('div p:only-of-type'); + +=item E:empty + +An C element that has no children (including text nodes). + + my $empty = $dom->find(':empty'); + +=item E:checked + +A user interface element C which is checked (for instance a radio-button or +checkbox). + + my $input = $dom->find(':checked'); + +=item E.warning + +An C element whose class is "warning". + + my $warning = $dom->find('div.warning'); + +=item E#myid + +An C element with C equal to "myid". + + my $foo = $dom->at('div#foo'); + +=item E:not(s) + +An C element that does not match simple selector C. + + my $others = $dom->find('div p:not(:first-child)'); + +=item E F + +An C element descendant of an C element. + + my $headlines = $dom->find('div h1'); + +=item E E F + +An C element child of an C element. + + my $headlines = $dom->find('html > body > div > h1'); + +=item E + F + +An C element immediately preceded by an C element. + + my $second = $dom->find('h1 + h2'); + +=item E ~ F + +An C element preceded by an C element. + + my $second = $dom->find('h1 ~ h2'); + +=item E, F, G + +Elements of type C, C and C. + + my $headlines = $dom->find('h1, h2, h3'); + +=item E[foo=bar][bar=baz] + +An C element whose attributes match all following attribute selectors. + + my $links = $dom->find('a[foo^=b][foo$=ar]'); + +=back + +=head1 OPERATORS + +L overloads the following operators. + +=head2 array + + my @nodes = @$dom; + +Alias for L. + + # "" + $dom->parse('123')->[0]; + +=head2 bool + + my $bool = !!$dom; + +Always true. + +=head2 hash + + my %attrs = %$dom; + +Alias for L. + + # "test" + $dom->parse('
Test
')->at('div')->{id}; + +=head2 stringify + + my $str = "$dom"; + +Alias for L. + =head1 METHODS L implements the following methods. @@ -123,8 +384,8 @@ whitespace trimming is enabled by default. my $collection = $dom->ancestors('div ~ p'); Find all ancestor elements of this node matching the CSS selector and return a -L object containing these elements as L -objects. All selectors from L are supported. +L containing these elements as L +objects. All selectors listed in L are supported. # List tag names of ancestor elements say $dom->ancestors->map('tag')->join("\n"); @@ -167,7 +428,7 @@ node's content. Find first descendant element of this element matching the CSS selector and return it as a L object or return C if none could be found. -All selectors from L are supported. +All selectors listed in L are supported. # Find first element with "svg" namespace definition my $namespace = $dom->at('[xmlns\:svg]')->{'xmlns:svg'}; @@ -194,7 +455,7 @@ This element's attributes. my $collection = $dom->child_nodes; -Return a L object containing all child nodes of this +Return a L containing all child nodes of this element as L objects. # "

123

" @@ -212,8 +473,8 @@ element as L objects. my $collection = $dom->children('div ~ p'); Find all child elements of this element matching the CSS selector and return a -L object containing these elements as L -objects. All selectors from L are supported. +L containing these elements as L +objects. All selectors listed in L are supported. # Show tag name of random child element say $dom->children->shuffle->first->tag; @@ -249,7 +510,7 @@ and C nodes) or raw content. my $collection = $dom->descendant_nodes; -Return a L object containing all descendant nodes of +Return a L containing all descendant nodes of this element as L objects. # "

123

" @@ -267,9 +528,8 @@ this element as L objects. my $collection = $dom->find('div ~ p'); Find all descendant elements of this element matching the CSS selector and -return a L object containing these elements as -L objects. All selectors from L are -supported. +return a L containing these elements as +L objects. All selectors listed in L are supported. # Find a specific element and extract information my $id = $dom->find('div')->[23]{id}; @@ -289,8 +549,8 @@ supported. my $collection = $dom->following('div ~ p'); Find all sibling elements after this node matching the CSS selector and return -a L object containing these elements as L -objects. All selectors from L are supported. +a L containing these elements as L +objects. All selectors listen in L are supported. # List tags of sibling elements after this node say $dom->following->map('tag')->join("\n"); @@ -299,7 +559,7 @@ objects. All selectors from L are supported. my $collection = $dom->following_nodes; -Return a L object containing all sibling nodes after +Return a L containing all sibling nodes after this node as L objects. # "C" @@ -309,8 +569,8 @@ this node as L objects. my $bool = $dom->matches('div ~ p'); -Check if this element matches the CSS selector. All selectors from -L are supported. +Check if this element matches the CSS selector. All selectors listed in +L are supported. # True $dom->parse('

A

')->at('p')->matches('.a'); @@ -368,7 +628,7 @@ no parent. $dom = $dom->parse('I ♥ DOM::Tiny!'); -Parse HTML/XML fragment with L. +Parse HTML/XML fragment. # Parse XML my $dom = DOM::Tiny->new->xml(1)->parse($xml); @@ -379,8 +639,8 @@ Parse HTML/XML fragment with L. my $collection = $dom->preceding('div ~ p'); Find all sibling elements before this node matching the CSS selector and return -a L object containing these elements as L -objects. All selectors from L are supported. +a L containing these elements as L +objects. All selectors listed in L are supported. # List tags of sibling elements before this node say $dom->preceding->map('tag')->join("\n"); @@ -389,8 +649,8 @@ objects. All selectors from L are supported. my $collection = $dom->preceding_nodes; -Return a L object containing all sibling nodes before -this node as L objects. +Return a L containing all sibling nodes +before this node as L objects. # "A" $dom->parse('A

C

')->at('p')->preceding_nodes->first->content; @@ -639,39 +899,216 @@ children of the first innermost element. Disable HTML semantics in parser and activate case-sensitivity, defaults to auto detection based on processing instructions. -=head1 OPERATORS +=head1 COLLECTION METHODS -L overloads the following operators. +Some L methods return an array-based collection object based on +L, which can either be accessed directly as an array +reference, or with the following methods. -=head2 array + # Chain methods + $collection->map(sub { ucfirst })->shuffle->each(sub { + my ($word, $num) = @_; + say "$num: $word"; + }); - my @nodes = @$dom; + # Access array directly to manipulate collection + $collection->[23] += 100; + say for @$collection; -Alias for L. +=head2 compact - # "" - $dom->parse('123')->[0]; + my $new = $collection->compact; -=head2 bool +Create a new collection with all elements that are defined and not an empty +string. - my $bool = !!$dom; + # $collection contains (0, 1, undef, 2, '', 3) + $collection->compact->join(', '); # "0, 1, 2, 3" -Always true. +=head2 each -=head2 hash + my @elements = $collection->each; + $collection = $collection->each(sub {...}); - my %attrs = %$dom; +Evaluate callback for each element in collection or return all elements as a +list if none has been provided. The element will be the first argument passed +to the callback and is also available as C<$_>. -Alias for L. + # Make a numbered list + $collection->each(sub { + my ($e, $num) = @_; + say "$num: $e"; + }); - # "test" - $dom->parse('
Test
')->at('div')->{id}; +=head2 first -=head2 stringify + my $first = $collection->first; + my $first = $collection->first(qr/foo/); + my $first = $collection->first(sub {...}); + my $first = $collection->first($method); + my $first = $collection->first($method, @args); - my $str = "$dom"; +Evaluate regular expression/callback for, or call method on, each element in +collection and return the first one that matched the regular expression, or for +which the callback/method returned true. The element will be the first argument +passed to the callback and is also available as C<$_>. -Alias for L. + # Longer version + my $first = $collection->first(sub { $_->$method(@args) }); + + # Find first value that contains the word "tiny" + my $interesting = $collection->first(qr/tiny/i); + + # Find first value that is greater than 5 + my $greater = $collection->first(sub { $_ > 5 }); + +=head2 flatten + + my $new = $collection->flatten; + +Flatten nested collections/arrays recursively and create a new collection with +all elements. + + # $collection contains (1, [2, [3, 4], 5, [6]], 7) + $collection->flatten->join(', '); # "1, 2, 3, 4, 5, 6, 7" + +=head2 grep + + my $new = $collection->grep(qr/foo/); + my $new = $collection->grep(sub {...}); + my $new = $collection->grep($method); + my $new = $collection->grep($method, @args); + +Evaluate regular expression/callback for, or call method on, each element in +collection and create a new collection with all elements that matched the +regular expression, or for which the callback/method returned true. The element +will be the first argument passed to the callback and is also available as +C<$_>. + + # Longer version + my $new = $collection->grep(sub { $_->$method(@args) }); + + # Find all values that contain the word "tiny" + my $interesting = $collection->grep(qr/tiny/i); + + # Find all values that are greater than 5 + my $greater = $collection->grep(sub { $_ > 5 }); + +=head2 join + + my $stream = $collection->join; + my $stream = $collection->join("\n"); + +Turn collection into string. + + # Join all values with commas + $collection->join(', '); + +=head2 last + + my $last = $collection->last; + +Return the last element in collection. + +=head2 map + + my $new = $collection->map(sub {...}); + my $new = $collection->map($method); + my $new = $collection->map($method, @args); + +Evaluate callback for, or call method on, each element in collection and create +a new collection from the results. The element will be the first argument +passed to the callback and is also available as C<$_>. + + # Longer version + my $new = $collection->map(sub { $_->$method(@args) }); + + # Append the word "tiny" to all values + my $domified = $collection->map(sub { $_ . 'tiny' }); + +=head2 reduce + + my $result = $collection->reduce(sub {...}); + my $result = $collection->reduce(sub {...}, $initial); + +Reduce elements in collection with callback, the first element will be used as +initial value if none has been provided. + + # Calculate the sum of all values + my $sum = $collection->reduce(sub { $a + $b }); + + # Count how often each value occurs in collection + my $hash = $collection->reduce(sub { $a->{$b}++; $a }, {}); + +=head2 reverse + + my $new = $collection->reverse; + +Create a new collection with all elements in reverse order. + +=head2 slice + + my $new = $collection->slice(4 .. 7); + +Create a new collection with all selected elements. + + # $collection contains ('A', 'B', 'C', 'D', 'E') + $collection->slice(1, 2, 4)->join(' '); # "B C E" + +=head2 shuffle + + my $new = $collection->shuffle; + +Create a new collection with all elements in random order. + +=head2 size + + my $size = $collection->size; + +Number of elements in collection. + +=head2 sort + + my $new = $collection->sort; + my $new = $collection->sort(sub {...}); + +Sort elements based on return value of callback and create a new collection +from the results. + + # Sort values case-insensitive + my $case_insensitive = $collection->sort(sub { uc($a) cmp uc($b) }); + +=head2 tap + + $collection = $collection->tap(sub {...}); + +Equivalent to L. + +=head2 to_array + + my $array = $collection->to_array; + +Turn collection into array reference. + +=head2 uniq + + my $new = $collection->uniq; + my $new = $collection->uniq(sub {...}); + my $new = $collection->uniq($method); + my $new = $collection->uniq($method, @args); + +Create a new collection without duplicate elements, using the string +representation of either the elements or the return value of the +callback/method. + + # Longer version + my $new = $collection->uniq(sub { $_->$method(@args) }); + + # $collection contains ('foo', 'bar', 'bar', 'baz') + $collection->uniq->join(' '); # "foo bar baz" + + # $collection contains ([1, 2], [2, 1], [3, 2]) + $collection->uniq(sub{ $_->[1] })->to_array; # "[[1, 2], [2, 1]]" =head1 BUGS @@ -691,6 +1128,8 @@ This is free software, licensed under: =head1 SEE ALSO -L, L, L, L, L +L, L, L, L, L + +=for Pod::Coverage TO_JSON =cut