Hello

=pod =encoding utf8 =head1 NAME DOM::Tiny - Minimalistic HTML/XML DOM parser with CSS selectors =head1 SYNOPSIS use DOM::Tiny; # Parse my $dom = DOM::Tiny->new('

Test

123

'); # Find say $dom->at('#b')->text; say $dom->find('p')->map('text')->join("\n"); say $dom->find('[id]')->map(attr => 'id')->join("\n"); # Iterate $dom->find('p[id]')->reverse->each(sub { say $_->{id} }); # Loop for my $e ($dom->find('p[id]')->each) { say $e->{id}, ':', $e->text; } # Modify $dom->find('div p')->last->append('

456

'); $dom->find(':not(p)')->map('strip'); # Render say "$dom"; =head1 DESCRIPTION L is a minimalistic and relaxed pure-perl HTML/XML DOM parser based on L. It supports the L and L, and matching based on L. It will even try to interpret broken HTML and XML, so you should not use it for validation. =head1 NODES AND ELEMENTS When we parse an HTML/XML fragment, it gets turned into a tree of nodes. Hello World! There are currently eight different kinds of nodes, C, C, C, C, C, C, C and C. Elements are nodes of the type C. root |- doctype (html) +- tag (html) |- tag (head) | +- tag (title) | +- raw (Hello) +- tag (body) +- text (World!) While all node types are represented as L objects, some methods like L and L only apply to elements. =head1 CASE-SENSITIVITY L defaults to HTML semantics, that means all tags and attribute names are lowercased and selectors need to be lowercase as well. # HTML semantics my $dom = DOM::Tiny->new('

Hi!

'); say $dom->at('p[id]')->text; If XML processing instructions are found, the parser will automatically switch into XML mode and everything becomes case-sensitive. # XML semantics my $dom = DOM::Tiny->new('

Hi!

'); say $dom->at('P[ID]')->text; XML detection can also be disabled with the L method. # Force XML semantics my $dom = DOM::Tiny->new->xml(1)->parse('

Hi!

'); say $dom->at('P[ID]')->text; # Force HTML semantics my $dom = DOM::Tiny->new->xml(0)->parse('

Hi!

'); say $dom->at('p[id]')->text; =head1 SELECTORS L uses a CSS selector engine based on L. All CSS selectors that make sense for a standalone parser are supported. =over =item Z<>* Any element. my $all = $dom->find('*'); =item E An element of type C. my $title = $dom->at('title'); =item E[foo] An C element with a C attribute. my $links = $dom->find('a[href]'); =item E[foo="bar"] An C element whose C attribute value is exactly equal to C. my $case_sensitive = $dom->find('input[type="hidden"]'); my $case_sensitive = $dom->find('input[type=hidden]'); =item E[foo="bar" i] An C element whose C attribute value is exactly equal to any (ASCII-range) case-permutation of C. Note that this selector is EXPERIMENTAL and might change without warning! my $case_insensitive = $dom->find('input[type="hidden" i]'); my $case_insensitive = $dom->find('input[type=hidden i]'); my $case_insensitive = $dom->find('input[class~="foo" i]'); This selector is part of L, which is still a work in progress. =item E[foo~="bar"] An C element whose C attribute value is a list of whitespace-separated values, one of which is exactly equal to C. my $foo = $dom->find('input[class~="foo"]'); my $foo = $dom->find('input[class~=foo]'); =item E[foo^="bar"] An C element whose C attribute value begins exactly with the string C. my $begins_with = $dom->find('input[name^="f"]'); my $begins_with = $dom->find('input[name^=f]'); =item E[foo$="bar"] An C element whose C attribute value ends exactly with the string C. my $ends_with = $dom->find('input[name$="o"]'); my $ends_with = $dom->find('input[name$=o]'); =item E[foo*="bar"] An C element whose C attribute value contains the substring C. my $contains = $dom->find('input[name*="fo"]'); my $contains = $dom->find('input[name*=fo]'); =item E:root An C element, root of the document. my $root = $dom->at(':root'); =item E:nth-child(n) An C element, the C child of its parent. my $third = $dom->find('div:nth-child(3)'); my $odd = $dom->find('div:nth-child(odd)'); my $even = $dom->find('div:nth-child(even)'); my $top3 = $dom->find('div:nth-child(-n+3)'); =item E:nth-last-child(n) An C element, the C child of its parent, counting from the last one. my $third = $dom->find('div:nth-last-child(3)'); my $odd = $dom->find('div:nth-last-child(odd)'); my $even = $dom->find('div:nth-last-child(even)'); my $bottom3 = $dom->find('div:nth-last-child(-n+3)'); =item E:nth-of-type(n) An C element, the C sibling of its type. my $third = $dom->find('div:nth-of-type(3)'); my $odd = $dom->find('div:nth-of-type(odd)'); my $even = $dom->find('div:nth-of-type(even)'); my $top3 = $dom->find('div:nth-of-type(-n+3)'); =item E:nth-last-of-type(n) An C element, the C sibling of its type, counting from the last one. my $third = $dom->find('div:nth-last-of-type(3)'); my $odd = $dom->find('div:nth-last-of-type(odd)'); my $even = $dom->find('div:nth-last-of-type(even)'); my $bottom3 = $dom->find('div:nth-last-of-type(-n+3)'); =item E:first-child An C element, first child of its parent. my $first = $dom->find('div p:first-child'); =item E:last-child An C element, last child of its parent. my $last = $dom->find('div p:last-child'); =item E:first-of-type An C element, first sibling of its type. my $first = $dom->find('div p:first-of-type'); =item E:last-of-type An C element, last sibling of its type. my $last = $dom->find('div p:last-of-type'); =item E:only-child An C element, only child of its parent. my $lonely = $dom->find('div p:only-child'); =item E:only-of-type An C element, only sibling of its type. my $lonely = $dom->find('div p:only-of-type'); =item E:empty An C element that has no children (including text nodes). my $empty = $dom->find(':empty'); =item E:checked A user interface element C which is checked (for instance a radio-button or checkbox). my $input = $dom->find(':checked'); =item E.warning An C element whose class is "warning". my $warning = $dom->find('div.warning'); =item E#myid An C element with C equal to "myid". my $foo = $dom->at('div#foo'); =item E:not(s) An C element that does not match simple selector C. my $others = $dom->find('div p:not(:first-child)'); =item E F An C element descendant of an C element. my $headlines = $dom->find('div h1'); =item E E F An C element child of an C element. my $headlines = $dom->find('html > body > div > h1'); =item E + F An C element immediately preceded by an C element. my $second = $dom->find('h1 + h2'); =item E ~ F An C element preceded by an C element. my $second = $dom->find('h1 ~ h2'); =item E, F, G Elements of type C, C and C. my $headlines = $dom->find('h1, h2, h3'); =item E[foo=bar][bar=baz] An C element whose attributes match all following attribute selectors. my $links = $dom->find('a[foo^=b][foo$=ar]'); =back =head1 OPERATORS L overloads the following operators. =head2 array my @nodes = @$dom; Alias for L. # "" $dom->parse('123')->[0]; =head2 bool my $bool = !!$dom; Always true. =head2 hash my %attrs = %$dom; Alias for L. # "test" $dom->parse('
Test
')->at('div')->{id}; =head2 stringify my $str = "$dom"; Alias for L. =head1 METHODS L implements the following methods. =head2 new my $dom = DOM::Tiny->new; my $dom = DOM::Tiny->new('I ♥ DOM::Tiny!'); Construct a new scalar-based L object and L HTML/XML fragment if necessary. =head2 all_text my $trimmed = $dom->all_text; my $untrimmed = $dom->all_text(0); Extract text content from all descendant nodes of this element, smart whitespace trimming is enabled by default. # "foo bar baz" $dom->parse("
foo\n
bar
baz\n
")->at('div')->all_text; # "foo\nbarbaz\n" $dom->parse("
foo\n
bar
baz\n
")->at('div')->all_text(0); =head2 ancestors my $collection = $dom->ancestors; my $collection = $dom->ancestors('div ~ p'); Find all ancestor elements of this node matching the CSS selector and return a L containing these elements as L objects. All selectors listed in L are supported. # List tag names of ancestor elements say $dom->ancestors->map('tag')->join("\n"); =head2 append $dom = $dom->append('
I ♥ DOM::Tiny!
'); Append HTML/XML fragment to this node. # "
Test
123
" $dom->parse('
Test
') ->at('h1')->append('
123
')->root; # "
Test 123
" $dom->parse('
Test
')->at('p') ->child_nodes->first->append(' 123')->root; =head2 append_content $dom = $dom->append_content('
I ♥ DOM::Tiny!
'); Append HTML/XML fragment (for C and C nodes) or raw content to this node's content. # "
Test123
" $dom->parse('
Test
') ->at('h1')->append_content('123')->root; # "
" $dom->parse('
') ->child_nodes->first->append_content('123 ')->root; # "
Test123
" $dom->parse('
Test
')->at('p')->append_content('123')->root; =head2 at my $result = $dom->at('div ~ p'); Find first descendant element of this element matching the CSS selector and return it as a L object or return C if none could be found. All selectors listed in L are supported. # Find first element with "svg" namespace definition my $namespace = $dom->at('[xmlns\:svg]')->{'xmlns:svg'}; =head2 attr my $hash = $dom->attr; my $foo = $dom->attr('foo'); $dom = $dom->attr({foo => 'bar'}); $dom = $dom->attr(foo => 'bar'); This element's attributes. # Remove an attribute delete $dom->attr->{id}; # Attribute without value $dom->attr(selected => undef); # List id attributes say $dom->find('*')->map(attr => 'id')->compact->join("\n"); =head2 child_nodes my $collection = $dom->child_nodes; Return a L containing all child nodes of this element as L objects. # "
123
" $dom->parse('
Test123
')->at('p')->child_nodes->first->remove; # "" $dom->parse('123')->child_nodes->first; # " Test " $dom->parse('123')->child_nodes->last->content; =head2 children my $collection = $dom->children; my $collection = $dom->children('div ~ p'); Find all child elements of this element matching the CSS selector and return a L containing these elements as L objects. All selectors listed in L are supported. # Show tag name of random child element say $dom->children->shuffle->first->tag; =head2 content my $str = $dom->content; $dom = $dom->content('
I ♥ DOM::Tiny!
'); Return this node's content or replace it with HTML/XML fragment (for C and C nodes) or raw content. # "Test" $dom->parse('
Test
')->at('div')->content; # "
123
" $dom->parse('
Test
')->at('h1')->content('123')->root; # "
123
" $dom->parse('
Test
')->at('p')->content('123')->root; # "
" $dom->parse('
Test
')->at('h1')->content('')->root; # " Test " $dom->parse('
')->child_nodes->first->content; # "
456
" $dom->parse('
456
') ->at('div')->child_nodes->first->content(' 123 ')->root; =head2 descendant_nodes my $collection = $dom->descendant_nodes; Return a L containing all descendant nodes of this element as L objects. # "
123
" $dom->parse('
123
') ->descendant_nodes->grep(sub { $_->type eq 'comment' }) ->map('remove')->first; # "
testtest
" $dom->parse('
123456
') ->at('p')->descendant_nodes->grep(sub { $_->type eq 'text' }) ->map(content => 'test')->first->root; =head2 find my $collection = $dom->find('div ~ p'); Find all descendant elements of this element matching the CSS selector and return a L containing these elements as L objects. All selectors listed in L are supported. # Find a specific element and extract information my $id = $dom->find('div')->[23]{id}; # Extract information from multiple elements my @headers = $dom->find('h1, h2, h3')->map('text')->each; # Count all the different tags my $hash = $dom->find('*')->reduce(sub { $a->{$b->tag}++; $a }, {}); # Find elements with a class that contains dots my @divs = $dom->find('div.foo\.bar')->each; =head2 following my $collection = $dom->following; my $collection = $dom->following('div ~ p'); Find all sibling elements after this node matching the CSS selector and return a L containing these elements as L objects. All selectors listen in L are supported. # List tags of sibling elements after this node say $dom->following->map('tag')->join("\n"); =head2 following_nodes my $collection = $dom->following_nodes; Return a L containing all sibling nodes after this node as L objects. # "C" $dom->parse('
A
C')->at('p')->following_nodes->last->content; =head2 matches my $bool = $dom->matches('div ~ p'); Check if this element matches the CSS selector. All selectors listed in L are supported. # True $dom->parse('
A
')->at('p')->matches('.a'); $dom->parse('
A
')->at('p')->matches('p[class]'); # False $dom->parse('
A
')->at('p')->matches('.b'); $dom->parse('
A
')->at('p')->matches('p[id]'); =head2 namespace my $namespace = $dom->namespace; Find this element's namespace or return C if none could be found. # Find namespace for an element with namespace prefix my $namespace = $dom->at('svg > svg\:circle')->namespace; # Find namespace for an element that may or may not have a namespace prefix my $namespace = $dom->at('svg > circle')->namespace; =head2 next my $sibling = $dom->next; Return L object for next sibling element or C if there are no more siblings. # "
123
" $dom->parse('
Test
123
')->at('h1')->next; =head2 next_node my $sibling = $dom->next_node; Return L object for next sibling node or C if there are no more siblings. # "456" $dom->parse('
123456
') ->at('b')->next_node->next_node; # " Test " $dom->parse('
123456
') ->at('b')->next_node->content; =head2 parent my $parent = $dom->parent; Return L object for parent of this node or C if this node has no parent. =head2 parse $dom = $dom->parse('I ♥ DOM::Tiny!'); Parse HTML/XML fragment. # Parse XML my $dom = DOM::Tiny->new->xml(1)->parse($xml); =head2 preceding my $collection = $dom->preceding; my $collection = $dom->preceding('div ~ p'); Find all sibling elements before this node matching the CSS selector and return a L containing these elements as L objects. All selectors listed in L are supported. # List tags of sibling elements before this node say $dom->preceding->map('tag')->join("\n"); =head2 preceding_nodes my $collection = $dom->preceding_nodes; Return a L containing all sibling nodes before this node as L objects. # "A" $dom->parse('A
C
')->at('p')->preceding_nodes->first->content; =head2 prepend $dom = $dom->prepend('
I ♥ DOM::Tiny!
'); Prepend HTML/XML fragment to this node. # "
Test
123
" $dom->parse('
123
') ->at('h2')->prepend('
Test
')->root; # "
Test 123
" $dom->parse('
123
') ->at('p')->child_nodes->first->prepend('Test ')->root; =head2 prepend_content $dom = $dom->prepend_content('
I ♥ DOM::Tiny!
'); Prepend HTML/XML fragment (for C and C nodes) or raw content to this node's content. # "
Test123
" $dom->parse('
123
') ->at('h2')->prepend_content('Test')->root; # "
" $dom->parse('
') ->child_nodes->first->prepend_content(' Test')->root; # "
123Test
" $dom->parse('
Test
')->at('p')->prepend_content('123')->root; =head2 previous my $sibling = $dom->previous; Return L object for previous sibling element or C if there are no more siblings. # "
Test
" $dom->parse('
Test
123
')->at('h2')->previous; =head2 previous_node my $sibling = $dom->previous_node; Return L object for previous sibling node or C if there are no more siblings. # "123" $dom->parse('
123456
') ->at('b')->previous_node->previous_node; # " Test " $dom->parse('
123456
') ->at('b')->previous_node->content; =head2 remove my $parent = $dom->remove; Remove this node and return L (for C nodes) or L. # "
" $dom->parse('
Test
')->at('h1')->remove; # "
456
" $dom->parse('
123456
') ->at('p')->child_nodes->first->remove->root; =head2 replace my $parent = $dom->replace('
I ♥ DOM::Tiny!
'); Replace this node with HTML/XML fragment and return L (for C nodes) or L. # "
123
" $dom->parse('
Test
')->at('h1')->replace('
123
'); # "
123
" $dom->parse('
Test
') ->at('p')->child_nodes->[0]->replace('123')->root; =head2 root my $root = $dom->root; Return L object for C node. =head2 strip my $parent = $dom->strip; Remove this element while preserving its content and return L. # "
Test
" $dom->parse('
Test
')->at('h1')->strip; =head2 tag my $tag = $dom->tag; $dom = $dom->tag('div'); This element's tag name. # List tag names of child elements say $dom->children->map('tag')->join("\n"); =head2 tap $dom = $dom->tap(sub {...}); Equivalent to L. =head2 text my $trimmed = $dom->text; my $untrimmed = $dom->text(0); Extract text content from this element only (not including child elements), smart whitespace trimming is enabled by default. # "foo baz" $dom->parse("
foo\n
bar
baz\n
")->at('div')->text; # "foo\nbaz\n" $dom->parse("
foo\n
bar
baz\n
")->at('div')->text(0); =head2 to_string my $str = $dom->to_string; Render this node and its content to HTML/XML. # "Test" $dom->parse('
Test
')->at('div b')->to_string; =head2 tree my $tree = $dom->tree; $dom = $dom->tree(['root']); Document Object Model. Note that this structure should only be used very carefully since it is very dynamic. =head2 type my $type = $dom->type; This node's type, usually C, C, C, C, C, C, C or C. # "cdata" $dom->parse('')->child_nodes->first->type; # "comment" $dom->parse('')->child_nodes->first->type; # "doctype" $dom->parse('')->child_nodes->first->type; # "pi" $dom->parse('')->child_nodes->first->type; # "raw" $dom->parse('Test')->at('title')->child_nodes->first->type; # "root" $dom->parse('
Test
')->type; # "tag" $dom->parse('
Test
')->at('p')->type; # "text" $dom->parse('
Test
')->at('p')->child_nodes->first->type; =head2 val my $value = $dom->val; Extract value from form element (such as C