=pod =encoding utf8 =head1 NAME DOM::Tiny - Minimalistic HTML/XML DOM parser with CSS selectors =head1 SYNOPSIS use DOM::Tiny; # Parse my $dom = DOM::Tiny->new('

Test

123

'); # Find say $dom->at('#b')->text; say $dom->find('p')->map('text')->join("\n"); say $dom->find('[id]')->map(attr => 'id')->join("\n"); # Iterate $dom->find('p[id]')->reverse->each(sub { say $_->{id} }); # Loop for my $e ($dom->find('p[id]')->each) { say $e->{id}, ':', $e->text; } # Modify $dom->find('div p')->last->append('

456

'); $dom->find(':not(p)')->map('strip'); # Render say "$dom"; =head1 DESCRIPTION L is a minimalistic and relaxed pure-perl HTML/XML DOM parser with support for the L and L based on L. It will even try to interpret broken HTML and XML, so you should not use it for validation. =head1 NODES AND ELEMENTS When we parse an HTML/XML fragment, it gets turned into a tree of nodes. Hello World! There are currently eight different kinds of nodes, C, C, C, C, C, C, C and C. Elements are nodes of the type C. root |- doctype (html) +- tag (html) |- tag (head) | +- tag (title) | +- raw (Hello) +- tag (body) +- text (World!) While all node types are represented as L objects, some methods like L and L only apply to elements. =head1 CASE-SENSITIVITY L defaults to HTML semantics, that means all tags and attribute names are lowercased and selectors need to be lowercase as well. # HTML semantics my $dom = DOM::Tiny->new('

Hi!

'); say $dom->at('p[id]')->text; If XML processing instructions are found, the parser will automatically switch into XML mode and everything becomes case-sensitive. # XML semantics my $dom = DOM::Tiny->new('

Hi!

'); say $dom->at('P[ID]')->text; XML detection can also be disabled with the L method. # Force XML semantics my $dom = DOM::Tiny->new->xml(1)->parse('

Hi!

'); say $dom->at('P[ID]')->text; # Force HTML semantics my $dom = DOM::Tiny->new->xml(0)->parse('

Hi!

'); say $dom->at('p[id]')->text; =head1 METHODS L implements the following methods. =head2 new my $dom = DOM::Tiny->new; my $dom = DOM::Tiny->new('I ♥ DOM::Tiny!'); Construct a new scalar-based L object and L HTML/XML fragment if necessary. =head2 all_text my $trimmed = $dom->all_text; my $untrimmed = $dom->all_text(0); Extract text content from all descendant nodes of this element, smart whitespace trimming is enabled by default. # "foo bar baz" $dom->parse("

foo\n

bar

baz\n

")->at('div')->all_text; # "foo\nbarbaz\n" $dom->parse("

foo\n

bar

baz\n

")->at('div')->all_text(0); =head2 ancestors my $collection = $dom->ancestors; my $collection = $dom->ancestors('div ~ p'); Find all ancestor elements of this node matching the CSS selector and return a L object containing these elements as L objects. All selectors from L are supported. # List tag names of ancestor elements say $dom->ancestors->map('tag')->join("\n"); =head2 append $dom = $dom->append('

I ♥ DOM::Tiny!

'); Append HTML/XML fragment to this node. # "

Test

123

" $dom->parse('

Test

') ->at('h1')->append('

123

')->root; # "

Test 123

" $dom->parse('

Test

')->at('p') ->child_nodes->first->append(' 123')->root; =head2 append_content $dom = $dom->append_content('

I ♥ DOM::Tiny!

'); Append HTML/XML fragment (for C and C nodes) or raw content to this node's content. # "

Test123

" $dom->parse('

Test

') ->at('h1')->append_content('123')->root; # "
" $dom->parse('
') ->child_nodes->first->append_content('123 ')->root; # "

Test123

" $dom->parse('

Test

')->at('p')->append_content('123')->root; =head2 at my $result = $dom->at('div ~ p'); Find first descendant element of this element matching the CSS selector and return it as a L object or return C if none could be found. All selectors from L are supported. # Find first element with "svg" namespace definition my $namespace = $dom->at('[xmlns\:svg]')->{'xmlns:svg'}; =head2 attr my $hash = $dom->attr; my $foo = $dom->attr('foo'); $dom = $dom->attr({foo => 'bar'}); $dom = $dom->attr(foo => 'bar'); This element's attributes. # Remove an attribute delete $dom->attr->{id}; # Attribute without value $dom->attr(selected => undef); # List id attributes say $dom->find('*')->map(attr => 'id')->compact->join("\n"); =head2 child_nodes my $collection = $dom->child_nodes; Return a L object containing all child nodes of this element as L objects. # "

123

" $dom->parse('

Test123

')->at('p')->child_nodes->first->remove; # "" $dom->parse('123')->child_nodes->first; # " Test " $dom->parse('123')->child_nodes->last->content; =head2 children my $collection = $dom->children; my $collection = $dom->children('div ~ p'); Find all child elements of this element matching the CSS selector and return a L object containing these elements as L objects. All selectors from L are supported. # Show tag name of random child element say $dom->children->shuffle->first->tag; =head2 content my $str = $dom->content; $dom = $dom->content('

I ♥ DOM::Tiny!

'); Return this node's content or replace it with HTML/XML fragment (for C and C nodes) or raw content. # "Test" $dom->parse('

Test

')->at('div')->content; # "

123

" $dom->parse('

Test

')->at('h1')->content('123')->root; # "

123

" $dom->parse('

Test

')->at('p')->content('123')->root; # "

" $dom->parse('

Test

')->at('h1')->content('')->root; # " Test " $dom->parse('
')->child_nodes->first->content; # "

456

" $dom->parse('

456

') ->at('div')->child_nodes->first->content(' 123 ')->root; =head2 descendant_nodes my $collection = $dom->descendant_nodes; Return a L object containing all descendant nodes of this element as L objects. # "

123

" $dom->parse('

123

') ->descendant_nodes->grep(sub { $_->type eq 'comment' }) ->map('remove')->first; # "

testtest

" $dom->parse('

123456

') ->at('p')->descendant_nodes->grep(sub { $_->type eq 'text' }) ->map(content => 'test')->first->root; =head2 find my $collection = $dom->find('div ~ p'); Find all descendant elements of this element matching the CSS selector and return a L object containing these elements as L objects. All selectors from L are supported. # Find a specific element and extract information my $id = $dom->find('div')->[23]{id}; # Extract information from multiple elements my @headers = $dom->find('h1, h2, h3')->map('text')->each; # Count all the different tags my $hash = $dom->find('*')->reduce(sub { $a->{$b->tag}++; $a }, {}); # Find elements with a class that contains dots my @divs = $dom->find('div.foo\.bar')->each; =head2 following my $collection = $dom->following; my $collection = $dom->following('div ~ p'); Find all sibling elements after this node matching the CSS selector and return a L object containing these elements as L objects. All selectors from L are supported. # List tags of sibling elements after this node say $dom->following->map('tag')->join("\n"); =head2 following_nodes my $collection = $dom->following_nodes; Return a L object containing all sibling nodes after this node as L objects. # "C" $dom->parse('

C')->at('p')->following_nodes->last->content; =head2 matches my $bool = $dom->matches('div ~ p'); Check if this element matches the CSS selector. All selectors from L are supported. # True $dom->parse('

')->at('p')->matches('.a'); $dom->parse('

')->at('p')->matches('p[class]'); # False $dom->parse('

')->at('p')->matches('.b'); $dom->parse('

')->at('p')->matches('p[id]'); =head2 namespace my $namespace = $dom->namespace; Find this element's namespace or return C if none could be found. # Find namespace for an element with namespace prefix my $namespace = $dom->at('svg > svg\:circle')->namespace; # Find namespace for an element that may or may not have a namespace prefix my $namespace = $dom->at('svg > circle')->namespace; =head2 next my $sibling = $dom->next; Return L object for next sibling element or C if there are no more siblings. # "

123

" $dom->parse('

Test

123

')->at('h1')->next; =head2 next_node my $sibling = $dom->next_node; Return L object for next sibling node or C if there are no more siblings. # "456" $dom->parse('

123456

') ->at('b')->next_node->next_node; # " Test " $dom->parse('

123456

') ->at('b')->next_node->content; =head2 parent my $parent = $dom->parent; Return L object for parent of this node or C if this node has no parent. =head2 parse $dom = $dom->parse('I ♥ DOM::Tiny!'); Parse HTML/XML fragment with L. # Parse XML my $dom = DOM::Tiny->new->xml(1)->parse($xml); =head2 preceding my $collection = $dom->preceding; my $collection = $dom->preceding('div ~ p'); Find all sibling elements before this node matching the CSS selector and return a L object containing these elements as L objects. All selectors from L are supported. # List tags of sibling elements before this node say $dom->preceding->map('tag')->join("\n"); =head2 preceding_nodes my $collection = $dom->preceding_nodes; Return a L object containing all sibling nodes before this node as L objects. # "A" $dom->parse('A

')->at('p')->preceding_nodes->first->content; =head2 prepend $dom = $dom->prepend('

I ♥ DOM::Tiny!

'); Prepend HTML/XML fragment to this node. # "

Test

123

" $dom->parse('

123

') ->at('h2')->prepend('

Test

')->root; # "

Test 123

" $dom->parse('

123

') ->at('p')->child_nodes->first->prepend('Test ')->root; =head2 prepend_content $dom = $dom->prepend_content('

I ♥ DOM::Tiny!

'); Prepend HTML/XML fragment (for C and C nodes) or raw content to this node's content. # "

Test123

" $dom->parse('

123

') ->at('h2')->prepend_content('Test')->root; # "
" $dom->parse('
') ->child_nodes->first->prepend_content(' Test')->root; # "

123Test

" $dom->parse('

Test

')->at('p')->prepend_content('123')->root; =head2 previous my $sibling = $dom->previous; Return L object for previous sibling element or C if there are no more siblings. # "

Test

" $dom->parse('

Test

123

')->at('h2')->previous; =head2 previous_node my $sibling = $dom->previous_node; Return L object for previous sibling node or C if there are no more siblings. # "123" $dom->parse('

123456

') ->at('b')->previous_node->previous_node; # " Test " $dom->parse('

123456

') ->at('b')->previous_node->content; =head2 remove my $parent = $dom->remove; Remove this node and return L (for C nodes) or L. # "

" $dom->parse('

Test

')->at('h1')->remove; # "

456

" $dom->parse('

123456

') ->at('p')->child_nodes->first->remove->root; =head2 replace my $parent = $dom->replace('

I ♥ DOM::Tiny!

'); Replace this node with HTML/XML fragment and return L (for C nodes) or L. # "

123

" $dom->parse('

Test

')->at('h1')->replace('

123

'); # "

123

" $dom->parse('

Test

') ->at('p')->child_nodes->[0]->replace('123')->root; =head2 root my $root = $dom->root; Return L object for C node. =head2 strip my $parent = $dom->strip; Remove this element while preserving its content and return L. # "

Test

" $dom->parse('

Test

')->at('h1')->strip; =head2 tag my $tag = $dom->tag; $dom = $dom->tag('div'); This element's tag name. # List tag names of child elements say $dom->children->map('tag')->join("\n"); =head2 tap $dom = $dom->tap(sub {...}); Equivalent to L. =head2 text my $trimmed = $dom->text; my $untrimmed = $dom->text(0); Extract text content from this element only (not including child elements), smart whitespace trimming is enabled by default. # "foo baz" $dom->parse("

foo\n

bar

baz\n

")->at('div')->text; # "foo\nbaz\n" $dom->parse("

foo\n

bar

baz\n

")->at('div')->text(0); =head2 to_string my $str = $dom->to_string; Render this node and its content to HTML/XML. # "Test" $dom->parse('

Test

')->at('div b')->to_string; =head2 tree my $tree = $dom->tree; $dom = $dom->tree(['root']); Document Object Model. Note that this structure should only be used very carefully since it is very dynamic. =head2 type my $type = $dom->type; This node's type, usually C, C, C, C, C, C, C or C. # "cdata" $dom->parse('')->child_nodes->first->type; # "comment" $dom->parse('')->child_nodes->first->type; # "doctype" $dom->parse('')->child_nodes->first->type; # "pi" $dom->parse('')->child_nodes->first->type; # "raw" $dom->parse('Test')->at('title')->child_nodes->first->type; # "root" $dom->parse('

Test

')->type; # "tag" $dom->parse('

Test

')->at('p')->type; # "text" $dom->parse('

Test

')->at('p')->child_nodes->first->type; =head2 val my $value = $dom->val; Extract value from form element (such as C