=pod
=encoding utf8
=head1 NAME
DOM::Tiny - Minimalistic HTML/XML DOM parser with CSS selectors
=head1 SYNOPSIS
use DOM::Tiny;
# Parse
my $dom = DOM::Tiny->new('
Test
123
');
# Find
say $dom->at('#b')->text;
say $dom->find('p')->map('text')->join("\n");
say $dom->find('[id]')->map(attr => 'id')->join("\n");
# Iterate
$dom->find('p[id]')->reverse->each(sub { say $_->{id} });
# Loop
for my $e ($dom->find('p[id]')->each) {
say $e->{id}, ':', $e->text;
}
# Modify
$dom->find('div p')->last->append('
456
');
$dom->find(':not(p)')->map('strip');
# Render
say "$dom";
=head1 DESCRIPTION
L is a minimalistic and relaxed pure-perl HTML/XML DOM parser with
support for the L and
L based on L. It
will even try to interpret broken HTML and XML, so you should not use it for
validation.
=head1 NODES AND ELEMENTS
When we parse an HTML/XML fragment, it gets turned into a tree of nodes.
Hello
World!
There are currently eight different kinds of nodes, C, C,
C, C, C, C, C and C. Elements are nodes of
the type C.
root
|- doctype (html)
+- tag (html)
|- tag (head)
| +- tag (title)
| +- raw (Hello)
+- tag (body)
+- text (World!)
While all node types are represented as L objects, some methods like
L"attr"> and L"namespace"> only apply to elements.
=head1 CASE-SENSITIVITY
L defaults to HTML semantics, that means all tags and attribute
names are lowercased and selectors need to be lowercase as well.
# HTML semantics
my $dom = DOM::Tiny->new('
Hi!
');
say $dom->at('p[id]')->text;
If XML processing instructions are found, the parser will automatically switch
into XML mode and everything becomes case-sensitive.
# XML semantics
my $dom = DOM::Tiny->new('
Hi!
');
say $dom->at('P[ID]')->text;
XML detection can also be disabled with the L"xml"> method.
# Force XML semantics
my $dom = DOM::Tiny->new->xml(1)->parse('
Hi!
');
say $dom->at('P[ID]')->text;
# Force HTML semantics
my $dom = DOM::Tiny->new->xml(0)->parse('
Hi!
');
say $dom->at('p[id]')->text;
=head1 METHODS
L implements the following methods.
=head2 new
my $dom = DOM::Tiny->new;
my $dom = DOM::Tiny->new('I ♥ DOM::Tiny!');
Construct a new scalar-based L object and L"parse"> HTML/XML
fragment if necessary.
=head2 all_text
my $trimmed = $dom->all_text;
my $untrimmed = $dom->all_text(0);
Extract text content from all descendant nodes of this element, smart
whitespace trimming is enabled by default.
# "foo bar baz"
$dom->parse("
")->at('div')->all_text(0);
=head2 ancestors
my $collection = $dom->ancestors;
my $collection = $dom->ancestors('div ~ p');
Find all ancestor elements of this node matching the CSS selector and return a
L object containing these elements as L
objects. All selectors from L are supported.
# List tag names of ancestor elements
say $dom->ancestors->map('tag')->join("\n");
=head2 append
$dom = $dom->append('
')->at('p')->append_content('123')->root;
=head2 at
my $result = $dom->at('div ~ p');
Find first descendant element of this element matching the CSS selector and
return it as a L object or return C if none could be found.
All selectors from L are supported.
# Find first element with "svg" namespace definition
my $namespace = $dom->at('[xmlns\:svg]')->{'xmlns:svg'};
=head2 attr
my $hash = $dom->attr;
my $foo = $dom->attr('foo');
$dom = $dom->attr({foo => 'bar'});
$dom = $dom->attr(foo => 'bar');
This element's attributes.
# Remove an attribute
delete $dom->attr->{id};
# Attribute without value
$dom->attr(selected => undef);
# List id attributes
say $dom->find('*')->map(attr => 'id')->compact->join("\n");
=head2 child_nodes
my $collection = $dom->child_nodes;
Return a L object containing all child nodes of this
element as L objects.
# "
123
"
$dom->parse('
Test123
')->at('p')->child_nodes->first->remove;
# ""
$dom->parse('123')->child_nodes->first;
# " Test "
$dom->parse('123')->child_nodes->last->content;
=head2 children
my $collection = $dom->children;
my $collection = $dom->children('div ~ p');
Find all child elements of this element matching the CSS selector and return a
L object containing these elements as L
objects. All selectors from L are supported.
# Show tag name of random child element
say $dom->children->shuffle->first->tag;
=head2 content
my $str = $dom->content;
$dom = $dom->content('
I ♥ DOM::Tiny!
');
Return this node's content or replace it with HTML/XML fragment (for C
and C nodes) or raw content.
# "Test"
$dom->parse('
Test
')->at('div')->content;
# "
123
"
$dom->parse('
Test
')->at('h1')->content('123')->root;
# "
123
"
$dom->parse('
Test
')->at('p')->content('123')->root;
# "
"
$dom->parse('
Test
')->at('h1')->content('')->root;
# " Test "
$dom->parse(' ')->child_nodes->first->content;
# "
456
"
$dom->parse('
456
')
->at('div')->child_nodes->first->content(' 123 ')->root;
=head2 descendant_nodes
my $collection = $dom->descendant_nodes;
Return a L object containing all descendant nodes of
this element as L objects.
# "
')
->at('p')->descendant_nodes->grep(sub { $_->type eq 'text' })
->map(content => 'test')->first->root;
=head2 find
my $collection = $dom->find('div ~ p');
Find all descendant elements of this element matching the CSS selector and
return a L object containing these elements as
L objects. All selectors from L are
supported.
# Find a specific element and extract information
my $id = $dom->find('div')->[23]{id};
# Extract information from multiple elements
my @headers = $dom->find('h1, h2, h3')->map('text')->each;
# Count all the different tags
my $hash = $dom->find('*')->reduce(sub { $a->{$b->tag}++; $a }, {});
# Find elements with a class that contains dots
my @divs = $dom->find('div.foo\.bar')->each;
=head2 following
my $collection = $dom->following;
my $collection = $dom->following('div ~ p');
Find all sibling elements after this node matching the CSS selector and return
a L object containing these elements as L
objects. All selectors from L are supported.
# List tags of sibling elements after this node
say $dom->following->map('tag')->join("\n");
=head2 following_nodes
my $collection = $dom->following_nodes;
Return a L object containing all sibling nodes after
this node as L objects.
# "C"
$dom->parse('
A
C')->at('p')->following_nodes->last->content;
=head2 matches
my $bool = $dom->matches('div ~ p');
Check if this element matches the CSS selector. All selectors from
L are supported.
# True
$dom->parse('
')->at('p')->matches('p[id]');
=head2 namespace
my $namespace = $dom->namespace;
Find this element's namespace or return C if none could be found.
# Find namespace for an element with namespace prefix
my $namespace = $dom->at('svg > svg\:circle')->namespace;
# Find namespace for an element that may or may not have a namespace prefix
my $namespace = $dom->at('svg > circle')->namespace;
=head2 next
my $sibling = $dom->next;
Return L object for next sibling element or C if there are no
more siblings.
# "
123
"
$dom->parse('
Test
123
')->at('h1')->next;
=head2 next_node
my $sibling = $dom->next_node;
Return L object for next sibling node or C if there are no
more siblings.
# "456"
$dom->parse('
123456
')
->at('b')->next_node->next_node;
# " Test "
$dom->parse('
123456
')
->at('b')->next_node->content;
=head2 parent
my $parent = $dom->parent;
Return L object for parent of this node or C if this node has
no parent.
=head2 parse
$dom = $dom->parse('I ♥ DOM::Tiny!');
Parse HTML/XML fragment with L.
# Parse XML
my $dom = DOM::Tiny->new->xml(1)->parse($xml);
=head2 preceding
my $collection = $dom->preceding;
my $collection = $dom->preceding('div ~ p');
Find all sibling elements before this node matching the CSS selector and return
a L object containing these elements as L
objects. All selectors from L are supported.
# List tags of sibling elements before this node
say $dom->preceding->map('tag')->join("\n");
=head2 preceding_nodes
my $collection = $dom->preceding_nodes;
Return a L object containing all sibling nodes before
this node as L objects.
# "A"
$dom->parse('A
')->at('p')->prepend_content('123')->root;
=head2 previous
my $sibling = $dom->previous;
Return L object for previous sibling element or C if there
are no more siblings.
# "
Test
"
$dom->parse('
Test
123
')->at('h2')->previous;
=head2 previous_node
my $sibling = $dom->previous_node;
Return L object for previous sibling node or C if there are
no more siblings.
# "123"
$dom->parse('
123456
')
->at('b')->previous_node->previous_node;
# " Test "
$dom->parse('
123456
')
->at('b')->previous_node->content;
=head2 remove
my $parent = $dom->remove;
Remove this node and return L"root"> (for C nodes) or L"parent">.
# ""
$dom->parse('
Test
')->at('h1')->remove;
# "
456
"
$dom->parse('
123456
')
->at('p')->child_nodes->first->remove->root;
=head2 replace
my $parent = $dom->replace('
I ♥ DOM::Tiny!
');
Replace this node with HTML/XML fragment and return L"root"> (for C
nodes) or L"parent">.
# "
123
"
$dom->parse('
Test
')->at('h1')->replace('
123
');
# "
123
"
$dom->parse('
Test
')
->at('p')->child_nodes->[0]->replace('123')->root;
=head2 root
my $root = $dom->root;
Return L object for C node.
=head2 strip
my $parent = $dom->strip;
Remove this element while preserving its content and return L"parent">.
# "
Test
"
$dom->parse('
Test
')->at('h1')->strip;
=head2 tag
my $tag = $dom->tag;
$dom = $dom->tag('div');
This element's tag name.
# List tag names of child elements
say $dom->children->map('tag')->join("\n");
=head2 tap
$dom = $dom->tap(sub {...});
Equivalent to L.
=head2 text
my $trimmed = $dom->text;
my $untrimmed = $dom->text(0);
Extract text content from this element only (not including child elements),
smart whitespace trimming is enabled by default.
# "foo baz"
$dom->parse("
foo\n
bar
baz\n
")->at('div')->text;
# "foo\nbaz\n"
$dom->parse("
foo\n
bar
baz\n
")->at('div')->text(0);
=head2 to_string
my $str = $dom->to_string;
Render this node and its content to HTML/XML.
# "Test"
$dom->parse('
Test
')->at('div b')->to_string;
=head2 tree
my $tree = $dom->tree;
$dom = $dom->tree(['root']);
Document Object Model. Note that this structure should only be used very
carefully since it is very dynamic.
=head2 type
my $type = $dom->type;
This node's type, usually C, C, C, C, C,
C, C or C.
# "cdata"
$dom->parse('')->child_nodes->first->type;
# "comment"
$dom->parse('')->child_nodes->first->type;
# "doctype"
$dom->parse('')->child_nodes->first->type;
# "pi"
$dom->parse('')->child_nodes->first->type;
# "raw"
$dom->parse('Test')->at('title')->child_nodes->first->type;
# "root"
$dom->parse('
Test
')->type;
# "tag"
$dom->parse('
Test
')->at('p')->type;
# "text"
$dom->parse('
Test
')->at('p')->child_nodes->first->type;
=head2 val
my $value = $dom->val;
Extract value from form element (such as C