X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=README.pod;fp=README.pod;h=2f6ea69c8e41d8472386324eeab8bf31905a6514;hb=d6512b506041e5f51cb53585efc6823ec5f3b109;hp=0000000000000000000000000000000000000000;hpb=a292be342745c12aa805e7aca023756ad824f5d4;p=catagits%2FDOM-Tiny.git diff --git a/README.pod b/README.pod new file mode 100644 index 0000000..2f6ea69 --- /dev/null +++ b/README.pod @@ -0,0 +1,696 @@ +=pod + +=encoding utf8 + +=head1 NAME + +DOM::Tiny - Minimalistic HTML/XML DOM parser with CSS selectors + +=head1 SYNOPSIS + + use DOM::Tiny; + + # Parse + my $dom = DOM::Tiny->new('

Test

123

'); + + # Find + say $dom->at('#b')->text; + say $dom->find('p')->map('text')->join("\n"); + say $dom->find('[id]')->map(attr => 'id')->join("\n"); + + # Iterate + $dom->find('p[id]')->reverse->each(sub { say $_->{id} }); + + # Loop + for my $e ($dom->find('p[id]')->each) { + say $e->{id}, ':', $e->text; + } + + # Modify + $dom->find('div p')->last->append('

456

'); + $dom->find(':not(p)')->map('strip'); + + # Render + say "$dom"; + +=head1 DESCRIPTION + +L is a minimalistic and relaxed HTML/XML DOM parser with CSS +selector support based on L. It will even try to interpret broken +HTML and XML, so you should not use it for validation. + +=head1 NODES AND ELEMENTS + +When we parse an HTML/XML fragment, it gets turned into a tree of nodes. + + + + Hello + World! + + +There are currently eight different kinds of nodes, C, C, +C, C, C, C, C and C. Elements are nodes of +the type C. + + root + |- doctype (html) + +- tag (html) + |- tag (head) + | +- tag (title) + | +- raw (Hello) + +- tag (body) + +- text (World!) + +While all node types are represented as L objects, some methods like +L and L only apply to elements. + +=head1 CASE-SENSITIVITY + +L defaults to HTML semantics, that means all tags and attribute +names are lowercased and selectors need to be lowercase as well. + + # HTML semantics + my $dom = DOM::Tiny->new('

Hi!

'); + say $dom->at('p[id]')->text; + +If XML processing instructions are found, the parser will automatically switch +into XML mode and everything becomes case-sensitive. + + # XML semantics + my $dom = DOM::Tiny->new('

Hi!

'); + say $dom->at('P[ID]')->text; + +XML detection can also be disabled with the L method. + + # Force XML semantics + my $dom = DOM::Tiny->new->xml(1)->parse('

Hi!

'); + say $dom->at('P[ID]')->text; + + # Force HTML semantics + my $dom = DOM::Tiny->new->xml(0)->parse('

Hi!

'); + say $dom->at('p[id]')->text; + +=head1 METHODS + +L implements the following methods. + +=head2 all_text + + my $trimmed = $dom->all_text; + my $untrimmed = $dom->all_text(0); + +Extract text content from all descendant nodes of this element, smart +whitespace trimming is enabled by default. + + # "foo bar baz" + $dom->parse("

foo\n

bar

baz\n

")->at('div')->all_text; + + # "foo\nbarbaz\n" + $dom->parse("

foo\n

bar

baz\n

")->at('div')->all_text(0); + +=head2 ancestors + + my $collection = $dom->ancestors; + my $collection = $dom->ancestors('div ~ p'); + +Find all ancestor elements of this node matching the CSS selector and return a +L object containing these elements as L +objects. All selectors from L are supported. + + # List tag names of ancestor elements + say $dom->ancestors->map('tag')->join("\n"); + +=head2 append + + $dom = $dom->append('

I â¥ DOM::Tiny!

'); + +Append HTML/XML fragment to this node. + + # "

Test

123

" + $dom->parse('

Test

') + ->at('h1')->append('

123

')->root; + + # "

Test 123

" + $dom->parse('

Test

')->at('p') + ->child_nodes->first->append(' 123')->root; + +=head2 append_content + + $dom = $dom->append_content('

I â¥ DOM::Tiny!

'); + +Append HTML/XML fragment (for C and C nodes) or raw content to this +node's content. + + # "

Test123

" + $dom->parse('

Test

') + ->at('h1')->append_content('123')->root; + + # "
" + $dom->parse('
') + ->child_nodes->first->append_content('123 ')->root; + + # "

Test123

" + $dom->parse('

Test

')->at('p')->append_content('123')->root; + +=head2 at + + my $result = $dom->at('div ~ p'); + +Find first descendant element of this element matching the CSS selector and +return it as a L object or return C if none could be found. +All selectors from L are supported. + + # Find first element with "svg" namespace definition + my $namespace = $dom->at('[xmlns\:svg]')->{'xmlns:svg'}; + +=head2 attr + + my $hash = $dom->attr; + my $foo = $dom->attr('foo'); + $dom = $dom->attr({foo => 'bar'}); + $dom = $dom->attr(foo => 'bar'); + +This element's attributes. + + # Remove an attribute + delete $dom->attr->{id}; + + # Attribute without value + $dom->attr(selected => undef); + + # List id attributes + say $dom->find('*')->map(attr => 'id')->compact->join("\n"); + +=head2 child_nodes + + my $collection = $dom->child_nodes; + +Return a L object containing all child nodes of this +element as L objects. + + # "

123

" + $dom->parse('

Test123

')->at('p')->child_nodes->first->remove; + + # "" + $dom->parse('123')->child_nodes->first; + + # " Test " + $dom->parse('123')->child_nodes->last->content; + +=head2 children + + my $collection = $dom->children; + my $collection = $dom->children('div ~ p'); + +Find all child elements of this element matching the CSS selector and return a +L object containing these elements as L +objects. All selectors from L are supported. + + # Show tag name of random child element + say $dom->children->shuffle->first->tag; + +=head2 content + + my $str = $dom->content; + $dom = $dom->content('

I â¥ DOM::Tiny!

'); + +Return this node's content or replace it with HTML/XML fragment (for C +and C nodes) or raw content. + + # "Test" + $dom->parse('

Test

')->at('div')->content; + + # "

123

" + $dom->parse('

Test

')->at('h1')->content('123')->root; + + # "

123

" + $dom->parse('

Test

')->at('p')->content('123')->root; + + # "

" + $dom->parse('

Test

')->at('h1')->content('')->root; + + # " Test " + $dom->parse('
')->child_nodes->first->content; + + # "

456

" + $dom->parse('

456

') + ->at('div')->child_nodes->first->content(' 123 ')->root; + +=head2 descendant_nodes + + my $collection = $dom->descendant_nodes; + +Return a L object containing all descendant nodes of +this element as L objects. + + # "

123

" + $dom->parse('

123

') + ->descendant_nodes->grep(sub { $_->type eq 'comment' }) + ->map('remove')->first; + + # "

testtest

" + $dom->parse('

123456

') + ->at('p')->descendant_nodes->grep(sub { $_->type eq 'text' }) + ->map(content => 'test')->first->root; + +=head2 find + + my $collection = $dom->find('div ~ p'); + +Find all descendant elements of this element matching the CSS selector and +return a L object containing these elements as +L objects. All selectors from L are +supported. + + # Find a specific element and extract information + my $id = $dom->find('div')->[23]{id}; + + # Extract information from multiple elements + my @headers = $dom->find('h1, h2, h3')->map('text')->each; + + # Count all the different tags + my $hash = $dom->find('*')->reduce(sub { $a->{$b->tag}++; $a }, {}); + + # Find elements with a class that contains dots + my @divs = $dom->find('div.foo\.bar')->each; + +=head2 following + + my $collection = $dom->following; + my $collection = $dom->following('div ~ p'); + +Find all sibling elements after this node matching the CSS selector and return +a L object containing these elements as L +objects. All selectors from L are supported. + + # List tags of sibling elements after this node + say $dom->following->map('tag')->join("\n"); + +=head2 following_nodes + + my $collection = $dom->following_nodes; + +Return a L object containing all sibling nodes after +this node as L objects. + + # "C" + $dom->parse('

C')->at('p')->following_nodes->last->content; + +=head2 matches + + my $bool = $dom->matches('div ~ p'); + +Check if this element matches the CSS selector. All selectors from +L are supported. + + # True + $dom->parse('

')->at('p')->matches('.a'); + $dom->parse('

')->at('p')->matches('p[class]'); + + # False + $dom->parse('

')->at('p')->matches('.b'); + $dom->parse('

')->at('p')->matches('p[id]'); + +=head2 namespace + + my $namespace = $dom->namespace; + +Find this element's namespace or return C if none could be found. + + # Find namespace for an element with namespace prefix + my $namespace = $dom->at('svg > svg\:circle')->namespace; + + # Find namespace for an element that may or may not have a namespace prefix + my $namespace = $dom->at('svg > circle')->namespace; + +=head2 new + + my $dom = DOM::Tiny->new; + my $dom = DOM::Tiny->new('I â¥ DOM::Tiny!'); + +Construct a new scalar-based L object and L HTML/XML +fragment if necessary. + +=head2 next + + my $sibling = $dom->next; + +Return L object for next sibling element or C if there are no +more siblings. + + # "

123

" + $dom->parse('

Test

123

')->at('h1')->next; + +=head2 next_node + + my $sibling = $dom->next_node; + +Return L object for next sibling node or C if there are no +more siblings. + + # "456" + $dom->parse('

123456

') + ->at('b')->next_node->next_node; + + # " Test " + $dom->parse('

123456

') + ->at('b')->next_node->content; + +=head2 parent + + my $parent = $dom->parent; + +Return L object for parent of this node or C if this node has +no parent. + +=head2 parse + + $dom = $dom->parse('I â¥ DOM::Tiny!'); + +Parse HTML/XML fragment with L. + + # Parse XML + my $dom = DOM::Tiny->new->xml(1)->parse($xml); + +=head2 preceding + + my $collection = $dom->preceding; + my $collection = $dom->preceding('div ~ p'); + +Find all sibling elements before this node matching the CSS selector and return +a L object containing these elements as L +objects. All selectors from L are supported. + + # List tags of sibling elements before this node + say $dom->preceding->map('tag')->join("\n"); + +=head2 preceding_nodes + + my $collection = $dom->preceding_nodes; + +Return a L object containing all sibling nodes before +this node as L objects. + + # "A" + $dom->parse('A

')->at('p')->preceding_nodes->first->content; + +=head2 prepend + + $dom = $dom->prepend('

I â¥ DOM::Tiny!

'); + +Prepend HTML/XML fragment to this node. + + # "

Test

123

" + $dom->parse('

123

') + ->at('h2')->prepend('

Test

')->root; + + # "

Test 123

" + $dom->parse('

123

') + ->at('p')->child_nodes->first->prepend('Test ')->root; + +=head2 prepend_content + + $dom = $dom->prepend_content('

I â¥ DOM::Tiny!

'); + +Prepend HTML/XML fragment (for C and C nodes) or raw content to this +node's content. + + # "

Test123

" + $dom->parse('

123

') + ->at('h2')->prepend_content('Test')->root; + + # "
" + $dom->parse('
') + ->child_nodes->first->prepend_content(' Test')->root; + + # "

123Test

" + $dom->parse('

Test

')->at('p')->prepend_content('123')->root; + +=head2 previous + + my $sibling = $dom->previous; + +Return L object for previous sibling element or C if there +are no more siblings. + + # "

Test

" + $dom->parse('

Test

123

')->at('h2')->previous; + +=head2 previous_node + + my $sibling = $dom->previous_node; + +Return L object for previous sibling node or C if there are +no more siblings. + + # "123" + $dom->parse('

123456

') + ->at('b')->previous_node->previous_node; + + # " Test " + $dom->parse('

123456

') + ->at('b')->previous_node->content; + +=head2 remove + + my $parent = $dom->remove; + +Remove this node and return L (for C nodes) or L. + + # "

" + $dom->parse('

Test

')->at('h1')->remove; + + # "

456

" + $dom->parse('

123456

') + ->at('p')->child_nodes->first->remove->root; + +=head2 replace + + my $parent = $dom->replace('

I â¥ DOM::Tiny!

'); + +Replace this node with HTML/XML fragment and return L (for C +nodes) or L. + + # "

123

" + $dom->parse('

Test

')->at('h1')->replace('

123

'); + + # "

123

" + $dom->parse('

Test

') + ->at('p')->child_nodes->[0]->replace('123')->root; + +=head2 root + + my $root = $dom->root; + +Return L object for C node. + +=head2 strip + + my $parent = $dom->strip; + +Remove this element while preserving its content and return L. + + # "

Test

" + $dom->parse('

Test

')->at('h1')->strip; + +=head2 tag + + my $tag = $dom->tag; + $dom = $dom->tag('div'); + +This element's tag name. + + # List tag names of child elements + say $dom->children->map('tag')->join("\n"); + +=head2 tap + + $dom = $dom->tap(sub {...}); + +Alias for L. + +=head2 text + + my $trimmed = $dom->text; + my $untrimmed = $dom->text(0); + +Extract text content from this element only (not including child elements), +smart whitespace trimming is enabled by default. + + # "foo baz" + $dom->parse("

foo\n

bar

baz\n

")->at('div')->text; + + # "foo\nbaz\n" + $dom->parse("

foo\n

bar

baz\n

")->at('div')->text(0); + +=head2 to_string + + my $str = $dom->to_string; + +Render this node and its content to HTML/XML. + + # "Test" + $dom->parse('

Test

')->at('div b')->to_string; + +=head2 tree + + my $tree = $dom->tree; + $dom = $dom->tree(['root']); + +Document Object Model. Note that this structure should only be used very +carefully since it is very dynamic. + +=head2 type + + my $type = $dom->type; + +This node's type, usually C, C, C, C, C, +C, C or C. + + # "cdata" + $dom->parse('')->child_nodes->first->type; + + # "comment" + $dom->parse('')->child_nodes->first->type; + + # "doctype" + $dom->parse('')->child_nodes->first->type; + + # "pi" + $dom->parse('')->child_nodes->first->type; + + # "raw" + $dom->parse('Test')->at('title')->child_nodes->first->type; + + # "root" + $dom->parse('

Test

')->type; + + # "tag" + $dom->parse('

Test

')->at('p')->type; + + # "text" + $dom->parse('

Test

')->at('p')->child_nodes->first->type; + +=head2 val + + my $value = $dom->val; + +Extract value from form element (such as C