better description for tap methods
[catagits/DOM-Tiny.git] / README.pod
CommitLineData
d6512b50 1=pod
2
3=encoding utf8
4
5=head1 NAME
6
7DOM::Tiny - Minimalistic HTML/XML DOM parser with CSS selectors
8
9=head1 SYNOPSIS
10
11 use DOM::Tiny;
12
13 # Parse
14 my $dom = DOM::Tiny->new('<div><p id="a">Test</p><p id="b">123</p></div>');
15
16 # Find
17 say $dom->at('#b')->text;
18 say $dom->find('p')->map('text')->join("\n");
19 say $dom->find('[id]')->map(attr => 'id')->join("\n");
20
21 # Iterate
22 $dom->find('p[id]')->reverse->each(sub { say $_->{id} });
23
24 # Loop
25 for my $e ($dom->find('p[id]')->each) {
26 say $e->{id}, ':', $e->text;
27 }
28
29 # Modify
30 $dom->find('div p')->last->append('<p id="c">456</p>');
31 $dom->find(':not(p)')->map('strip');
32
33 # Render
34 say "$dom";
35
36=head1 DESCRIPTION
37
38L<DOM::Tiny> is a minimalistic and relaxed HTML/XML DOM parser with CSS
39selector support based on L<Mojo::DOM>. It will even try to interpret broken
40HTML and XML, so you should not use it for validation.
41
42=head1 NODES AND ELEMENTS
43
44When we parse an HTML/XML fragment, it gets turned into a tree of nodes.
45
46 <!DOCTYPE html>
47 <html>
48 <head><title>Hello</title></head>
49 <body>World!</body>
50 </html>
51
52There are currently eight different kinds of nodes, C<cdata>, C<comment>,
53C<doctype>, C<pi>, C<raw>, C<root>, C<tag> and C<text>. Elements are nodes of
54the type C<tag>.
55
56 root
57 |- doctype (html)
58 +- tag (html)
59 |- tag (head)
60 | +- tag (title)
61 | +- raw (Hello)
62 +- tag (body)
63 +- text (World!)
64
65While all node types are represented as L<DOM::Tiny> objects, some methods like
66L</"attr"> and L</"namespace"> only apply to elements.
67
68=head1 CASE-SENSITIVITY
69
70L<DOM::Tiny> defaults to HTML semantics, that means all tags and attribute
71names are lowercased and selectors need to be lowercase as well.
72
73 # HTML semantics
74 my $dom = DOM::Tiny->new('<P ID="greeting">Hi!</P>');
75 say $dom->at('p[id]')->text;
76
77If XML processing instructions are found, the parser will automatically switch
78into XML mode and everything becomes case-sensitive.
79
80 # XML semantics
81 my $dom = DOM::Tiny->new('<?xml version="1.0"?><P ID="greeting">Hi!</P>');
82 say $dom->at('P[ID]')->text;
83
84XML detection can also be disabled with the L</"xml"> method.
85
86 # Force XML semantics
87 my $dom = DOM::Tiny->new->xml(1)->parse('<P ID="greeting">Hi!</P>');
88 say $dom->at('P[ID]')->text;
89
90 # Force HTML semantics
91 my $dom = DOM::Tiny->new->xml(0)->parse('<P ID="greeting">Hi!</P>');
92 say $dom->at('p[id]')->text;
93
94=head1 METHODS
95
96L<DOM::Tiny> implements the following methods.
97
3793c28f 98=head2 new
99
100 my $dom = DOM::Tiny->new;
101 my $dom = DOM::Tiny->new('<foo bar="baz">I ♥ DOM::Tiny!</foo>');
102
103Construct a new scalar-based L<DOM::Tiny> object and L</"parse"> HTML/XML
104fragment if necessary.
105
d6512b50 106=head2 all_text
107
108 my $trimmed = $dom->all_text;
109 my $untrimmed = $dom->all_text(0);
110
111Extract text content from all descendant nodes of this element, smart
112whitespace trimming is enabled by default.
113
114 # "foo bar baz"
115 $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->at('div')->all_text;
116
117 # "foo\nbarbaz\n"
118 $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->at('div')->all_text(0);
119
120=head2 ancestors
121
122 my $collection = $dom->ancestors;
123 my $collection = $dom->ancestors('div ~ p');
124
125Find all ancestor elements of this node matching the CSS selector and return a
126L<DOM::Tiny::Collection> object containing these elements as L<DOM::Tiny>
127objects. All selectors from L<DOM::Tiny::CSS/"SELECTORS"> are supported.
128
129 # List tag names of ancestor elements
130 say $dom->ancestors->map('tag')->join("\n");
131
132=head2 append
133
134 $dom = $dom->append('<p>I ♥ DOM::Tiny!</p>');
135
136Append HTML/XML fragment to this node.
137
138 # "<div><h1>Test</h1><h2>123</h2></div>"
139 $dom->parse('<div><h1>Test</h1></div>')
140 ->at('h1')->append('<h2>123</h2>')->root;
141
142 # "<p>Test 123</p>"
143 $dom->parse('<p>Test</p>')->at('p')
144 ->child_nodes->first->append(' 123')->root;
145
146=head2 append_content
147
148 $dom = $dom->append_content('<p>I ♥ DOM::Tiny!</p>');
149
150Append HTML/XML fragment (for C<root> and C<tag> nodes) or raw content to this
151node's content.
152
153 # "<div><h1>Test123</h1></div>"
154 $dom->parse('<div><h1>Test</h1></div>')
155 ->at('h1')->append_content('123')->root;
156
157 # "<!-- Test 123 --><br>"
158 $dom->parse('<!-- Test --><br>')
159 ->child_nodes->first->append_content('123 ')->root;
160
161 # "<p>Test<i>123</i></p>"
162 $dom->parse('<p>Test</p>')->at('p')->append_content('<i>123</i>')->root;
163
164=head2 at
165
166 my $result = $dom->at('div ~ p');
167
168Find first descendant element of this element matching the CSS selector and
169return it as a L<DOM::Tiny> object or return C<undef> if none could be found.
170All selectors from L<DOM::Tiny::CSS/"SELECTORS"> are supported.
171
172 # Find first element with "svg" namespace definition
173 my $namespace = $dom->at('[xmlns\:svg]')->{'xmlns:svg'};
174
175=head2 attr
176
177 my $hash = $dom->attr;
178 my $foo = $dom->attr('foo');
179 $dom = $dom->attr({foo => 'bar'});
180 $dom = $dom->attr(foo => 'bar');
181
182This element's attributes.
183
184 # Remove an attribute
185 delete $dom->attr->{id};
186
187 # Attribute without value
188 $dom->attr(selected => undef);
189
190 # List id attributes
191 say $dom->find('*')->map(attr => 'id')->compact->join("\n");
192
193=head2 child_nodes
194
195 my $collection = $dom->child_nodes;
196
197Return a L<DOM::Tiny::Collection> object containing all child nodes of this
198element as L<DOM::Tiny> objects.
199
200 # "<p><b>123</b></p>"
201 $dom->parse('<p>Test<b>123</b></p>')->at('p')->child_nodes->first->remove;
202
203 # "<!DOCTYPE html>"
204 $dom->parse('<!DOCTYPE html><b>123</b>')->child_nodes->first;
205
206 # " Test "
207 $dom->parse('<b>123</b><!-- Test -->')->child_nodes->last->content;
208
209=head2 children
210
211 my $collection = $dom->children;
212 my $collection = $dom->children('div ~ p');
213
214Find all child elements of this element matching the CSS selector and return a
215L<DOM::Tiny::Collection> object containing these elements as L<DOM::Tiny>
216objects. All selectors from L<DOM::Tiny::CSS/"SELECTORS"> are supported.
217
218 # Show tag name of random child element
219 say $dom->children->shuffle->first->tag;
220
221=head2 content
222
223 my $str = $dom->content;
224 $dom = $dom->content('<p>I ♥ DOM::Tiny!</p>');
225
226Return this node's content or replace it with HTML/XML fragment (for C<root>
227and C<tag> nodes) or raw content.
228
229 # "<b>Test</b>"
230 $dom->parse('<div><b>Test</b></div>')->at('div')->content;
231
232 # "<div><h1>123</h1></div>"
233 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->content('123')->root;
234
235 # "<p><i>123</i></p>"
236 $dom->parse('<p>Test</p>')->at('p')->content('<i>123</i>')->root;
237
238 # "<div><h1></h1></div>"
239 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->content('')->root;
240
241 # " Test "
242 $dom->parse('<!-- Test --><br>')->child_nodes->first->content;
243
244 # "<div><!-- 123 -->456</div>"
245 $dom->parse('<div><!-- Test -->456</div>')
246 ->at('div')->child_nodes->first->content(' 123 ')->root;
247
248=head2 descendant_nodes
249
250 my $collection = $dom->descendant_nodes;
251
252Return a L<DOM::Tiny::Collection> object containing all descendant nodes of
253this element as L<DOM::Tiny> objects.
254
255 # "<p><b>123</b></p>"
256 $dom->parse('<p><!-- Test --><b>123<!-- 456 --></b></p>')
257 ->descendant_nodes->grep(sub { $_->type eq 'comment' })
258 ->map('remove')->first;
259
260 # "<p><b>test</b>test</p>"
261 $dom->parse('<p><b>123</b>456</p>')
262 ->at('p')->descendant_nodes->grep(sub { $_->type eq 'text' })
263 ->map(content => 'test')->first->root;
264
265=head2 find
266
267 my $collection = $dom->find('div ~ p');
268
269Find all descendant elements of this element matching the CSS selector and
270return a L<DOM::Tiny::Collection> object containing these elements as
271L<DOM::Tiny> objects. All selectors from L<DOM::Tiny::CSS/"SELECTORS"> are
272supported.
273
274 # Find a specific element and extract information
275 my $id = $dom->find('div')->[23]{id};
276
277 # Extract information from multiple elements
278 my @headers = $dom->find('h1, h2, h3')->map('text')->each;
279
280 # Count all the different tags
281 my $hash = $dom->find('*')->reduce(sub { $a->{$b->tag}++; $a }, {});
282
283 # Find elements with a class that contains dots
284 my @divs = $dom->find('div.foo\.bar')->each;
285
286=head2 following
287
288 my $collection = $dom->following;
289 my $collection = $dom->following('div ~ p');
290
291Find all sibling elements after this node matching the CSS selector and return
292a L<DOM::Tiny::Collection> object containing these elements as L<DOM::Tiny>
293objects. All selectors from L<DOM::Tiny::CSS/"SELECTORS"> are supported.
294
295 # List tags of sibling elements after this node
296 say $dom->following->map('tag')->join("\n");
297
298=head2 following_nodes
299
300 my $collection = $dom->following_nodes;
301
302Return a L<DOM::Tiny::Collection> object containing all sibling nodes after
303this node as L<DOM::Tiny> objects.
304
305 # "C"
306 $dom->parse('<p>A</p><!-- B -->C')->at('p')->following_nodes->last->content;
307
308=head2 matches
309
310 my $bool = $dom->matches('div ~ p');
311
312Check if this element matches the CSS selector. All selectors from
313L<DOM::Tiny::CSS/"SELECTORS"> are supported.
314
315 # True
316 $dom->parse('<p class="a">A</p>')->at('p')->matches('.a');
317 $dom->parse('<p class="a">A</p>')->at('p')->matches('p[class]');
318
319 # False
320 $dom->parse('<p class="a">A</p>')->at('p')->matches('.b');
321 $dom->parse('<p class="a">A</p>')->at('p')->matches('p[id]');
322
323=head2 namespace
324
325 my $namespace = $dom->namespace;
326
327Find this element's namespace or return C<undef> if none could be found.
328
329 # Find namespace for an element with namespace prefix
330 my $namespace = $dom->at('svg > svg\:circle')->namespace;
331
332 # Find namespace for an element that may or may not have a namespace prefix
333 my $namespace = $dom->at('svg > circle')->namespace;
334
d6512b50 335=head2 next
336
337 my $sibling = $dom->next;
338
339Return L<DOM::Tiny> object for next sibling element or C<undef> if there are no
340more siblings.
341
342 # "<h2>123</h2>"
343 $dom->parse('<div><h1>Test</h1><h2>123</h2></div>')->at('h1')->next;
344
345=head2 next_node
346
347 my $sibling = $dom->next_node;
348
349Return L<DOM::Tiny> object for next sibling node or C<undef> if there are no
350more siblings.
351
352 # "456"
353 $dom->parse('<p><b>123</b><!-- Test -->456</p>')
354 ->at('b')->next_node->next_node;
355
356 # " Test "
357 $dom->parse('<p><b>123</b><!-- Test -->456</p>')
358 ->at('b')->next_node->content;
359
360=head2 parent
361
362 my $parent = $dom->parent;
363
364Return L<DOM::Tiny> object for parent of this node or C<undef> if this node has
365no parent.
366
367=head2 parse
368
369 $dom = $dom->parse('<foo bar="baz">I ♥ DOM::Tiny!</foo>');
370
371Parse HTML/XML fragment with L<DOM::Tiny::HTML>.
372
373 # Parse XML
374 my $dom = DOM::Tiny->new->xml(1)->parse($xml);
375
376=head2 preceding
377
378 my $collection = $dom->preceding;
379 my $collection = $dom->preceding('div ~ p');
380
381Find all sibling elements before this node matching the CSS selector and return
382a L<DOM::Tiny::Collection> object containing these elements as L<DOM::Tiny>
383objects. All selectors from L<DOM::Tiny::CSS/"SELECTORS"> are supported.
384
385 # List tags of sibling elements before this node
386 say $dom->preceding->map('tag')->join("\n");
387
388=head2 preceding_nodes
389
390 my $collection = $dom->preceding_nodes;
391
392Return a L<DOM::Tiny::Collection> object containing all sibling nodes before
393this node as L<DOM::Tiny> objects.
394
395 # "A"
396 $dom->parse('A<!-- B --><p>C</p>')->at('p')->preceding_nodes->first->content;
397
398=head2 prepend
399
400 $dom = $dom->prepend('<p>I ♥ DOM::Tiny!</p>');
401
402Prepend HTML/XML fragment to this node.
403
404 # "<div><h1>Test</h1><h2>123</h2></div>"
405 $dom->parse('<div><h2>123</h2></div>')
406 ->at('h2')->prepend('<h1>Test</h1>')->root;
407
408 # "<p>Test 123</p>"
409 $dom->parse('<p>123</p>')
410 ->at('p')->child_nodes->first->prepend('Test ')->root;
411
412=head2 prepend_content
413
414 $dom = $dom->prepend_content('<p>I ♥ DOM::Tiny!</p>');
415
416Prepend HTML/XML fragment (for C<root> and C<tag> nodes) or raw content to this
417node's content.
418
419 # "<div><h2>Test123</h2></div>"
420 $dom->parse('<div><h2>123</h2></div>')
421 ->at('h2')->prepend_content('Test')->root;
422
423 # "<!-- Test 123 --><br>"
424 $dom->parse('<!-- 123 --><br>')
425 ->child_nodes->first->prepend_content(' Test')->root;
426
427 # "<p><i>123</i>Test</p>"
428 $dom->parse('<p>Test</p>')->at('p')->prepend_content('<i>123</i>')->root;
429
430=head2 previous
431
432 my $sibling = $dom->previous;
433
434Return L<DOM::Tiny> object for previous sibling element or C<undef> if there
435are no more siblings.
436
437 # "<h1>Test</h1>"
438 $dom->parse('<div><h1>Test</h1><h2>123</h2></div>')->at('h2')->previous;
439
440=head2 previous_node
441
442 my $sibling = $dom->previous_node;
443
444Return L<DOM::Tiny> object for previous sibling node or C<undef> if there are
445no more siblings.
446
447 # "123"
448 $dom->parse('<p>123<!-- Test --><b>456</b></p>')
449 ->at('b')->previous_node->previous_node;
450
451 # " Test "
452 $dom->parse('<p>123<!-- Test --><b>456</b></p>')
453 ->at('b')->previous_node->content;
454
455=head2 remove
456
457 my $parent = $dom->remove;
458
459Remove this node and return L</"root"> (for C<root> nodes) or L</"parent">.
460
461 # "<div></div>"
462 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->remove;
463
464 # "<p><b>456</b></p>"
465 $dom->parse('<p>123<b>456</b></p>')
466 ->at('p')->child_nodes->first->remove->root;
467
468=head2 replace
469
470 my $parent = $dom->replace('<div>I ♥ DOM::Tiny!</div>');
471
472Replace this node with HTML/XML fragment and return L</"root"> (for C<root>
473nodes) or L</"parent">.
474
475 # "<div><h2>123</h2></div>"
476 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->replace('<h2>123</h2>');
477
478 # "<p><b>123</b></p>"
479 $dom->parse('<p>Test</p>')
480 ->at('p')->child_nodes->[0]->replace('<b>123</b>')->root;
481
482=head2 root
483
484 my $root = $dom->root;
485
486Return L<DOM::Tiny> object for C<root> node.
487
488=head2 strip
489
490 my $parent = $dom->strip;
491
492Remove this element while preserving its content and return L</"parent">.
493
494 # "<div>Test</div>"
495 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->strip;
496
497=head2 tag
498
499 my $tag = $dom->tag;
500 $dom = $dom->tag('div');
501
502This element's tag name.
503
504 # List tag names of child elements
505 say $dom->children->map('tag')->join("\n");
506
507=head2 tap
508
509 $dom = $dom->tap(sub {...});
510
e99ef07d 511Equivalent to L<Mojo::Base/"tap">.
d6512b50 512
513=head2 text
514
515 my $trimmed = $dom->text;
516 my $untrimmed = $dom->text(0);
517
518Extract text content from this element only (not including child elements),
519smart whitespace trimming is enabled by default.
520
521 # "foo baz"
522 $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->at('div')->text;
523
524 # "foo\nbaz\n"
525 $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->at('div')->text(0);
526
527=head2 to_string
528
529 my $str = $dom->to_string;
530
531Render this node and its content to HTML/XML.
532
533 # "<b>Test</b>"
534 $dom->parse('<div><b>Test</b></div>')->at('div b')->to_string;
535
536=head2 tree
537
538 my $tree = $dom->tree;
539 $dom = $dom->tree(['root']);
540
541Document Object Model. Note that this structure should only be used very
542carefully since it is very dynamic.
543
544=head2 type
545
546 my $type = $dom->type;
547
548This node's type, usually C<cdata>, C<comment>, C<doctype>, C<pi>, C<raw>,
549C<root>, C<tag> or C<text>.
550
551 # "cdata"
552 $dom->parse('<![CDATA[Test]]>')->child_nodes->first->type;
553
554 # "comment"
555 $dom->parse('<!-- Test -->')->child_nodes->first->type;
556
557 # "doctype"
558 $dom->parse('<!DOCTYPE html>')->child_nodes->first->type;
559
560 # "pi"
561 $dom->parse('<?xml version="1.0"?>')->child_nodes->first->type;
562
563 # "raw"
564 $dom->parse('<title>Test</title>')->at('title')->child_nodes->first->type;
565
566 # "root"
567 $dom->parse('<p>Test</p>')->type;
568
569 # "tag"
570 $dom->parse('<p>Test</p>')->at('p')->type;
571
572 # "text"
573 $dom->parse('<p>Test</p>')->at('p')->child_nodes->first->type;
574
575=head2 val
576
577 my $value = $dom->val;
578
579Extract value from form element (such as C<button>, C<input>, C<option>,
580C<select> and C<textarea>) or return C<undef> if this element has no value. In
581the case of C<select> with C<multiple> attribute, find C<option> elements with
582C<selected> attribute and return an array reference with all values or C<undef>
583if none could be found.
584
585 # "a"
586 $dom->parse('<input name="test" value="a">')->at('input')->val;
587
588 # "b"
589 $dom->parse('<textarea>b</textarea>')->at('textarea')->val;
590
591 # "c"
592 $dom->parse('<option value="c">Test</option>')->at('option')->val;
593
594 # "d"
595 $dom->parse('<select><option selected>d</option></select>')
596 ->at('select')->val;
597
598 # "e"
599 $dom->parse('<select multiple><option selected>e</option></select>')
600 ->at('select')->val->[0];
601
602=head2 wrap
603
604 $dom = $dom->wrap('<div></div>');
605
606Wrap HTML/XML fragment around this node, placing it as the last child of the
607first innermost element.
608
609 # "<p>123<b>Test</b></p>"
610 $dom->parse('<b>Test</b>')->at('b')->wrap('<p>123</p>')->root;
611
612 # "<div><p><b>Test</b></p>123</div>"
613 $dom->parse('<b>Test</b>')->at('b')->wrap('<div><p></p>123</div>')->root;
614
615 # "<p><b>Test</b></p><p>123</p>"
616 $dom->parse('<b>Test</b>')->at('b')->wrap('<p></p><p>123</p>')->root;
617
618 # "<p><b>Test</b></p>"
619 $dom->parse('<p>Test</p>')->at('p')->child_nodes->first->wrap('<b>')->root;
620
621=head2 wrap_content
622
623 $dom = $dom->wrap_content('<div></div>');
624
625Wrap HTML/XML fragment around this node's content, placing it as the last
626children of the first innermost element.
627
628 # "<p><b>123Test</b></p>"
629 $dom->parse('<p>Test<p>')->at('p')->wrap_content('<b>123</b>')->root;
630
631 # "<p><b>Test</b></p><p>123</p>"
632 $dom->parse('<b>Test</b>')->wrap_content('<p></p><p>123</p>');
633
634=head2 xml
635
636 my $bool = $dom->xml;
637 $dom = $dom->xml($bool);
638
639Disable HTML semantics in parser and activate case-sensitivity, defaults to
640auto detection based on processing instructions.
641
642=head1 OPERATORS
643
644L<DOM::Tiny> overloads the following operators.
645
646=head2 array
647
648 my @nodes = @$dom;
649
650Alias for L</"child_nodes">.
651
652 # "<!-- Test -->"
653 $dom->parse('<!-- Test --><b>123</b>')->[0];
654
655=head2 bool
656
657 my $bool = !!$dom;
658
659Always true.
660
661=head2 hash
662
663 my %attrs = %$dom;
664
665Alias for L</"attr">.
666
667 # "test"
668 $dom->parse('<div id="test">Test</div>')->at('div')->{id};
669
670=head2 stringify
671
672 my $str = "$dom";
673
674Alias for L</"to_string">.
675
676=head1 BUGS
677
678Report any issues on the public bugtracker.
679
680=head1 AUTHOR
681
682Dan Book <dbook@cpan.org>
683
684=head1 COPYRIGHT AND LICENSE
685
686This software is Copyright (c) 2015 by Dan Book.
687
688This is free software, licensed under:
689
690 The Artistic License 2.0 (GPL Compatible)
691
692=head1 SEE ALSO
693
694L<Mojo::DOM>, L<XML::LibXML>, L<XML::Twig>, L<HTML::TreeBuilder>, L<XML::Smart>
695
696=cut