use JSON::PP instead for test
[catagits/DOM-Tiny.git] / README.pod
CommitLineData
d6512b50 1=pod
2
3=encoding utf8
4
5=head1 NAME
6
7DOM::Tiny - Minimalistic HTML/XML DOM parser with CSS selectors
8
9=head1 SYNOPSIS
10
11 use DOM::Tiny;
12
13 # Parse
14 my $dom = DOM::Tiny->new('<div><p id="a">Test</p><p id="b">123</p></div>');
15
16 # Find
17 say $dom->at('#b')->text;
18 say $dom->find('p')->map('text')->join("\n");
19 say $dom->find('[id]')->map(attr => 'id')->join("\n");
20
21 # Iterate
22 $dom->find('p[id]')->reverse->each(sub { say $_->{id} });
23
24 # Loop
25 for my $e ($dom->find('p[id]')->each) {
26 say $e->{id}, ':', $e->text;
27 }
28
29 # Modify
30 $dom->find('div p')->last->append('<p id="c">456</p>');
31 $dom->find(':not(p)')->map('strip');
32
33 # Render
34 say "$dom";
35
36=head1 DESCRIPTION
37
38L<DOM::Tiny> is a minimalistic and relaxed HTML/XML DOM parser with CSS
39selector support based on L<Mojo::DOM>. It will even try to interpret broken
40HTML and XML, so you should not use it for validation.
41
42=head1 NODES AND ELEMENTS
43
44When we parse an HTML/XML fragment, it gets turned into a tree of nodes.
45
46 <!DOCTYPE html>
47 <html>
48 <head><title>Hello</title></head>
49 <body>World!</body>
50 </html>
51
52There are currently eight different kinds of nodes, C<cdata>, C<comment>,
53C<doctype>, C<pi>, C<raw>, C<root>, C<tag> and C<text>. Elements are nodes of
54the type C<tag>.
55
56 root
57 |- doctype (html)
58 +- tag (html)
59 |- tag (head)
60 | +- tag (title)
61 | +- raw (Hello)
62 +- tag (body)
63 +- text (World!)
64
65While all node types are represented as L<DOM::Tiny> objects, some methods like
66L</"attr"> and L</"namespace"> only apply to elements.
67
68=head1 CASE-SENSITIVITY
69
70L<DOM::Tiny> defaults to HTML semantics, that means all tags and attribute
71names are lowercased and selectors need to be lowercase as well.
72
73 # HTML semantics
74 my $dom = DOM::Tiny->new('<P ID="greeting">Hi!</P>');
75 say $dom->at('p[id]')->text;
76
77If XML processing instructions are found, the parser will automatically switch
78into XML mode and everything becomes case-sensitive.
79
80 # XML semantics
81 my $dom = DOM::Tiny->new('<?xml version="1.0"?><P ID="greeting">Hi!</P>');
82 say $dom->at('P[ID]')->text;
83
84XML detection can also be disabled with the L</"xml"> method.
85
86 # Force XML semantics
87 my $dom = DOM::Tiny->new->xml(1)->parse('<P ID="greeting">Hi!</P>');
88 say $dom->at('P[ID]')->text;
89
90 # Force HTML semantics
91 my $dom = DOM::Tiny->new->xml(0)->parse('<P ID="greeting">Hi!</P>');
92 say $dom->at('p[id]')->text;
93
94=head1 METHODS
95
96L<DOM::Tiny> implements the following methods.
97
98=head2 all_text
99
100 my $trimmed = $dom->all_text;
101 my $untrimmed = $dom->all_text(0);
102
103Extract text content from all descendant nodes of this element, smart
104whitespace trimming is enabled by default.
105
106 # "foo bar baz"
107 $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->at('div')->all_text;
108
109 # "foo\nbarbaz\n"
110 $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->at('div')->all_text(0);
111
112=head2 ancestors
113
114 my $collection = $dom->ancestors;
115 my $collection = $dom->ancestors('div ~ p');
116
117Find all ancestor elements of this node matching the CSS selector and return a
118L<DOM::Tiny::Collection> object containing these elements as L<DOM::Tiny>
119objects. All selectors from L<DOM::Tiny::CSS/"SELECTORS"> are supported.
120
121 # List tag names of ancestor elements
122 say $dom->ancestors->map('tag')->join("\n");
123
124=head2 append
125
126 $dom = $dom->append('<p>I ♥ DOM::Tiny!</p>');
127
128Append HTML/XML fragment to this node.
129
130 # "<div><h1>Test</h1><h2>123</h2></div>"
131 $dom->parse('<div><h1>Test</h1></div>')
132 ->at('h1')->append('<h2>123</h2>')->root;
133
134 # "<p>Test 123</p>"
135 $dom->parse('<p>Test</p>')->at('p')
136 ->child_nodes->first->append(' 123')->root;
137
138=head2 append_content
139
140 $dom = $dom->append_content('<p>I ♥ DOM::Tiny!</p>');
141
142Append HTML/XML fragment (for C<root> and C<tag> nodes) or raw content to this
143node's content.
144
145 # "<div><h1>Test123</h1></div>"
146 $dom->parse('<div><h1>Test</h1></div>')
147 ->at('h1')->append_content('123')->root;
148
149 # "<!-- Test 123 --><br>"
150 $dom->parse('<!-- Test --><br>')
151 ->child_nodes->first->append_content('123 ')->root;
152
153 # "<p>Test<i>123</i></p>"
154 $dom->parse('<p>Test</p>')->at('p')->append_content('<i>123</i>')->root;
155
156=head2 at
157
158 my $result = $dom->at('div ~ p');
159
160Find first descendant element of this element matching the CSS selector and
161return it as a L<DOM::Tiny> object or return C<undef> if none could be found.
162All selectors from L<DOM::Tiny::CSS/"SELECTORS"> are supported.
163
164 # Find first element with "svg" namespace definition
165 my $namespace = $dom->at('[xmlns\:svg]')->{'xmlns:svg'};
166
167=head2 attr
168
169 my $hash = $dom->attr;
170 my $foo = $dom->attr('foo');
171 $dom = $dom->attr({foo => 'bar'});
172 $dom = $dom->attr(foo => 'bar');
173
174This element's attributes.
175
176 # Remove an attribute
177 delete $dom->attr->{id};
178
179 # Attribute without value
180 $dom->attr(selected => undef);
181
182 # List id attributes
183 say $dom->find('*')->map(attr => 'id')->compact->join("\n");
184
185=head2 child_nodes
186
187 my $collection = $dom->child_nodes;
188
189Return a L<DOM::Tiny::Collection> object containing all child nodes of this
190element as L<DOM::Tiny> objects.
191
192 # "<p><b>123</b></p>"
193 $dom->parse('<p>Test<b>123</b></p>')->at('p')->child_nodes->first->remove;
194
195 # "<!DOCTYPE html>"
196 $dom->parse('<!DOCTYPE html><b>123</b>')->child_nodes->first;
197
198 # " Test "
199 $dom->parse('<b>123</b><!-- Test -->')->child_nodes->last->content;
200
201=head2 children
202
203 my $collection = $dom->children;
204 my $collection = $dom->children('div ~ p');
205
206Find all child elements of this element matching the CSS selector and return a
207L<DOM::Tiny::Collection> object containing these elements as L<DOM::Tiny>
208objects. All selectors from L<DOM::Tiny::CSS/"SELECTORS"> are supported.
209
210 # Show tag name of random child element
211 say $dom->children->shuffle->first->tag;
212
213=head2 content
214
215 my $str = $dom->content;
216 $dom = $dom->content('<p>I ♥ DOM::Tiny!</p>');
217
218Return this node's content or replace it with HTML/XML fragment (for C<root>
219and C<tag> nodes) or raw content.
220
221 # "<b>Test</b>"
222 $dom->parse('<div><b>Test</b></div>')->at('div')->content;
223
224 # "<div><h1>123</h1></div>"
225 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->content('123')->root;
226
227 # "<p><i>123</i></p>"
228 $dom->parse('<p>Test</p>')->at('p')->content('<i>123</i>')->root;
229
230 # "<div><h1></h1></div>"
231 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->content('')->root;
232
233 # " Test "
234 $dom->parse('<!-- Test --><br>')->child_nodes->first->content;
235
236 # "<div><!-- 123 -->456</div>"
237 $dom->parse('<div><!-- Test -->456</div>')
238 ->at('div')->child_nodes->first->content(' 123 ')->root;
239
240=head2 descendant_nodes
241
242 my $collection = $dom->descendant_nodes;
243
244Return a L<DOM::Tiny::Collection> object containing all descendant nodes of
245this element as L<DOM::Tiny> objects.
246
247 # "<p><b>123</b></p>"
248 $dom->parse('<p><!-- Test --><b>123<!-- 456 --></b></p>')
249 ->descendant_nodes->grep(sub { $_->type eq 'comment' })
250 ->map('remove')->first;
251
252 # "<p><b>test</b>test</p>"
253 $dom->parse('<p><b>123</b>456</p>')
254 ->at('p')->descendant_nodes->grep(sub { $_->type eq 'text' })
255 ->map(content => 'test')->first->root;
256
257=head2 find
258
259 my $collection = $dom->find('div ~ p');
260
261Find all descendant elements of this element matching the CSS selector and
262return a L<DOM::Tiny::Collection> object containing these elements as
263L<DOM::Tiny> objects. All selectors from L<DOM::Tiny::CSS/"SELECTORS"> are
264supported.
265
266 # Find a specific element and extract information
267 my $id = $dom->find('div')->[23]{id};
268
269 # Extract information from multiple elements
270 my @headers = $dom->find('h1, h2, h3')->map('text')->each;
271
272 # Count all the different tags
273 my $hash = $dom->find('*')->reduce(sub { $a->{$b->tag}++; $a }, {});
274
275 # Find elements with a class that contains dots
276 my @divs = $dom->find('div.foo\.bar')->each;
277
278=head2 following
279
280 my $collection = $dom->following;
281 my $collection = $dom->following('div ~ p');
282
283Find all sibling elements after this node matching the CSS selector and return
284a L<DOM::Tiny::Collection> object containing these elements as L<DOM::Tiny>
285objects. All selectors from L<DOM::Tiny::CSS/"SELECTORS"> are supported.
286
287 # List tags of sibling elements after this node
288 say $dom->following->map('tag')->join("\n");
289
290=head2 following_nodes
291
292 my $collection = $dom->following_nodes;
293
294Return a L<DOM::Tiny::Collection> object containing all sibling nodes after
295this node as L<DOM::Tiny> objects.
296
297 # "C"
298 $dom->parse('<p>A</p><!-- B -->C')->at('p')->following_nodes->last->content;
299
300=head2 matches
301
302 my $bool = $dom->matches('div ~ p');
303
304Check if this element matches the CSS selector. All selectors from
305L<DOM::Tiny::CSS/"SELECTORS"> are supported.
306
307 # True
308 $dom->parse('<p class="a">A</p>')->at('p')->matches('.a');
309 $dom->parse('<p class="a">A</p>')->at('p')->matches('p[class]');
310
311 # False
312 $dom->parse('<p class="a">A</p>')->at('p')->matches('.b');
313 $dom->parse('<p class="a">A</p>')->at('p')->matches('p[id]');
314
315=head2 namespace
316
317 my $namespace = $dom->namespace;
318
319Find this element's namespace or return C<undef> if none could be found.
320
321 # Find namespace for an element with namespace prefix
322 my $namespace = $dom->at('svg > svg\:circle')->namespace;
323
324 # Find namespace for an element that may or may not have a namespace prefix
325 my $namespace = $dom->at('svg > circle')->namespace;
326
327=head2 new
328
329 my $dom = DOM::Tiny->new;
330 my $dom = DOM::Tiny->new('<foo bar="baz">I ♥ DOM::Tiny!</foo>');
331
332Construct a new scalar-based L<DOM::Tiny> object and L</"parse"> HTML/XML
333fragment if necessary.
334
335=head2 next
336
337 my $sibling = $dom->next;
338
339Return L<DOM::Tiny> object for next sibling element or C<undef> if there are no
340more siblings.
341
342 # "<h2>123</h2>"
343 $dom->parse('<div><h1>Test</h1><h2>123</h2></div>')->at('h1')->next;
344
345=head2 next_node
346
347 my $sibling = $dom->next_node;
348
349Return L<DOM::Tiny> object for next sibling node or C<undef> if there are no
350more siblings.
351
352 # "456"
353 $dom->parse('<p><b>123</b><!-- Test -->456</p>')
354 ->at('b')->next_node->next_node;
355
356 # " Test "
357 $dom->parse('<p><b>123</b><!-- Test -->456</p>')
358 ->at('b')->next_node->content;
359
360=head2 parent
361
362 my $parent = $dom->parent;
363
364Return L<DOM::Tiny> object for parent of this node or C<undef> if this node has
365no parent.
366
367=head2 parse
368
369 $dom = $dom->parse('<foo bar="baz">I ♥ DOM::Tiny!</foo>');
370
371Parse HTML/XML fragment with L<DOM::Tiny::HTML>.
372
373 # Parse XML
374 my $dom = DOM::Tiny->new->xml(1)->parse($xml);
375
376=head2 preceding
377
378 my $collection = $dom->preceding;
379 my $collection = $dom->preceding('div ~ p');
380
381Find all sibling elements before this node matching the CSS selector and return
382a L<DOM::Tiny::Collection> object containing these elements as L<DOM::Tiny>
383objects. All selectors from L<DOM::Tiny::CSS/"SELECTORS"> are supported.
384
385 # List tags of sibling elements before this node
386 say $dom->preceding->map('tag')->join("\n");
387
388=head2 preceding_nodes
389
390 my $collection = $dom->preceding_nodes;
391
392Return a L<DOM::Tiny::Collection> object containing all sibling nodes before
393this node as L<DOM::Tiny> objects.
394
395 # "A"
396 $dom->parse('A<!-- B --><p>C</p>')->at('p')->preceding_nodes->first->content;
397
398=head2 prepend
399
400 $dom = $dom->prepend('<p>I ♥ DOM::Tiny!</p>');
401
402Prepend HTML/XML fragment to this node.
403
404 # "<div><h1>Test</h1><h2>123</h2></div>"
405 $dom->parse('<div><h2>123</h2></div>')
406 ->at('h2')->prepend('<h1>Test</h1>')->root;
407
408 # "<p>Test 123</p>"
409 $dom->parse('<p>123</p>')
410 ->at('p')->child_nodes->first->prepend('Test ')->root;
411
412=head2 prepend_content
413
414 $dom = $dom->prepend_content('<p>I ♥ DOM::Tiny!</p>');
415
416Prepend HTML/XML fragment (for C<root> and C<tag> nodes) or raw content to this
417node's content.
418
419 # "<div><h2>Test123</h2></div>"
420 $dom->parse('<div><h2>123</h2></div>')
421 ->at('h2')->prepend_content('Test')->root;
422
423 # "<!-- Test 123 --><br>"
424 $dom->parse('<!-- 123 --><br>')
425 ->child_nodes->first->prepend_content(' Test')->root;
426
427 # "<p><i>123</i>Test</p>"
428 $dom->parse('<p>Test</p>')->at('p')->prepend_content('<i>123</i>')->root;
429
430=head2 previous
431
432 my $sibling = $dom->previous;
433
434Return L<DOM::Tiny> object for previous sibling element or C<undef> if there
435are no more siblings.
436
437 # "<h1>Test</h1>"
438 $dom->parse('<div><h1>Test</h1><h2>123</h2></div>')->at('h2')->previous;
439
440=head2 previous_node
441
442 my $sibling = $dom->previous_node;
443
444Return L<DOM::Tiny> object for previous sibling node or C<undef> if there are
445no more siblings.
446
447 # "123"
448 $dom->parse('<p>123<!-- Test --><b>456</b></p>')
449 ->at('b')->previous_node->previous_node;
450
451 # " Test "
452 $dom->parse('<p>123<!-- Test --><b>456</b></p>')
453 ->at('b')->previous_node->content;
454
455=head2 remove
456
457 my $parent = $dom->remove;
458
459Remove this node and return L</"root"> (for C<root> nodes) or L</"parent">.
460
461 # "<div></div>"
462 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->remove;
463
464 # "<p><b>456</b></p>"
465 $dom->parse('<p>123<b>456</b></p>')
466 ->at('p')->child_nodes->first->remove->root;
467
468=head2 replace
469
470 my $parent = $dom->replace('<div>I ♥ DOM::Tiny!</div>');
471
472Replace this node with HTML/XML fragment and return L</"root"> (for C<root>
473nodes) or L</"parent">.
474
475 # "<div><h2>123</h2></div>"
476 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->replace('<h2>123</h2>');
477
478 # "<p><b>123</b></p>"
479 $dom->parse('<p>Test</p>')
480 ->at('p')->child_nodes->[0]->replace('<b>123</b>')->root;
481
482=head2 root
483
484 my $root = $dom->root;
485
486Return L<DOM::Tiny> object for C<root> node.
487
488=head2 strip
489
490 my $parent = $dom->strip;
491
492Remove this element while preserving its content and return L</"parent">.
493
494 # "<div>Test</div>"
495 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->strip;
496
497=head2 tag
498
499 my $tag = $dom->tag;
500 $dom = $dom->tag('div');
501
502This element's tag name.
503
504 # List tag names of child elements
505 say $dom->children->map('tag')->join("\n");
506
507=head2 tap
508
509 $dom = $dom->tap(sub {...});
510
511Alias for L<Mojo::Base/"tap">.
512
513=head2 text
514
515 my $trimmed = $dom->text;
516 my $untrimmed = $dom->text(0);
517
518Extract text content from this element only (not including child elements),
519smart whitespace trimming is enabled by default.
520
521 # "foo baz"
522 $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->at('div')->text;
523
524 # "foo\nbaz\n"
525 $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->at('div')->text(0);
526
527=head2 to_string
528
529 my $str = $dom->to_string;
530
531Render this node and its content to HTML/XML.
532
533 # "<b>Test</b>"
534 $dom->parse('<div><b>Test</b></div>')->at('div b')->to_string;
535
536=head2 tree
537
538 my $tree = $dom->tree;
539 $dom = $dom->tree(['root']);
540
541Document Object Model. Note that this structure should only be used very
542carefully since it is very dynamic.
543
544=head2 type
545
546 my $type = $dom->type;
547
548This node's type, usually C<cdata>, C<comment>, C<doctype>, C<pi>, C<raw>,
549C<root>, C<tag> or C<text>.
550
551 # "cdata"
552 $dom->parse('<![CDATA[Test]]>')->child_nodes->first->type;
553
554 # "comment"
555 $dom->parse('<!-- Test -->')->child_nodes->first->type;
556
557 # "doctype"
558 $dom->parse('<!DOCTYPE html>')->child_nodes->first->type;
559
560 # "pi"
561 $dom->parse('<?xml version="1.0"?>')->child_nodes->first->type;
562
563 # "raw"
564 $dom->parse('<title>Test</title>')->at('title')->child_nodes->first->type;
565
566 # "root"
567 $dom->parse('<p>Test</p>')->type;
568
569 # "tag"
570 $dom->parse('<p>Test</p>')->at('p')->type;
571
572 # "text"
573 $dom->parse('<p>Test</p>')->at('p')->child_nodes->first->type;
574
575=head2 val
576
577 my $value = $dom->val;
578
579Extract value from form element (such as C<button>, C<input>, C<option>,
580C<select> and C<textarea>) or return C<undef> if this element has no value. In
581the case of C<select> with C<multiple> attribute, find C<option> elements with
582C<selected> attribute and return an array reference with all values or C<undef>
583if none could be found.
584
585 # "a"
586 $dom->parse('<input name="test" value="a">')->at('input')->val;
587
588 # "b"
589 $dom->parse('<textarea>b</textarea>')->at('textarea')->val;
590
591 # "c"
592 $dom->parse('<option value="c">Test</option>')->at('option')->val;
593
594 # "d"
595 $dom->parse('<select><option selected>d</option></select>')
596 ->at('select')->val;
597
598 # "e"
599 $dom->parse('<select multiple><option selected>e</option></select>')
600 ->at('select')->val->[0];
601
602=head2 wrap
603
604 $dom = $dom->wrap('<div></div>');
605
606Wrap HTML/XML fragment around this node, placing it as the last child of the
607first innermost element.
608
609 # "<p>123<b>Test</b></p>"
610 $dom->parse('<b>Test</b>')->at('b')->wrap('<p>123</p>')->root;
611
612 # "<div><p><b>Test</b></p>123</div>"
613 $dom->parse('<b>Test</b>')->at('b')->wrap('<div><p></p>123</div>')->root;
614
615 # "<p><b>Test</b></p><p>123</p>"
616 $dom->parse('<b>Test</b>')->at('b')->wrap('<p></p><p>123</p>')->root;
617
618 # "<p><b>Test</b></p>"
619 $dom->parse('<p>Test</p>')->at('p')->child_nodes->first->wrap('<b>')->root;
620
621=head2 wrap_content
622
623 $dom = $dom->wrap_content('<div></div>');
624
625Wrap HTML/XML fragment around this node's content, placing it as the last
626children of the first innermost element.
627
628 # "<p><b>123Test</b></p>"
629 $dom->parse('<p>Test<p>')->at('p')->wrap_content('<b>123</b>')->root;
630
631 # "<p><b>Test</b></p><p>123</p>"
632 $dom->parse('<b>Test</b>')->wrap_content('<p></p><p>123</p>');
633
634=head2 xml
635
636 my $bool = $dom->xml;
637 $dom = $dom->xml($bool);
638
639Disable HTML semantics in parser and activate case-sensitivity, defaults to
640auto detection based on processing instructions.
641
642=head1 OPERATORS
643
644L<DOM::Tiny> overloads the following operators.
645
646=head2 array
647
648 my @nodes = @$dom;
649
650Alias for L</"child_nodes">.
651
652 # "<!-- Test -->"
653 $dom->parse('<!-- Test --><b>123</b>')->[0];
654
655=head2 bool
656
657 my $bool = !!$dom;
658
659Always true.
660
661=head2 hash
662
663 my %attrs = %$dom;
664
665Alias for L</"attr">.
666
667 # "test"
668 $dom->parse('<div id="test">Test</div>')->at('div')->{id};
669
670=head2 stringify
671
672 my $str = "$dom";
673
674Alias for L</"to_string">.
675
676=head1 BUGS
677
678Report any issues on the public bugtracker.
679
680=head1 AUTHOR
681
682Dan Book <dbook@cpan.org>
683
684=head1 COPYRIGHT AND LICENSE
685
686This software is Copyright (c) 2015 by Dan Book.
687
688This is free software, licensed under:
689
690 The Artistic License 2.0 (GPL Compatible)
691
692=head1 SEE ALSO
693
694L<Mojo::DOM>, L<XML::LibXML>, L<XML::Twig>, L<HTML::TreeBuilder>, L<XML::Smart>
695
696=cut