more descriptive main description
[catagits/DOM-Tiny.git] / README.pod
CommitLineData
d6512b50 1=pod
2
3=encoding utf8
4
5=head1 NAME
6
7DOM::Tiny - Minimalistic HTML/XML DOM parser with CSS selectors
8
9=head1 SYNOPSIS
10
11 use DOM::Tiny;
12
13 # Parse
14 my $dom = DOM::Tiny->new('<div><p id="a">Test</p><p id="b">123</p></div>');
15
16 # Find
17 say $dom->at('#b')->text;
18 say $dom->find('p')->map('text')->join("\n");
19 say $dom->find('[id]')->map(attr => 'id')->join("\n");
20
21 # Iterate
22 $dom->find('p[id]')->reverse->each(sub { say $_->{id} });
23
24 # Loop
25 for my $e ($dom->find('p[id]')->each) {
26 say $e->{id}, ':', $e->text;
27 }
28
29 # Modify
30 $dom->find('div p')->last->append('<p id="c">456</p>');
31 $dom->find(':not(p)')->map('strip');
32
33 # Render
34 say "$dom";
35
36=head1 DESCRIPTION
37
5a70ee9d 38L<DOM::Tiny> is a minimalistic and relaxed pure-perl HTML/XML DOM parser with
39support for the L<HTML Living Standard|https://html.spec.whatwg.org/> and
40L<CSS3 selectors|http://www.w3.org/TR/selectors/> based on L<Mojo::DOM>. It
41will even try to interpret broken HTML and XML, so you should not use it for
42validation.
d6512b50 43
44=head1 NODES AND ELEMENTS
45
46When we parse an HTML/XML fragment, it gets turned into a tree of nodes.
47
48 <!DOCTYPE html>
49 <html>
50 <head><title>Hello</title></head>
51 <body>World!</body>
52 </html>
53
54There are currently eight different kinds of nodes, C<cdata>, C<comment>,
55C<doctype>, C<pi>, C<raw>, C<root>, C<tag> and C<text>. Elements are nodes of
56the type C<tag>.
57
58 root
59 |- doctype (html)
60 +- tag (html)
61 |- tag (head)
62 | +- tag (title)
63 | +- raw (Hello)
64 +- tag (body)
65 +- text (World!)
66
67While all node types are represented as L<DOM::Tiny> objects, some methods like
68L</"attr"> and L</"namespace"> only apply to elements.
69
70=head1 CASE-SENSITIVITY
71
72L<DOM::Tiny> defaults to HTML semantics, that means all tags and attribute
73names are lowercased and selectors need to be lowercase as well.
74
75 # HTML semantics
76 my $dom = DOM::Tiny->new('<P ID="greeting">Hi!</P>');
77 say $dom->at('p[id]')->text;
78
79If XML processing instructions are found, the parser will automatically switch
80into XML mode and everything becomes case-sensitive.
81
82 # XML semantics
83 my $dom = DOM::Tiny->new('<?xml version="1.0"?><P ID="greeting">Hi!</P>');
84 say $dom->at('P[ID]')->text;
85
86XML detection can also be disabled with the L</"xml"> method.
87
88 # Force XML semantics
89 my $dom = DOM::Tiny->new->xml(1)->parse('<P ID="greeting">Hi!</P>');
90 say $dom->at('P[ID]')->text;
91
92 # Force HTML semantics
93 my $dom = DOM::Tiny->new->xml(0)->parse('<P ID="greeting">Hi!</P>');
94 say $dom->at('p[id]')->text;
95
96=head1 METHODS
97
98L<DOM::Tiny> implements the following methods.
99
3793c28f 100=head2 new
101
102 my $dom = DOM::Tiny->new;
103 my $dom = DOM::Tiny->new('<foo bar="baz">I ♥ DOM::Tiny!</foo>');
104
105Construct a new scalar-based L<DOM::Tiny> object and L</"parse"> HTML/XML
106fragment if necessary.
107
d6512b50 108=head2 all_text
109
110 my $trimmed = $dom->all_text;
111 my $untrimmed = $dom->all_text(0);
112
113Extract text content from all descendant nodes of this element, smart
114whitespace trimming is enabled by default.
115
116 # "foo bar baz"
117 $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->at('div')->all_text;
118
119 # "foo\nbarbaz\n"
120 $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->at('div')->all_text(0);
121
122=head2 ancestors
123
124 my $collection = $dom->ancestors;
125 my $collection = $dom->ancestors('div ~ p');
126
127Find all ancestor elements of this node matching the CSS selector and return a
128L<DOM::Tiny::Collection> object containing these elements as L<DOM::Tiny>
129objects. All selectors from L<DOM::Tiny::CSS/"SELECTORS"> are supported.
130
131 # List tag names of ancestor elements
132 say $dom->ancestors->map('tag')->join("\n");
133
134=head2 append
135
136 $dom = $dom->append('<p>I ♥ DOM::Tiny!</p>');
137
138Append HTML/XML fragment to this node.
139
140 # "<div><h1>Test</h1><h2>123</h2></div>"
141 $dom->parse('<div><h1>Test</h1></div>')
142 ->at('h1')->append('<h2>123</h2>')->root;
143
144 # "<p>Test 123</p>"
145 $dom->parse('<p>Test</p>')->at('p')
146 ->child_nodes->first->append(' 123')->root;
147
148=head2 append_content
149
150 $dom = $dom->append_content('<p>I ♥ DOM::Tiny!</p>');
151
152Append HTML/XML fragment (for C<root> and C<tag> nodes) or raw content to this
153node's content.
154
155 # "<div><h1>Test123</h1></div>"
156 $dom->parse('<div><h1>Test</h1></div>')
157 ->at('h1')->append_content('123')->root;
158
159 # "<!-- Test 123 --><br>"
160 $dom->parse('<!-- Test --><br>')
161 ->child_nodes->first->append_content('123 ')->root;
162
163 # "<p>Test<i>123</i></p>"
164 $dom->parse('<p>Test</p>')->at('p')->append_content('<i>123</i>')->root;
165
166=head2 at
167
168 my $result = $dom->at('div ~ p');
169
170Find first descendant element of this element matching the CSS selector and
171return it as a L<DOM::Tiny> object or return C<undef> if none could be found.
172All selectors from L<DOM::Tiny::CSS/"SELECTORS"> are supported.
173
174 # Find first element with "svg" namespace definition
175 my $namespace = $dom->at('[xmlns\:svg]')->{'xmlns:svg'};
176
177=head2 attr
178
179 my $hash = $dom->attr;
180 my $foo = $dom->attr('foo');
181 $dom = $dom->attr({foo => 'bar'});
182 $dom = $dom->attr(foo => 'bar');
183
184This element's attributes.
185
186 # Remove an attribute
187 delete $dom->attr->{id};
188
189 # Attribute without value
190 $dom->attr(selected => undef);
191
192 # List id attributes
193 say $dom->find('*')->map(attr => 'id')->compact->join("\n");
194
195=head2 child_nodes
196
197 my $collection = $dom->child_nodes;
198
199Return a L<DOM::Tiny::Collection> object containing all child nodes of this
200element as L<DOM::Tiny> objects.
201
202 # "<p><b>123</b></p>"
203 $dom->parse('<p>Test<b>123</b></p>')->at('p')->child_nodes->first->remove;
204
205 # "<!DOCTYPE html>"
206 $dom->parse('<!DOCTYPE html><b>123</b>')->child_nodes->first;
207
208 # " Test "
209 $dom->parse('<b>123</b><!-- Test -->')->child_nodes->last->content;
210
211=head2 children
212
213 my $collection = $dom->children;
214 my $collection = $dom->children('div ~ p');
215
216Find all child elements of this element matching the CSS selector and return a
217L<DOM::Tiny::Collection> object containing these elements as L<DOM::Tiny>
218objects. All selectors from L<DOM::Tiny::CSS/"SELECTORS"> are supported.
219
220 # Show tag name of random child element
221 say $dom->children->shuffle->first->tag;
222
223=head2 content
224
225 my $str = $dom->content;
226 $dom = $dom->content('<p>I ♥ DOM::Tiny!</p>');
227
228Return this node's content or replace it with HTML/XML fragment (for C<root>
229and C<tag> nodes) or raw content.
230
231 # "<b>Test</b>"
232 $dom->parse('<div><b>Test</b></div>')->at('div')->content;
233
234 # "<div><h1>123</h1></div>"
235 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->content('123')->root;
236
237 # "<p><i>123</i></p>"
238 $dom->parse('<p>Test</p>')->at('p')->content('<i>123</i>')->root;
239
240 # "<div><h1></h1></div>"
241 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->content('')->root;
242
243 # " Test "
244 $dom->parse('<!-- Test --><br>')->child_nodes->first->content;
245
246 # "<div><!-- 123 -->456</div>"
247 $dom->parse('<div><!-- Test -->456</div>')
248 ->at('div')->child_nodes->first->content(' 123 ')->root;
249
250=head2 descendant_nodes
251
252 my $collection = $dom->descendant_nodes;
253
254Return a L<DOM::Tiny::Collection> object containing all descendant nodes of
255this element as L<DOM::Tiny> objects.
256
257 # "<p><b>123</b></p>"
258 $dom->parse('<p><!-- Test --><b>123<!-- 456 --></b></p>')
259 ->descendant_nodes->grep(sub { $_->type eq 'comment' })
260 ->map('remove')->first;
261
262 # "<p><b>test</b>test</p>"
263 $dom->parse('<p><b>123</b>456</p>')
264 ->at('p')->descendant_nodes->grep(sub { $_->type eq 'text' })
265 ->map(content => 'test')->first->root;
266
267=head2 find
268
269 my $collection = $dom->find('div ~ p');
270
271Find all descendant elements of this element matching the CSS selector and
272return a L<DOM::Tiny::Collection> object containing these elements as
273L<DOM::Tiny> objects. All selectors from L<DOM::Tiny::CSS/"SELECTORS"> are
274supported.
275
276 # Find a specific element and extract information
277 my $id = $dom->find('div')->[23]{id};
278
279 # Extract information from multiple elements
280 my @headers = $dom->find('h1, h2, h3')->map('text')->each;
281
282 # Count all the different tags
283 my $hash = $dom->find('*')->reduce(sub { $a->{$b->tag}++; $a }, {});
284
285 # Find elements with a class that contains dots
286 my @divs = $dom->find('div.foo\.bar')->each;
287
288=head2 following
289
290 my $collection = $dom->following;
291 my $collection = $dom->following('div ~ p');
292
293Find all sibling elements after this node matching the CSS selector and return
294a L<DOM::Tiny::Collection> object containing these elements as L<DOM::Tiny>
295objects. All selectors from L<DOM::Tiny::CSS/"SELECTORS"> are supported.
296
297 # List tags of sibling elements after this node
298 say $dom->following->map('tag')->join("\n");
299
300=head2 following_nodes
301
302 my $collection = $dom->following_nodes;
303
304Return a L<DOM::Tiny::Collection> object containing all sibling nodes after
305this node as L<DOM::Tiny> objects.
306
307 # "C"
308 $dom->parse('<p>A</p><!-- B -->C')->at('p')->following_nodes->last->content;
309
310=head2 matches
311
312 my $bool = $dom->matches('div ~ p');
313
314Check if this element matches the CSS selector. All selectors from
315L<DOM::Tiny::CSS/"SELECTORS"> are supported.
316
317 # True
318 $dom->parse('<p class="a">A</p>')->at('p')->matches('.a');
319 $dom->parse('<p class="a">A</p>')->at('p')->matches('p[class]');
320
321 # False
322 $dom->parse('<p class="a">A</p>')->at('p')->matches('.b');
323 $dom->parse('<p class="a">A</p>')->at('p')->matches('p[id]');
324
325=head2 namespace
326
327 my $namespace = $dom->namespace;
328
329Find this element's namespace or return C<undef> if none could be found.
330
331 # Find namespace for an element with namespace prefix
332 my $namespace = $dom->at('svg > svg\:circle')->namespace;
333
334 # Find namespace for an element that may or may not have a namespace prefix
335 my $namespace = $dom->at('svg > circle')->namespace;
336
d6512b50 337=head2 next
338
339 my $sibling = $dom->next;
340
341Return L<DOM::Tiny> object for next sibling element or C<undef> if there are no
342more siblings.
343
344 # "<h2>123</h2>"
345 $dom->parse('<div><h1>Test</h1><h2>123</h2></div>')->at('h1')->next;
346
347=head2 next_node
348
349 my $sibling = $dom->next_node;
350
351Return L<DOM::Tiny> object for next sibling node or C<undef> if there are no
352more siblings.
353
354 # "456"
355 $dom->parse('<p><b>123</b><!-- Test -->456</p>')
356 ->at('b')->next_node->next_node;
357
358 # " Test "
359 $dom->parse('<p><b>123</b><!-- Test -->456</p>')
360 ->at('b')->next_node->content;
361
362=head2 parent
363
364 my $parent = $dom->parent;
365
366Return L<DOM::Tiny> object for parent of this node or C<undef> if this node has
367no parent.
368
369=head2 parse
370
371 $dom = $dom->parse('<foo bar="baz">I ♥ DOM::Tiny!</foo>');
372
373Parse HTML/XML fragment with L<DOM::Tiny::HTML>.
374
375 # Parse XML
376 my $dom = DOM::Tiny->new->xml(1)->parse($xml);
377
378=head2 preceding
379
380 my $collection = $dom->preceding;
381 my $collection = $dom->preceding('div ~ p');
382
383Find all sibling elements before this node matching the CSS selector and return
384a L<DOM::Tiny::Collection> object containing these elements as L<DOM::Tiny>
385objects. All selectors from L<DOM::Tiny::CSS/"SELECTORS"> are supported.
386
387 # List tags of sibling elements before this node
388 say $dom->preceding->map('tag')->join("\n");
389
390=head2 preceding_nodes
391
392 my $collection = $dom->preceding_nodes;
393
394Return a L<DOM::Tiny::Collection> object containing all sibling nodes before
395this node as L<DOM::Tiny> objects.
396
397 # "A"
398 $dom->parse('A<!-- B --><p>C</p>')->at('p')->preceding_nodes->first->content;
399
400=head2 prepend
401
402 $dom = $dom->prepend('<p>I ♥ DOM::Tiny!</p>');
403
404Prepend HTML/XML fragment to this node.
405
406 # "<div><h1>Test</h1><h2>123</h2></div>"
407 $dom->parse('<div><h2>123</h2></div>')
408 ->at('h2')->prepend('<h1>Test</h1>')->root;
409
410 # "<p>Test 123</p>"
411 $dom->parse('<p>123</p>')
412 ->at('p')->child_nodes->first->prepend('Test ')->root;
413
414=head2 prepend_content
415
416 $dom = $dom->prepend_content('<p>I ♥ DOM::Tiny!</p>');
417
418Prepend HTML/XML fragment (for C<root> and C<tag> nodes) or raw content to this
419node's content.
420
421 # "<div><h2>Test123</h2></div>"
422 $dom->parse('<div><h2>123</h2></div>')
423 ->at('h2')->prepend_content('Test')->root;
424
425 # "<!-- Test 123 --><br>"
426 $dom->parse('<!-- 123 --><br>')
427 ->child_nodes->first->prepend_content(' Test')->root;
428
429 # "<p><i>123</i>Test</p>"
430 $dom->parse('<p>Test</p>')->at('p')->prepend_content('<i>123</i>')->root;
431
432=head2 previous
433
434 my $sibling = $dom->previous;
435
436Return L<DOM::Tiny> object for previous sibling element or C<undef> if there
437are no more siblings.
438
439 # "<h1>Test</h1>"
440 $dom->parse('<div><h1>Test</h1><h2>123</h2></div>')->at('h2')->previous;
441
442=head2 previous_node
443
444 my $sibling = $dom->previous_node;
445
446Return L<DOM::Tiny> object for previous sibling node or C<undef> if there are
447no more siblings.
448
449 # "123"
450 $dom->parse('<p>123<!-- Test --><b>456</b></p>')
451 ->at('b')->previous_node->previous_node;
452
453 # " Test "
454 $dom->parse('<p>123<!-- Test --><b>456</b></p>')
455 ->at('b')->previous_node->content;
456
457=head2 remove
458
459 my $parent = $dom->remove;
460
461Remove this node and return L</"root"> (for C<root> nodes) or L</"parent">.
462
463 # "<div></div>"
464 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->remove;
465
466 # "<p><b>456</b></p>"
467 $dom->parse('<p>123<b>456</b></p>')
468 ->at('p')->child_nodes->first->remove->root;
469
470=head2 replace
471
472 my $parent = $dom->replace('<div>I ♥ DOM::Tiny!</div>');
473
474Replace this node with HTML/XML fragment and return L</"root"> (for C<root>
475nodes) or L</"parent">.
476
477 # "<div><h2>123</h2></div>"
478 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->replace('<h2>123</h2>');
479
480 # "<p><b>123</b></p>"
481 $dom->parse('<p>Test</p>')
482 ->at('p')->child_nodes->[0]->replace('<b>123</b>')->root;
483
484=head2 root
485
486 my $root = $dom->root;
487
488Return L<DOM::Tiny> object for C<root> node.
489
490=head2 strip
491
492 my $parent = $dom->strip;
493
494Remove this element while preserving its content and return L</"parent">.
495
496 # "<div>Test</div>"
497 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->strip;
498
499=head2 tag
500
501 my $tag = $dom->tag;
502 $dom = $dom->tag('div');
503
504This element's tag name.
505
506 # List tag names of child elements
507 say $dom->children->map('tag')->join("\n");
508
509=head2 tap
510
511 $dom = $dom->tap(sub {...});
512
e99ef07d 513Equivalent to L<Mojo::Base/"tap">.
d6512b50 514
515=head2 text
516
517 my $trimmed = $dom->text;
518 my $untrimmed = $dom->text(0);
519
520Extract text content from this element only (not including child elements),
521smart whitespace trimming is enabled by default.
522
523 # "foo baz"
524 $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->at('div')->text;
525
526 # "foo\nbaz\n"
527 $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->at('div')->text(0);
528
529=head2 to_string
530
531 my $str = $dom->to_string;
532
533Render this node and its content to HTML/XML.
534
535 # "<b>Test</b>"
536 $dom->parse('<div><b>Test</b></div>')->at('div b')->to_string;
537
538=head2 tree
539
540 my $tree = $dom->tree;
541 $dom = $dom->tree(['root']);
542
543Document Object Model. Note that this structure should only be used very
544carefully since it is very dynamic.
545
546=head2 type
547
548 my $type = $dom->type;
549
550This node's type, usually C<cdata>, C<comment>, C<doctype>, C<pi>, C<raw>,
551C<root>, C<tag> or C<text>.
552
553 # "cdata"
554 $dom->parse('<![CDATA[Test]]>')->child_nodes->first->type;
555
556 # "comment"
557 $dom->parse('<!-- Test -->')->child_nodes->first->type;
558
559 # "doctype"
560 $dom->parse('<!DOCTYPE html>')->child_nodes->first->type;
561
562 # "pi"
563 $dom->parse('<?xml version="1.0"?>')->child_nodes->first->type;
564
565 # "raw"
566 $dom->parse('<title>Test</title>')->at('title')->child_nodes->first->type;
567
568 # "root"
569 $dom->parse('<p>Test</p>')->type;
570
571 # "tag"
572 $dom->parse('<p>Test</p>')->at('p')->type;
573
574 # "text"
575 $dom->parse('<p>Test</p>')->at('p')->child_nodes->first->type;
576
577=head2 val
578
579 my $value = $dom->val;
580
581Extract value from form element (such as C<button>, C<input>, C<option>,
582C<select> and C<textarea>) or return C<undef> if this element has no value. In
583the case of C<select> with C<multiple> attribute, find C<option> elements with
584C<selected> attribute and return an array reference with all values or C<undef>
585if none could be found.
586
587 # "a"
588 $dom->parse('<input name="test" value="a">')->at('input')->val;
589
590 # "b"
591 $dom->parse('<textarea>b</textarea>')->at('textarea')->val;
592
593 # "c"
594 $dom->parse('<option value="c">Test</option>')->at('option')->val;
595
596 # "d"
597 $dom->parse('<select><option selected>d</option></select>')
598 ->at('select')->val;
599
600 # "e"
601 $dom->parse('<select multiple><option selected>e</option></select>')
602 ->at('select')->val->[0];
603
604=head2 wrap
605
606 $dom = $dom->wrap('<div></div>');
607
608Wrap HTML/XML fragment around this node, placing it as the last child of the
609first innermost element.
610
611 # "<p>123<b>Test</b></p>"
612 $dom->parse('<b>Test</b>')->at('b')->wrap('<p>123</p>')->root;
613
614 # "<div><p><b>Test</b></p>123</div>"
615 $dom->parse('<b>Test</b>')->at('b')->wrap('<div><p></p>123</div>')->root;
616
617 # "<p><b>Test</b></p><p>123</p>"
618 $dom->parse('<b>Test</b>')->at('b')->wrap('<p></p><p>123</p>')->root;
619
620 # "<p><b>Test</b></p>"
621 $dom->parse('<p>Test</p>')->at('p')->child_nodes->first->wrap('<b>')->root;
622
623=head2 wrap_content
624
625 $dom = $dom->wrap_content('<div></div>');
626
627Wrap HTML/XML fragment around this node's content, placing it as the last
628children of the first innermost element.
629
630 # "<p><b>123Test</b></p>"
631 $dom->parse('<p>Test<p>')->at('p')->wrap_content('<b>123</b>')->root;
632
633 # "<p><b>Test</b></p><p>123</p>"
634 $dom->parse('<b>Test</b>')->wrap_content('<p></p><p>123</p>');
635
636=head2 xml
637
638 my $bool = $dom->xml;
639 $dom = $dom->xml($bool);
640
641Disable HTML semantics in parser and activate case-sensitivity, defaults to
642auto detection based on processing instructions.
643
644=head1 OPERATORS
645
646L<DOM::Tiny> overloads the following operators.
647
648=head2 array
649
650 my @nodes = @$dom;
651
652Alias for L</"child_nodes">.
653
654 # "<!-- Test -->"
655 $dom->parse('<!-- Test --><b>123</b>')->[0];
656
657=head2 bool
658
659 my $bool = !!$dom;
660
661Always true.
662
663=head2 hash
664
665 my %attrs = %$dom;
666
667Alias for L</"attr">.
668
669 # "test"
670 $dom->parse('<div id="test">Test</div>')->at('div')->{id};
671
672=head2 stringify
673
674 my $str = "$dom";
675
676Alias for L</"to_string">.
677
678=head1 BUGS
679
680Report any issues on the public bugtracker.
681
682=head1 AUTHOR
683
684Dan Book <dbook@cpan.org>
685
686=head1 COPYRIGHT AND LICENSE
687
688This software is Copyright (c) 2015 by Dan Book.
689
690This is free software, licensed under:
691
692 The Artistic License 2.0 (GPL Compatible)
693
694=head1 SEE ALSO
695
696L<Mojo::DOM>, L<XML::LibXML>, L<XML::Twig>, L<HTML::TreeBuilder>, L<XML::Smart>
697
698=cut