1 .\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.3
4 .\" ========================================================================
5 .de Sh \" Subsection heading
13 .de Sp \" Vertical space (when we can't use .PP)
17 .de Vb \" Begin verbatim text
22 .de Ve \" End verbatim text
26 .\" Set up some character translations and predefined strings. \*(-- will
27 .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
28 .\" double quote, and \*(R" will give a right double quote. | will give a
29 .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
30 .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C'
31 .\" expand to `' in nroff, nothing in troff, for use with C<>.
33 .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
37 . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
38 . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
51 .\" If the F register is turned on, we'll generate index entries on stderr for
52 .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
53 .\" entries marked with X<> in POD. Of course, you'll have to process the
54 .\" output yourself in some meaningful fashion.
57 . tm Index:\\$1\t\\n%\t"\\$2"
63 .\" For nroff, turn off justification. Always turn off hyphenation; it makes
64 .\" way too many mistakes in technical documents.
68 .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
69 .\" Fear. Run. Save yourself. No user-serviceable parts.
70 . \" fudge factors for nroff and troff
79 . ds #H ((1u-(\\\\n(.fu%2u))*.13m)
85 . \" simple accents for nroff and troff
95 . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
96 . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
97 . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
98 . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
99 . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
100 . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
102 . \" troff and (daisy-wheel) nroff accents
103 .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
104 .ds 8 \h'\*(#H'\(*b\h'-\*(#H'
105 .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
106 .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
107 .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
108 .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
109 .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
110 .ds ae a\h'-(\w'a'u*4/10)'e
111 .ds Ae A\h'-(\w'A'u*4/10)'E
112 . \" corrections for vroff
113 .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
114 .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
115 . \" for low resolution devices (crt and lpr)
116 .if \n(.H>23 .if \n(.V>19 \
129 .\" ========================================================================
131 .IX Title "HTML::LinkExtor 3"
132 .TH HTML::LinkExtor 3 "2009-02-09" "perl v5.8.7" "User Contributed Perl Documentation"
134 HTML::LinkExtor \- Extract links from an HTML document
136 .IX Header "SYNOPSIS"
138 \& require HTML::LinkExtor;
139 \& $p = HTML::LinkExtor\->new(\e&cb, "http://www.perl.org/");
141 \& my($tag, %links) = @_;
142 \& print "$tag @{[%links]}\en";
144 \& $p\->parse_file("index.html");
147 .IX Header "DESCRIPTION"
148 \&\fIHTML::LinkExtor\fR is an \s-1HTML\s0 parser that extracts links from an
149 \&\s-1HTML\s0 document. The \fIHTML::LinkExtor\fR is a subclass of
150 \&\fIHTML::Parser\fR. This means that the document should be given to the
151 parser by calling the \f(CW$p\fR\->\fIparse()\fR or \f(CW$p\fR\->\fIparse_file()\fR methods.
152 .IP "$p = HTML::LinkExtor\->new" 4
153 .IX Item "$p = HTML::LinkExtor->new"
155 .ie n .IP "$p = HTML::LinkExtor\->new( $callback )" 4
156 .el .IP "$p = HTML::LinkExtor\->new( \f(CW$callback\fR )" 4
157 .IX Item "$p = HTML::LinkExtor->new( $callback )"
158 .ie n .IP "$p = HTML::LinkExtor\->new( $callback\fR, \f(CW$base )" 4
159 .el .IP "$p = HTML::LinkExtor\->new( \f(CW$callback\fR, \f(CW$base\fR )" 4
160 .IX Item "$p = HTML::LinkExtor->new( $callback, $base )"
162 The constructor takes two optional arguments. The first is a reference
163 to a callback routine. It will be called as links are found. If a
164 callback is not provided, then links are just accumulated internally
165 and can be retrieved by calling the \f(CW$p\fR\->\fIlinks()\fR method.
167 The \f(CW$base\fR argument is an optional base \s-1URL\s0 used to absolutize all URLs found.
168 You need to have the \fI\s-1URI\s0\fR module installed if you provide \f(CW$base\fR.
170 The callback is called with the lowercase tag name as first argument,
171 and then all link attributes as separate key/value pairs. All
172 non-link attributes are removed.
175 Returns a list of all links found in the document. The returned
176 values will be anonymous arrays with the following elements:
179 \& [$tag, $attr => $url1, $attr2 => $url2,...]
182 The \f(CW$p\fR\->links method will also truncate the internal link list. This
183 means that if the method is called twice without any parsing
184 between them the second call will return an empty list.
186 Also note that \f(CW$p\fR\->links will always be empty if a callback routine
187 was provided when the \fIHTML::LinkExtor\fR was created.
190 This is an example showing how you can extract links from a document
191 received using \s-1LWP:\s0
194 \& use LWP::UserAgent;
195 \& use HTML::LinkExtor;
200 \& $url = "http://www.perl.org/"; # for instance
201 \& $ua = LWP::UserAgent\->new;
205 \& # Set up a callback that collect image links
208 \& my($tag, %attr) = @_;
209 \& return if $tag ne 'img'; # we only look closer at <img ...>
210 \& push(@imgs, values %attr);
215 \& # Make the parser. Unfortunately, we don't know the base yet
216 \& # (it might be different from $url)
217 \& $p = HTML::LinkExtor\->new(\e&callback);
221 \& # Request document and parse it as it arrives
222 \& $res = $ua\->request(HTTP::Request\->new(GET => $url),
223 \& sub {$p\->parse($_[0])});
227 \& # Expand all image URLs to absolute ones
228 \& my $base = $res\->base;
229 \& @imgs = map { $_ = url($_, $base)\->abs; } @imgs;
234 \& print join("\en", @imgs), "\en";
237 .IX Header "SEE ALSO"
238 HTML::Parser, HTML::Tagset, \s-1LWP\s0, \s-1URI::URL\s0
240 .IX Header "COPYRIGHT"
241 Copyright 1996\-2001 Gisle Aas.
243 This library is free software; you can redistribute it and/or
244 modify it under the same terms as Perl itself.