local-lib5/man/man3/HTML::LinkExtor.3pm

   1 .\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.3
   2 .\"
   3 .\" Standard preamble:
   4 .\" ========================================================================
   5 .de Sh \" Subsection heading
   6 .br
   7 .if t .Sp
   8 .ne 5
   9 .PP
  10 \fB\\$1\fR
  11 .PP
  12 ..
  13 .de Sp \" Vertical space (when we can't use .PP)
  14 .if t .sp .5v
  15 .if n .sp
  16 ..
  17 .de Vb \" Begin verbatim text
  18 .ft CW
  19 .nf
  20 .ne \\$1
  21 ..
  22 .de Ve \" End verbatim text
  23 .ft R
  24 .fi
  25 ..
  26 .\" Set up some character translations and predefined strings.  \*(-- will
  27 .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
  28 .\" double quote, and \*(R" will give a right double quote.  | will give a
  29 .\" real vertical bar.  \*(C+ will give a nicer C++.  Capital omega is used to
  30 .\" do unbreakable dashes and therefore won't be available.  \*(C` and \*(C'
  31 .\" expand to `' in nroff, nothing in troff, for use with C<>.
  32 .tr \(*W-|\(bv\*(Tr
  33 .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
  34 .ie n \{\
  35 .    ds -- \(*W-
  36 .    ds PI pi
  37 .    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
  38 .    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
  39 .    ds L" ""
  40 .    ds R" ""
  41 .    ds C` ""
  42 .    ds C' ""
  43 'br\}
  44 .el\{\
  45 .    ds -- \|\(em\|
  46 .    ds PI \(*p
  47 .    ds L" ``
  48 .    ds R" ''
  49 'br\}
  50 .\"
  51 .\" If the F register is turned on, we'll generate index entries on stderr for
  52 .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
  53 .\" entries marked with X<> in POD.  Of course, you'll have to process the
  54 .\" output yourself in some meaningful fashion.
  55 .if \nF \{\
  56 .    de IX
  57 .    tm Index:\\$1\t\\n%\t"\\$2"
  58 ..
  59 .    nr % 0
  60 .    rr F
  61 .\}
  62 .\"
  63 .\" For nroff, turn off justification.  Always turn off hyphenation; it makes
  64 .\" way too many mistakes in technical documents.
  65 .hy 0
  66 .if n .na
  67 .\"
  68 .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
  69 .\" Fear.  Run.  Save yourself.  No user-serviceable parts.
  70 .    \" fudge factors for nroff and troff
  71 .if n \{\
  72 .    ds #H 0
  73 .    ds #V .8m
  74 .    ds #F .3m
  75 .    ds #[ \f1
  76 .    ds #] \fP
  77 .\}
  78 .if t \{\
  79 .    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
  80 .    ds #V .6m
  81 .    ds #F 0
  82 .    ds #[ \&
  83 .    ds #] \&
  84 .\}
  85 .    \" simple accents for nroff and troff
  86 .if n \{\
  87 .    ds ' \&
  88 .    ds ` \&
  89 .    ds ^ \&
  90 .    ds , \&
  91 .    ds ~ ~
  92 .    ds /
  93 .\}
  94 .if t \{\
  95 .    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
  96 .    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
  97 .    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
  98 .    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
  99 .    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
 100 .    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
 101 .\}
 102 .    \" troff and (daisy-wheel) nroff accents
 103 .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
 104 .ds 8 \h'\*(#H'\(*b\h'-\*(#H'
 105 .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
 106 .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
 107 .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
 108 .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
 109 .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
 110 .ds ae a\h'-(\w'a'u*4/10)'e
 111 .ds Ae A\h'-(\w'A'u*4/10)'E
 112 .    \" corrections for vroff
 113 .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
 114 .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
 115 .    \" for low resolution devices (crt and lpr)
 116 .if \n(.H>23 .if \n(.V>19 \
 117 \{\
 118 .    ds : e
 119 .    ds 8 ss
 120 .    ds o a
 121 .    ds d- d\h'-1'\(ga
 122 .    ds D- D\h'-1'\(hy
 123 .    ds th \o'bp'
 124 .    ds Th \o'LP'
 125 .    ds ae ae
 126 .    ds Ae AE
 127 .\}
 128 .rm #[ #] #H #V #F C
 129 .\" ========================================================================
 130 .\"
 131 .IX Title "HTML::LinkExtor 3"
 132 .TH HTML::LinkExtor 3 "2009-02-09" "perl v5.8.7" "User Contributed Perl Documentation"
 133 .SH "NAME"
 134 HTML::LinkExtor \- Extract links from an HTML document
 135 .SH "SYNOPSIS"
 136 .IX Header "SYNOPSIS"
 137 .Vb 7
 138 \& require HTML::LinkExtor;
 139 \& $p = HTML::LinkExtor\->new(\e&cb, "http://www.perl.org/");
 140 \& sub cb {
 141 \&     my($tag, %links) = @_;
 142 \&     print "$tag @{[%links]}\en";
 143 \& }
 144 \& $p\->parse_file("index.html");
 145 .Ve
 146 .SH "DESCRIPTION"
 147 .IX Header "DESCRIPTION"
 148 \&\fIHTML::LinkExtor\fR is an \s-1HTML\s0 parser that extracts links from an
 149 \&\s-1HTML\s0 document.  The \fIHTML::LinkExtor\fR is a subclass of
 150 \&\fIHTML::Parser\fR. This means that the document should be given to the
 151 parser by calling the \f(CW$p\fR\->\fIparse()\fR or \f(CW$p\fR\->\fIparse_file()\fR methods.
 152 .IP "$p = HTML::LinkExtor\->new" 4
 153 .IX Item "$p = HTML::LinkExtor->new"
 154 .PD 0
 155 .ie n .IP "$p = HTML::LinkExtor\->new( $callback )" 4
 156 .el .IP "$p = HTML::LinkExtor\->new( \f(CW$callback\fR )" 4
 157 .IX Item "$p = HTML::LinkExtor->new( $callback )"
 158 .ie n .IP "$p = HTML::LinkExtor\->new( $callback\fR, \f(CW$base )" 4
 159 .el .IP "$p = HTML::LinkExtor\->new( \f(CW$callback\fR, \f(CW$base\fR )" 4
 160 .IX Item "$p = HTML::LinkExtor->new( $callback, $base )"
 161 .PD
 162 The constructor takes two optional arguments. The first is a reference
 163 to a callback routine. It will be called as links are found. If a
 164 callback is not provided, then links are just accumulated internally
 165 and can be retrieved by calling the \f(CW$p\fR\->\fIlinks()\fR method.
 166 .Sp
 167 The \f(CW$base\fR argument is an optional base \s-1URL\s0 used to absolutize all URLs found.
 168 You need to have the \fI\s-1URI\s0\fR module installed if you provide \f(CW$base\fR.
 169 .Sp
 170 The callback is called with the lowercase tag name as first argument,
 171 and then all link attributes as separate key/value pairs.  All
 172 non-link attributes are removed.
 173 .IP "$p\->links" 4
 174 .IX Item "$p->links"
 175 Returns a list of all links found in the document.  The returned
 176 values will be anonymous arrays with the following elements:
 177 .Sp
 178 .Vb 1
 179 \&  [$tag, $attr => $url1, $attr2 => $url2,...]
 180 .Ve
 181 .Sp
 182 The \f(CW$p\fR\->links method will also truncate the internal link list.  This
 183 means that if the method is called twice without any parsing
 184 between them the second call will return an empty list.
 185 .Sp
 186 Also note that \f(CW$p\fR\->links will always be empty if a callback routine
 187 was provided when the \fIHTML::LinkExtor\fR was created.
 188 .SH "EXAMPLE"
 189 .IX Header "EXAMPLE"
 190 This is an example showing how you can extract links from a document
 191 received using \s-1LWP:\s0
 192 .PP
 193 .Vb 3
 194 \&  use LWP::UserAgent;
 195 \&  use HTML::LinkExtor;
 196 \&  use URI::URL;
 197 .Ve
 198 .PP
 199 .Vb 2
 200 \&  $url = "http://www.perl.org/";  # for instance
 201 \&  $ua = LWP::UserAgent\->new;
 202 .Ve
 203 .PP
 204 .Vb 7
 205 \&  # Set up a callback that collect image links
 206 \&  my @imgs = ();
 207 \&  sub callback {
 208 \&     my($tag, %attr) = @_;
 209 \&     return if $tag ne 'img';  # we only look closer at <img ...>
 210 \&     push(@imgs, values %attr);
 211 \&  }
 212 .Ve
 213 .PP
 214 .Vb 3
 215 \&  # Make the parser.  Unfortunately, we don't know the base yet
 216 \&  # (it might be different from $url)
 217 \&  $p = HTML::LinkExtor\->new(\e&callback);
 218 .Ve
 219 .PP
 220 .Vb 3
 221 \&  # Request document and parse it as it arrives
 222 \&  $res = $ua\->request(HTTP::Request\->new(GET => $url),
 223 \&                      sub {$p\->parse($_[0])});
 224 .Ve
 225 .PP
 226 .Vb 3
 227 \&  # Expand all image URLs to absolute ones
 228 \&  my $base = $res\->base;
 229 \&  @imgs = map { $_ = url($_, $base)\->abs; } @imgs;
 230 .Ve
 231 .PP
 232 .Vb 2
 233 \&  # Print them out
 234 \&  print join("\en", @imgs), "\en";
 235 .Ve
 236 .SH "SEE ALSO"
 237 .IX Header "SEE ALSO"
 238 HTML::Parser, HTML::Tagset, \s-1LWP\s0, \s-1URI::URL\s0
 239 .SH "COPYRIGHT"
 240 .IX Header "COPYRIGHT"
 241 Copyright 1996\-2001 Gisle Aas.
 242 .PP
 243 This library is free software; you can redistribute it and/or
 244 modify it under the same terms as Perl itself.