local-lib5/lib/perl5/WWW/RobotRules.pm

   1 package WWW::RobotRules;
   2
   3 $VERSION = "5.832";
   4 sub Version { $VERSION; }
   5
   6 use strict;
   7 use URI ();
   8
   9
  10
  11 sub new {
  12     my($class, $ua) = @_;
  13
  14     # This ugly hack is needed to ensure backwards compatibility.
  15     # The "WWW::RobotRules" class is now really abstract.
  16     $class = "WWW::RobotRules::InCore" if $class eq "WWW::RobotRules";
  17
  18     my $self = bless { }, $class;
  19     $self->agent($ua);
  20     $self;
  21 }
  22
  23
  24 sub parse {
  25     my($self, $robot_txt_uri, $txt, $fresh_until) = @_;
  26     $robot_txt_uri = URI->new("$robot_txt_uri");
  27     my $netloc = $robot_txt_uri->host . ":" . $robot_txt_uri->port;
  28
  29     $self->clear_rules($netloc);
  30     $self->fresh_until($netloc, $fresh_until || (time + 365*24*3600));
  31
  32     my $ua;
  33     my $is_me = 0;              # 1 iff this record is for me
  34     my $is_anon = 0;            # 1 iff this record is for *
  35     my $seen_disallow = 0;      # watch for missing record separators
  36     my @me_disallowed = ();     # rules disallowed for me
  37     my @anon_disallowed = ();   # rules disallowed for *
  38
  39     # blank lines are significant, so turn CRLF into LF to avoid generating
  40     # false ones
  41     $txt =~ s/\015\012/\012/g;
  42
  43     # split at \012 (LF) or \015 (CR) (Mac text files have just CR for EOL)
  44     for(split(/[\012\015]/, $txt)) {
  45
  46         # Lines containing only a comment are discarded completely, and
  47         # therefore do not indicate a record boundary.
  48         next if /^\s*\#/;
  49
  50         s/\s*\#.*//;        # remove comments at end-of-line
  51
  52         if (/^\s*$/) {      # blank line
  53             last if $is_me; # That was our record. No need to read the rest.
  54             $is_anon = 0;
  55             $seen_disallow = 0;
  56         }
  57         elsif (/^\s*User-Agent\s*:\s*(.*)/i) {
  58             $ua = $1;
  59             $ua =~ s/\s+$//;
  60
  61             if ($seen_disallow) {
  62                 # treat as start of a new record
  63                 $seen_disallow = 0;
  64                 last if $is_me; # That was our record. No need to read the rest.
  65                 $is_anon = 0;
  66             }
  67
  68             if ($is_me) {
  69                 # This record already had a User-agent that
  70                 # we matched, so just continue.
  71             }
  72             elsif ($ua eq '*') {
  73                 $is_anon = 1;
  74             }
  75             elsif($self->is_me($ua)) {
  76                 $is_me = 1;
  77             }
  78         }
  79         elsif (/^\s*Disallow\s*:\s*(.*)/i) {
  80             unless (defined $ua) {
  81                 warn "RobotRules <$robot_txt_uri>: Disallow without preceding User-agent\n" if $^W;
  82                 $is_anon = 1;  # assume that User-agent: * was intended
  83             }
  84             my $disallow = $1;
  85             $disallow =~ s/\s+$//;
  86             $seen_disallow = 1;
  87             if (length $disallow) {
  88                 my $ignore;
  89                 eval {
  90                     my $u = URI->new_abs($disallow, $robot_txt_uri);
  91                     $ignore++ if $u->scheme ne $robot_txt_uri->scheme;
  92                     $ignore++ if lc($u->host) ne lc($robot_txt_uri->host);
  93                     $ignore++ if $u->port ne $robot_txt_uri->port;
  94                     $disallow = $u->path_query;
  95                     $disallow = "/" unless length $disallow;
  96                 };
  97                 next if $@;
  98                 next if $ignore;
  99             }
 100
 101             if ($is_me) {
 102                 push(@me_disallowed, $disallow);
 103             }
 104             elsif ($is_anon) {
 105                 push(@anon_disallowed, $disallow);
 106             }
 107         }
 108         elsif (/\S\s*:/) {
 109              # ignore
 110         }
 111         else {
 112             warn "RobotRules <$robot_txt_uri>: Malformed record: <$_>\n" if $^W;
 113         }
 114     }
 115
 116     if ($is_me) {
 117         $self->push_rules($netloc, @me_disallowed);
 118     }
 119     else {
 120         $self->push_rules($netloc, @anon_disallowed);
 121     }
 122 }
 123
 124
 125 #
 126 # Returns TRUE if the given name matches the
 127 # name of this robot
 128 #
 129 sub is_me {
 130     my($self, $ua_line) = @_;
 131     my $me = $self->agent;
 132
 133     # See whether my short-name is a substring of the
 134     #  "User-Agent: ..." line that we were passed:
 135
 136     if(index(lc($me), lc($ua_line)) >= 0) {
 137       return 1;
 138     }
 139     else {
 140       return '';
 141     }
 142 }
 143
 144
 145 sub allowed {
 146     my($self, $uri) = @_;
 147     $uri = URI->new("$uri");
 148
 149     return 1 unless $uri->scheme eq 'http' or $uri->scheme eq 'https';
 150      # Robots.txt applies to only those schemes.
 151
 152     my $netloc = $uri->host . ":" . $uri->port;
 153
 154     my $fresh_until = $self->fresh_until($netloc);
 155     return -1 if !defined($fresh_until) || $fresh_until < time;
 156
 157     my $str = $uri->path_query;
 158     my $rule;
 159     for $rule ($self->rules($netloc)) {
 160         return 1 unless length $rule;
 161         return 0 if index($str, $rule) == 0;
 162     }
 163     return 1;
 164 }
 165
 166
 167 # The following methods must be provided by the subclass.
 168 sub agent;
 169 sub visit;
 170 sub no_visits;
 171 sub last_visits;
 172 sub fresh_until;
 173 sub push_rules;
 174 sub clear_rules;
 175 sub rules;
 176 sub dump;
 177
 178
 179
 180 package WWW::RobotRules::InCore;
 181
 182 use vars qw(@ISA);
 183 @ISA = qw(WWW::RobotRules);
 184
 185
 186
 187 sub agent {
 188     my ($self, $name) = @_;
 189     my $old = $self->{'ua'};
 190     if ($name) {
 191         # Strip it so that it's just the short name.
 192         # I.e., "FooBot"                                      => "FooBot"
 193         #       "FooBot/1.2"                                  => "FooBot"
 194         #       "FooBot/1.2 [http://foobot.int; foo@bot.int]" => "FooBot"
 195
 196         $name = $1 if $name =~ m/(\S+)/; # get first word
 197         $name =~ s!/.*!!;  # get rid of version
 198         unless ($old && $old eq $name) {
 199             delete $self->{'loc'}; # all old info is now stale
 200             $self->{'ua'} = $name;
 201         }
 202     }
 203     $old;
 204 }
 205
 206
 207 sub visit {
 208     my($self, $netloc, $time) = @_;
 209     return unless $netloc;
 210     $time ||= time;
 211     $self->{'loc'}{$netloc}{'last'} = $time;
 212     my $count = \$self->{'loc'}{$netloc}{'count'};
 213     if (!defined $$count) {
 214         $$count = 1;
 215     }
 216     else {
 217         $$count++;
 218     }
 219 }
 220
 221
 222 sub no_visits {
 223     my ($self, $netloc) = @_;
 224     $self->{'loc'}{$netloc}{'count'};
 225 }
 226
 227
 228 sub last_visit {
 229     my ($self, $netloc) = @_;
 230     $self->{'loc'}{$netloc}{'last'};
 231 }
 232
 233
 234 sub fresh_until {
 235     my ($self, $netloc, $fresh_until) = @_;
 236     my $old = $self->{'loc'}{$netloc}{'fresh'};
 237     if (defined $fresh_until) {
 238         $self->{'loc'}{$netloc}{'fresh'} = $fresh_until;
 239     }
 240     $old;
 241 }
 242
 243
 244 sub push_rules {
 245     my($self, $netloc, @rules) = @_;
 246     push (@{$self->{'loc'}{$netloc}{'rules'}}, @rules);
 247 }
 248
 249
 250 sub clear_rules {
 251     my($self, $netloc) = @_;
 252     delete $self->{'loc'}{$netloc}{'rules'};
 253 }
 254
 255
 256 sub rules {
 257     my($self, $netloc) = @_;
 258     if (defined $self->{'loc'}{$netloc}{'rules'}) {
 259         return @{$self->{'loc'}{$netloc}{'rules'}};
 260     }
 261     else {
 262         return ();
 263     }
 264 }
 265
 266
 267 sub dump
 268 {
 269     my $self = shift;
 270     for (keys %$self) {
 271         next if $_ eq 'loc';
 272         print "$_ = $self->{$_}\n";
 273     }
 274     for (keys %{$self->{'loc'}}) {
 275         my @rules = $self->rules($_);
 276         print "$_: ", join("; ", @rules), "\n";
 277     }
 278 }
 279
 280
 281 1;
 282
 283 __END__
 284
 285
 286 # Bender: "Well, I don't have anything else
 287 #          planned for today.  Let's get drunk!"
 288
 289 =head1 NAME
 290
 291 WWW::RobotRules - database of robots.txt-derived permissions
 292
 293 =head1 SYNOPSIS
 294
 295  use WWW::RobotRules;
 296  my $rules = WWW::RobotRules->new('MOMspider/1.0');
 297
 298  use LWP::Simple qw(get);
 299
 300  {
 301    my $url = "http://some.place/robots.txt";
 302    my $robots_txt = get $url;
 303    $rules->parse($url, $robots_txt) if defined $robots_txt;
 304  }
 305
 306  {
 307    my $url = "http://some.other.place/robots.txt";
 308    my $robots_txt = get $url;
 309    $rules->parse($url, $robots_txt) if defined $robots_txt;
 310  }
 311
 312  # Now we can check if a URL is valid for those servers
 313  # whose "robots.txt" files we've gotten and parsed:
 314  if($rules->allowed($url)) {
 315      $c = get $url;
 316      ...
 317  }
 318
 319 =head1 DESCRIPTION
 320
 321 This module parses F</robots.txt> files as specified in
 322 "A Standard for Robot Exclusion", at
 323 <http://www.robotstxt.org/wc/norobots.html>
 324 Webmasters can use the F</robots.txt> file to forbid conforming
 325 robots from accessing parts of their web site.
 326
 327 The parsed files are kept in a WWW::RobotRules object, and this object
 328 provides methods to check if access to a given URL is prohibited.  The
 329 same WWW::RobotRules object can be used for one or more parsed
 330 F</robots.txt> files on any number of hosts.
 331
 332 The following methods are provided:
 333
 334 =over 4
 335
 336 =item $rules = WWW::RobotRules->new($robot_name)
 337
 338 This is the constructor for WWW::RobotRules objects.  The first
 339 argument given to new() is the name of the robot.
 340
 341 =item $rules->parse($robot_txt_url, $content, $fresh_until)
 342
 343 The parse() method takes as arguments the URL that was used to
 344 retrieve the F</robots.txt> file, and the contents of the file.
 345
 346 =item $rules->allowed($uri)
 347
 348 Returns TRUE if this robot is allowed to retrieve this URL.
 349
 350 =item $rules->agent([$name])
 351
 352 Get/set the agent name. NOTE: Changing the agent name will clear the robots.txt
 353 rules and expire times out of the cache.
 354
 355 =back
 356
 357 =head1 ROBOTS.TXT
 358
 359 The format and semantics of the "/robots.txt" file are as follows
 360 (this is an edited abstract of
 361 <http://www.robotstxt.org/wc/norobots.html>):
 362
 363 The file consists of one or more records separated by one or more
 364 blank lines. Each record contains lines of the form
 365
 366   <field-name>: <value>
 367
 368 The field name is case insensitive.  Text after the '#' character on a
 369 line is ignored during parsing.  This is used for comments.  The
 370 following <field-names> can be used:
 371
 372 =over 3
 373
 374 =item User-Agent
 375
 376 The value of this field is the name of the robot the record is
 377 describing access policy for.  If more than one I<User-Agent> field is
 378 present the record describes an identical access policy for more than
 379 one robot. At least one field needs to be present per record.  If the
 380 value is '*', the record describes the default access policy for any
 381 robot that has not not matched any of the other records.
 382
 383 The I<User-Agent> fields must occur before the I<Disallow> fields.  If a
 384 record contains a I<User-Agent> field after a I<Disallow> field, that
 385 constitutes a malformed record.  This parser will assume that a blank
 386 line should have been placed before that I<User-Agent> field, and will
 387 break the record into two.  All the fields before the I<User-Agent> field
 388 will constitute a record, and the I<User-Agent> field will be the first
 389 field in a new record.
 390
 391 =item Disallow
 392
 393 The value of this field specifies a partial URL that is not to be
 394 visited. This can be a full path, or a partial path; any URL that
 395 starts with this value will not be retrieved
 396
 397 =back
 398
 399 Unrecognized records are ignored.
 400
 401 =head1 ROBOTS.TXT EXAMPLES
 402
 403 The following example "/robots.txt" file specifies that no robots
 404 should visit any URL starting with "/cyberworld/map/" or "/tmp/":
 405
 406   User-agent: *
 407   Disallow: /cyberworld/map/ # This is an infinite virtual URL space
 408   Disallow: /tmp/ # these will soon disappear
 409
 410 This example "/robots.txt" file specifies that no robots should visit
 411 any URL starting with "/cyberworld/map/", except the robot called
 412 "cybermapper":
 413
 414   User-agent: *
 415   Disallow: /cyberworld/map/ # This is an infinite virtual URL space
 416
 417   # Cybermapper knows where to go.
 418   User-agent: cybermapper
 419   Disallow:
 420
 421 This example indicates that no robots should visit this site further:
 422
 423   # go away
 424   User-agent: *
 425   Disallow: /
 426
 427 This is an example of a malformed robots.txt file.
 428
 429   # robots.txt for ancientcastle.example.com
 430   # I've locked myself away.
 431   User-agent: *
 432   Disallow: /
 433   # The castle is your home now, so you can go anywhere you like.
 434   User-agent: Belle
 435   Disallow: /west-wing/ # except the west wing!
 436   # It's good to be the Prince...
 437   User-agent: Beast
 438   Disallow:
 439
 440 This file is missing the required blank lines between records.
 441 However, the intention is clear.
 442
 443 =head1 SEE ALSO
 444
 445 L<LWP::RobotUA>, L<WWW::RobotRules::AnyDBM_File>