1 package WWW::RobotRules;
4 sub Version { $VERSION; }
14 # This ugly hack is needed to ensure backwards compatibility.
15 # The "WWW::RobotRules" class is now really abstract.
16 $class = "WWW::RobotRules::InCore" if $class eq "WWW::RobotRules";
18 my $self = bless { }, $class;
25 my($self, $robot_txt_uri, $txt, $fresh_until) = @_;
26 $robot_txt_uri = URI->new("$robot_txt_uri");
27 my $netloc = $robot_txt_uri->host . ":" . $robot_txt_uri->port;
29 $self->clear_rules($netloc);
30 $self->fresh_until($netloc, $fresh_until || (time + 365*24*3600));
33 my $is_me = 0; # 1 iff this record is for me
34 my $is_anon = 0; # 1 iff this record is for *
35 my $seen_disallow = 0; # watch for missing record separators
36 my @me_disallowed = (); # rules disallowed for me
37 my @anon_disallowed = (); # rules disallowed for *
39 # blank lines are significant, so turn CRLF into LF to avoid generating
41 $txt =~ s/\015\012/\012/g;
43 # split at \012 (LF) or \015 (CR) (Mac text files have just CR for EOL)
44 for(split(/[\012\015]/, $txt)) {
46 # Lines containing only a comment are discarded completely, and
47 # therefore do not indicate a record boundary.
50 s/\s*\#.*//; # remove comments at end-of-line
52 if (/^\s*$/) { # blank line
53 last if $is_me; # That was our record. No need to read the rest.
57 elsif (/^\s*User-Agent\s*:\s*(.*)/i) {
62 # treat as start of a new record
64 last if $is_me; # That was our record. No need to read the rest.
69 # This record already had a User-agent that
70 # we matched, so just continue.
75 elsif($self->is_me($ua)) {
79 elsif (/^\s*Disallow\s*:\s*(.*)/i) {
80 unless (defined $ua) {
81 warn "RobotRules <$robot_txt_uri>: Disallow without preceding User-agent\n" if $^W;
82 $is_anon = 1; # assume that User-agent: * was intended
85 $disallow =~ s/\s+$//;
87 if (length $disallow) {
90 my $u = URI->new_abs($disallow, $robot_txt_uri);
91 $ignore++ if $u->scheme ne $robot_txt_uri->scheme;
92 $ignore++ if lc($u->host) ne lc($robot_txt_uri->host);
93 $ignore++ if $u->port ne $robot_txt_uri->port;
94 $disallow = $u->path_query;
95 $disallow = "/" unless length $disallow;
102 push(@me_disallowed, $disallow);
105 push(@anon_disallowed, $disallow);
112 warn "RobotRules <$robot_txt_uri>: Malformed record: <$_>\n" if $^W;
117 $self->push_rules($netloc, @me_disallowed);
120 $self->push_rules($netloc, @anon_disallowed);
126 # Returns TRUE if the given name matches the
130 my($self, $ua_line) = @_;
131 my $me = $self->agent;
133 # See whether my short-name is a substring of the
134 # "User-Agent: ..." line that we were passed:
136 if(index(lc($me), lc($ua_line)) >= 0) {
146 my($self, $uri) = @_;
147 $uri = URI->new("$uri");
149 return 1 unless $uri->scheme eq 'http' or $uri->scheme eq 'https';
150 # Robots.txt applies to only those schemes.
152 my $netloc = $uri->host . ":" . $uri->port;
154 my $fresh_until = $self->fresh_until($netloc);
155 return -1 if !defined($fresh_until) || $fresh_until < time;
157 my $str = $uri->path_query;
159 for $rule ($self->rules($netloc)) {
160 return 1 unless length $rule;
161 return 0 if index($str, $rule) == 0;
167 # The following methods must be provided by the subclass.
180 package WWW::RobotRules::InCore;
183 @ISA = qw(WWW::RobotRules);
188 my ($self, $name) = @_;
189 my $old = $self->{'ua'};
191 # Strip it so that it's just the short name.
192 # I.e., "FooBot" => "FooBot"
193 # "FooBot/1.2" => "FooBot"
194 # "FooBot/1.2 [http://foobot.int; foo@bot.int]" => "FooBot"
196 $name = $1 if $name =~ m/(\S+)/; # get first word
197 $name =~ s!/.*!!; # get rid of version
198 unless ($old && $old eq $name) {
199 delete $self->{'loc'}; # all old info is now stale
200 $self->{'ua'} = $name;
208 my($self, $netloc, $time) = @_;
209 return unless $netloc;
211 $self->{'loc'}{$netloc}{'last'} = $time;
212 my $count = \$self->{'loc'}{$netloc}{'count'};
213 if (!defined $$count) {
223 my ($self, $netloc) = @_;
224 $self->{'loc'}{$netloc}{'count'};
229 my ($self, $netloc) = @_;
230 $self->{'loc'}{$netloc}{'last'};
235 my ($self, $netloc, $fresh_until) = @_;
236 my $old = $self->{'loc'}{$netloc}{'fresh'};
237 if (defined $fresh_until) {
238 $self->{'loc'}{$netloc}{'fresh'} = $fresh_until;
245 my($self, $netloc, @rules) = @_;
246 push (@{$self->{'loc'}{$netloc}{'rules'}}, @rules);
251 my($self, $netloc) = @_;
252 delete $self->{'loc'}{$netloc}{'rules'};
257 my($self, $netloc) = @_;
258 if (defined $self->{'loc'}{$netloc}{'rules'}) {
259 return @{$self->{'loc'}{$netloc}{'rules'}};
272 print "$_ = $self->{$_}\n";
274 for (keys %{$self->{'loc'}}) {
275 my @rules = $self->rules($_);
276 print "$_: ", join("; ", @rules), "\n";
286 # Bender: "Well, I don't have anything else
287 # planned for today. Let's get drunk!"
291 WWW::RobotRules - database of robots.txt-derived permissions
296 my $rules = WWW::RobotRules->new('MOMspider/1.0');
298 use LWP::Simple qw(get);
301 my $url = "http://some.place/robots.txt";
302 my $robots_txt = get $url;
303 $rules->parse($url, $robots_txt) if defined $robots_txt;
307 my $url = "http://some.other.place/robots.txt";
308 my $robots_txt = get $url;
309 $rules->parse($url, $robots_txt) if defined $robots_txt;
312 # Now we can check if a URL is valid for those servers
313 # whose "robots.txt" files we've gotten and parsed:
314 if($rules->allowed($url)) {
321 This module parses F</robots.txt> files as specified in
322 "A Standard for Robot Exclusion", at
323 <http://www.robotstxt.org/wc/norobots.html>
324 Webmasters can use the F</robots.txt> file to forbid conforming
325 robots from accessing parts of their web site.
327 The parsed files are kept in a WWW::RobotRules object, and this object
328 provides methods to check if access to a given URL is prohibited. The
329 same WWW::RobotRules object can be used for one or more parsed
330 F</robots.txt> files on any number of hosts.
332 The following methods are provided:
336 =item $rules = WWW::RobotRules->new($robot_name)
338 This is the constructor for WWW::RobotRules objects. The first
339 argument given to new() is the name of the robot.
341 =item $rules->parse($robot_txt_url, $content, $fresh_until)
343 The parse() method takes as arguments the URL that was used to
344 retrieve the F</robots.txt> file, and the contents of the file.
346 =item $rules->allowed($uri)
348 Returns TRUE if this robot is allowed to retrieve this URL.
350 =item $rules->agent([$name])
352 Get/set the agent name. NOTE: Changing the agent name will clear the robots.txt
353 rules and expire times out of the cache.
359 The format and semantics of the "/robots.txt" file are as follows
360 (this is an edited abstract of
361 <http://www.robotstxt.org/wc/norobots.html>):
363 The file consists of one or more records separated by one or more
364 blank lines. Each record contains lines of the form
366 <field-name>: <value>
368 The field name is case insensitive. Text after the '#' character on a
369 line is ignored during parsing. This is used for comments. The
370 following <field-names> can be used:
376 The value of this field is the name of the robot the record is
377 describing access policy for. If more than one I<User-Agent> field is
378 present the record describes an identical access policy for more than
379 one robot. At least one field needs to be present per record. If the
380 value is '*', the record describes the default access policy for any
381 robot that has not not matched any of the other records.
383 The I<User-Agent> fields must occur before the I<Disallow> fields. If a
384 record contains a I<User-Agent> field after a I<Disallow> field, that
385 constitutes a malformed record. This parser will assume that a blank
386 line should have been placed before that I<User-Agent> field, and will
387 break the record into two. All the fields before the I<User-Agent> field
388 will constitute a record, and the I<User-Agent> field will be the first
389 field in a new record.
393 The value of this field specifies a partial URL that is not to be
394 visited. This can be a full path, or a partial path; any URL that
395 starts with this value will not be retrieved
399 Unrecognized records are ignored.
401 =head1 ROBOTS.TXT EXAMPLES
403 The following example "/robots.txt" file specifies that no robots
404 should visit any URL starting with "/cyberworld/map/" or "/tmp/":
407 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
408 Disallow: /tmp/ # these will soon disappear
410 This example "/robots.txt" file specifies that no robots should visit
411 any URL starting with "/cyberworld/map/", except the robot called
415 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
417 # Cybermapper knows where to go.
418 User-agent: cybermapper
421 This example indicates that no robots should visit this site further:
427 This is an example of a malformed robots.txt file.
429 # robots.txt for ancientcastle.example.com
430 # I've locked myself away.
433 # The castle is your home now, so you can go anywhere you like.
435 Disallow: /west-wing/ # except the west wing!
436 # It's good to be the Prince...
440 This file is missing the required blank lines between records.
441 However, the intention is clear.
445 L<LWP::RobotUA>, L<WWW::RobotRules::AnyDBM_File>