summaryrefslogtreecommitdiff
path: root/Lib/robotparser.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/robotparser.py')
-rw-r--r--Lib/robotparser.py140
1 files changed, 30 insertions, 110 deletions
diff --git a/Lib/robotparser.py b/Lib/robotparser.py
index cf0a58d6ee..726854b49f 100644
--- a/Lib/robotparser.py
+++ b/Lib/robotparser.py
@@ -9,15 +9,11 @@
The robots.txt Exclusion Protocol is implemented as specified in
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
"""
-import urlparse,urllib
+import urlparse
+import urllib
__all__ = ["RobotFileParser"]
-debug = 0
-
-def _debug(msg):
- if debug: print msg
-
class RobotFileParser:
""" This class provides a set of methods to read, parse and answer
@@ -59,26 +55,22 @@ class RobotFileParser:
"""Reads the robots.txt URL and feeds it to the parser."""
opener = URLopener()
f = opener.open(self.url)
- lines = []
- line = f.readline()
- while line:
- lines.append(line.strip())
- line = f.readline()
+ lines = [line.strip() for line in f]
+ f.close()
self.errcode = opener.errcode
- if self.errcode == 401 or self.errcode == 403:
+ if self.errcode in (401, 403):
self.disallow_all = True
- _debug("disallow all")
elif self.errcode >= 400:
self.allow_all = True
- _debug("allow all")
elif self.errcode == 200 and lines:
- _debug("parse lines")
self.parse(lines)
def _add_entry(self, entry):
if "*" in entry.useragents:
# the default entry is considered last
- self.default_entry = entry
+ if self.default_entry is None:
+ # the first default entry wins
+ self.default_entry = entry
else:
self.entries.append(entry)
@@ -86,26 +78,27 @@ class RobotFileParser:
"""parse the input lines from a robots.txt file.
We allow that a user-agent: line is not preceded by
one or more blank lines."""
+ # states:
+ # 0: start state
+ # 1: saw user-agent line
+ # 2: saw an allow or disallow line
state = 0
linenumber = 0
entry = Entry()
for line in lines:
- linenumber = linenumber + 1
+ linenumber += 1
if not line:
- if state==1:
- _debug("line %d: warning: you should insert"
- " allow: or disallow: directives below any"
- " user-agent: line" % linenumber)
+ if state == 1:
entry = Entry()
state = 0
- elif state==2:
+ elif state == 2:
self._add_entry(entry)
entry = Entry()
state = 0
# remove optional comment and strip line
i = line.find('#')
- if i>=0:
+ if i >= 0:
line = line[:i]
line = line.strip()
if not line:
@@ -115,41 +108,25 @@ class RobotFileParser:
line[0] = line[0].strip().lower()
line[1] = urllib.unquote(line[1].strip())
if line[0] == "user-agent":
- if state==2:
- _debug("line %d: warning: you should insert a blank"
- " line before any user-agent"
- " directive" % linenumber)
+ if state == 2:
self._add_entry(entry)
entry = Entry()
entry.useragents.append(line[1])
state = 1
elif line[0] == "disallow":
- if state==0:
- _debug("line %d: error: you must insert a user-agent:"
- " directive before this line" % linenumber)
- else:
+ if state != 0:
entry.rulelines.append(RuleLine(line[1], False))
state = 2
elif line[0] == "allow":
- if state==0:
- _debug("line %d: error: you must insert a user-agent:"
- " directive before this line" % linenumber)
- else:
+ if state != 0:
entry.rulelines.append(RuleLine(line[1], True))
- else:
- _debug("line %d: warning: unknown key %s" % (linenumber,
- line[0]))
- else:
- _debug("line %d: error: malformed line %s"%(linenumber, line))
- if state==2:
- self.entries.append(entry)
- _debug("Parsed rules:\n%s" % str(self))
+ state = 2
+ if state == 2:
+ self._add_entry(entry)
def can_fetch(self, useragent, url):
"""using the parsed robots.txt decide if useragent can fetch url"""
- _debug("Checking robots.txt allowance for:\n user agent: %s\n url: %s" %
- (useragent, url))
if self.disallow_all:
return False
if self.allow_all:
@@ -168,10 +145,7 @@ class RobotFileParser:
def __str__(self):
- ret = ""
- for entry in self.entries:
- ret = ret + str(entry) + "\n"
- return ret
+ return ''.join([str(entry) + "\n" for entry in self.entries])
class RuleLine:
@@ -185,10 +159,10 @@ class RuleLine:
self.allowance = allowance
def applies_to(self, filename):
- return self.path=="*" or filename.startswith(self.path)
+ return self.path == "*" or filename.startswith(self.path)
def __str__(self):
- return (self.allowance and "Allow" or "Disallow")+": "+self.path
+ return (self.allowance and "Allow" or "Disallow") + ": " + self.path
class Entry:
@@ -198,19 +172,19 @@ class Entry:
self.rulelines = []
def __str__(self):
- ret = ""
+ ret = []
for agent in self.useragents:
- ret = ret + "User-agent: "+agent+"\n"
+ ret.extend(["User-agent: ", agent, "\n"])
for line in self.rulelines:
- ret = ret + str(line) + "\n"
- return ret
+ ret.extend([str(line), "\n"])
+ return ''.join(ret)
def applies_to(self, useragent):
"""check if this entry applies to the specified agent"""
# split the name token and make it lower case
useragent = useragent.split("/")[0].lower()
for agent in self.useragents:
- if agent=='*':
+ if agent == '*':
# we have the catch-all agent
return True
agent = agent.lower()
@@ -223,7 +197,6 @@ class Entry:
- our agent applies to this entry
- filename is URL decoded"""
for line in self.rulelines:
- _debug((filename, str(line), line.allowance))
if line.applies_to(filename):
return line.allowance
return True
@@ -242,56 +215,3 @@ class URLopener(urllib.FancyURLopener):
self.errcode = errcode
return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
errmsg, headers)
-
-def _check(a,b):
- if not b:
- ac = "access denied"
- else:
- ac = "access allowed"
- if a!=b:
- print "failed"
- else:
- print "ok (%s)" % ac
- print
-
-def _test():
- global debug
- rp = RobotFileParser()
- debug = 1
-
- # robots.txt that exists, gotten to by redirection
- rp.set_url('http://www.musi-cal.com/robots.txt')
- rp.read()
-
- # test for re.escape
- _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
- # this should match the first rule, which is a disallow
- _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
- # various cherry pickers
- _check(rp.can_fetch('CherryPickerSE',
- 'http://www.musi-cal.com/cgi-bin/event-search'
- '?city=San+Francisco'), 0)
- _check(rp.can_fetch('CherryPickerSE/1.0',
- 'http://www.musi-cal.com/cgi-bin/event-search'
- '?city=San+Francisco'), 0)
- _check(rp.can_fetch('CherryPickerSE/1.5',
- 'http://www.musi-cal.com/cgi-bin/event-search'
- '?city=San+Francisco'), 0)
- # case sensitivity
- _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
- _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
- # substring test
- _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
- # tests for catch-all * agent
- _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
- _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
- _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
- _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
-
- # robots.txt that does not exist
- rp.set_url('http://www.lycos.com/robots.txt')
- rp.read()
- _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
-
-if __name__ == '__main__':
- _test()