1 files changed, 30 insertions, 110 deletions
diff --git a/Lib/robotparser.py b/Lib/robotparser.py
index cf0a58d6ee..726854b49f 100644
--- a/Lib/robotparser.py
+++ b/Lib/robotparser.py
@@ -9,15 +9,11 @@
     The robots.txt Exclusion Protocol is implemented as specified in
     http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
 """
-import urlparse,urllib
+import urlparse
+import urllib
 
 __all__ = ["RobotFileParser"]
 
-debug = 0
-
-def _debug(msg):
-    if debug: print msg
-
 
 class RobotFileParser:
     """ This class provides a set of methods to read, parse and answer
@@ -59,26 +55,22 @@ class RobotFileParser:
         """Reads the robots.txt URL and feeds it to the parser."""
         opener = URLopener()
         f = opener.open(self.url)
-        lines = []
-        line = f.readline()
-        while line:
-            lines.append(line.strip())
-            line = f.readline()
+        lines = [line.strip() for line in f]
+        f.close()
         self.errcode = opener.errcode
-        if self.errcode == 401 or self.errcode == 403:
+        if self.errcode in (401, 403):
             self.disallow_all = True
-            _debug("disallow all")
         elif self.errcode >= 400:
             self.allow_all = True
-            _debug("allow all")
         elif self.errcode == 200 and lines:
-            _debug("parse lines")
             self.parse(lines)
 
     def _add_entry(self, entry):
         if "*" in entry.useragents:
             # the default entry is considered last
-            self.default_entry = entry
+            if self.default_entry is None:
+                # the first default entry wins
+                self.default_entry = entry
         else:
             self.entries.append(entry)
 
@@ -86,26 +78,27 @@ class RobotFileParser:
         """parse the input lines from a robots.txt file.
            We allow that a user-agent: line is not preceded by
            one or more blank lines."""
+        # states:
+        #   0: start state
+        #   1: saw user-agent line
+        #   2: saw an allow or disallow line
         state = 0
         linenumber = 0
         entry = Entry()
 
         for line in lines:
-            linenumber = linenumber + 1
+            linenumber += 1
             if not line:
-                if state==1:
-                    _debug("line %d: warning: you should insert"
-                           " allow: or disallow: directives below any"
-                           " user-agent: line" % linenumber)
+                if state == 1:
                     entry = Entry()
                     state = 0
-                elif state==2:
+                elif state == 2:
                     self._add_entry(entry)
                     entry = Entry()
                     state = 0
             # remove optional comment and strip line
             i = line.find('#')
-            if i>=0:
+            if i >= 0:
                 line = line[:i]
             line = line.strip()
             if not line:
@@ -115,41 +108,25 @@ class RobotFileParser:
                 line[0] = line[0].strip().lower()
                 line[1] = urllib.unquote(line[1].strip())
                 if line[0] == "user-agent":
-                    if state==2:
-                        _debug("line %d: warning: you should insert a blank"
-                               " line before any user-agent"
-                               " directive" % linenumber)
+                    if state == 2:
                         self._add_entry(entry)
                         entry = Entry()
                     entry.useragents.append(line[1])
                     state = 1
                 elif line[0] == "disallow":
-                    if state==0:
-                        _debug("line %d: error: you must insert a user-agent:"
-                               " directive before this line" % linenumber)
-                    else:
+                    if state != 0:
                         entry.rulelines.append(RuleLine(line[1], False))
                         state = 2
                 elif line[0] == "allow":
-                    if state==0:
-                        _debug("line %d: error: you must insert a user-agent:"
-                               " directive before this line" % linenumber)
-                    else:
+                    if state != 0:
                         entry.rulelines.append(RuleLine(line[1], True))
-                else:
-                    _debug("line %d: warning: unknown key %s" % (linenumber,
-                               line[0]))
-            else:
-                _debug("line %d: error: malformed line %s"%(linenumber, line))
-        if state==2:
-            self.entries.append(entry)
-        _debug("Parsed rules:\n%s" % str(self))
+                        state = 2
+        if state == 2:
+            self._add_entry(entry)
 
 
     def can_fetch(self, useragent, url):
         """using the parsed robots.txt decide if useragent can fetch url"""
-        _debug("Checking robots.txt allowance for:\n  user agent: %s\n  url: %s" %
-               (useragent, url))
         if self.disallow_all:
             return False
         if self.allow_all:
@@ -168,10 +145,7 @@ class RobotFileParser:
 
 
     def __str__(self):
-        ret = ""
-        for entry in self.entries:
-            ret = ret + str(entry) + "\n"
-        return ret
+        return ''.join([str(entry) + "\n" for entry in self.entries])
 
 
 class RuleLine:
@@ -185,10 +159,10 @@ class RuleLine:
         self.allowance = allowance
 
     def applies_to(self, filename):
-        return self.path=="*" or filename.startswith(self.path)
+        return self.path == "*" or filename.startswith(self.path)
 
     def __str__(self):
-        return (self.allowance and "Allow" or "Disallow")+": "+self.path
+        return (self.allowance and "Allow" or "Disallow") + ": " + self.path
 
 
 class Entry:
@@ -198,19 +172,19 @@ class Entry:
         self.rulelines = []
 
     def __str__(self):
-        ret = ""
+        ret = []
         for agent in self.useragents:
-            ret = ret + "User-agent: "+agent+"\n"
+            ret.extend(["User-agent: ", agent, "\n"])
         for line in self.rulelines:
-            ret = ret + str(line) + "\n"
-        return ret
+            ret.extend([str(line), "\n"])
+        return ''.join(ret)
 
     def applies_to(self, useragent):
         """check if this entry applies to the specified agent"""
         # split the name token and make it lower case
         useragent = useragent.split("/")[0].lower()
         for agent in self.useragents:
-            if agent=='*':
+            if agent == '*':
                 # we have the catch-all agent
                 return True
             agent = agent.lower()
@@ -223,7 +197,6 @@ class Entry:
         - our agent applies to this entry
         - filename is URL decoded"""
         for line in self.rulelines:
-            _debug((filename, str(line), line.allowance))
             if line.applies_to(filename):
                 return line.allowance
         return True
@@ -242,56 +215,3 @@ class URLopener(urllib.FancyURLopener):
         self.errcode = errcode
         return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
                                                         errmsg, headers)
-
-def _check(a,b):
-    if not b:
-        ac = "access denied"
-    else:
-        ac = "access allowed"
-    if a!=b:
-        print "failed"
-    else:
-        print "ok (%s)" % ac
-    print
-
-def _test():
-    global debug
-    rp = RobotFileParser()
-    debug = 1
-
-    # robots.txt that exists, gotten to by redirection
-    rp.set_url('http://www.musi-cal.com/robots.txt')
-    rp.read()
-
-    # test for re.escape
-    _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
-    # this should match the first rule, which is a disallow
-    _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
-    # various cherry pickers
-    _check(rp.can_fetch('CherryPickerSE',
-                       'http://www.musi-cal.com/cgi-bin/event-search'
-                       '?city=San+Francisco'), 0)
-    _check(rp.can_fetch('CherryPickerSE/1.0',
-                       'http://www.musi-cal.com/cgi-bin/event-search'
-                       '?city=San+Francisco'), 0)
-    _check(rp.can_fetch('CherryPickerSE/1.5',
-                       'http://www.musi-cal.com/cgi-bin/event-search'
-                       '?city=San+Francisco'), 0)
-    # case sensitivity
-    _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
-    _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
-    # substring test
-    _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
-    # tests for catch-all * agent
-    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
-    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
-    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
-    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
-
-    # robots.txt that does not exist
-    rp.set_url('http://www.lycos.com/robots.txt')
-    rp.read()
-    _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
-
-if __name__ == '__main__':
-    _test()