diff options
Diffstat (limited to 'Lib/robotparser.py')
| -rw-r--r-- | Lib/robotparser.py | 123 | 
1 files changed, 89 insertions, 34 deletions
| diff --git a/Lib/robotparser.py b/Lib/robotparser.py index d627c9af28..ff25dfe498 100644 --- a/Lib/robotparser.py +++ b/Lib/robotparser.py @@ -39,28 +39,19 @@ class RobotFileParser:          self.host, self.path = urlparse.urlparse(url)[1:3]      def read(self): -        import httplib -        tries = 0 -        while tries<5: -            connection = httplib.HTTP(self.host) -            connection.putrequest("GET", self.path) -            connection.putheader("Host", self.host) -            connection.endheaders() -            status, text, mime = connection.getreply() -            if status in [301,302] and mime: -                tries = tries + 1 -                newurl = mime.get("Location", mime.get("Uri", "")) -                newurl = urlparse.urljoin(self.url, newurl) -                self.set_url(newurl) -            else: -                break -        if status==401 or status==403: +        opener = URLopener() +        f = opener.open(self.url) +        lines = f.readlines() +        self.errcode = opener.errcode +        if self.errcode == 401 or self.errcode == 403:              self.disallow_all = 1 -        elif status>=400: +            _debug("disallow all") +        elif self.errcode >= 400:              self.allow_all = 1 -        else: -            # status < 400 -            self.parse(connection.getfile().readlines()) +            _debug("allow all") +        elif self.errcode == 200 and lines: +            _debug("parse lines") +            self.parse(lines)      def parse(self, lines):          """parse the input lines from a robot.txt file. @@ -129,15 +120,15 @@ class RobotFileParser:      def can_fetch(self, useragent, url):          """using the parsed robots.txt decide if useragent can fetch url""" -        _debug("Checking robot.txt allowance for\n%s\n%s" % (useragent, url)) +        _debug("Checking robot.txt allowance for:\n  user agent: %s\n  url: %s" % +               (useragent, url))          if self.disallow_all:              return 0          if self.allow_all:              return 1          # search for given user agent matches          # the first match counts -        useragent = useragent.lower() -        url = urllib.quote(urlparse.urlparse(url)[2]) +        url = urllib.quote(urlparse.urlparse(url)[2]) or "/"          for entry in self.entries:              if entry.applies_to(useragent):                  return entry.allowance(url) @@ -181,11 +172,16 @@ class Entry:          return ret      def applies_to(self, useragent): -        "check if this entry applies to the specified agent" +        """check if this entry applies to the specified agent""" +        # split the name token and make it lower case +        useragent = useragent.split("/")[0].lower()          for agent in self.useragents: -            if agent=="*": +            if agent=='*': +                # we have the catch-all agent                  return 1 -            if re.match(agent, useragent): +            agent = agent.lower() +            # don't forget to re.escape +            if re.search(re.escape(useragent), agent):                  return 1          return 0 @@ -194,25 +190,84 @@ class Entry:          - our agent applies to this entry          - filename is URL decoded"""          for line in self.rulelines: +            _debug((filename, str(line), line.allowance))              if line.applies_to(filename):                  return line.allowance          return 1 +class URLopener(urllib.FancyURLopener): +    def __init__(self, *args): +        apply(urllib.FancyURLopener.__init__, (self,) + args) +        self.errcode = 200 +        self.tries = 0 +        self.maxtries = 10 +         +    def http_error_default(self, url, fp, errcode, errmsg, headers): +        self.errcode = errcode +        return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, +                                                        errmsg, headers) + +    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): +        self.tries += 1 +        if self.tries >= self.maxtries: +            return self.http_error_default(url, fp, 500, +                                           "Internal Server Error: Redirect Recursion", +                                           headers) +        result = urllib.FancyURLopener.http_error_302(self, url, fp, errcode, +                                                      errmsg, headers, data) +        self.tries = 0 +        return result + +def _check(a,b): +    if not b: +        ac = "access denied" +    else: +        ac = "access allowed" +    if a!=b: +        print "failed" +    else: +        print "ok (%s)" % ac +    print  def _test():      global debug      import sys      rp = RobotFileParser()      debug = 1 -    if len(sys.argv) <= 1: -        rp.set_url('http://www.musi-cal.com/robots.txt') -        rp.read() -    else: -        rp.parse(open(sys.argv[1]).readlines()) -    print rp.can_fetch('*', 'http://www.musi-cal.com/') -    print rp.can_fetch('Musi-Cal-Robot/1.0', + +    # robots.txt that exists, gotten to by redirection +    rp.set_url('http://www.musi-cal.com/robots.txt') +    rp.read() + +    # test for re.escape +    _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1) +    # this should match the first rule, which is a disallow +    _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0) +    # various cherry pickers +    _check(rp.can_fetch('CherryPickerSE', +                       'http://www.musi-cal.com/cgi-bin/event-search' +                       '?city=San+Francisco'), 0) +    _check(rp.can_fetch('CherryPickerSE/1.0',                         'http://www.musi-cal.com/cgi-bin/event-search' -                       '?city=San+Francisco') +                       '?city=San+Francisco'), 0) +    _check(rp.can_fetch('CherryPickerSE/1.5', +                       'http://www.musi-cal.com/cgi-bin/event-search' +                       '?city=San+Francisco'), 0) +    # case sensitivity +    _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0) +    _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0) +    # substring test +    _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0) +    # tests for catch-all * agent +    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0) +    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1) +    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) +    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) + +    # robots.txt that does not exist +    rp.set_url('http://www.lycos.com/robots.txt') +    rp.read() +    _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)  if __name__ == '__main__':      _test() | 
