diff options
| author | Berker Peksag <berker.peksag@gmail.com> | 2015-10-08 12:27:06 +0300 | 
|---|---|---|
| committer | Berker Peksag <berker.peksag@gmail.com> | 2015-10-08 12:27:06 +0300 | 
| commit | 960e848f0d32399824d61dce2ff5736bb596ed44 (patch) | |
| tree | 7274633785f0e6b8d0ce64375bdca6351b0d32db /Lib/urllib/robotparser.py | |
| parent | 2137dc15737e286e576d69ad10124973c3f9ba1e (diff) | |
| download | cpython-git-960e848f0d32399824d61dce2ff5736bb596ed44.tar.gz | |
Issue #16099: RobotFileParser now supports Crawl-delay and Request-rate
extensions.
Patch by Nikolay Bogoychev.
Diffstat (limited to 'Lib/urllib/robotparser.py')
| -rw-r--r-- | Lib/urllib/robotparser.py | 39 | 
1 files changed, 37 insertions, 2 deletions
| diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index 4fbb0cb995..4ac553af20 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -10,7 +10,9 @@      http://www.robotstxt.org/norobots-rfc.txt  """ -import urllib.parse, urllib.request +import collections +import urllib.parse +import urllib.request  __all__ = ["RobotFileParser"] @@ -120,10 +122,29 @@ class RobotFileParser:                      if state != 0:                          entry.rulelines.append(RuleLine(line[1], True))                          state = 2 +                elif line[0] == "crawl-delay": +                    if state != 0: +                        # before trying to convert to int we need to make +                        # sure that robots.txt has valid syntax otherwise +                        # it will crash +                        if line[1].strip().isdigit(): +                            entry.delay = int(line[1]) +                        state = 2 +                elif line[0] == "request-rate": +                    if state != 0: +                        numbers = line[1].split('/') +                        # check if all values are sane +                        if (len(numbers) == 2 and numbers[0].strip().isdigit() +                            and numbers[1].strip().isdigit()): +                            req_rate = collections.namedtuple('req_rate', +                                                              'requests seconds') +                            entry.req_rate = req_rate +                            entry.req_rate.requests = int(numbers[0]) +                            entry.req_rate.seconds = int(numbers[1]) +                        state = 2          if state == 2:              self._add_entry(entry) -      def can_fetch(self, useragent, url):          """using the parsed robots.txt decide if useragent can fetch url"""          if self.disallow_all: @@ -153,6 +174,18 @@ class RobotFileParser:          # agent not found ==> access granted          return True +    def crawl_delay(self, useragent): +        for entry in self.entries: +            if entry.applies_to(useragent): +                return entry.delay +        return None + +    def request_rate(self, useragent): +        for entry in self.entries: +            if entry.applies_to(useragent): +                return entry.req_rate +        return None +      def __str__(self):          return ''.join([str(entry) + "\n" for entry in self.entries]) @@ -180,6 +213,8 @@ class Entry:      def __init__(self):          self.useragents = []          self.rulelines = [] +        self.delay = None +        self.req_rate = None      def __str__(self):          ret = [] | 
