diff options
| author | Georg Brandl <georg@python.org> | 2011-01-04 11:27:42 +0100 |
|---|---|---|
| committer | Georg Brandl <georg@python.org> | 2011-01-04 11:27:42 +0100 |
| commit | 51852c0e8732085bc3f933667a3394320c1e98f2 (patch) | |
| tree | 52eecb2e8dfe9f0186eb64903dc32165d50b9c0c | |
| parent | 3642b521ed5a8c85faf4b37eb1bc83d6cfa63716 (diff) | |
| download | sphinx-git-51852c0e8732085bc3f933667a3394320c1e98f2.tar.gz | |
#472: linkcheck builder: Check links in parallel, use HTTP HEAD requests and allow configuring the timeout.
New config values: :confval:`linkcheck_timeout` and :confval:`linkcheck_workers`.
| -rw-r--r-- | CHANGES | 4 | ||||
| -rw-r--r-- | doc/config.rst | 15 | ||||
| -rw-r--r-- | sphinx/builders/linkcheck.py | 181 | ||||
| -rw-r--r-- | sphinx/config.py | 2 |
4 files changed, 127 insertions, 75 deletions
@@ -24,6 +24,10 @@ Release 1.1 (in development) * #443: Allow referencing external graphviz files. +* #472: linkcheck builder: Check links in parallel, use HTTP HEAD + requests and allow configuring the timeout. New config values: + :confval:`linkcheck_timeout` and :confval:`linkcheck_workers`. + * #221: Add Swedish locale. * Added ``inline`` option to graphviz directives, and fixed the diff --git a/doc/config.rst b/doc/config.rst index 65601d8cf..992a714ef 100644 --- a/doc/config.rst +++ b/doc/config.rst @@ -1132,6 +1132,21 @@ Options for the linkcheck builder .. versionadded:: 1.1 +.. confval:: linkcheck_timeout + + A timeout value, in seconds, for the linkcheck builder. **Only works in + Python 2.6 and higher.** The default is to use Python's global socket + timeout. + + .. versionadded:: 1.1 + +.. confval:: linkcheck_workers + + The number of worker threads to use when checking links. Default is 5 + threads. + + .. versionadded:: 1.1 + .. rubric:: Footnotes diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index dd87d70d8..e40919861 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -10,9 +10,12 @@ """ import re +import sys +import Queue import socket +import threading from os import path -from urllib2 import build_opener, HTTPError +from urllib2 import build_opener, Request from docutils import nodes @@ -24,6 +27,12 @@ opener = build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] +class HeadRequest(Request): + """Subclass of urllib2.Request that sends a HEAD request.""" + def get_method(self): + return 'HEAD' + + class CheckExternalLinksBuilder(Builder): """ Checks for broken external links. @@ -40,6 +49,83 @@ class CheckExternalLinksBuilder(Builder): # create output file open(path.join(self.outdir, 'output.txt'), 'w').close() + # create queues and worker threads + self.wqueue = Queue.Queue() + self.rqueue = Queue.Queue() + self.workers = [] + for i in range(self.app.config.linkcheck_workers): + thread = threading.Thread(target=self.check_thread) + thread.setDaemon(True) + thread.start() + self.workers.append(thread) + + def check_thread(self): + kwargs = {} + if sys.version_info > (2, 5) and self.app.config.linkcheck_timeout: + kwargs['timeout'] = self.app.config.linkcheck_timeout + + def check(): + # check for various conditions without bothering the network + if len(uri) == 0 or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:': + return 'unchecked', '' + elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'): + return 'local', '' + elif uri in self.good: + return 'working', '' + elif uri in self.broken: + return 'broken', self.broken[uri] + elif uri in self.redirected: + return 'redirected', self.redirected[uri] + for rex in self.to_ignore: + if rex.match(uri): + return 'ignored', '' + + # need to actually check the URI + try: + f = opener.open(HeadRequest(uri), **kwargs) + f.close() + except Exception, err: + self.broken[uri] = str(err) + return 'broken', str(err) + if f.url.rstrip('/') == uri.rstrip('/'): + self.good.add(uri) + return 'working', 'new' + else: + self.redirected[uri] = f.url + return 'redirected', f.url + + while True: + uri, docname, lineno = self.wqueue.get() + if uri is None: + break + status, info = check() + self.rqueue.put((uri, docname, lineno, status, info)) + + def process_result(self, result): + uri, docname, lineno, status, info = result + if status == 'unchecked': + return + if status == 'working' and info != 'new': + return + if lineno: + self.info('(line %3d) ' % lineno, nonl=1) + if status == 'ignored': + self.info(uri + ' - ' + darkgray('ignored')) + elif status == 'local': + self.info(uri + ' - ' + darkgray('local')) + self.write_entry('local', docname, lineno, uri) + elif status == 'working': + self.info(uri + ' - ' + darkgreen('working')) + elif status == 'broken': + self.info(uri + ' - ' + red('broken: ') + info) + self.write_entry('broken', docname, lineno, uri + ': ' + info) + if self.app.quiet: + self.warn('broken link: %s' % uri, + '%s:%s' % (self.env.doc2path(docname), lineno)) + elif status == 'redirected': + self.info(uri + ' - ' + purple('redirected') + ' to ' + info) + self.write_entry('redirected', docname, lineno, uri + ' to ' + info) + def get_target_uri(self, docname, typ=None): return '' @@ -51,65 +137,25 @@ class CheckExternalLinksBuilder(Builder): def write_doc(self, docname, doctree): self.info() + n = 0 for node in doctree.traverse(nodes.reference): - try: - self.check(node, docname) - except KeyError: + if 'refuri' not in node: continue - - def check(self, node, docname): - uri = node['refuri'] - - if '#' in uri: - uri = uri.split('#')[0] - - if uri in self.good: - return - - lineno = None - while lineno is None: - node = node.parent - if node is None: - break - lineno = node.line - - if len(uri) == 0 or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:': - return - - if lineno: - self.info('(line %3d) ' % lineno, nonl=1) - for rex in self.to_ignore: - if rex.match(uri): - self.info(uri + ' - ' + darkgray('ignored')) - return - if uri[0:5] == 'http:' or uri[0:6] == 'https:': - self.info(uri, nonl=1) - - if uri in self.broken: - (r, s) = self.broken[uri] - elif uri in self.redirected: - (r, s) = self.redirected[uri] - else: - (r, s) = self.resolve(uri) - - if r == 0: - self.info(' - ' + darkgreen('working')) - self.good.add(uri) - elif r == 2: - self.info(' - ' + red('broken: ') + s) - self.write_entry('broken', docname, lineno, uri + ': ' + s) - self.broken[uri] = (r, s) - if self.app.quiet: - self.warn('broken link: %s' % uri, - '%s:%s' % (self.env.doc2path(docname), lineno)) - else: - self.info(' - ' + purple('redirected') + ' to ' + s) - self.write_entry('redirected', docname, - lineno, uri + ' to ' + s) - self.redirected[uri] = (r, s) - else: - self.info(uri + ' - ' + darkgray('local')) - self.write_entry('local', docname, lineno, uri) + uri = node['refuri'] + if '#' in uri: + uri = uri.split('#')[0] + lineno = None + while lineno is None: + node = node.parent + if node is None: + break + lineno = node.line + self.wqueue.put((uri, docname, lineno), False) + n += 1 + done = 0 + while done < n: + self.process_result(self.rqueue.get()) + done += 1 if self.broken: self.app.statuscode = 1 @@ -120,21 +166,6 @@ class CheckExternalLinksBuilder(Builder): line, what, uri)) output.close() - def resolve(self, uri): - try: - f = opener.open(uri) - f.close() - except HTTPError, err: - #if err.code == 403 and uri.startswith('http://en.wikipedia.org/'): - # # Wikipedia blocks requests from urllib User-Agent - # return (0, 0) - return (2, str(err)) - except Exception, err: - return (2, str(err)) - if f.url.rstrip('/') == uri.rstrip('/'): - return (0, 0) - else: - return (1, f.url) - def finish(self): - return + for worker in self.workers: + self.wqueue.put((None, None, None), False) diff --git a/sphinx/config.py b/sphinx/config.py index 922af803a..b461c94bd 100644 --- a/sphinx/config.py +++ b/sphinx/config.py @@ -168,6 +168,8 @@ class Config(object): # linkcheck options linkcheck_ignore = ([], None), + linkcheck_timeout = (None, None), + linkcheck_workers = (5, None), ) def __init__(self, dirname, filename, overrides, tags): |
