summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorg Brandl <georg@python.org>2011-01-04 11:27:42 +0100
committerGeorg Brandl <georg@python.org>2011-01-04 11:27:42 +0100
commit51852c0e8732085bc3f933667a3394320c1e98f2 (patch)
tree52eecb2e8dfe9f0186eb64903dc32165d50b9c0c
parent3642b521ed5a8c85faf4b37eb1bc83d6cfa63716 (diff)
downloadsphinx-git-51852c0e8732085bc3f933667a3394320c1e98f2.tar.gz
#472: linkcheck builder: Check links in parallel, use HTTP HEAD requests and allow configuring the timeout.
New config values: :confval:`linkcheck_timeout` and :confval:`linkcheck_workers`.
-rw-r--r--CHANGES4
-rw-r--r--doc/config.rst15
-rw-r--r--sphinx/builders/linkcheck.py181
-rw-r--r--sphinx/config.py2
4 files changed, 127 insertions, 75 deletions
diff --git a/CHANGES b/CHANGES
index 174998567..3fe98cfc1 100644
--- a/CHANGES
+++ b/CHANGES
@@ -24,6 +24,10 @@ Release 1.1 (in development)
* #443: Allow referencing external graphviz files.
+* #472: linkcheck builder: Check links in parallel, use HTTP HEAD
+ requests and allow configuring the timeout. New config values:
+ :confval:`linkcheck_timeout` and :confval:`linkcheck_workers`.
+
* #221: Add Swedish locale.
* Added ``inline`` option to graphviz directives, and fixed the
diff --git a/doc/config.rst b/doc/config.rst
index 65601d8cf..992a714ef 100644
--- a/doc/config.rst
+++ b/doc/config.rst
@@ -1132,6 +1132,21 @@ Options for the linkcheck builder
.. versionadded:: 1.1
+.. confval:: linkcheck_timeout
+
+ A timeout value, in seconds, for the linkcheck builder. **Only works in
+ Python 2.6 and higher.** The default is to use Python's global socket
+ timeout.
+
+ .. versionadded:: 1.1
+
+.. confval:: linkcheck_workers
+
+ The number of worker threads to use when checking links. Default is 5
+ threads.
+
+ .. versionadded:: 1.1
+
.. rubric:: Footnotes
diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py
index dd87d70d8..e40919861 100644
--- a/sphinx/builders/linkcheck.py
+++ b/sphinx/builders/linkcheck.py
@@ -10,9 +10,12 @@
"""
import re
+import sys
+import Queue
import socket
+import threading
from os import path
-from urllib2 import build_opener, HTTPError
+from urllib2 import build_opener, Request
from docutils import nodes
@@ -24,6 +27,12 @@ opener = build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
+class HeadRequest(Request):
+ """Subclass of urllib2.Request that sends a HEAD request."""
+ def get_method(self):
+ return 'HEAD'
+
+
class CheckExternalLinksBuilder(Builder):
"""
Checks for broken external links.
@@ -40,6 +49,83 @@ class CheckExternalLinksBuilder(Builder):
# create output file
open(path.join(self.outdir, 'output.txt'), 'w').close()
+ # create queues and worker threads
+ self.wqueue = Queue.Queue()
+ self.rqueue = Queue.Queue()
+ self.workers = []
+ for i in range(self.app.config.linkcheck_workers):
+ thread = threading.Thread(target=self.check_thread)
+ thread.setDaemon(True)
+ thread.start()
+ self.workers.append(thread)
+
+ def check_thread(self):
+ kwargs = {}
+ if sys.version_info > (2, 5) and self.app.config.linkcheck_timeout:
+ kwargs['timeout'] = self.app.config.linkcheck_timeout
+
+ def check():
+ # check for various conditions without bothering the network
+ if len(uri) == 0 or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:':
+ return 'unchecked', ''
+ elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'):
+ return 'local', ''
+ elif uri in self.good:
+ return 'working', ''
+ elif uri in self.broken:
+ return 'broken', self.broken[uri]
+ elif uri in self.redirected:
+ return 'redirected', self.redirected[uri]
+ for rex in self.to_ignore:
+ if rex.match(uri):
+ return 'ignored', ''
+
+ # need to actually check the URI
+ try:
+ f = opener.open(HeadRequest(uri), **kwargs)
+ f.close()
+ except Exception, err:
+ self.broken[uri] = str(err)
+ return 'broken', str(err)
+ if f.url.rstrip('/') == uri.rstrip('/'):
+ self.good.add(uri)
+ return 'working', 'new'
+ else:
+ self.redirected[uri] = f.url
+ return 'redirected', f.url
+
+ while True:
+ uri, docname, lineno = self.wqueue.get()
+ if uri is None:
+ break
+ status, info = check()
+ self.rqueue.put((uri, docname, lineno, status, info))
+
+ def process_result(self, result):
+ uri, docname, lineno, status, info = result
+ if status == 'unchecked':
+ return
+ if status == 'working' and info != 'new':
+ return
+ if lineno:
+ self.info('(line %3d) ' % lineno, nonl=1)
+ if status == 'ignored':
+ self.info(uri + ' - ' + darkgray('ignored'))
+ elif status == 'local':
+ self.info(uri + ' - ' + darkgray('local'))
+ self.write_entry('local', docname, lineno, uri)
+ elif status == 'working':
+ self.info(uri + ' - ' + darkgreen('working'))
+ elif status == 'broken':
+ self.info(uri + ' - ' + red('broken: ') + info)
+ self.write_entry('broken', docname, lineno, uri + ': ' + info)
+ if self.app.quiet:
+ self.warn('broken link: %s' % uri,
+ '%s:%s' % (self.env.doc2path(docname), lineno))
+ elif status == 'redirected':
+ self.info(uri + ' - ' + purple('redirected') + ' to ' + info)
+ self.write_entry('redirected', docname, lineno, uri + ' to ' + info)
+
def get_target_uri(self, docname, typ=None):
return ''
@@ -51,65 +137,25 @@ class CheckExternalLinksBuilder(Builder):
def write_doc(self, docname, doctree):
self.info()
+ n = 0
for node in doctree.traverse(nodes.reference):
- try:
- self.check(node, docname)
- except KeyError:
+ if 'refuri' not in node:
continue
-
- def check(self, node, docname):
- uri = node['refuri']
-
- if '#' in uri:
- uri = uri.split('#')[0]
-
- if uri in self.good:
- return
-
- lineno = None
- while lineno is None:
- node = node.parent
- if node is None:
- break
- lineno = node.line
-
- if len(uri) == 0 or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:':
- return
-
- if lineno:
- self.info('(line %3d) ' % lineno, nonl=1)
- for rex in self.to_ignore:
- if rex.match(uri):
- self.info(uri + ' - ' + darkgray('ignored'))
- return
- if uri[0:5] == 'http:' or uri[0:6] == 'https:':
- self.info(uri, nonl=1)
-
- if uri in self.broken:
- (r, s) = self.broken[uri]
- elif uri in self.redirected:
- (r, s) = self.redirected[uri]
- else:
- (r, s) = self.resolve(uri)
-
- if r == 0:
- self.info(' - ' + darkgreen('working'))
- self.good.add(uri)
- elif r == 2:
- self.info(' - ' + red('broken: ') + s)
- self.write_entry('broken', docname, lineno, uri + ': ' + s)
- self.broken[uri] = (r, s)
- if self.app.quiet:
- self.warn('broken link: %s' % uri,
- '%s:%s' % (self.env.doc2path(docname), lineno))
- else:
- self.info(' - ' + purple('redirected') + ' to ' + s)
- self.write_entry('redirected', docname,
- lineno, uri + ' to ' + s)
- self.redirected[uri] = (r, s)
- else:
- self.info(uri + ' - ' + darkgray('local'))
- self.write_entry('local', docname, lineno, uri)
+ uri = node['refuri']
+ if '#' in uri:
+ uri = uri.split('#')[0]
+ lineno = None
+ while lineno is None:
+ node = node.parent
+ if node is None:
+ break
+ lineno = node.line
+ self.wqueue.put((uri, docname, lineno), False)
+ n += 1
+ done = 0
+ while done < n:
+ self.process_result(self.rqueue.get())
+ done += 1
if self.broken:
self.app.statuscode = 1
@@ -120,21 +166,6 @@ class CheckExternalLinksBuilder(Builder):
line, what, uri))
output.close()
- def resolve(self, uri):
- try:
- f = opener.open(uri)
- f.close()
- except HTTPError, err:
- #if err.code == 403 and uri.startswith('http://en.wikipedia.org/'):
- # # Wikipedia blocks requests from urllib User-Agent
- # return (0, 0)
- return (2, str(err))
- except Exception, err:
- return (2, str(err))
- if f.url.rstrip('/') == uri.rstrip('/'):
- return (0, 0)
- else:
- return (1, f.url)
-
def finish(self):
- return
+ for worker in self.workers:
+ self.wqueue.put((None, None, None), False)
diff --git a/sphinx/config.py b/sphinx/config.py
index 922af803a..b461c94bd 100644
--- a/sphinx/config.py
+++ b/sphinx/config.py
@@ -168,6 +168,8 @@ class Config(object):
# linkcheck options
linkcheck_ignore = ([], None),
+ linkcheck_timeout = (None, None),
+ linkcheck_workers = (5, None),
)
def __init__(self, dirname, filename, overrides, tags):