summaryrefslogtreecommitdiff
path: root/sphinx/builders/linkcheck.py
diff options
context:
space:
mode:
Diffstat (limited to 'sphinx/builders/linkcheck.py')
-rw-r--r--sphinx/builders/linkcheck.py80
1 files changed, 32 insertions, 48 deletions
diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py
index 7c42e7506..95892dbaf 100644
--- a/sphinx/builders/linkcheck.py
+++ b/sphinx/builders/linkcheck.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
"""
sphinx.builders.linkcheck
~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -9,26 +8,16 @@
:license: BSD, see LICENSE for details.
"""
-import codecs
+import queue
import re
import socket
import threading
+from html.parser import HTMLParser
from os import path
+from urllib.parse import unquote
from docutils import nodes
from requests.exceptions import HTTPError
-from six.moves import queue, html_parser
-from six.moves.urllib.parse import unquote
-
-# 2015-06-25 barry@python.org. This exception was deprecated in Python 3.3 and
-# removed in Python 3.5, however for backward compatibility reasons, we're not
-# going to just remove it. If it doesn't exist, define an exception that will
-# never be caught but leaves the code in check_anchor() intact.
-try:
- from six.moves.html_parser import HTMLParseError # type: ignore
-except ImportError:
- class HTMLParseError(Exception): # type: ignore
- pass
from sphinx.builders import Builder
from sphinx.locale import __
@@ -36,6 +25,7 @@ from sphinx.util import encode_uri, requests, logging
from sphinx.util.console import ( # type: ignore
purple, red, darkgreen, darkgray, darkred, turquoise
)
+from sphinx.util.nodes import traverse_parent
from sphinx.util.requests import is_ssl_error
if False:
@@ -48,12 +38,12 @@ if False:
logger = logging.getLogger(__name__)
-class AnchorCheckParser(html_parser.HTMLParser):
+class AnchorCheckParser(HTMLParser):
"""Specialized HTML parser that looks for a specific anchor."""
def __init__(self, search_anchor):
- # type: (unicode) -> None
- html_parser.HTMLParser.__init__(self)
+ # type: (str) -> None
+ super().__init__()
self.search_anchor = search_anchor
self.found = False
@@ -67,23 +57,18 @@ class AnchorCheckParser(html_parser.HTMLParser):
def check_anchor(response, anchor):
- # type: (Response, unicode) -> bool
+ # type: (Response, str) -> bool
"""Reads HTML data from a response object `response` searching for `anchor`.
Returns True if anchor was found, False otherwise.
"""
parser = AnchorCheckParser(anchor)
- try:
- # Read file in chunks. If we find a matching anchor, we break
- # the loop early in hopes not to have to download the whole thing.
- for chunk in response.iter_content(chunk_size=4096, decode_unicode=True):
- parser.feed(chunk)
- if parser.found:
- break
- parser.close()
- except HTMLParseError:
- # HTMLParser is usually pretty good with sloppy HTML, but it tends to
- # choke on EOF. But we're done then anyway.
- pass
+ # Read file in chunks. If we find a matching anchor, we break
+ # the loop early in hopes not to have to download the whole thing.
+ for chunk in response.iter_content(chunk_size=4096, decode_unicode=True):
+ parser.feed(chunk)
+ if parser.found:
+ break
+ parser.close()
return parser.found
@@ -100,9 +85,9 @@ class CheckExternalLinksBuilder(Builder):
self.to_ignore = [re.compile(x) for x in self.app.config.linkcheck_ignore]
self.anchors_ignore = [re.compile(x)
for x in self.app.config.linkcheck_anchors_ignore]
- self.good = set() # type: Set[unicode]
- self.broken = {} # type: Dict[unicode, unicode]
- self.redirected = {} # type: Dict[unicode, Tuple[unicode, int]]
+ self.good = set() # type: Set[str]
+ self.broken = {} # type: Dict[str, str]
+ self.redirected = {} # type: Dict[str, Tuple[str, int]]
# set a timeout for non-responding servers
socket.setdefaulttimeout(5.0)
# create output file
@@ -130,7 +115,7 @@ class CheckExternalLinksBuilder(Builder):
kwargs['timeout'] = self.app.config.linkcheck_timeout
def check_uri():
- # type: () -> Tuple[unicode, unicode, int]
+ # type: () -> Tuple[str, str, int]
# split off anchor
if '#' in uri:
req_url, anchor = uri.split('#', 1)
@@ -194,7 +179,7 @@ class CheckExternalLinksBuilder(Builder):
return 'redirected', new_url, 0
def check():
- # type: () -> Tuple[unicode, unicode, int]
+ # type: () -> Tuple[str, str, int]
# check for various conditions without bothering the network
if len(uri) == 0 or uri.startswith(('#', 'mailto:', 'ftp:')):
return 'unchecked', '', 0
@@ -233,14 +218,14 @@ class CheckExternalLinksBuilder(Builder):
self.rqueue.put((uri, docname, lineno, status, info, code))
def process_result(self, result):
- # type: (Tuple[unicode, unicode, int, unicode, unicode, int]) -> None
+ # type: (Tuple[str, str, int, str, str, int]) -> None
uri, docname, lineno, status, info, code = result
if status == 'unchecked':
return
if status == 'working' and info == 'old':
return
if lineno:
- logger.info('(line %4d) ', lineno, nonl=1)
+ logger.info('(line %4d) ', lineno, nonl=True)
if status == 'ignored':
if info:
logger.info(darkgray('-ignored- ') + uri + ': ' + info)
@@ -271,19 +256,19 @@ class CheckExternalLinksBuilder(Builder):
logger.info(color('redirect ') + uri + color(' - ' + text + ' to ' + info))
def get_target_uri(self, docname, typ=None):
- # type: (unicode, unicode) -> unicode
+ # type: (str, str) -> str
return ''
def get_outdated_docs(self):
- # type: () -> Set[unicode]
+ # type: () -> Set[str]
return self.env.found_docs
def prepare_writing(self, docnames):
- # type: (nodes.Node) -> None
+ # type: (Set[str]) -> None
return
def write_doc(self, docname, doctree):
- # type: (unicode, nodes.Node) -> None
+ # type: (str, nodes.Node) -> None
logger.info('')
n = 0
for node in doctree.traverse(nodes.reference):
@@ -291,11 +276,10 @@ class CheckExternalLinksBuilder(Builder):
continue
uri = node['refuri']
lineno = None
- while lineno is None:
- node = node.parent
- if node is None:
+ for parent in traverse_parent(node):
+ if parent.line:
+ lineno = parent.line
break
- lineno = node.line
self.wqueue.put((uri, docname, lineno), False)
n += 1
done = 0
@@ -307,8 +291,8 @@ class CheckExternalLinksBuilder(Builder):
self.app.statuscode = 1
def write_entry(self, what, docname, line, uri):
- # type: (unicode, unicode, int, unicode) -> None
- with codecs.open(path.join(self.outdir, 'output.txt'), 'a', 'utf-8') as output: # type: ignore # NOQA
+ # type: (str, str, int, str) -> None
+ with open(path.join(self.outdir, 'output.txt'), 'a', encoding='utf-8') as output:
output.write("%s:%s: [%s] %s\n" % (self.env.doc2path(docname, None),
line, what, uri))
@@ -319,7 +303,7 @@ class CheckExternalLinksBuilder(Builder):
def setup(app):
- # type: (Sphinx) -> Dict[unicode, Any]
+ # type: (Sphinx) -> Dict[str, Any]
app.add_builder(CheckExternalLinksBuilder)
app.add_config_value('linkcheck_ignore', [], None)