diff options
Diffstat (limited to 'sphinx/builders/linkcheck.py')
| -rw-r--r-- | sphinx/builders/linkcheck.py | 80 |
1 files changed, 32 insertions, 48 deletions
diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index 7c42e7506..95892dbaf 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """ sphinx.builders.linkcheck ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -9,26 +8,16 @@ :license: BSD, see LICENSE for details. """ -import codecs +import queue import re import socket import threading +from html.parser import HTMLParser from os import path +from urllib.parse import unquote from docutils import nodes from requests.exceptions import HTTPError -from six.moves import queue, html_parser -from six.moves.urllib.parse import unquote - -# 2015-06-25 barry@python.org. This exception was deprecated in Python 3.3 and -# removed in Python 3.5, however for backward compatibility reasons, we're not -# going to just remove it. If it doesn't exist, define an exception that will -# never be caught but leaves the code in check_anchor() intact. -try: - from six.moves.html_parser import HTMLParseError # type: ignore -except ImportError: - class HTMLParseError(Exception): # type: ignore - pass from sphinx.builders import Builder from sphinx.locale import __ @@ -36,6 +25,7 @@ from sphinx.util import encode_uri, requests, logging from sphinx.util.console import ( # type: ignore purple, red, darkgreen, darkgray, darkred, turquoise ) +from sphinx.util.nodes import traverse_parent from sphinx.util.requests import is_ssl_error if False: @@ -48,12 +38,12 @@ if False: logger = logging.getLogger(__name__) -class AnchorCheckParser(html_parser.HTMLParser): +class AnchorCheckParser(HTMLParser): """Specialized HTML parser that looks for a specific anchor.""" def __init__(self, search_anchor): - # type: (unicode) -> None - html_parser.HTMLParser.__init__(self) + # type: (str) -> None + super().__init__() self.search_anchor = search_anchor self.found = False @@ -67,23 +57,18 @@ class AnchorCheckParser(html_parser.HTMLParser): def check_anchor(response, anchor): - # type: (Response, unicode) -> bool + # type: (Response, str) -> bool """Reads HTML data from a response object `response` searching for `anchor`. Returns True if anchor was found, False otherwise. """ parser = AnchorCheckParser(anchor) - try: - # Read file in chunks. If we find a matching anchor, we break - # the loop early in hopes not to have to download the whole thing. - for chunk in response.iter_content(chunk_size=4096, decode_unicode=True): - parser.feed(chunk) - if parser.found: - break - parser.close() - except HTMLParseError: - # HTMLParser is usually pretty good with sloppy HTML, but it tends to - # choke on EOF. But we're done then anyway. - pass + # Read file in chunks. If we find a matching anchor, we break + # the loop early in hopes not to have to download the whole thing. + for chunk in response.iter_content(chunk_size=4096, decode_unicode=True): + parser.feed(chunk) + if parser.found: + break + parser.close() return parser.found @@ -100,9 +85,9 @@ class CheckExternalLinksBuilder(Builder): self.to_ignore = [re.compile(x) for x in self.app.config.linkcheck_ignore] self.anchors_ignore = [re.compile(x) for x in self.app.config.linkcheck_anchors_ignore] - self.good = set() # type: Set[unicode] - self.broken = {} # type: Dict[unicode, unicode] - self.redirected = {} # type: Dict[unicode, Tuple[unicode, int]] + self.good = set() # type: Set[str] + self.broken = {} # type: Dict[str, str] + self.redirected = {} # type: Dict[str, Tuple[str, int]] # set a timeout for non-responding servers socket.setdefaulttimeout(5.0) # create output file @@ -130,7 +115,7 @@ class CheckExternalLinksBuilder(Builder): kwargs['timeout'] = self.app.config.linkcheck_timeout def check_uri(): - # type: () -> Tuple[unicode, unicode, int] + # type: () -> Tuple[str, str, int] # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) @@ -194,7 +179,7 @@ class CheckExternalLinksBuilder(Builder): return 'redirected', new_url, 0 def check(): - # type: () -> Tuple[unicode, unicode, int] + # type: () -> Tuple[str, str, int] # check for various conditions without bothering the network if len(uri) == 0 or uri.startswith(('#', 'mailto:', 'ftp:')): return 'unchecked', '', 0 @@ -233,14 +218,14 @@ class CheckExternalLinksBuilder(Builder): self.rqueue.put((uri, docname, lineno, status, info, code)) def process_result(self, result): - # type: (Tuple[unicode, unicode, int, unicode, unicode, int]) -> None + # type: (Tuple[str, str, int, str, str, int]) -> None uri, docname, lineno, status, info, code = result if status == 'unchecked': return if status == 'working' and info == 'old': return if lineno: - logger.info('(line %4d) ', lineno, nonl=1) + logger.info('(line %4d) ', lineno, nonl=True) if status == 'ignored': if info: logger.info(darkgray('-ignored- ') + uri + ': ' + info) @@ -271,19 +256,19 @@ class CheckExternalLinksBuilder(Builder): logger.info(color('redirect ') + uri + color(' - ' + text + ' to ' + info)) def get_target_uri(self, docname, typ=None): - # type: (unicode, unicode) -> unicode + # type: (str, str) -> str return '' def get_outdated_docs(self): - # type: () -> Set[unicode] + # type: () -> Set[str] return self.env.found_docs def prepare_writing(self, docnames): - # type: (nodes.Node) -> None + # type: (Set[str]) -> None return def write_doc(self, docname, doctree): - # type: (unicode, nodes.Node) -> None + # type: (str, nodes.Node) -> None logger.info('') n = 0 for node in doctree.traverse(nodes.reference): @@ -291,11 +276,10 @@ class CheckExternalLinksBuilder(Builder): continue uri = node['refuri'] lineno = None - while lineno is None: - node = node.parent - if node is None: + for parent in traverse_parent(node): + if parent.line: + lineno = parent.line break - lineno = node.line self.wqueue.put((uri, docname, lineno), False) n += 1 done = 0 @@ -307,8 +291,8 @@ class CheckExternalLinksBuilder(Builder): self.app.statuscode = 1 def write_entry(self, what, docname, line, uri): - # type: (unicode, unicode, int, unicode) -> None - with codecs.open(path.join(self.outdir, 'output.txt'), 'a', 'utf-8') as output: # type: ignore # NOQA + # type: (str, str, int, str) -> None + with open(path.join(self.outdir, 'output.txt'), 'a', encoding='utf-8') as output: output.write("%s:%s: [%s] %s\n" % (self.env.doc2path(docname, None), line, what, uri)) @@ -319,7 +303,7 @@ class CheckExternalLinksBuilder(Builder): def setup(app): - # type: (Sphinx) -> Dict[unicode, Any] + # type: (Sphinx) -> Dict[str, Any] app.add_builder(CheckExternalLinksBuilder) app.add_config_value('linkcheck_ignore', [], None) |
