1 files changed, 178 insertions, 68 deletions
diff --git a/sphinx/io.py b/sphinx/io.py
index 8365e22e0..2d584de91 100644
--- a/sphinx/io.py
+++ b/sphinx/io.py
@@ -8,10 +8,15 @@
     :copyright: Copyright 2007-2018 by the Sphinx team, see AUTHORS.
     :license: BSD, see LICENSE for details.
 """
-from docutils.io import FileInput
+import re
+import codecs
+
+from docutils.io import FileInput, NullOutput
+from docutils.core import Publisher
 from docutils.readers import standalone
+from docutils.statemachine import StringList, string2lines
 from docutils.writers import UnfilteredWriter
-from six import string_types, text_type, iteritems
+from six import text_type
 from typing import Any, Union  # NOQA
 
 from sphinx.transforms import (
@@ -24,7 +29,7 @@ from sphinx.transforms.compact_bullet_list import RefOnlyBulletListTransform
 from sphinx.transforms.i18n import (
     PreserveTranslatableMessages, Locale, RemoveTranslatableInline,
 )
-from sphinx.util import import_object, split_docinfo
+from sphinx.util import logging
 from sphinx.util.docutils import LoggingReporter
 
 if False:
@@ -38,41 +43,18 @@ if False:
     from sphinx.builders import Builder  # NOQA
     from sphinx.environment import BuildEnvironment  # NOQA
 
+docinfo_re = re.compile(':\\w+:.*?')
+
+
+logger = logging.getLogger(__name__)
+
 
 class SphinxBaseReader(standalone.Reader):
     """
-    Add our source parsers
-    """
-    def __init__(self, app, parsers={}, *args, **kwargs):
-        # type: (Sphinx, Dict[unicode, Parser], Any, Any) -> None
-        standalone.Reader.__init__(self, *args, **kwargs)
-        self.parser_map = {}  # type: Dict[unicode, Parser]
-        for suffix, parser_class in parsers.items():
-            if isinstance(parser_class, string_types):
-                parser_class = import_object(parser_class, 'source parser')  # type: ignore
-            parser = parser_class()
-            if hasattr(parser, 'set_application'):
-                parser.set_application(app)
-            self.parser_map[suffix] = parser
-
-    def read(self, source, parser, settings):
-        # type: (Input, Parser, Dict) -> nodes.document
-        self.source = source
-
-        for suffix in self.parser_map:
-            if source.source_path.endswith(suffix):
-                self.parser = self.parser_map[suffix]
-                break
-        else:
-            # use special parser for unknown file-extension '*' (if exists)
-            self.parser = self.parser_map.get('*')
+    A base class of readers for Sphinx.
 
-        if not self.parser:
-            self.parser = parser
-        self.settings = settings
-        self.input = self.source.read()
-        self.parse()
-        return self.document
+    This replaces reporter by Sphinx's on generating document.
+    """
 
     def get_transforms(self):
         # type: () -> List[Transform]
@@ -80,50 +62,59 @@ class SphinxBaseReader(standalone.Reader):
 
     def new_document(self):
         # type: () -> nodes.document
+        """Creates a new document object which having a special reporter object good
+        for logging.
+        """
         document = standalone.Reader.new_document(self)
         reporter = document.reporter
-        document.reporter = LoggingReporter(reporter.source, reporter.report_level,
-                                            reporter.halt_level, reporter.debug_flag,
-                                            reporter.error_handler)
+        document.reporter = LoggingReporter.from_reporter(reporter)
+        document.reporter.set_source(self.source)
         return document
 
 
 class SphinxStandaloneReader(SphinxBaseReader):
     """
-    Add our own transforms.
+    A basic document reader for Sphinx.
     """
     transforms = [ApplySourceWorkaround, ExtraTranslatableNodes, PreserveTranslatableMessages,
                   Locale, CitationReferences, DefaultSubstitutions, MoveModuleTargets,
                   HandleCodeBlocks, AutoNumbering, AutoIndexUpgrader, SortIds,
                   RemoveTranslatableInline, PreserveTranslatableMessages, FilterSystemMessages,
-                  RefOnlyBulletListTransform, UnreferencedFootnotesDetector]
+                  RefOnlyBulletListTransform, UnreferencedFootnotesDetector
+                  ]  # type: List[Transform]
+
+    def __init__(self, app, *args, **kwargs):
+        # type: (Sphinx, Any, Any) -> None
+        self.transforms = self.transforms + app.registry.get_transforms()
+        SphinxBaseReader.__init__(self, *args, **kwargs)  # type: ignore
 
 
 class SphinxI18nReader(SphinxBaseReader):
     """
-    Replacer for document.reporter.get_source_and_line method.
+    A document reader for i18n.
 
-    reST text lines for translation do not have the original source line number.
-    This class provides the correct line numbers when reporting.
+    This returns the source line number of original text as current source line number
+    to let users know where the error happened.
+    Because the translated texts are partial and they don't have correct line numbers.
     """
 
+    lineno = None  # type: int
     transforms = [ApplySourceWorkaround, ExtraTranslatableNodes, CitationReferences,
                   DefaultSubstitutions, MoveModuleTargets, HandleCodeBlocks,
                   AutoNumbering, SortIds, RemoveTranslatableInline,
                   FilterSystemMessages, RefOnlyBulletListTransform,
                   UnreferencedFootnotesDetector]
 
-    def __init__(self, *args, **kwargs):
-        # type: (Any, Any) -> None
-        SphinxBaseReader.__init__(self, *args, **kwargs)
-        self.lineno = None  # type: int
-
     def set_lineno_for_reporter(self, lineno):
         # type: (int) -> None
+        """Stores the source line number of original text."""
         self.lineno = lineno
 
     def new_document(self):
         # type: () -> nodes.document
+        """Creates a new document object which having a special reporter object for
+        translation.
+        """
         document = SphinxBaseReader.new_document(self)
         reporter = document.reporter
 
@@ -136,6 +127,8 @@ class SphinxI18nReader(SphinxBaseReader):
 
 
 class SphinxDummyWriter(UnfilteredWriter):
+    """Dummy writer module used for generating doctree."""
+
     supported = ('html',)  # needed to keep "meta" nodes
 
     def translate(self):
@@ -143,11 +136,26 @@ class SphinxDummyWriter(UnfilteredWriter):
         pass
 
 
-class SphinxFileInput(FileInput):
+def SphinxDummySourceClass(source, *args, **kwargs):
+    """Bypass source object as is to cheat Publisher."""
+    return source
+
+
+class SphinxBaseFileInput(FileInput):
+    """A base class of SphinxFileInput.
+
+    It supports to replace unknown Unicode characters to '?'. And it also emits
+    Sphinx events ``source-read`` on reading.
+    """
+
     def __init__(self, app, env, *args, **kwds):
         # type: (Sphinx, BuildEnvironment, Any, Any) -> None
         self.app = app
         self.env = env
+
+        # set up error handler
+        codecs.register_error('sphinx', self.warn_and_replace)  # type: ignore
+
         kwds['error_handler'] = 'sphinx'  # py3: handle error on open.
         FileInput.__init__(self, *args, **kwds)
 
@@ -159,25 +167,127 @@ class SphinxFileInput(FileInput):
 
     def read(self):
         # type: () -> unicode
-        def get_parser_type(source_path):
-            # type: (unicode) -> Tuple[unicode]
-            for suffix, parser_class in iteritems(self.app.registry.get_source_parsers()):
-                if source_path.endswith(suffix):
-                    if isinstance(parser_class, string_types):
-                        parser_class = import_object(parser_class, 'source parser')  # type: ignore  # NOQA
-                    return parser_class.supported
-            else:
-                return ('restructuredtext',)
+        """Reads the contents from file.
 
+        After reading, it emits Sphinx event ``source-read``.
+        """
         data = FileInput.read(self)
-        if self.app:
-            arg = [data]
-            self.app.emit('source-read', self.env.docname, arg)
-            data = arg[0]
-        docinfo, data = split_docinfo(data)
-        if 'restructuredtext' in get_parser_type(self.source_path):
-            if self.env.config.rst_epilog:
-                data = data + '\n' + self.env.config.rst_epilog + '\n'
-            if self.env.config.rst_prolog:
-                data = self.env.config.rst_prolog + '\n' + data
-        return docinfo + data
+
+        # emit source-read event
+        arg = [data]
+        self.app.emit('source-read', self.env.docname, arg)
+        return arg[0]
+
+    def warn_and_replace(self, error):
+        # type: (Any) -> Tuple
+        """Custom decoding error handler that warns and replaces."""
+        linestart = error.object.rfind(b'\n', 0, error.start)
+        lineend = error.object.find(b'\n', error.start)
+        if lineend == -1:
+            lineend = len(error.object)
+        lineno = error.object.count(b'\n', 0, error.start) + 1
+        logger.warning('undecodable source characters, replacing with "?": %r',
+                       (error.object[linestart + 1:error.start] + b'>>>' +
+                        error.object[error.start:error.end] + b'<<<' +
+                        error.object[error.end:lineend]),
+                       location=(self.env.docname, lineno))
+        return (u'?', error.end)
+
+
+class SphinxFileInput(SphinxBaseFileInput):
+    """A basic FileInput for Sphinx."""
+    pass
+
+
+class SphinxRSTFileInput(SphinxBaseFileInput):
+    """A reST FileInput for Sphinx.
+
+    This FileInput automatically prepends and appends text by :confval:`rst_prolog` and
+    :confval:`rst_epilog`.
+
+    .. important::
+
+       This FileInput uses an instance of ``StringList`` as a return value of ``read()``
+       method to indicate original source filename and line numbers after prepending and
+       appending.
+       For that reason, ``sphinx.parsers.RSTParser`` should be used with this to parse
+       a content correctly.
+    """
+
+    def prepend_prolog(self, text, prolog):
+        # type: (StringList, unicode) -> None
+        docinfo = self.count_docinfo_lines(text)
+        if docinfo:
+            # insert a blank line after docinfo
+            text.insert(docinfo, '', '<generated>', 0)
+            docinfo += 1
+
+        # insert prolog (after docinfo if exists)
+        for lineno, line in enumerate(prolog.splitlines()):
+            text.insert(docinfo + lineno, line, '<rst_prolog>', lineno)
+
+        text.insert(docinfo + lineno + 1, '', '<generated>', 0)
+
+    def append_epilog(self, text, epilog):
+        # type: (StringList, unicode) -> None
+        # append a blank line and rst_epilog
+        text.append('', '<generated>', 0)
+        for lineno, line in enumerate(epilog.splitlines()):
+            text.append(line, '<rst_epilog>', lineno)
+
+    def read(self):
+        # type: () -> StringList
+        inputstring = SphinxBaseFileInput.read(self)
+        lines = string2lines(inputstring, convert_whitespace=True)
+        content = StringList()
+        for lineno, line in enumerate(lines):
+            content.append(line, self.source_path, lineno)
+
+        if self.env.config.rst_prolog:
+            self.prepend_prolog(content, self.env.config.rst_prolog)
+        if self.env.config.rst_epilog:
+            self.append_epilog(content, self.env.config.rst_epilog)
+
+        return content
+
+    def count_docinfo_lines(self, content):
+        # type: (StringList) -> int
+        if len(content) == 0:
+            return 0
+        else:
+            for lineno, line in enumerate(content.data):
+                if not docinfo_re.match(line):
+                    break
+            return lineno
+
+
+def read_doc(app, env, filename):
+    # type: (Sphinx, BuildEnvironment, unicode) -> nodes.document
+    """Parse a document and convert to doctree."""
+    input_class = app.registry.get_source_input(filename)
+    reader = SphinxStandaloneReader(app)
+    source = input_class(app, env, source=None, source_path=filename,
+                         encoding=env.config.source_encoding)
+    parser = app.registry.create_source_parser(app, filename)
+
+    pub = Publisher(reader=reader,
+                    parser=parser,
+                    writer=SphinxDummyWriter(),
+                    source_class=SphinxDummySourceClass,
+                    destination=NullOutput())
+    pub.set_components(None, 'restructuredtext', None)
+    pub.process_programmatic_settings(None, env.settings, None)
+    pub.set_source(source, filename)
+    pub.publish()
+    return pub.document
+
+
+def setup(app):
+    app.registry.add_source_input('*', SphinxFileInput)
+    app.registry.add_source_input('restructuredtext', SphinxRSTFileInput)
+
+    return {
+        'version': 'builtin',
+        'parallel_read_safe': True,
+        'parallel_write_safe': True,
+    }