Do not replace unicode characters by LaTeX macros on unicode supported LaTeX engines

author: Takeshi KOMIYA <i.tkomiya@gmail.com> 2019-10-20 22:29:46 +0900
committer: Takeshi KOMIYA <i.tkomiya@gmail.com> 2019-10-20 22:54:21 +0900
commit: 56bbb08b2cc4925e4796752ac798fe0001267a49 (patch)
tree: 60f7eaae59f1f99e750dd8c2ae38c74ff1e20e9d
parent: a499b7d83ed90ee5802dc5e39aceb56021d5addc (diff)
download: sphinx-git-56bbb08b2cc4925e4796752ac798fe0001267a49.tar.gz
7 files changed, 70 insertions, 10 deletions
diff --git a/CHANGES b/CHANGES
index 26ea1afb6..15b8368ac 100644
--- a/CHANGES
+++ b/CHANGES
@@ -28,6 +28,8 @@ Bugs fixed
 
   .. _latex3/latex2e#173: https://github.com/latex3/latex2e/issues/173
 * #6618: LaTeX: Avoid section names at the end of a page
+* #6738: LaTeX: Do not replace unicode characters by LaTeX macros on unicode
+  supported LaTeX engines
 * #6704: linkcheck: Be defensive and handle newly defined HTTP error code
 * #6655: image URLs containing ``data:`` causes gettext builder crashed
 * #6584: i18n: Error when compiling message catalogs on Hindi
diff --git a/sphinx/util/template.py b/sphinx/util/template.py
index fd8886944..3a43db9a5 100644
--- a/sphinx/util/template.py
+++ b/sphinx/util/template.py
@@ -63,14 +63,14 @@ class SphinxRenderer(FileRenderer):
 
 
 class LaTeXRenderer(SphinxRenderer):
-    def __init__(self, template_path: str = None) -> None:
+    def __init__(self, template_path: str = None, latex_engine: str = None) -> None:
         if template_path is None:
             template_path = os.path.join(package_dir, 'templates', 'latex')
         super().__init__(template_path)
 
         # use texescape as escape filter
-        self.env.filters['e'] = texescape.escape
-        self.env.filters['escape'] = texescape.escape
+        self.env.filters['e'] = texescape.get_escape_func(latex_engine)
+        self.env.filters['escape'] = texescape.get_escape_func(latex_engine)
         self.env.filters['eabbr'] = texescape.escape_abbr
 
         # use JSP/eRuby like tagging instead because curly bracket; the default
diff --git a/sphinx/util/texescape.py b/sphinx/util/texescape.py
index 408ec1253..4e7055119 100644
--- a/sphinx/util/texescape.py
+++ b/sphinx/util/texescape.py
@@ -9,7 +9,7 @@
 """
 
 import re
-from typing import Dict
+from typing import Callable, Dict
 
 tex_replacements = [
     # map TeX special chars
@@ -46,6 +46,14 @@ tex_replacements = [
     ('|', r'\textbar{}'),
     ('ℯ', r'e'),
     ('ⅈ', r'i'),
+    # Greek alphabet not escaped: pdflatex handles it via textalpha and inputenc
+    # OHM SIGN U+2126 is handled by LaTeX textcomp package
+]
+
+# A map Unicode characters to LaTeX representation
+# (for LaTeX engines which don't support unicode)
+unicode_tex_replacements = [
+    # superscript
     ('⁰', r'\(\sp{\text{0}}\)'),
     ('¹', r'\(\sp{\text{1}}\)'),
     ('²', r'\(\sp{\text{2}}\)'),
@@ -56,6 +64,7 @@ tex_replacements = [
     ('⁷', r'\(\sp{\text{7}}\)'),
     ('⁸', r'\(\sp{\text{8}}\)'),
     ('⁹', r'\(\sp{\text{9}}\)'),
+    # subscript
     ('₀', r'\(\sb{\text{0}}\)'),
     ('₁', r'\(\sb{\text{1}}\)'),
     ('₂', r'\(\sb{\text{2}}\)'),
@@ -66,20 +75,32 @@ tex_replacements = [
     ('₇', r'\(\sb{\text{7}}\)'),
     ('₈', r'\(\sb{\text{8}}\)'),
     ('₉', r'\(\sb{\text{9}}\)'),
-    # Greek alphabet not escaped: pdflatex handles it via textalpha and inputenc
-    # OHM SIGN U+2126 is handled by LaTeX textcomp package
 ]
 
 tex_escape_map = {}  # type: Dict[int, str]
+tex_escape_map_without_unicode = {}  # type: Dict[int, str]
 tex_replace_map = {}
 tex_hl_escape_map_new = {}
 
 
+def get_escape_func(latex_engine: str) -> Callable[[str], str]:
+    """Get escape() function for given latex_engine."""
+    if latex_engine in ('lualatex', 'xelatex'):
+        return escape_for_unicode_latex_engine
+    else:
+        return escape
+
+
 def escape(s: str) -> str:
     """Escape text for LaTeX output."""
     return s.translate(tex_escape_map)
 
 
+def escape_for_unicode_latex_engine(s: str) -> str:
+    """Escape text for unicode supporting LaTeX engine."""
+    return s.translate(tex_escape_map_without_unicode)
+
+
 def escape_abbr(text: str) -> str:
     """Adjust spacing after abbreviations. Works with @ letter or other."""
     return re.sub(r'\.(?=\s|$)', r'.\@{}', text)
@@ -88,6 +109,11 @@ def escape_abbr(text: str) -> str:
 def init() -> None:
     for a, b in tex_replacements:
         tex_escape_map[ord(a)] = b
+        tex_escape_map_without_unicode[ord(a)] = b
+        tex_replace_map[ord(a)] = '_'
+
+    for a, b in unicode_tex_replacements:
+        tex_escape_map[ord(a)] = b
         tex_replace_map[ord(a)] = '_'
 
     for a, b in tex_replacements:
diff --git a/sphinx/writers/latex.py b/sphinx/writers/latex.py
index 0d2b2bc97..bb84bba7a 100644
--- a/sphinx/writers/latex.py
+++ b/sphinx/writers/latex.py
@@ -32,7 +32,7 @@ from sphinx.util import split_into, logging
 from sphinx.util.docutils import SphinxTranslator
 from sphinx.util.nodes import clean_astext, get_prev_node
 from sphinx.util.template import LaTeXRenderer
-from sphinx.util.texescape import escape, tex_replace_map
+from sphinx.util.texescape import get_escape_func, tex_replace_map
 
 try:
     from docutils.utils.roman import toRoman
@@ -500,7 +500,7 @@ class LaTeXTranslator(SphinxTranslator):
         self.first_param = 0
 
         # escape helper
-        self.escape = escape
+        self.escape = get_escape_func(self.config.latex_engine)
 
         # sort out some elements
         self.elements = self.builder.context.copy()
@@ -795,13 +795,14 @@ class LaTeXTranslator(SphinxTranslator):
 
     def render(self, template_name, variables):
         # type: (str, Dict) -> str
+        renderer = LaTeXRenderer(latex_engine=self.config.latex_engine)
         for template_dir in self.builder.config.templates_path:
             template = path.join(self.builder.confdir, template_dir,
                                  template_name)
             if path.exists(template):
-                return LaTeXRenderer().render(template, variables)
+                return renderer.render(template, variables)
 
-        return LaTeXRenderer().render(template_name, variables)
+        return renderer.render(template_name, variables)
 
     def visit_document(self, node):
         # type: (nodes.Element) -> None
diff --git a/tests/roots/test-latex-unicode/conf.py b/tests/roots/test-latex-unicode/conf.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tests/roots/test-latex-unicode/conf.py
diff --git a/tests/roots/test-latex-unicode/index.rst b/tests/roots/test-latex-unicode/index.rst
new file mode 100644
index 000000000..2abeca98f
--- /dev/null
+++ b/tests/roots/test-latex-unicode/index.rst
@@ -0,0 +1,7 @@
+test-latex-unicode
+==================
+
+* script small e: ℯ
+* double struck italic small i: ⅈ
+* superscript: ⁰, ¹
+* subscript: ₀, ₁
diff --git a/tests/test_build_latex.py b/tests/test_build_latex.py
index 8410bbd03..80869dea3 100644
--- a/tests/test_build_latex.py
+++ b/tests/test_build_latex.py
@@ -1437,3 +1437,27 @@ def test_index_on_title(app, status, warning):
             '\\label{\\detokenize{contents:test-for-index-in-top-level-title}}'
             '\\index{index@\\spxentry{index}}\n'
             in result)
+
+
+@pytest.mark.sphinx('latex', testroot='latex-unicode',
+                    confoverrides={'latex_engine': 'pdflatex'})
+def test_texescape_for_non_unicode_supported_engine(app, status, warning):
+    app.builder.build_all()
+    result = (app.outdir / 'python.tex').text()
+    print(result)
+    assert 'script small e: e' in result
+    assert 'double struck italic small i: i' in result
+    assert r'superscript: \(\sp{\text{0}}\), \(\sp{\text{1}}\)' in result
+    assert r'subscript: \(\sb{\text{0}}\), \(\sb{\text{1}}\)' in result
+
+
+@pytest.mark.sphinx('latex', testroot='latex-unicode',
+                    confoverrides={'latex_engine': 'xelatex'})
+def test_texescape_for_unicode_supported_engine(app, status, warning):
+    app.builder.build_all()
+    result = (app.outdir / 'python.tex').text()
+    print(result)
+    assert 'script small e: e' in result
+    assert 'double struck italic small i: i' in result
+    assert 'superscript: ⁰, ¹' in result
+    assert 'subscript: ₀, ₁' in result
author	Takeshi KOMIYA <i.tkomiya@gmail.com>	2019-10-20 22:29:46 +0900
committer	Takeshi KOMIYA <i.tkomiya@gmail.com>	2019-10-20 22:54:21 +0900
commit	56bbb08b2cc4925e4796752ac798fe0001267a49 (patch)
tree	60f7eaae59f1f99e750dd8c2ae38c74ff1e20e9d
parent	a499b7d83ed90ee5802dc5e39aceb56021d5addc (diff)
download	sphinx-git-56bbb08b2cc4925e4796752ac798fe0001267a49.tar.gz