Fix #3045: HTML search index creator should ignore "raw" content if now html

author: Takeshi KOMIYA <i.tkomiya@gmail.com> 2016-10-17 16:06:45 +0900
committer: Takeshi KOMIYA <i.tkomiya@gmail.com> 2016-10-17 16:08:37 +0900
commit: 53ea1cb2808e90b51f0ed9468740a34c00decc2a (patch)
tree: e442ad075ba6ba89ced0ce6e4ee7abc7e9cf4277
parent: 78d96b4abb1ac98efb96e73b79f70375f3496194 (diff)
download: sphinx-git-53ea1cb2808e90b51f0ed9468740a34c00decc2a.tar.gz
4 files changed, 27 insertions, 9 deletions
diff --git a/CHANGES b/CHANGES
index cdc428cd0..c5c06efb3 100644
--- a/CHANGES
+++ b/CHANGES
@@ -53,6 +53,7 @@ Bugs fixed
 * #3031: incompatibility with LaTeX package ``tocloft``
 * #3003: literal blocks in footnotes are not supported by Latex
 * #3047: spacing before footnote in pdf output is not coherent and allows breaks
+* #3045: HTML search index creator should ignore "raw" content if now html
 
 Testing
 --------
diff --git a/sphinx/search/__init__.py b/sphinx/search/__init__.py
index 09430876b..d3c6c0eba 100644
--- a/sphinx/search/__init__.py
+++ b/sphinx/search/__init__.py
@@ -196,13 +196,14 @@ class WordCollector(NodeVisitor):
         if issubclass(nodetype, comment):
             raise SkipNode
         if issubclass(nodetype, raw):
-            # Some people might put content in raw HTML that should be searched,
-            # so we just amateurishly strip HTML tags and index the remaining
-            # content
-            nodetext = re.sub(r'(?is)<style.*?</style>', '', node.astext())
-            nodetext = re.sub(r'(?is)<script.*?</script>', '', nodetext)
-            nodetext = re.sub(r'<[^<]+?>', '', nodetext)
-            self.found_words.extend(self.lang.split(nodetext))
+            if 'html' in node.get('format', '').split():
+                # Some people might put content in raw HTML that should be searched,
+                # so we just amateurishly strip HTML tags and index the remaining
+                # content
+                nodetext = re.sub(r'(?is)<style.*?</style>', '', node.astext())
+                nodetext = re.sub(r'(?is)<script.*?</script>', '', nodetext)
+                nodetext = re.sub(r'<[^<]+?>', '', nodetext)
+                self.found_words.extend(self.lang.split(nodetext))
             raise SkipNode
         if issubclass(nodetype, Text):
             self.found_words.extend(self.lang.split(node.astext()))
diff --git a/tests/roots/test-search/index.rst b/tests/roots/test-search/index.rst
index 21fcdf53c..b593c6cad 100644
--- a/tests/roots/test-search/index.rst
+++ b/tests/roots/test-search/index.rst
@@ -17,4 +17,12 @@ textinheading
 
 .. toctree::
 
-   tocitem
-\ No newline at end of file
+   tocitem
+
+.. raw:: html
+
+   <span class="raw">rawword"</span>
+
+.. raw:: latex
+
+   latex_keyword
diff --git a/tests/test_search.py b/tests/test_search.py
index a363b30be..d4b8817de 100644
--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -114,4 +114,12 @@ def test_term_in_heading_and_section(app, status, warning):
     # both documents should be a hit in the search index as a title,
     # respectively text hit
     assert 'textinhead:1' in searchindex
-    assert 'textinhead:0' in searchindex
-\ No newline at end of file
+    assert 'textinhead:0' in searchindex
+
+
+@with_app(testroot='search')
+def test_term_in_raw_directive(app, status, warning):
+    searchindex = jsload(app.outdir / 'searchindex.js')
+    assert not is_registered_term(searchindex, 'raw')
+    assert is_registered_term(searchindex, 'rawword')
+    assert not is_registered_term(searchindex, 'latex_keyword')
author	Takeshi KOMIYA <i.tkomiya@gmail.com>	2016-10-17 16:06:45 +0900
committer	Takeshi KOMIYA <i.tkomiya@gmail.com>	2016-10-17 16:08:37 +0900
commit	53ea1cb2808e90b51f0ed9468740a34c00decc2a (patch)
tree	e442ad075ba6ba89ced0ce6e4ee7abc7e9cf4277
parent	78d96b4abb1ac98efb96e73b79f70375f3496194 (diff)
download	sphinx-git-53ea1cb2808e90b51f0ed9468740a34c00decc2a.tar.gz