summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTimotheus Kampik <timotheus.kampik@gmail.com>2018-12-25 18:41:52 +0100
committerGitHub <noreply@github.com>2018-12-25 18:41:52 +0100
commit221614654fed1ce7e07e6bb4e242e8c60c4caa67 (patch)
treec98a3930e86b22913d1bfef1b4ad0c3029dc4e12
parent920aafaee60413f4e536f3168f3122779df01683 (diff)
downloadsphinx-git-221614654fed1ce7e07e6bb4e242e8c60c4caa67.tar.gz
#5605 fix Chinese search index (#5611)
generate search index for Latin words correctly if search language is Chinese
-rw-r--r--CHANGES2
-rw-r--r--sphinx/search/zh.py18
-rw-r--r--tests/roots/test-search/tocitem.rst6
-rw-r--r--tests/test_search.py15
4 files changed, 39 insertions, 2 deletions
diff --git a/CHANGES b/CHANGES
index 867cdc950..d3f8b0a0e 100644
--- a/CHANGES
+++ b/CHANGES
@@ -15,6 +15,8 @@ Features added
Bugs fixed
----------
+* #5605 If the documentation language is set to Chinese, English words could not
+ be searched.
Testing
--------
diff --git a/sphinx/search/zh.py b/sphinx/search/zh.py
index 6c5b65d6b..3753bc990 100644
--- a/sphinx/search/zh.py
+++ b/sphinx/search/zh.py
@@ -233,7 +233,8 @@ class SearchChinese(SearchLanguage):
language_name = 'Chinese'
js_stemmer_code = js_porter_stemmer
stopwords = english_stopwords
- latin1_letters = re.compile(u'(?u)\\w+[\u0000-\u00ff]')
+ latin1_letters = re.compile(r'[a-zA-Z0-9_]+')
+ latin_terms = [] # type: List[unicode]
def init(self, options):
# type: (Dict) -> None
@@ -250,7 +251,9 @@ class SearchChinese(SearchLanguage):
if JIEBA:
chinese = list(jieba.cut_for_search(input))
- latin1 = self.latin1_letters.findall(input)
+ latin1 = \
+ [term.strip() for term in self.latin1_letters.findall(input)]
+ self.latin_terms.extend(latin1)
return chinese + latin1
def word_filter(self, stemmed_word):
@@ -259,4 +262,15 @@ class SearchChinese(SearchLanguage):
def stem(self, word):
# type: (unicode) -> unicode
+
+ # Don't stem Latin words that are long enough to be relevant for search
+ # if not stemmed, but would be too short after being stemmed
+ # avoids some issues with acronyms
+ should_not_be_stemmed = (
+ word in self.latin_terms and
+ len(word) >= 3 and
+ len(self.stemmer.stem(word.lower())) < 3
+ )
+ if should_not_be_stemmed:
+ return word.lower()
return self.stemmer.stem(word.lower())
diff --git a/tests/roots/test-search/tocitem.rst b/tests/roots/test-search/tocitem.rst
index f082abf65..5d99f0a66 100644
--- a/tests/roots/test-search/tocitem.rst
+++ b/tests/roots/test-search/tocitem.rst
@@ -8,3 +8,9 @@ textinheading
=============
lorem ipsum
+
+可以查看 FAQ 模块中 Chinesetest 部分
+
+模块中 CAS service部分
+
+可以Chinesetesttwo查看
diff --git a/tests/test_search.py b/tests/test_search.py
index 886151831..4c7eb8b21 100644
--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -240,3 +240,18 @@ def test_IndexBuilder_lookup():
# zh_CN
index = IndexBuilder(env, 'zh_CN', {}, None)
assert index.lang.lang == 'zh'
+
+
+@pytest.mark.sphinx(
+ testroot='search',
+ confoverrides={'html_search_language': 'zh'},
+ srcdir='search_zh'
+)
+def test_search_index_gen_zh(app, status, warning):
+ app.builder.build_all()
+ # jsdump fails if search language is 'zh'; hence we just get the text:
+ searchindex = (app.outdir / 'searchindex.js').text()
+ assert 'chinesetest ' not in searchindex
+ assert 'chinesetest' in searchindex
+ assert 'chinesetesttwo' in searchindex
+ assert 'cas' in searchindex