5 files changed, 47 insertions, 222 deletions
diff --git a/sphinx/search/__init__.py b/sphinx/search/__init__.py
index 5fa35ae16..c3e46ce22 100644
--- a/sphinx/search/__init__.py
+++ b/sphinx/search/__init__.py
@@ -11,7 +11,6 @@ from docutils.nodes import Element, Node
 
 from sphinx import addnodes, package_dir
 from sphinx.environment import BuildEnvironment
-from sphinx.search.jssplitter import splitter_code
 from sphinx.util import jsdump
 
 
@@ -48,7 +47,7 @@ class SearchLanguage:
     lang: str = None
     language_name: str = None
     stopwords: Set[str] = set()
-    js_splitter_code: str = None
+    js_splitter_code: str = ""
     js_stemmer_rawcode: str = None
     js_stemmer_code = """
 /**
@@ -262,7 +261,7 @@ class IndexBuilder:
                 self.js_scorer_code = fp.read().decode()
         else:
             self.js_scorer_code = ''
-        self.js_splitter_code = splitter_code
+        self.js_splitter_code = ""
 
     def load(self, stream: IO, format: Any) -> None:
         """Reconstruct from frozen data."""
diff --git a/sphinx/search/jssplitter.py b/sphinx/search/jssplitter.py
deleted file mode 100644
index 983d1bfc2..000000000
--- a/sphinx/search/jssplitter.py
+++ /dev/null
@@ -1,105 +0,0 @@
-"""Provides Python compatible word splitter to JavaScript
-
-DO NOT EDIT. This is generated by utils/jssplitter_generator.py
-"""
-
-splitter_code = """
-const splitChars = new Set(
-    [[0, 48], [58, 65], [91, 95], [96, 97], [123, 170], [171, 178], [180, 181], [182, 185],
-     [187, 188], [191, 192], [215, 216], [247, 248], [706, 710], [722, 736], [741, 748],
-     [749, 750], [751, 880], [885, 886], [888, 890], [894, 895], [896, 902], [903, 904],
-     [907, 908], [909, 910], [930, 931], [1014, 1015], [1154, 1162], [1328, 1329],
-     [1367, 1369], [1370, 1376], [1417, 1488], [1515, 1519], [1523, 1568], [1611, 1632],
-     [1642, 1646], [1648, 1649], [1748, 1749], [1750, 1765], [1767, 1774], [1789, 1791],
-     [1792, 1808], [1809, 1810], [1840, 1869], [1958, 1969], [1970, 1984], [2027, 2036],
-     [2038, 2042], [2043, 2048], [2070, 2074], [2075, 2084], [2085, 2088], [2089, 2112],
-     [2137, 2144], [2155, 2208], [2229, 2230], [2248, 2308], [2362, 2365], [2366, 2384],
-     [2385, 2392], [2402, 2406], [2416, 2417], [2433, 2437], [2445, 2447], [2449, 2451],
-     [2473, 2474], [2481, 2482], [2483, 2486], [2490, 2493], [2494, 2510], [2511, 2524],
-     [2526, 2527], [2530, 2534], [2546, 2548], [2554, 2556], [2557, 2565], [2571, 2575],
-     [2577, 2579], [2601, 2602], [2609, 2610], [2612, 2613], [2615, 2616], [2618, 2649],
-     [2653, 2654], [2655, 2662], [2672, 2674], [2677, 2693], [2702, 2703], [2706, 2707],
-     [2729, 2730], [2737, 2738], [2740, 2741], [2746, 2749], [2750, 2768], [2769, 2784],
-     [2786, 2790], [2800, 2809], [2810, 2821], [2829, 2831], [2833, 2835], [2857, 2858],
-     [2865, 2866], [2868, 2869], [2874, 2877], [2878, 2908], [2910, 2911], [2914, 2918],
-     [2928, 2929], [2936, 2947], [2948, 2949], [2955, 2958], [2961, 2962], [2966, 2969],
-     [2971, 2972], [2973, 2974], [2976, 2979], [2981, 2984], [2987, 2990], [3002, 3024],
-     [3025, 3046], [3059, 3077], [3085, 3086], [3089, 3090], [3113, 3114], [3130, 3133],
-     [3134, 3160], [3163, 3168], [3170, 3174], [3184, 3192], [3199, 3200], [3201, 3205],
-     [3213, 3214], [3217, 3218], [3241, 3242], [3252, 3253], [3258, 3261], [3262, 3294],
-     [3295, 3296], [3298, 3302], [3312, 3313], [3315, 3332], [3341, 3342], [3345, 3346],
-     [3387, 3389], [3390, 3406], [3407, 3412], [3415, 3416], [3426, 3430], [3449, 3450],
-     [3456, 3461], [3479, 3482], [3506, 3507], [3516, 3517], [3518, 3520], [3527, 3558],
-     [3568, 3585], [3633, 3634], [3636, 3648], [3655, 3664], [3674, 3713], [3715, 3716],
-     [3717, 3718], [3723, 3724], [3748, 3749], [3750, 3751], [3761, 3762], [3764, 3773],
-     [3774, 3776], [3781, 3782], [3783, 3792], [3802, 3804], [3808, 3840], [3841, 3872],
-     [3892, 3904], [3912, 3913], [3949, 3976], [3981, 4096], [4139, 4159], [4170, 4176],
-     [4182, 4186], [4190, 4193], [4194, 4197], [4199, 4206], [4209, 4213], [4226, 4238],
-     [4239, 4240], [4250, 4256], [4294, 4295], [4296, 4301], [4302, 4304], [4347, 4348],
-     [4681, 4682], [4686, 4688], [4695, 4696], [4697, 4698], [4702, 4704], [4745, 4746],
-     [4750, 4752], [4785, 4786], [4790, 4792], [4799, 4800], [4801, 4802], [4806, 4808],
-     [4823, 4824], [4881, 4882], [4886, 4888], [4955, 4969], [4989, 4992], [5008, 5024],
-     [5110, 5112], [5118, 5121], [5741, 5743], [5760, 5761], [5787, 5792], [5867, 5870],
-     [5881, 5888], [5901, 5902], [5906, 5920], [5938, 5952], [5970, 5984], [5997, 5998],
-     [6001, 6016], [6068, 6103], [6104, 6108], [6109, 6112], [6122, 6128], [6138, 6160],
-     [6170, 6176], [6265, 6272], [6277, 6279], [6313, 6314], [6315, 6320], [6390, 6400],
-     [6431, 6470], [6510, 6512], [6517, 6528], [6572, 6576], [6602, 6608], [6619, 6656],
-     [6679, 6688], [6741, 6784], [6794, 6800], [6810, 6823], [6824, 6917], [6964, 6981],
-     [6988, 6992], [7002, 7043], [7073, 7086], [7142, 7168], [7204, 7232], [7242, 7245],
-     [7294, 7296], [7305, 7312], [7355, 7357], [7360, 7401], [7405, 7406], [7412, 7413],
-     [7415, 7418], [7419, 7424], [7616, 7680], [7958, 7960], [7966, 7968], [8006, 8008],
-     [8014, 8016], [8024, 8025], [8026, 8027], [8028, 8029], [8030, 8031], [8062, 8064],
-     [8117, 8118], [8125, 8126], [8127, 8130], [8133, 8134], [8141, 8144], [8148, 8150],
-     [8156, 8160], [8173, 8178], [8181, 8182], [8189, 8304], [8306, 8308], [8314, 8319],
-     [8330, 8336], [8349, 8450], [8451, 8455], [8456, 8458], [8468, 8469], [8470, 8473],
-     [8478, 8484], [8485, 8486], [8487, 8488], [8489, 8490], [8494, 8495], [8506, 8508],
-     [8512, 8517], [8522, 8526], [8527, 8528], [8586, 9312], [9372, 9450], [9472, 10102],
-     [10132, 11264], [11311, 11312], [11359, 11360], [11493, 11499], [11503, 11506],
-     [11508, 11517], [11518, 11520], [11558, 11559], [11560, 11565], [11566, 11568],
-     [11624, 11631], [11632, 11648], [11671, 11680], [11687, 11688], [11695, 11696],
-     [11703, 11704], [11711, 11712], [11719, 11720], [11727, 11728], [11735, 11736],
-     [11743, 11823], [11824, 12293], [12296, 12321], [12330, 12337], [12342, 12344],
-     [12349, 12353], [12439, 12445], [12448, 12449], [12539, 12540], [12544, 12549],
-     [12592, 12593], [12687, 12690], [12694, 12704], [12736, 12784], [12800, 12832],
-     [12842, 12872], [12880, 12881], [12896, 12928], [12938, 12977], [12992, 13312],
-     [19904, 19968], [40957, 40960], [42125, 42192], [42238, 42240], [42509, 42512],
-     [42540, 42560], [42607, 42623], [42654, 42656], [42736, 42775], [42784, 42786],
-     [42889, 42891], [42944, 42946], [42955, 42997], [43010, 43011], [43014, 43015],
-     [43019, 43020], [43043, 43056], [43062, 43072], [43124, 43138], [43188, 43216],
-     [43226, 43250], [43256, 43259], [43260, 43261], [43263, 43264], [43302, 43312],
-     [43335, 43360], [43389, 43396], [43443, 43471], [43482, 43488], [43493, 43494],
-     [43519, 43520], [43561, 43584], [43587, 43588], [43596, 43600], [43610, 43616],
-     [43639, 43642], [43643, 43646], [43696, 43697], [43698, 43701], [43703, 43705],
-     [43710, 43712], [43713, 43714], [43715, 43739], [43742, 43744], [43755, 43762],
-     [43765, 43777], [43783, 43785], [43791, 43793], [43799, 43808], [43815, 43816],
-     [43823, 43824], [43867, 43868], [43882, 43888], [44003, 44016], [44026, 44032],
-     [55204, 55216], [55239, 55243], [55292, 55296], [57344, 63744], [64110, 64112],
-     [64218, 64256], [64263, 64275], [64280, 64285], [64286, 64287], [64297, 64298],
-     [64311, 64312], [64317, 64318], [64319, 64320], [64322, 64323], [64325, 64326],
-     [64434, 64467], [64830, 64848], [64912, 64914], [64968, 65008], [65020, 65136],
-     [65141, 65142], [65277, 65296], [65306, 65313], [65339, 65345], [65371, 65382],
-     [65471, 65474], [65480, 65482], [65488, 65490], [65496, 65498]].map(
-        ([start, end]) => Array(end - start).fill(0).map((_, i) => start + i)
-    ).flat()
-)
-
-const splitQuery = (query) => {
-    const result = [];
-    let start = null;
-    for (let i = 0; i < query.length; i++) {
-        if (splitChars.has(query.charCodeAt(i))) {
-            if (start !== null) {
-                result.push(query.slice(start, i));
-                start = null;
-            }
-        } else {
-            if (start === null) start = i;
-            if (i === query.length - 1) {
-                result.push(query.slice(start));
-            }
-        }
-    }
-    return result;
-}
-
-"""
diff --git a/sphinx/themes/basic/static/searchtools.js b/sphinx/themes/basic/static/searchtools.js
index 403718cd3..50473461d 100644
--- a/sphinx/themes/basic/static/searchtools.js
+++ b/sphinx/themes/basic/static/searchtools.js
@@ -13,7 +13,7 @@
 /**
  * Simple result scoring code.
  */
-if (!Scorer) {
+if (typeof Scorer === "undefined") {
   var Scorer = {
     // Implement the following function to further tweak the score for each result
     // The function takes a result array [docname, title, anchor, descr, score, filename]
@@ -133,6 +133,20 @@ const _displayNextItem = (
 };
 
 /**
+ * Default splitQuery function. Can be overridden in ``sphinx.search`` with a
+ * custom function per language.
+ *
+ * The regular expression works by splitting the string on consecutive characters
+ * that are not Unicode letters, numbers, underscores, or emoji characters.
+ * This is the same as ``\W+`` in Python, preserving the surrogate pair area.
+ */
+if (typeof splitQuery === "undefined") {
+  var splitQuery = (query) => query
+      .split(/[^\p{Letter}\p{Number}_\p{Emoji_Presentation}]+/gu)
+      .filter(term => term)  // remove remaining empty strings
+}
+
+/**
  * Search Module
  */
 const Search = {
diff --git a/tests/js/searchtools.js b/tests/js/searchtools.js
index 007eeb7db..c9e0c43d3 100644
--- a/tests/js/searchtools.js
+++ b/tests/js/searchtools.js
@@ -30,3 +30,33 @@ describe('Basic html theme search', function() {
   });
 
 });
+
+// This is regression test for https://github.com/sphinx-doc/sphinx/issues/3150
+describe('splitQuery regression tests', () => {
+
+  it('can split English words', () => {
+    const parts = splitQuery('   Hello    World   ')
+    expect(parts).toEqual(['Hello', 'World'])
+  })
+
+  it('can split special characters', () => {
+    const parts = splitQuery('Pin-Code')
+    expect(parts).toEqual(['Pin', 'Code'])
+  })
+
+  it('can split Chinese characters', () => {
+    const parts = splitQuery('Hello from 中国 上海')
+    expect(parts).toEqual(['Hello', 'from', '中国', '上海'])
+  })
+
+  it('can split Emoji (surrogate pair) characters. It should keep emojis.', () => {
+    const parts = splitQuery('😁😁')
+    expect(parts).toEqual(['😁😁'])
+  })
+
+  it('can split umlauts. It should keep umlauts.', () => {
+    const parts = splitQuery('Löschen Prüfung Abändern ærlig spørsmål')
+    expect(parts).toEqual(['Löschen', 'Prüfung', 'Abändern', 'ærlig', 'spørsmål'])
+  })
+
+})
diff --git a/utils/jssplitter_generator.py b/utils/jssplitter_generator.py
deleted file mode 100644
index f37559dfa..000000000
--- a/utils/jssplitter_generator.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import json
-import subprocess
-
-begin = -1
-ranges = []
-
-for i in range(65536):
-    # Get all non 'word' codepoints. This means skipping all alphanumerics and
-    # '_' (U+0095), matching the `\w` character class in `re`. We also skip
-    # 0xd800-0xdfff, the surrogate pair area.
-    if not (chr(i).isalnum() or i == 95) and not (0xd800 <= i <= 0xdfff):
-        if begin == -1:
-            begin = i
-    elif begin != -1:
-        ranges.append((begin, i))
-        begin = -1
-
-
-# fold json within almost 80 chars per line
-def fold(json_data, splitter):
-    code = json.dumps(json_data)
-    lines = []
-    while True:
-        if len(code) < 75:
-            lines.append('    ' + code)
-            break
-        index = code.index(splitter, 74)
-        lines.append('    ' + code[:index + len(splitter)])
-        code = code[index + len(splitter):]
-    lines[0] = lines[0][4:]
-    return '\n'.join(lines)
-
-
-# JavaScript code
-js_src = '''\
-const splitChars = new Set(
-    ''' + fold(ranges, "],") + '''.map(
-        ([start, end]) => Array(end - start).fill(0).map((_, i) => start + i)
-    ).flat()
-)
-
-const splitQuery = (query) => {
-    const result = [];
-    let start = null;
-    for (let i = 0; i < query.length; i++) {
-        if (splitChars.has(query.charCodeAt(i))) {
-            if (start !== null) {
-                result.push(query.slice(start, i));
-                start = null;
-            }
-        } else {
-            if (start === null) start = i;
-            if (i === query.length - 1) {
-                result.push(query.slice(start));
-            }
-        }
-    }
-    return result;
-}
-'''
-
-js_test_src = f'''\
-// This is regression test for https://github.com/sphinx-doc/sphinx/issues/3150
-// generated by compat_regexp_generator.py
-// it needs node.js for testing
-const assert = require('assert');
-
-{js_src}
-
-console.log("test splitting English words")
-assert.deepEqual(['Hello', 'World'], splitQuery('   Hello    World   '));
-console.log('   ... ok\\n')
-
-console.log("test splitting special characters")
-assert.deepEqual(['Pin', 'Code'], splitQuery('Pin-Code'));
-console.log('   ... ok\\n')
-
-console.log("test splitting Chinese characters")
-assert.deepEqual(['Hello', 'from', '中国', '上海'], splitQuery('Hello from 中国 上海'));
-console.log('   ... ok\\n')
-
-console.log("test splitting Emoji (surrogate pair) characters. It should keep emojis.")
-assert.deepEqual(['😁😁'], splitQuery('😁😁'));
-console.log('   ... ok\\n')
-
-console.log("test splitting umlauts. It should keep umlauts.")
-assert.deepEqual(
-    ['Löschen', 'Prüfung', 'Abändern', 'ærlig', 'spørsmål'],
-    splitQuery('Löschen Prüfung Abändern ærlig spørsmål'));
-console.log('   ... ok\\n')
-
-'''
-
-
-python_src = '''\
-"""Provides Python compatible word splitter to JavaScript
-
-DO NOT EDIT. This is generated by utils/jssplitter_generator.py
-"""
-
-splitter_code = """
-{js_src}
-"""
-'''
-
-with open('../sphinx/search/jssplitter.py', 'w', encoding="utf-8") as f:
-    f.write(python_src)
-
-with open('./regression_test.js', 'w', encoding="utf-8") as f:
-    f.write(js_test_src)
-
-print("starting test...")
-raise SystemExit(subprocess.call(['node', './regression_test.js']))