diff options
author | Takeshi KOMIYA <i.tkomiya@gmail.com> | 2022-03-20 03:18:03 +0900 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-03-20 03:18:03 +0900 |
commit | 9ba8d43821eb6331972fbd3af03852c1dba5d30b (patch) | |
tree | 8ce5fb90ee28e2f1a103cb33c2650b646bf65fa3 | |
parent | aa1bc83c2ac9357b39048b555c3fdb0efff3449c (diff) | |
parent | 2beeaf746fab1634e47f01c215c9ff69d35833ee (diff) | |
download | sphinx-git-9ba8d43821eb6331972fbd3af03852c1dba5d30b.tar.gz |
Merge pull request #10247 from AA-Turner/fix-searchtools
Fix query splitting in `searchtools.js`; update `jssplitter`
-rw-r--r-- | sphinx/search/jssplitter.py | 177 | ||||
-rw-r--r-- | sphinx/themes/basic/static/searchtools.js | 51 | ||||
-rw-r--r-- | utils/jssplitter_generator.py | 106 |
3 files changed, 157 insertions, 177 deletions
diff --git a/sphinx/search/jssplitter.py b/sphinx/search/jssplitter.py index babdb8e47..983d1bfc2 100644 --- a/sphinx/search/jssplitter.py +++ b/sphinx/search/jssplitter.py @@ -4,98 +4,101 @@ DO NOT EDIT. This is generated by utils/jssplitter_generator.py """ splitter_code = """ +const splitChars = new Set( + [[0, 48], [58, 65], [91, 95], [96, 97], [123, 170], [171, 178], [180, 181], [182, 185], + [187, 188], [191, 192], [215, 216], [247, 248], [706, 710], [722, 736], [741, 748], + [749, 750], [751, 880], [885, 886], [888, 890], [894, 895], [896, 902], [903, 904], + [907, 908], [909, 910], [930, 931], [1014, 1015], [1154, 1162], [1328, 1329], + [1367, 1369], [1370, 1376], [1417, 1488], [1515, 1519], [1523, 1568], [1611, 1632], + [1642, 1646], [1648, 1649], [1748, 1749], [1750, 1765], [1767, 1774], [1789, 1791], + [1792, 1808], [1809, 1810], [1840, 1869], [1958, 1969], [1970, 1984], [2027, 2036], + [2038, 2042], [2043, 2048], [2070, 2074], [2075, 2084], [2085, 2088], [2089, 2112], + [2137, 2144], [2155, 2208], [2229, 2230], [2248, 2308], [2362, 2365], [2366, 2384], + [2385, 2392], [2402, 2406], [2416, 2417], [2433, 2437], [2445, 2447], [2449, 2451], + [2473, 2474], [2481, 2482], [2483, 2486], [2490, 2493], [2494, 2510], [2511, 2524], + [2526, 2527], [2530, 2534], [2546, 2548], [2554, 2556], [2557, 2565], [2571, 2575], + [2577, 2579], [2601, 2602], [2609, 2610], [2612, 2613], [2615, 2616], [2618, 2649], + [2653, 2654], [2655, 2662], [2672, 2674], [2677, 2693], [2702, 2703], [2706, 2707], + [2729, 2730], [2737, 2738], [2740, 2741], [2746, 2749], [2750, 2768], [2769, 2784], + [2786, 2790], [2800, 2809], [2810, 2821], [2829, 2831], [2833, 2835], [2857, 2858], + [2865, 2866], [2868, 2869], [2874, 2877], [2878, 2908], [2910, 2911], [2914, 2918], + [2928, 2929], [2936, 2947], [2948, 2949], [2955, 2958], [2961, 2962], [2966, 2969], + [2971, 2972], [2973, 2974], [2976, 2979], [2981, 2984], [2987, 2990], [3002, 3024], + [3025, 3046], [3059, 3077], [3085, 3086], [3089, 3090], [3113, 3114], [3130, 3133], + [3134, 3160], [3163, 3168], [3170, 3174], [3184, 3192], [3199, 3200], [3201, 3205], + [3213, 3214], [3217, 3218], [3241, 3242], [3252, 3253], [3258, 3261], [3262, 3294], + [3295, 3296], [3298, 3302], [3312, 3313], [3315, 3332], [3341, 3342], [3345, 3346], + [3387, 3389], [3390, 3406], [3407, 3412], [3415, 3416], [3426, 3430], [3449, 3450], + [3456, 3461], [3479, 3482], [3506, 3507], [3516, 3517], [3518, 3520], [3527, 3558], + [3568, 3585], [3633, 3634], [3636, 3648], [3655, 3664], [3674, 3713], [3715, 3716], + [3717, 3718], [3723, 3724], [3748, 3749], [3750, 3751], [3761, 3762], [3764, 3773], + [3774, 3776], [3781, 3782], [3783, 3792], [3802, 3804], [3808, 3840], [3841, 3872], + [3892, 3904], [3912, 3913], [3949, 3976], [3981, 4096], [4139, 4159], [4170, 4176], + [4182, 4186], [4190, 4193], [4194, 4197], [4199, 4206], [4209, 4213], [4226, 4238], + [4239, 4240], [4250, 4256], [4294, 4295], [4296, 4301], [4302, 4304], [4347, 4348], + [4681, 4682], [4686, 4688], [4695, 4696], [4697, 4698], [4702, 4704], [4745, 4746], + [4750, 4752], [4785, 4786], [4790, 4792], [4799, 4800], [4801, 4802], [4806, 4808], + [4823, 4824], [4881, 4882], [4886, 4888], [4955, 4969], [4989, 4992], [5008, 5024], + [5110, 5112], [5118, 5121], [5741, 5743], [5760, 5761], [5787, 5792], [5867, 5870], + [5881, 5888], [5901, 5902], [5906, 5920], [5938, 5952], [5970, 5984], [5997, 5998], + [6001, 6016], [6068, 6103], [6104, 6108], [6109, 6112], [6122, 6128], [6138, 6160], + [6170, 6176], [6265, 6272], [6277, 6279], [6313, 6314], [6315, 6320], [6390, 6400], + [6431, 6470], [6510, 6512], [6517, 6528], [6572, 6576], [6602, 6608], [6619, 6656], + [6679, 6688], [6741, 6784], [6794, 6800], [6810, 6823], [6824, 6917], [6964, 6981], + [6988, 6992], [7002, 7043], [7073, 7086], [7142, 7168], [7204, 7232], [7242, 7245], + [7294, 7296], [7305, 7312], [7355, 7357], [7360, 7401], [7405, 7406], [7412, 7413], + [7415, 7418], [7419, 7424], [7616, 7680], [7958, 7960], [7966, 7968], [8006, 8008], + [8014, 8016], [8024, 8025], [8026, 8027], [8028, 8029], [8030, 8031], [8062, 8064], + [8117, 8118], [8125, 8126], [8127, 8130], [8133, 8134], [8141, 8144], [8148, 8150], + [8156, 8160], [8173, 8178], [8181, 8182], [8189, 8304], [8306, 8308], [8314, 8319], + [8330, 8336], [8349, 8450], [8451, 8455], [8456, 8458], [8468, 8469], [8470, 8473], + [8478, 8484], [8485, 8486], [8487, 8488], [8489, 8490], [8494, 8495], [8506, 8508], + [8512, 8517], [8522, 8526], [8527, 8528], [8586, 9312], [9372, 9450], [9472, 10102], + [10132, 11264], [11311, 11312], [11359, 11360], [11493, 11499], [11503, 11506], + [11508, 11517], [11518, 11520], [11558, 11559], [11560, 11565], [11566, 11568], + [11624, 11631], [11632, 11648], [11671, 11680], [11687, 11688], [11695, 11696], + [11703, 11704], [11711, 11712], [11719, 11720], [11727, 11728], [11735, 11736], + [11743, 11823], [11824, 12293], [12296, 12321], [12330, 12337], [12342, 12344], + [12349, 12353], [12439, 12445], [12448, 12449], [12539, 12540], [12544, 12549], + [12592, 12593], [12687, 12690], [12694, 12704], [12736, 12784], [12800, 12832], + [12842, 12872], [12880, 12881], [12896, 12928], [12938, 12977], [12992, 13312], + [19904, 19968], [40957, 40960], [42125, 42192], [42238, 42240], [42509, 42512], + [42540, 42560], [42607, 42623], [42654, 42656], [42736, 42775], [42784, 42786], + [42889, 42891], [42944, 42946], [42955, 42997], [43010, 43011], [43014, 43015], + [43019, 43020], [43043, 43056], [43062, 43072], [43124, 43138], [43188, 43216], + [43226, 43250], [43256, 43259], [43260, 43261], [43263, 43264], [43302, 43312], + [43335, 43360], [43389, 43396], [43443, 43471], [43482, 43488], [43493, 43494], + [43519, 43520], [43561, 43584], [43587, 43588], [43596, 43600], [43610, 43616], + [43639, 43642], [43643, 43646], [43696, 43697], [43698, 43701], [43703, 43705], + [43710, 43712], [43713, 43714], [43715, 43739], [43742, 43744], [43755, 43762], + [43765, 43777], [43783, 43785], [43791, 43793], [43799, 43808], [43815, 43816], + [43823, 43824], [43867, 43868], [43882, 43888], [44003, 44016], [44026, 44032], + [55204, 55216], [55239, 55243], [55292, 55296], [57344, 63744], [64110, 64112], + [64218, 64256], [64263, 64275], [64280, 64285], [64286, 64287], [64297, 64298], + [64311, 64312], [64317, 64318], [64319, 64320], [64322, 64323], [64325, 64326], + [64434, 64467], [64830, 64848], [64912, 64914], [64968, 65008], [65020, 65136], + [65141, 65142], [65277, 65296], [65306, 65313], [65339, 65345], [65371, 65382], + [65471, 65474], [65480, 65482], [65488, 65490], [65496, 65498]].map( + ([start, end]) => Array(end - start).fill(0).map((_, i) => start + i) + ).flat() +) -var splitChars = (function() { - var result = {}; - var singles = [96, 180, 187, 191, 215, 247, 749, 885, 903, 907, 909, 930, 1014, 1648, - 1748, 1809, 2416, 2473, 2481, 2526, 2601, 2609, 2612, 2615, 2653, 2702, - 2706, 2729, 2737, 2740, 2857, 2865, 2868, 2910, 2928, 2948, 2961, 2971, - 2973, 3085, 3089, 3113, 3124, 3213, 3217, 3241, 3252, 3295, 3341, 3345, - 3369, 3506, 3516, 3633, 3715, 3721, 3736, 3744, 3748, 3750, 3756, 3761, - 3781, 3912, 4239, 4347, 4681, 4695, 4697, 4745, 4785, 4799, 4801, 4823, - 4881, 5760, 5901, 5997, 6313, 7405, 8024, 8026, 8028, 8030, 8117, 8125, - 8133, 8181, 8468, 8485, 8487, 8489, 8494, 8527, 11311, 11359, 11687, 11695, - 11703, 11711, 11719, 11727, 11735, 12448, 12539, 43010, 43014, 43019, 43587, - 43696, 43713, 64286, 64297, 64311, 64317, 64319, 64322, 64325, 65141]; - var i, j, start, end; - for (i = 0; i < singles.length; i++) { - result[singles[i]] = true; - } - var ranges = [[0, 47], [58, 64], [91, 94], [123, 169], [171, 177], [182, 184], [706, 709], - [722, 735], [741, 747], [751, 879], [888, 889], [894, 901], [1154, 1161], - [1318, 1328], [1367, 1368], [1370, 1376], [1416, 1487], [1515, 1519], [1523, 1568], - [1611, 1631], [1642, 1645], [1750, 1764], [1767, 1773], [1789, 1790], [1792, 1807], - [1840, 1868], [1958, 1968], [1970, 1983], [2027, 2035], [2038, 2041], [2043, 2047], - [2070, 2073], [2075, 2083], [2085, 2087], [2089, 2307], [2362, 2364], [2366, 2383], - [2385, 2391], [2402, 2405], [2419, 2424], [2432, 2436], [2445, 2446], [2449, 2450], - [2483, 2485], [2490, 2492], [2494, 2509], [2511, 2523], [2530, 2533], [2546, 2547], - [2554, 2564], [2571, 2574], [2577, 2578], [2618, 2648], [2655, 2661], [2672, 2673], - [2677, 2692], [2746, 2748], [2750, 2767], [2769, 2783], [2786, 2789], [2800, 2820], - [2829, 2830], [2833, 2834], [2874, 2876], [2878, 2907], [2914, 2917], [2930, 2946], - [2955, 2957], [2966, 2968], [2976, 2978], [2981, 2983], [2987, 2989], [3002, 3023], - [3025, 3045], [3059, 3076], [3130, 3132], [3134, 3159], [3162, 3167], [3170, 3173], - [3184, 3191], [3199, 3204], [3258, 3260], [3262, 3293], [3298, 3301], [3312, 3332], - [3386, 3388], [3390, 3423], [3426, 3429], [3446, 3449], [3456, 3460], [3479, 3481], - [3518, 3519], [3527, 3584], [3636, 3647], [3655, 3663], [3674, 3712], [3717, 3718], - [3723, 3724], [3726, 3731], [3752, 3753], [3764, 3772], [3774, 3775], [3783, 3791], - [3802, 3803], [3806, 3839], [3841, 3871], [3892, 3903], [3949, 3975], [3980, 4095], - [4139, 4158], [4170, 4175], [4182, 4185], [4190, 4192], [4194, 4196], [4199, 4205], - [4209, 4212], [4226, 4237], [4250, 4255], [4294, 4303], [4349, 4351], [4686, 4687], - [4702, 4703], [4750, 4751], [4790, 4791], [4806, 4807], [4886, 4887], [4955, 4968], - [4989, 4991], [5008, 5023], [5109, 5120], [5741, 5742], [5787, 5791], [5867, 5869], - [5873, 5887], [5906, 5919], [5938, 5951], [5970, 5983], [6001, 6015], [6068, 6102], - [6104, 6107], [6109, 6111], [6122, 6127], [6138, 6159], [6170, 6175], [6264, 6271], - [6315, 6319], [6390, 6399], [6429, 6469], [6510, 6511], [6517, 6527], [6572, 6592], - [6600, 6607], [6619, 6655], [6679, 6687], [6741, 6783], [6794, 6799], [6810, 6822], - [6824, 6916], [6964, 6980], [6988, 6991], [7002, 7042], [7073, 7085], [7098, 7167], - [7204, 7231], [7242, 7244], [7294, 7400], [7410, 7423], [7616, 7679], [7958, 7959], - [7966, 7967], [8006, 8007], [8014, 8015], [8062, 8063], [8127, 8129], [8141, 8143], - [8148, 8149], [8156, 8159], [8173, 8177], [8189, 8303], [8306, 8307], [8314, 8318], - [8330, 8335], [8341, 8449], [8451, 8454], [8456, 8457], [8470, 8472], [8478, 8483], - [8506, 8507], [8512, 8516], [8522, 8525], [8586, 9311], [9372, 9449], [9472, 10101], - [10132, 11263], [11493, 11498], [11503, 11516], [11518, 11519], [11558, 11567], - [11622, 11630], [11632, 11647], [11671, 11679], [11743, 11822], [11824, 12292], - [12296, 12320], [12330, 12336], [12342, 12343], [12349, 12352], [12439, 12444], - [12544, 12548], [12590, 12592], [12687, 12689], [12694, 12703], [12728, 12783], - [12800, 12831], [12842, 12880], [12896, 12927], [12938, 12976], [12992, 13311], - [19894, 19967], [40908, 40959], [42125, 42191], [42238, 42239], [42509, 42511], - [42540, 42559], [42592, 42593], [42607, 42622], [42648, 42655], [42736, 42774], - [42784, 42785], [42889, 42890], [42893, 43002], [43043, 43055], [43062, 43071], - [43124, 43137], [43188, 43215], [43226, 43249], [43256, 43258], [43260, 43263], - [43302, 43311], [43335, 43359], [43389, 43395], [43443, 43470], [43482, 43519], - [43561, 43583], [43596, 43599], [43610, 43615], [43639, 43641], [43643, 43647], - [43698, 43700], [43703, 43704], [43710, 43711], [43715, 43738], [43742, 43967], - [44003, 44015], [44026, 44031], [55204, 55215], [55239, 55242], [55292, 55295], - [57344, 63743], [64046, 64047], [64110, 64111], [64218, 64255], [64263, 64274], - [64280, 64284], [64434, 64466], [64830, 64847], [64912, 64913], [64968, 65007], - [65020, 65135], [65277, 65295], [65306, 65312], [65339, 65344], [65371, 65381], - [65471, 65473], [65480, 65481], [65488, 65489], [65496, 65497]]; - for (i = 0; i < ranges.length; i++) { - start = ranges[i][0]; - end = ranges[i][1]; - for (j = start; j <= end; j++) { - result[j] = true; - } - } - return result; -})(); - -function splitQuery(query) { - var result = []; - var start = -1; - for (var i = 0; i < query.length; i++) { - if (splitChars[query.charCodeAt(i)]) { - if (start !== -1) { +const splitQuery = (query) => { + const result = []; + let start = null; + for (let i = 0; i < query.length; i++) { + if (splitChars.has(query.charCodeAt(i))) { + if (start !== null) { result.push(query.slice(start, i)); - start = -1; + start = null; + } + } else { + if (start === null) start = i; + if (i === query.length - 1) { + result.push(query.slice(start)); } - } else if (start === -1) { - start = i; } } - if (start !== -1) { - result.push(query.slice(start)); - } return result; } diff --git a/sphinx/themes/basic/static/searchtools.js b/sphinx/themes/basic/static/searchtools.js index 46a181a2a..a37b7e55e 100644 --- a/sphinx/themes/basic/static/searchtools.js +++ b/sphinx/themes/basic/static/searchtools.js @@ -228,34 +228,31 @@ const Search = { const searchTerms = new Set(); const excludedTerms = new Set(); const highlightTerms = new Set(); - const objectTerms = new Set(query.toLowerCase().trim().split(/\s+/)); - query - .trim() - .split(/\s+/) - .forEach((queryTerm) => { - const queryTermLower = queryTerm.toLowerCase(); - - // maybe skip this "word" - // stopwords array is from language_data.js - if ( - stopwords.indexOf(queryTermLower) !== -1 || - queryTerm.match(/^\d+$/) - ) - return; + const objectTerms = new Set(splitQuery(query.toLowerCase().trim())); + splitQuery(query.trim()).forEach((queryTerm) => { + const queryTermLower = queryTerm.toLowerCase(); - // stem the word - let word = stemmer.stemWord(queryTermLower); - // prevent stemmer from cutting word smaller than two chars - if (word.length < 3 && queryTerm.length >= 3) { - word = queryTerm; - } - // select the correct list - if (word[0] === "-") excludedTerms.add(word.substr(1)); - else { - searchTerms.add(word); - highlightTerms.add(queryTermLower); - } - }); + // maybe skip this "word" + // stopwords array is from language_data.js + if ( + stopwords.indexOf(queryTermLower) !== -1 || + queryTerm.match(/^\d+$/) + ) + return; + + // stem the word + let word = stemmer.stemWord(queryTermLower); + // prevent stemmer from cutting word smaller than two chars + if (word.length < 3 && queryTerm.length >= 3) { + word = queryTerm; + } + // select the correct list + if (word[0] === "-") excludedTerms.add(word.substr(1)); + else { + searchTerms.add(word); + highlightTerms.add(queryTermLower); + } + }); // console.debug("SEARCH: searching for:"); // console.info("required: ", [...searchTerms]); diff --git a/utils/jssplitter_generator.py b/utils/jssplitter_generator.py index f0918ec3f..f37559dfa 100644 --- a/utils/jssplitter_generator.py +++ b/utils/jssplitter_generator.py @@ -1,91 +1,71 @@ import json -import re import subprocess -import sys -# find char codes they are matched with Python's (?u)\\w - -match = re.compile(r'(?u)\w') begin = -1 - ranges = [] -singles = [] for i in range(65536): - # 0xd800-0xdfff is surrogate pair area. skip this. - if not match.match(chr(i)) and not (0xd800 <= i <= 0xdfff): + # Get all non 'word' codepoints. This means skipping all alphanumerics and + # '_' (U+0095), matching the `\w` character class in `re`. We also skip + # 0xd800-0xdfff, the surrogate pair area. + if not (chr(i).isalnum() or i == 95) and not (0xd800 <= i <= 0xdfff): if begin == -1: begin = i elif begin != -1: - if begin + 1 == i: - singles.append(begin) - else: - ranges.append((begin, i - 1)) + ranges.append((begin, i)) begin = -1 # fold json within almost 80 chars per line -def fold(jsonData, splitter): - code = json.dumps(jsonData) +def fold(json_data, splitter): + code = json.dumps(json_data) lines = [] while True: - if len(code) < 71: - lines.append(' ' + code) + if len(code) < 75: + lines.append(' ' + code) break - index = code.index(splitter, 70) - lines.append(' ' + code[:index + len(splitter)]) + index = code.index(splitter, 74) + lines.append(' ' + code[:index + len(splitter)]) code = code[index + len(splitter):] - lines[0] = lines[0][8:] + lines[0] = lines[0][4:] return '\n'.join(lines) # JavaScript code -js_src = ''' -var splitChars = (function() { - var result = {}; - var singles = %s; - var i, j, start, end; - for (i = 0; i < singles.length; i++) { - result[singles[i]] = true; - } - var ranges = %s; - for (i = 0; i < ranges.length; i++) { - start = ranges[i][0]; - end = ranges[i][1]; - for (j = start; j <= end; j++) { - result[j] = true; - } - } - return result; -})(); - -function splitQuery(query) { - var result = []; - var start = -1; - for (var i = 0; i < query.length; i++) { - if (splitChars[query.charCodeAt(i)]) { - if (start !== -1) { +js_src = '''\ +const splitChars = new Set( + ''' + fold(ranges, "],") + '''.map( + ([start, end]) => Array(end - start).fill(0).map((_, i) => start + i) + ).flat() +) + +const splitQuery = (query) => { + const result = []; + let start = null; + for (let i = 0; i < query.length; i++) { + if (splitChars.has(query.charCodeAt(i))) { + if (start !== null) { result.push(query.slice(start, i)); - start = -1; + start = null; + } + } else { + if (start === null) start = i; + if (i === query.length - 1) { + result.push(query.slice(start)); } - } else if (start === -1) { - start = i; } } - if (start !== -1) { - result.push(query.slice(start)); - } return result; } -''' % (fold(singles, ','), fold(ranges, '],')) +''' -js_test_src = ''' +js_test_src = f'''\ // This is regression test for https://github.com/sphinx-doc/sphinx/issues/3150 // generated by compat_regexp_generator.py // it needs node.js for testing -var assert = require('assert'); +const assert = require('assert'); -%s +{js_src} console.log("test splitting English words") assert.deepEqual(['Hello', 'World'], splitQuery(' Hello World ')); @@ -99,7 +79,7 @@ console.log("test splitting Chinese characters") assert.deepEqual(['Hello', 'from', '中国', '上海'], splitQuery('Hello from 中国 上海')); console.log(' ... ok\\n') -console.log("test splitting Emoji(surrogate pair) characters. It should keep emojis.") +console.log("test splitting Emoji (surrogate pair) characters. It should keep emojis.") assert.deepEqual(['😁😁'], splitQuery('😁😁')); console.log(' ... ok\\n') @@ -109,7 +89,8 @@ assert.deepEqual( splitQuery('Löschen Prüfung Abändern ærlig spørsmål')); console.log(' ... ok\\n') -''' % js_src +''' + python_src = '''\ """Provides Python compatible word splitter to JavaScript @@ -118,16 +99,15 @@ DO NOT EDIT. This is generated by utils/jssplitter_generator.py """ splitter_code = """ -%s +{js_src} """ -''' % js_src +''' -with open('../sphinx/search/jssplitter.py', 'w') as f: +with open('../sphinx/search/jssplitter.py', 'w', encoding="utf-8") as f: f.write(python_src) -with open('./regression_test.js', 'w') as f: +with open('./regression_test.js', 'w', encoding="utf-8") as f: f.write(js_test_src) print("starting test...") -result = subprocess.call(['node', './regression_test.js']) -sys.exit(result) +raise SystemExit(subprocess.call(['node', './regression_test.js'])) |