summaryrefslogtreecommitdiff
path: root/examples/booleansearchparser.py
diff options
context:
space:
mode:
Diffstat (limited to 'examples/booleansearchparser.py')
-rw-r--r--examples/booleansearchparser.py346
1 files changed, 211 insertions, 135 deletions
diff --git a/examples/booleansearchparser.py b/examples/booleansearchparser.py
index d970e98..7ac502c 100644
--- a/examples/booleansearchparser.py
+++ b/examples/booleansearchparser.py
@@ -64,8 +64,8 @@ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
CONTRIBUTORS:
@@ -81,46 +81,56 @@ TODO:
- add more kinds of wildcards ('*' at the beginning and '*' inside a word)?
"""
-from pyparsing import Word, alphanums, Keyword, Group, Forward, Suppress, OneOrMore, oneOf
+from pyparsing import (
+ Word,
+ alphanums,
+ Keyword,
+ Group,
+ Forward,
+ Suppress,
+ OneOrMore,
+ oneOf,
+)
import re
alphabet_ranges = [
##CYRILIC: https://en.wikipedia.org/wiki/Cyrillic_(Unicode_block)
- [int("0400",16), int("04FF",16)],
+ [int("0400", 16), int("04FF", 16)],
##THAI: https://en.wikipedia.org/wiki/Thai_(Unicode_block)
- [int("0E00",16), int("0E7F",16)],
+ [int("0E00", 16), int("0E7F", 16)],
##ARABIC: https://en.wikipedia.org/wiki/Arabic_(Unicode_block) (Arabic (0600–06FF)+ Syriac (0700–074F)+ Arabic Supplement (0750–077F) )
- [int("0600",16), int("07FF",16)],
+ [int("0600", 16), int("07FF", 16)],
##CHINESE: https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
- [int("0400",16), int("09FF",16)],
- #JAPANESE : https://en.wikipedia.org/wiki/Japanese_writing_system
- [int("3040",16), int("30FF",16)],
- #KOREAN : https://en.wikipedia.org/wiki/Hangul
- [int("AC00",16), int("D7AF",16)],
- [int("1100",16), int("11FF",16)],
- [int("3130",16), int("318F",16)],
- [int("3200",16), int("32FF",16)],
- [int("A960",16), int("A97F",16)],
- [int("D7B0",16), int("D7FF",16)],
- [int("FF00",16), int("FFEF",16)],
+ [int("0400", 16), int("09FF", 16)],
+ # JAPANESE : https://en.wikipedia.org/wiki/Japanese_writing_system
+ [int("3040", 16), int("30FF", 16)],
+ # KOREAN : https://en.wikipedia.org/wiki/Hangul
+ [int("AC00", 16), int("D7AF", 16)],
+ [int("1100", 16), int("11FF", 16)],
+ [int("3130", 16), int("318F", 16)],
+ [int("3200", 16), int("32FF", 16)],
+ [int("A960", 16), int("A97F", 16)],
+ [int("D7B0", 16), int("D7FF", 16)],
+ [int("FF00", 16), int("FFEF", 16)],
]
-class BooleanSearchParser:
- def __init__(self,only_parse=False):
+
+class BooleanSearchParser:
+ def __init__(self, only_parse=False):
self._methods = {
- 'and': self.evaluateAnd,
- 'or': self.evaluateOr,
- 'not': self.evaluateNot,
- 'parenthesis': self.evaluateParenthesis,
- 'quotes': self.evaluateQuotes,
- 'word': self.evaluateWord,
- 'wordwildcardprefix': self.evaluateWordWildcardPrefix,
- 'wordwildcardsufix': self.evaluateWordWildcardSufix,
+ "and": self.evaluateAnd,
+ "or": self.evaluateOr,
+ "not": self.evaluateNot,
+ "parenthesis": self.evaluateParenthesis,
+ "quotes": self.evaluateQuotes,
+ "word": self.evaluateWord,
+ "wordwildcardprefix": self.evaluateWordWildcardPrefix,
+ "wordwildcardsufix": self.evaluateWordWildcardSufix,
}
- self._parser = self.parser()
- self.text = ''
- self.words = []
+ self._parser = self.parser()
+ self.text = ""
+ self.words = []
def parser(self):
"""
@@ -141,47 +151,54 @@ class BooleanSearchParser:
alphabet = alphanums
- #suport for non-western alphabets
+ # suport for non-western alphabets
for r in alphabet_ranges:
- alphabet += ''.join(chr(c) for c in range(*r) if not chr(c).isspace())
-
- operatorWord = Group(
- Word(alphabet + '*')
- ).setResultsName('word*')
+ alphabet += "".join(chr(c) for c in range(*r) if not chr(c).isspace())
+ operatorWord = Group(Word(alphabet + "*")).setResultsName("word*")
operatorQuotesContent = Forward()
- operatorQuotesContent << (
- (operatorWord + operatorQuotesContent) | operatorWord
- )
+ operatorQuotesContent << ((operatorWord + operatorQuotesContent) | operatorWord)
- operatorQuotes = Group(
- Suppress('"') + operatorQuotesContent + Suppress('"')
- ).setResultsName("quotes") | operatorWord
+ operatorQuotes = (
+ Group(Suppress('"') + operatorQuotesContent + Suppress('"')).setResultsName(
+ "quotes"
+ )
+ | operatorWord
+ )
- operatorParenthesis = Group(
- Suppress("(") + operatorOr + Suppress(")")
- ).setResultsName("parenthesis") | operatorQuotes
+ operatorParenthesis = (
+ Group(Suppress("(") + operatorOr + Suppress(")")).setResultsName(
+ "parenthesis"
+ )
+ | operatorQuotes
+ )
operatorNot = Forward()
- operatorNot << (Group(
- Suppress(Keyword("not", caseless=True)) + operatorNot
- ).setResultsName("not") | operatorParenthesis)
+ operatorNot << (
+ Group(Suppress(Keyword("not", caseless=True)) + operatorNot).setResultsName(
+ "not"
+ )
+ | operatorParenthesis
+ )
operatorAnd = Forward()
operatorAnd << (
Group(
operatorNot + Suppress(Keyword("and", caseless=True)) + operatorAnd
- ).setResultsName("and")|
- Group(
+ ).setResultsName("and")
+ | Group(
operatorNot + OneOrMore(~oneOf("and or") + operatorAnd)
- ).setResultsName("and") |
- operatorNot
+ ).setResultsName("and")
+ | operatorNot
)
- operatorOr << (Group(
- operatorAnd + Suppress(Keyword("or", caseless=True)) + operatorOr
- ).setResultsName("or") | operatorAnd)
+ operatorOr << (
+ Group(
+ operatorAnd + Suppress(Keyword("or", caseless=True)) + operatorOr
+ ).setResultsName("or")
+ | operatorAnd
+ )
return operatorOr.parseString
@@ -204,26 +221,26 @@ class BooleanSearchParser:
function GetQuoted to only return the subset of ID's that contain the
literal string.
"""
- #r = set()
+ # r = set()
r = False
search_terms = []
for item in argument:
search_terms.append(item[0])
r = r and self.evaluate(item)
- return self.GetQuotes(' '.join(search_terms), r)
+ return self.GetQuotes(" ".join(search_terms), r)
def evaluateWord(self, argument):
wildcard_count = argument[0].count("*")
if wildcard_count > 0:
if wildcard_count == 1 and argument[0].startswith("*"):
- return self.GetWordWildcard(argument[0][1:], method = "endswith")
+ return self.GetWordWildcard(argument[0][1:], method="endswith")
if wildcard_count == 1 and argument[0].endswith("*"):
- return self.GetWordWildcard(argument[0][:-1], method = "startswith")
+ return self.GetWordWildcard(argument[0][:-1], method="startswith")
else:
- _regex = argument[0].replace("*",".+")
+ _regex = argument[0].replace("*", ".+")
matched = False
for w in self.words:
- matched = bool(re.search(_regex,w))
+ matched = bool(re.search(_regex, w))
if matched:
break
return matched
@@ -231,10 +248,10 @@ class BooleanSearchParser:
return self.GetWord(argument[0])
def evaluateWordWildcardPrefix(self, argument):
- return self.GetWordWildcard(argument[0], method = "endswith")
+ return self.GetWordWildcard(argument[0], method="endswith")
def evaluateWordWildcardSufix(self, argument):
- return self.GetWordWildcard(argument[0], method = "startswith")
+ return self.GetWordWildcard(argument[0], method="startswith")
def evaluate(self, argument):
return self._methods[argument.getName()](argument)
@@ -245,10 +262,10 @@ class BooleanSearchParser:
def GetWord(self, word):
return word in self.words
- def GetWordWildcard(self, word, method = "startswith"):
+ def GetWordWildcard(self, word, method="startswith"):
matched = False
for w in self.words:
- matched = getattr(w,method)(word)
+ matched = getattr(w, method)(word)
if matched:
break
return matched
@@ -265,24 +282,22 @@ class BooleanSearchParser:
def GetQuotes(self, search_string, tmp_result):
return search_string in self.text
-
def GetNot(self, not_set):
return not not_set
-
- def _split_words(self,text):
+ def _split_words(self, text):
words = []
"""
>>> import string
>>> string.punctuation
'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
"""
- #it will keep @, # and
- #usernames and hashtags can contain dots, so a double check is done
- r = re.compile(r'[\s{}]+'.format(re.escape('!"$%&\'()*+,-/:;<=>?[\\]^`{|}~')))
- _words = r.split(text)
+ # it will keep @, # and
+ # usernames and hashtags can contain dots, so a double check is done
+ r = re.compile(r"[\s{}]+".format(re.escape("!\"$%&'()*+,-/:;<=>?[\\]^`{|}~")))
+ _words = r.split(text)
for _w in _words:
- if '.' in _w and not _w.startswith("#") and not _w.startswith("@"):
+ if "." in _w and not _w.startswith("#") and not _w.startswith("@"):
for __w in _w.split("."):
words.append(__w)
continue
@@ -291,9 +306,9 @@ class BooleanSearchParser:
return words
- def match(self,text,expr):
- self.text = text
- self.words = self._split_words(text)
+ def match(self, text, expr):
+ self.text = text
+ self.words = self._split_words(text)
return self.Parse(expr)
@@ -305,72 +320,132 @@ class ParserTest(BooleanSearchParser):
def Test(self):
exprs = {
- '0' : 'help',
- '1' : 'help or hulp',
- '2' : 'help and hulp',
- '3' : 'help hulp',
- '4' : 'help and hulp or hilp',
- '5' : 'help or hulp and hilp',
- '6' : 'help or hulp or hilp or halp',
- '7' : '(help or hulp) and (hilp or halp)',
- '8' : 'help and (hilp or halp)',
- '9' : '(help and (hilp or halp)) or hulp',
- '10': 'not help',
- '11': 'not hulp and halp',
- '12': 'not (help and halp)',
- '13': '"help me please"',
- '14': '"help me please" or hulp',
- '15': '"help me please" or (hulp and halp)',
- '16': 'help*',
- '17': 'help or hulp*',
- '18': 'help* and hulp',
- '19': 'help and hulp* or hilp',
- '20': 'help* or hulp or hilp or halp',
- '21': '(help or hulp*) and (hilp* or halp)',
- '22': 'help* and (hilp* or halp*)',
- '23': '(help and (hilp* or halp)) or hulp*',
- '24': 'not help* and halp',
- '25': 'not (help* and helpe*)',
- '26': '"help* me please"',
- '27': '"help* me* please" or hulp*',
- '28': '"help me please*" or (hulp and halp)',
- '29': '"help me please" not (hulp and halp)',
- '30': '"help me please" hulp',
- '31': 'help and hilp and not holp',
- '32': 'help hilp not holp',
- '33': 'help hilp and not holp',
- '34': '*lp and halp',
- '35': '*신은 and 어떠세요',
+ "0": "help",
+ "1": "help or hulp",
+ "2": "help and hulp",
+ "3": "help hulp",
+ "4": "help and hulp or hilp",
+ "5": "help or hulp and hilp",
+ "6": "help or hulp or hilp or halp",
+ "7": "(help or hulp) and (hilp or halp)",
+ "8": "help and (hilp or halp)",
+ "9": "(help and (hilp or halp)) or hulp",
+ "10": "not help",
+ "11": "not hulp and halp",
+ "12": "not (help and halp)",
+ "13": '"help me please"',
+ "14": '"help me please" or hulp',
+ "15": '"help me please" or (hulp and halp)',
+ "16": "help*",
+ "17": "help or hulp*",
+ "18": "help* and hulp",
+ "19": "help and hulp* or hilp",
+ "20": "help* or hulp or hilp or halp",
+ "21": "(help or hulp*) and (hilp* or halp)",
+ "22": "help* and (hilp* or halp*)",
+ "23": "(help and (hilp* or halp)) or hulp*",
+ "24": "not help* and halp",
+ "25": "not (help* and helpe*)",
+ "26": '"help* me please"',
+ "27": '"help* me* please" or hulp*',
+ "28": '"help me please*" or (hulp and halp)',
+ "29": '"help me please" not (hulp and halp)',
+ "30": '"help me please" hulp',
+ "31": "help and hilp and not holp",
+ "32": "help hilp not holp",
+ "33": "help hilp and not holp",
+ "34": "*lp and halp",
+ "35": "*신은 and 어떠세요",
}
texts_matcheswith = {
"halp thinks he needs help": [
- "25", "22", "20", "21", "11", "17", "16", "23", "34", "1", "0", "5", "7", "6", "9", "8"
- ],
- "he needs halp": [
- "24", "25", "20", "11", "10", "12", "34", "6"
- ],
- "help": [
- "25", "20", "12", "17", "16", "1", "0", "5", "6"
+ "25",
+ "22",
+ "20",
+ "21",
+ "11",
+ "17",
+ "16",
+ "23",
+ "34",
+ "1",
+ "0",
+ "5",
+ "7",
+ "6",
+ "9",
+ "8",
],
+ "he needs halp": ["24", "25", "20", "11", "10", "12", "34", "6"],
+ "help": ["25", "20", "12", "17", "16", "1", "0", "5", "6"],
"help hilp": [
- "25", "22", "20", "32", "21", "12", "17", "16", "19", "31", "23", "1", "0", "5", "4", "7", "6", "9", "8", "33"
+ "25",
+ "22",
+ "20",
+ "32",
+ "21",
+ "12",
+ "17",
+ "16",
+ "19",
+ "31",
+ "23",
+ "1",
+ "0",
+ "5",
+ "4",
+ "7",
+ "6",
+ "9",
+ "8",
+ "33",
],
"help me please hulp": [
- "30", "25", "27", "20", "13", "12", "15", "14", "17", "16", "19", "18", "23", "29", "1", "0", "3", "2", "5", "4", "6", "9"
- ],
- "helper": [
- "20", "10", "12", "16"
+ "30",
+ "25",
+ "27",
+ "20",
+ "13",
+ "12",
+ "15",
+ "14",
+ "17",
+ "16",
+ "19",
+ "18",
+ "23",
+ "29",
+ "1",
+ "0",
+ "3",
+ "2",
+ "5",
+ "4",
+ "6",
+ "9",
],
+ "helper": ["20", "10", "12", "16"],
"hulp hilp": [
- "25", "27", "20", "21", "10", "12", "14", "17", "19", "23", "1", "5", "4", "7", "6", "9"
+ "25",
+ "27",
+ "20",
+ "21",
+ "10",
+ "12",
+ "14",
+ "17",
+ "19",
+ "23",
+ "1",
+ "5",
+ "4",
+ "7",
+ "6",
+ "9",
],
- "nothing": [
- "25", "10", "12"
- ],
- "안녕하세요, 당신은 어떠세요?": [
- "10", "12", "25", "35"
- ]
+ "nothing": ["25", "10", "12"],
+ "안녕하세요, 당신은 어떠세요?": ["10", "12", "25", "35"],
}
all_ok = True
@@ -382,16 +457,17 @@ class ParserTest(BooleanSearchParser):
test_passed = sorted(matches) == sorted(_matches)
if not test_passed:
- print('Failed', repr(text), 'expected', matches, 'matched', _matches)
+ print("Failed", repr(text), "expected", matches, "matched", _matches)
all_ok = all_ok and test_passed
return all_ok
-if __name__=='__main__':
+
+if __name__ == "__main__":
if ParserTest().Test():
- print ('All tests OK')
+ print("All tests OK")
exit(0)
else:
- print ('One or more tests FAILED')
+ print("One or more tests FAILED")
exit(1)