diff options
Diffstat (limited to 'examples/booleansearchparser.py')
-rw-r--r-- | examples/booleansearchparser.py | 346 |
1 files changed, 211 insertions, 135 deletions
diff --git a/examples/booleansearchparser.py b/examples/booleansearchparser.py index d970e98..7ac502c 100644 --- a/examples/booleansearchparser.py +++ b/examples/booleansearchparser.py @@ -64,8 +64,8 @@ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. CONTRIBUTORS: @@ -81,46 +81,56 @@ TODO: - add more kinds of wildcards ('*' at the beginning and '*' inside a word)? """ -from pyparsing import Word, alphanums, Keyword, Group, Forward, Suppress, OneOrMore, oneOf +from pyparsing import ( + Word, + alphanums, + Keyword, + Group, + Forward, + Suppress, + OneOrMore, + oneOf, +) import re alphabet_ranges = [ ##CYRILIC: https://en.wikipedia.org/wiki/Cyrillic_(Unicode_block) - [int("0400",16), int("04FF",16)], + [int("0400", 16), int("04FF", 16)], ##THAI: https://en.wikipedia.org/wiki/Thai_(Unicode_block) - [int("0E00",16), int("0E7F",16)], + [int("0E00", 16), int("0E7F", 16)], ##ARABIC: https://en.wikipedia.org/wiki/Arabic_(Unicode_block) (Arabic (0600–06FF)+ Syriac (0700–074F)+ Arabic Supplement (0750–077F) ) - [int("0600",16), int("07FF",16)], + [int("0600", 16), int("07FF", 16)], ##CHINESE: https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - [int("0400",16), int("09FF",16)], - #JAPANESE : https://en.wikipedia.org/wiki/Japanese_writing_system - [int("3040",16), int("30FF",16)], - #KOREAN : https://en.wikipedia.org/wiki/Hangul - [int("AC00",16), int("D7AF",16)], - [int("1100",16), int("11FF",16)], - [int("3130",16), int("318F",16)], - [int("3200",16), int("32FF",16)], - [int("A960",16), int("A97F",16)], - [int("D7B0",16), int("D7FF",16)], - [int("FF00",16), int("FFEF",16)], + [int("0400", 16), int("09FF", 16)], + # JAPANESE : https://en.wikipedia.org/wiki/Japanese_writing_system + [int("3040", 16), int("30FF", 16)], + # KOREAN : https://en.wikipedia.org/wiki/Hangul + [int("AC00", 16), int("D7AF", 16)], + [int("1100", 16), int("11FF", 16)], + [int("3130", 16), int("318F", 16)], + [int("3200", 16), int("32FF", 16)], + [int("A960", 16), int("A97F", 16)], + [int("D7B0", 16), int("D7FF", 16)], + [int("FF00", 16), int("FFEF", 16)], ] -class BooleanSearchParser: - def __init__(self,only_parse=False): + +class BooleanSearchParser: + def __init__(self, only_parse=False): self._methods = { - 'and': self.evaluateAnd, - 'or': self.evaluateOr, - 'not': self.evaluateNot, - 'parenthesis': self.evaluateParenthesis, - 'quotes': self.evaluateQuotes, - 'word': self.evaluateWord, - 'wordwildcardprefix': self.evaluateWordWildcardPrefix, - 'wordwildcardsufix': self.evaluateWordWildcardSufix, + "and": self.evaluateAnd, + "or": self.evaluateOr, + "not": self.evaluateNot, + "parenthesis": self.evaluateParenthesis, + "quotes": self.evaluateQuotes, + "word": self.evaluateWord, + "wordwildcardprefix": self.evaluateWordWildcardPrefix, + "wordwildcardsufix": self.evaluateWordWildcardSufix, } - self._parser = self.parser() - self.text = '' - self.words = [] + self._parser = self.parser() + self.text = "" + self.words = [] def parser(self): """ @@ -141,47 +151,54 @@ class BooleanSearchParser: alphabet = alphanums - #suport for non-western alphabets + # suport for non-western alphabets for r in alphabet_ranges: - alphabet += ''.join(chr(c) for c in range(*r) if not chr(c).isspace()) - - operatorWord = Group( - Word(alphabet + '*') - ).setResultsName('word*') + alphabet += "".join(chr(c) for c in range(*r) if not chr(c).isspace()) + operatorWord = Group(Word(alphabet + "*")).setResultsName("word*") operatorQuotesContent = Forward() - operatorQuotesContent << ( - (operatorWord + operatorQuotesContent) | operatorWord - ) + operatorQuotesContent << ((operatorWord + operatorQuotesContent) | operatorWord) - operatorQuotes = Group( - Suppress('"') + operatorQuotesContent + Suppress('"') - ).setResultsName("quotes") | operatorWord + operatorQuotes = ( + Group(Suppress('"') + operatorQuotesContent + Suppress('"')).setResultsName( + "quotes" + ) + | operatorWord + ) - operatorParenthesis = Group( - Suppress("(") + operatorOr + Suppress(")") - ).setResultsName("parenthesis") | operatorQuotes + operatorParenthesis = ( + Group(Suppress("(") + operatorOr + Suppress(")")).setResultsName( + "parenthesis" + ) + | operatorQuotes + ) operatorNot = Forward() - operatorNot << (Group( - Suppress(Keyword("not", caseless=True)) + operatorNot - ).setResultsName("not") | operatorParenthesis) + operatorNot << ( + Group(Suppress(Keyword("not", caseless=True)) + operatorNot).setResultsName( + "not" + ) + | operatorParenthesis + ) operatorAnd = Forward() operatorAnd << ( Group( operatorNot + Suppress(Keyword("and", caseless=True)) + operatorAnd - ).setResultsName("and")| - Group( + ).setResultsName("and") + | Group( operatorNot + OneOrMore(~oneOf("and or") + operatorAnd) - ).setResultsName("and") | - operatorNot + ).setResultsName("and") + | operatorNot ) - operatorOr << (Group( - operatorAnd + Suppress(Keyword("or", caseless=True)) + operatorOr - ).setResultsName("or") | operatorAnd) + operatorOr << ( + Group( + operatorAnd + Suppress(Keyword("or", caseless=True)) + operatorOr + ).setResultsName("or") + | operatorAnd + ) return operatorOr.parseString @@ -204,26 +221,26 @@ class BooleanSearchParser: function GetQuoted to only return the subset of ID's that contain the literal string. """ - #r = set() + # r = set() r = False search_terms = [] for item in argument: search_terms.append(item[0]) r = r and self.evaluate(item) - return self.GetQuotes(' '.join(search_terms), r) + return self.GetQuotes(" ".join(search_terms), r) def evaluateWord(self, argument): wildcard_count = argument[0].count("*") if wildcard_count > 0: if wildcard_count == 1 and argument[0].startswith("*"): - return self.GetWordWildcard(argument[0][1:], method = "endswith") + return self.GetWordWildcard(argument[0][1:], method="endswith") if wildcard_count == 1 and argument[0].endswith("*"): - return self.GetWordWildcard(argument[0][:-1], method = "startswith") + return self.GetWordWildcard(argument[0][:-1], method="startswith") else: - _regex = argument[0].replace("*",".+") + _regex = argument[0].replace("*", ".+") matched = False for w in self.words: - matched = bool(re.search(_regex,w)) + matched = bool(re.search(_regex, w)) if matched: break return matched @@ -231,10 +248,10 @@ class BooleanSearchParser: return self.GetWord(argument[0]) def evaluateWordWildcardPrefix(self, argument): - return self.GetWordWildcard(argument[0], method = "endswith") + return self.GetWordWildcard(argument[0], method="endswith") def evaluateWordWildcardSufix(self, argument): - return self.GetWordWildcard(argument[0], method = "startswith") + return self.GetWordWildcard(argument[0], method="startswith") def evaluate(self, argument): return self._methods[argument.getName()](argument) @@ -245,10 +262,10 @@ class BooleanSearchParser: def GetWord(self, word): return word in self.words - def GetWordWildcard(self, word, method = "startswith"): + def GetWordWildcard(self, word, method="startswith"): matched = False for w in self.words: - matched = getattr(w,method)(word) + matched = getattr(w, method)(word) if matched: break return matched @@ -265,24 +282,22 @@ class BooleanSearchParser: def GetQuotes(self, search_string, tmp_result): return search_string in self.text - def GetNot(self, not_set): return not not_set - - def _split_words(self,text): + def _split_words(self, text): words = [] """ >>> import string >>> string.punctuation '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' """ - #it will keep @, # and - #usernames and hashtags can contain dots, so a double check is done - r = re.compile(r'[\s{}]+'.format(re.escape('!"$%&\'()*+,-/:;<=>?[\\]^`{|}~'))) - _words = r.split(text) + # it will keep @, # and + # usernames and hashtags can contain dots, so a double check is done + r = re.compile(r"[\s{}]+".format(re.escape("!\"$%&'()*+,-/:;<=>?[\\]^`{|}~"))) + _words = r.split(text) for _w in _words: - if '.' in _w and not _w.startswith("#") and not _w.startswith("@"): + if "." in _w and not _w.startswith("#") and not _w.startswith("@"): for __w in _w.split("."): words.append(__w) continue @@ -291,9 +306,9 @@ class BooleanSearchParser: return words - def match(self,text,expr): - self.text = text - self.words = self._split_words(text) + def match(self, text, expr): + self.text = text + self.words = self._split_words(text) return self.Parse(expr) @@ -305,72 +320,132 @@ class ParserTest(BooleanSearchParser): def Test(self): exprs = { - '0' : 'help', - '1' : 'help or hulp', - '2' : 'help and hulp', - '3' : 'help hulp', - '4' : 'help and hulp or hilp', - '5' : 'help or hulp and hilp', - '6' : 'help or hulp or hilp or halp', - '7' : '(help or hulp) and (hilp or halp)', - '8' : 'help and (hilp or halp)', - '9' : '(help and (hilp or halp)) or hulp', - '10': 'not help', - '11': 'not hulp and halp', - '12': 'not (help and halp)', - '13': '"help me please"', - '14': '"help me please" or hulp', - '15': '"help me please" or (hulp and halp)', - '16': 'help*', - '17': 'help or hulp*', - '18': 'help* and hulp', - '19': 'help and hulp* or hilp', - '20': 'help* or hulp or hilp or halp', - '21': '(help or hulp*) and (hilp* or halp)', - '22': 'help* and (hilp* or halp*)', - '23': '(help and (hilp* or halp)) or hulp*', - '24': 'not help* and halp', - '25': 'not (help* and helpe*)', - '26': '"help* me please"', - '27': '"help* me* please" or hulp*', - '28': '"help me please*" or (hulp and halp)', - '29': '"help me please" not (hulp and halp)', - '30': '"help me please" hulp', - '31': 'help and hilp and not holp', - '32': 'help hilp not holp', - '33': 'help hilp and not holp', - '34': '*lp and halp', - '35': '*신은 and 어떠세요', + "0": "help", + "1": "help or hulp", + "2": "help and hulp", + "3": "help hulp", + "4": "help and hulp or hilp", + "5": "help or hulp and hilp", + "6": "help or hulp or hilp or halp", + "7": "(help or hulp) and (hilp or halp)", + "8": "help and (hilp or halp)", + "9": "(help and (hilp or halp)) or hulp", + "10": "not help", + "11": "not hulp and halp", + "12": "not (help and halp)", + "13": '"help me please"', + "14": '"help me please" or hulp', + "15": '"help me please" or (hulp and halp)', + "16": "help*", + "17": "help or hulp*", + "18": "help* and hulp", + "19": "help and hulp* or hilp", + "20": "help* or hulp or hilp or halp", + "21": "(help or hulp*) and (hilp* or halp)", + "22": "help* and (hilp* or halp*)", + "23": "(help and (hilp* or halp)) or hulp*", + "24": "not help* and halp", + "25": "not (help* and helpe*)", + "26": '"help* me please"', + "27": '"help* me* please" or hulp*', + "28": '"help me please*" or (hulp and halp)', + "29": '"help me please" not (hulp and halp)', + "30": '"help me please" hulp', + "31": "help and hilp and not holp", + "32": "help hilp not holp", + "33": "help hilp and not holp", + "34": "*lp and halp", + "35": "*신은 and 어떠세요", } texts_matcheswith = { "halp thinks he needs help": [ - "25", "22", "20", "21", "11", "17", "16", "23", "34", "1", "0", "5", "7", "6", "9", "8" - ], - "he needs halp": [ - "24", "25", "20", "11", "10", "12", "34", "6" - ], - "help": [ - "25", "20", "12", "17", "16", "1", "0", "5", "6" + "25", + "22", + "20", + "21", + "11", + "17", + "16", + "23", + "34", + "1", + "0", + "5", + "7", + "6", + "9", + "8", ], + "he needs halp": ["24", "25", "20", "11", "10", "12", "34", "6"], + "help": ["25", "20", "12", "17", "16", "1", "0", "5", "6"], "help hilp": [ - "25", "22", "20", "32", "21", "12", "17", "16", "19", "31", "23", "1", "0", "5", "4", "7", "6", "9", "8", "33" + "25", + "22", + "20", + "32", + "21", + "12", + "17", + "16", + "19", + "31", + "23", + "1", + "0", + "5", + "4", + "7", + "6", + "9", + "8", + "33", ], "help me please hulp": [ - "30", "25", "27", "20", "13", "12", "15", "14", "17", "16", "19", "18", "23", "29", "1", "0", "3", "2", "5", "4", "6", "9" - ], - "helper": [ - "20", "10", "12", "16" + "30", + "25", + "27", + "20", + "13", + "12", + "15", + "14", + "17", + "16", + "19", + "18", + "23", + "29", + "1", + "0", + "3", + "2", + "5", + "4", + "6", + "9", ], + "helper": ["20", "10", "12", "16"], "hulp hilp": [ - "25", "27", "20", "21", "10", "12", "14", "17", "19", "23", "1", "5", "4", "7", "6", "9" + "25", + "27", + "20", + "21", + "10", + "12", + "14", + "17", + "19", + "23", + "1", + "5", + "4", + "7", + "6", + "9", ], - "nothing": [ - "25", "10", "12" - ], - "안녕하세요, 당신은 어떠세요?": [ - "10", "12", "25", "35" - ] + "nothing": ["25", "10", "12"], + "안녕하세요, 당신은 어떠세요?": ["10", "12", "25", "35"], } all_ok = True @@ -382,16 +457,17 @@ class ParserTest(BooleanSearchParser): test_passed = sorted(matches) == sorted(_matches) if not test_passed: - print('Failed', repr(text), 'expected', matches, 'matched', _matches) + print("Failed", repr(text), "expected", matches, "matched", _matches) all_ok = all_ok and test_passed return all_ok -if __name__=='__main__': + +if __name__ == "__main__": if ParserTest().Test(): - print ('All tests OK') + print("All tests OK") exit(0) else: - print ('One or more tests FAILED') + print("One or more tests FAILED") exit(1) |