"""
Boolean Search query parser (Based on searchparser: https://github.com/pyparsing/pyparsing/blob/master/examples/searchparser.py)

version 2018-07-22

This search query parser uses the excellent Pyparsing module
(http://pyparsing.sourceforge.net/) to parse search queries by users.
It handles:

* 'and', 'or' and implicit 'and' operators;
* parentheses;
* quoted strings;
* wildcards at the end of a search term (help*);
* wildcards at the beginning of a search term (*lp);
* non-western languages

Requirements:
* Python
* Pyparsing

SAMPLE USAGE:
from booleansearchparser import BooleanSearchParser
from __future__ import print_function
bsp = BooleanSearchParser()
text = u"wildcards at the beginning of a search term "
exprs= [
    u"*cards and term", #True
    u"wild* and term",  #True
    u"not terms",       #True
    u"terms or begin",  #False
]
for expr in exprs:
    print (bsp.match(text,expr))

#non-western samples
text = u"안녕하세요, 당신은 어떠세요?"
exprs= [
    u"*신은 and 어떠세요", #True
    u"not 당신은",       #False
    u"당신 or 당",  #False
]
for expr in exprs:
    print (bsp.match(text,expr))
-------------------------------------------------------------------------------
Copyright (c) 2006, Estrate, the Netherlands
All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.
* Neither the name of Estrate nor the names of its contributors may be used
  to endorse or promote products derived from this software without specific
  prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

CONTRIBUTORS:
- Steven Mooij
- Rudolph Froger
- Paul McGuire
- Guiem Bosch
- Francesc Garcia

TODO:
- add more docs
- ask someone to check my English texts
- add more kinds of wildcards ('*' at the beginning and '*' inside a word)?

"""
from pyparsing import (
    Word,
    alphanums,
    CaselessKeyword,
    Group,
    Forward,
    Suppress,
    OneOrMore,
    one_of,
)
import re


# Updated on 02 Dec 2021 according to ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt
alphabet_ranges = [
    # CYRILIC: https://en.wikipedia.org/wiki/Cyrillic_(Unicode_block)
    [int("0400", 16), int("04FF", 16)],
    # ARABIC: https://en.wikipedia.org/wiki/Arabic_(Unicode_block) (Arabic (0600–06FF)+ Syriac (0700–074F)+ Arabic Supplement (0750–077F))
    [int("0600", 16), int("07FF", 16)],
    # THAI: https://en.wikipedia.org/wiki/Thai_(Unicode_block)
    [int("0E00", 16), int("0E7F", 16)],
    # JAPANESE : https://en.wikipedia.org/wiki/Japanese_writing_system (Hiragana (3040–309F) + Katakana (30A0–30FF))
    [int("3040", 16), int("30FF", 16)],
    # Enclosed CJK Letters and Months
    [int("3200", 16), int("32FF", 16)],
    # CHINESE: https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
    [int("4E00", 16), int("9FFF", 16)],
    # KOREAN : https://en.wikipedia.org/wiki/Hangul
    [int("1100", 16), int("11FF", 16)],
    [int("3130", 16), int("318F", 16)],
    [int("A960", 16), int("A97F", 16)],
    [int("AC00", 16), int("D7AF", 16)],
    [int("D7B0", 16), int("D7FF", 16)],
    # Halfwidth and Fullwidth Forms
    [int("FF00", 16), int("FFEF", 16)],
]


class BooleanSearchParser:
    def __init__(self, only_parse=False):
        self._methods = {
            "and": self.evaluateAnd,
            "or": self.evaluateOr,
            "not": self.evaluateNot,
            "parenthesis": self.evaluateParenthesis,
            "quotes": self.evaluateQuotes,
            "word": self.evaluateWord,
            "wordwildcardprefix": self.evaluateWordWildcardPrefix,
            "wordwildcardsufix": self.evaluateWordWildcardSufix,
        }
        self._parser = self.parser()
        self.text = ""
        self.words = []

    def parser(self):
        """
        This function returns a parser.
        The grammar should be like most full text search engines (Google, Tsearch, Lucene).

        Grammar:
        - a query consists of alphanumeric words, with an optional '*'
          wildcard at the end or the beginning of a word
        - a sequence of words between quotes is a literal string
        - words can be used together by using operators ('and' or 'or')
        - words with operators can be grouped with parenthesis
        - a word or group of words can be preceded by a 'not' operator
        - the 'and' operator precedes an 'or' operator
        - if an operator is missing, use an 'and' operator
        """
        operatorOr = Forward()

        alphabet = alphanums

        # support for non-western alphabets
        for lo, hi in alphabet_ranges:
            alphabet += "".join(chr(c) for c in range(lo, hi + 1) if not chr(c).isspace())

        operatorWord = Group(Word(alphabet + "*")).set_results_name("word*")

        operatorQuotesContent = Forward()
        operatorQuotesContent << ((operatorWord + operatorQuotesContent) | operatorWord)

        operatorQuotes = (
            Group(Suppress('"') + operatorQuotesContent + Suppress('"')).set_results_name(
                "quotes"
            )
            | operatorWord
        )

        operatorParenthesis = (
            Group(Suppress("(") + operatorOr + Suppress(")")).set_results_name(
                "parenthesis"
            )
            | operatorQuotes
        )

        operatorNot = Forward()
        operatorNot << (
            Group(Suppress(CaselessKeyword("not")) + operatorNot).set_results_name(
                "not"
            )
            | operatorParenthesis
        )

        operatorAnd = Forward()
        operatorAnd << (
            Group(
                operatorNot + Suppress(CaselessKeyword("and")) + operatorAnd
            ).set_results_name("and")
            | Group(
                operatorNot + OneOrMore(~one_of("and or") + operatorAnd)
            ).set_results_name("and")
            | operatorNot
        )

        operatorOr << (
            Group(
                operatorAnd + Suppress(CaselessKeyword("or")) + operatorOr
            ).set_results_name("or")
            | operatorAnd
        )

        return operatorOr.parse_string

    def evaluateAnd(self, argument):
        return all(self.evaluate(arg) for arg in argument)

    def evaluateOr(self, argument):
        return any(self.evaluate(arg) for arg in argument)

    def evaluateNot(self, argument):
        return self.GetNot(self.evaluate(argument[0]))

    def evaluateParenthesis(self, argument):
        return self.evaluate(argument[0])

    def evaluateQuotes(self, argument):
        """Evaluate quoted strings

        First is does an 'and' on the individual search terms, then it asks the
        function GetQuoted to only return the subset of ID's that contain the
        literal string.
        """
        # r = set()
        r = False
        search_terms = []
        for item in argument:
            search_terms.append(item[0])
            r = r and self.evaluate(item)
        return self.GetQuotes(" ".join(search_terms), r)

    def evaluateWord(self, argument):
        wildcard_count = argument[0].count("*")
        if wildcard_count > 0:
            if wildcard_count == 1 and argument[0].startswith("*"):
                return self.GetWordWildcard(argument[0][1:], method="endswith")
            if wildcard_count == 1 and argument[0].endswith("*"):
                return self.GetWordWildcard(argument[0][:-1], method="startswith")
            else:
                _regex = argument[0].replace("*", ".+")
                matched = False
                for w in self.words:
                    matched = bool(re.search(_regex, w))
                    if matched:
                        break
                return matched

        return self.GetWord(argument[0])

    def evaluateWordWildcardPrefix(self, argument):
        return self.GetWordWildcard(argument[0], method="endswith")

    def evaluateWordWildcardSufix(self, argument):
        return self.GetWordWildcard(argument[0], method="startswith")

    def evaluate(self, argument):
        return self._methods[argument.getName()](argument)

    def Parse(self, query):
        return self.evaluate(self._parser(query)[0])

    def GetWord(self, word):
        return word in self.words

    def GetWordWildcard(self, word, method="startswith"):
        matched = False
        for w in self.words:
            matched = getattr(w, method)(word)
            if matched:
                break
        return matched

    """
    def GetKeyword(self, name, value):
        return set()

    def GetBetween(self, min, max):
        print (min,max)
        return set()
    """

    def GetQuotes(self, search_string, tmp_result):
        return search_string in self.text

    def GetNot(self, not_set):
        return not not_set

    def _split_words(self, text):
        words = []
        """
        >>> import string
        >>> string.punctuation
        '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        """
        # it will keep @, # and
        # usernames and hashtags can contain dots, so a double check is done
        r = re.compile(r"[\s{}]+".format(re.escape("!\"$%&'()*+,-/:;<=>?[\\]^`{|}~")))
        _words = r.split(text)
        for _w in _words:
            if "." in _w and not _w.startswith("#") and not _w.startswith("@"):
                for __w in _w.split("."):
                    words.append(__w)
                continue

            words.append(_w)

        return words

    def match(self, text, expr):
        self.text = text
        self.words = self._split_words(text)

        return self.Parse(expr)


class ParserTest(BooleanSearchParser):
    """Tests the parser with some search queries
    tests contains a dictionary with tests and expected results.
    """

    def Test(self):
        exprs = {
            "0": "help",
            "1": "help or hulp",
            "2": "help and hulp",
            "3": "help hulp",
            "4": "help and hulp or hilp",
            "5": "help or hulp and hilp",
            "6": "help or hulp or hilp or halp",
            "7": "(help or hulp) and (hilp or halp)",
            "8": "help and (hilp or halp)",
            "9": "(help and (hilp or halp)) or hulp",
            "10": "not help",
            "11": "not hulp and halp",
            "12": "not (help and halp)",
            "13": '"help me please"',
            "14": '"help me please" or hulp',
            "15": '"help me please" or (hulp and halp)',
            "16": "help*",
            "17": "help or hulp*",
            "18": "help* and hulp",
            "19": "help and hulp* or hilp",
            "20": "help* or hulp or hilp or halp",
            "21": "(help or hulp*) and (hilp* or halp)",
            "22": "help* and (hilp* or halp*)",
            "23": "(help and (hilp* or halp)) or hulp*",
            "24": "not help* and halp",
            "25": "not (help* and helpe*)",
            "26": '"help* me please"',
            "27": '"help* me* please" or hulp*',
            "28": '"help me please*" or (hulp and halp)',
            "29": '"help me please" not (hulp and halp)',
            "30": '"help me please" hulp',
            "31": "help and hilp and not holp",
            "32": "help hilp not holp",
            "33": "help hilp and not holp",
            "34": "*lp and halp",
            "35": "*신은 and 어떠세요",
        }

        texts_matcheswith = {
            "halp thinks he needs help": [
                "25",
                "22",
                "20",
                "21",
                "11",
                "17",
                "16",
                "23",
                "34",
                "1",
                "0",
                "5",
                "7",
                "6",
                "9",
                "8",
            ],
            "he needs halp": ["24", "25", "20", "11", "10", "12", "34", "6"],
            "help": ["25", "20", "12", "17", "16", "1", "0", "5", "6"],
            "help hilp": [
                "25",
                "22",
                "20",
                "32",
                "21",
                "12",
                "17",
                "16",
                "19",
                "31",
                "23",
                "1",
                "0",
                "5",
                "4",
                "7",
                "6",
                "9",
                "8",
                "33",
            ],
            "help me please hulp": [
                "30",
                "25",
                "27",
                "20",
                "13",
                "12",
                "15",
                "14",
                "17",
                "16",
                "19",
                "18",
                "23",
                "29",
                "1",
                "0",
                "3",
                "2",
                "5",
                "4",
                "6",
                "9",
            ],
            "helper": ["20", "10", "12", "16"],
            "hulp hilp": [
                "25",
                "27",
                "20",
                "21",
                "10",
                "12",
                "14",
                "17",
                "19",
                "23",
                "1",
                "5",
                "4",
                "7",
                "6",
                "9",
            ],
            "nothing": ["25", "10", "12"],
            "안녕하세요, 당신은 어떠세요?": ["10", "12", "25", "35"],
        }

        all_ok = True
        for text, matches in texts_matcheswith.items():
            _matches = []
            for _id, expr in exprs.items():
                if self.match(text, expr):
                    _matches.append(_id)

            test_passed = sorted(matches) == sorted(_matches)
            if not test_passed:
                print("Failed", repr(text), "expected", matches, "matched", _matches)

            all_ok = all_ok and test_passed

        # Tests for non western characters, should fail with
        # pyparsing.exceptions.ParseException under the previous
        # configuration
        non_western_exprs = {
            "0": "*",
            "1": "ヿ",  # Edge character
            "2": "亀",  # Character in CJK block
            "3": "ヿ or 亀",
            "4": "ヿ and 亀",
            "5": "not ヿ"
        }

        non_western_texts_matcheswith = {
            "안녕하세요, 당신은 어떠세요?": ["0", "5"],
            "ヿ": ["0", "1", "3"],
            "亀": ["0", "2", "3", "5"],
            "亀 ヿ": ["0", "1", "2", "3", "4"],
        }

        for text, matches in non_western_texts_matcheswith.items():
            _matches = []
            for _id, expr in non_western_exprs.items():
                if self.match(text, expr):
                    _matches.append(_id)

            test_passed = sorted(matches) == sorted(_matches)
            if not test_passed:
                print("Failed", repr(text), "expected", matches, "matched", _matches)

            all_ok = all_ok and test_passed

        return all_ok


if __name__ == "__main__":
    if ParserTest().Test():
        print("All tests OK")
        exit(0)
    else:
        print("One or more tests FAILED")
        exit(1)