""" antlr_grammar.py Created on 4 sept. 2010 @author: luca Submitted by Luca DallOlio, September, 2010 (Minor updates by Paul McGuire, June, 2012) (Code idiom updates by Paul McGuire, April, 2019) """ from pyparsing import ( Word, ZeroOrMore, printables, Suppress, OneOrMore, Group, LineEnd, Optional, White, originalTextFor, hexnums, nums, Combine, Literal, Keyword, cStyleComment, Regex, Forward, MatchFirst, And, oneOf, alphas, alphanums, delimitedList, Char, ) # http://www.antlr.org/grammar/ANTLR/ANTLRv3.g ( QUOTE, APOS, EQ, LBRACK, RBRACK, LBRACE, RBRACE, LPAR, RPAR, ROOT, BANG, AT, TIL, SEMI, COLON, VERT, ) = map(Suppress, "\"'=[]{}()^!@~;:|") BSLASH = Literal("\\") keywords = ( SRC_, SCOPE_, OPTIONS_, TOKENS_, FRAGMENT, ID, LEXER, PARSER, GRAMMAR, TREE, CATCH, FINALLY, THROWS, PROTECTED, PUBLIC, PRIVATE, ) = map( Keyword, """src scope options tokens fragment id lexer parser grammar tree catch finally throws protected public private """.split(), ) KEYWORD = MatchFirst(keywords) # Tokens EOL = Suppress(LineEnd()) # $ SGL_PRINTABLE = Char(printables) singleTextString = originalTextFor( ZeroOrMore(~EOL + (White(" \t") | Word(printables))) ).leaveWhitespace() XDIGIT = hexnums INT = Word(nums) ESC = BSLASH + ( oneOf(list(r"nrtbf\">" + "'")) | ("u" + Word(hexnums, exact=4)) | SGL_PRINTABLE ) LITERAL_CHAR = ESC | ~(APOS | BSLASH) + SGL_PRINTABLE CHAR_LITERAL = APOS + LITERAL_CHAR + APOS STRING_LITERAL = APOS + Combine(OneOrMore(LITERAL_CHAR)) + APOS DOUBLE_QUOTE_STRING_LITERAL = '"' + ZeroOrMore(LITERAL_CHAR) + '"' DOUBLE_ANGLE_STRING_LITERAL = "<<" + ZeroOrMore(SGL_PRINTABLE) + ">>" TOKEN_REF = Word(alphas.upper(), alphanums + "_") RULE_REF = Word(alphas.lower(), alphanums + "_") ACTION_ESC = ( BSLASH.suppress() + APOS | BSLASH.suppress() | BSLASH.suppress() + (~(APOS | QUOTE) + SGL_PRINTABLE) ) ACTION_CHAR_LITERAL = APOS + (ACTION_ESC | ~(BSLASH | APOS) + SGL_PRINTABLE) + APOS ACTION_STRING_LITERAL = ( QUOTE + ZeroOrMore(ACTION_ESC | ~(BSLASH | QUOTE) + SGL_PRINTABLE) + QUOTE ) SRC = SRC_.suppress() + ACTION_STRING_LITERAL("file") + INT("line") id = TOKEN_REF | RULE_REF SL_COMMENT = ( Suppress("//") + Suppress("$ANTLR") + SRC | ZeroOrMore(~EOL + Word(printables)) + EOL ) ML_COMMENT = cStyleComment WS = OneOrMore( Suppress(" ") | Suppress("\t") | (Optional(Suppress("\r")) + Literal("\n")) ) WS_LOOP = ZeroOrMore(SL_COMMENT | ML_COMMENT) NESTED_ARG_ACTION = Forward() NESTED_ARG_ACTION << ( LBRACK + ZeroOrMore(NESTED_ARG_ACTION | ACTION_STRING_LITERAL | ACTION_CHAR_LITERAL) + RBRACK ) ARG_ACTION = NESTED_ARG_ACTION NESTED_ACTION = Forward() NESTED_ACTION << ( LBRACE + ZeroOrMore( NESTED_ACTION | SL_COMMENT | ML_COMMENT | ACTION_STRING_LITERAL | ACTION_CHAR_LITERAL ) + RBRACE ) ACTION = NESTED_ACTION + Optional("?") SCOPE = SCOPE_.suppress() OPTIONS = OPTIONS_.suppress() + LBRACE # + WS_LOOP + Suppress('{') TOKENS = TOKENS_.suppress() + LBRACE # + WS_LOOP + Suppress('{') TREE_BEGIN = ROOT + LPAR RANGE = Suppress("..") REWRITE = Suppress("->") # General Parser Definitions # Grammar heading optionValue = id | STRING_LITERAL | CHAR_LITERAL | INT | Literal("*").setName("s") option = Group(id("id") + EQ + optionValue("value"))("option") optionsSpec = OPTIONS + Group(OneOrMore(option + SEMI))("options") + RBRACE tokenSpec = ( Group(TOKEN_REF("token_ref") + (EQ + (STRING_LITERAL | CHAR_LITERAL)("lit")))( "token" ) + SEMI ) tokensSpec = TOKENS + Group(OneOrMore(tokenSpec))("tokens") + RBRACE attrScope = SCOPE_.suppress() + id + ACTION grammarType = LEXER + PARSER + TREE actionScopeName = id | LEXER("l") | PARSER("p") action = AT + Optional(actionScopeName + Suppress("::")) + id + ACTION grammarHeading = ( Optional(ML_COMMENT("ML_COMMENT")) + Optional(grammarType) + GRAMMAR + id("grammarName") + SEMI + Optional(optionsSpec) + Optional(tokensSpec) + ZeroOrMore(attrScope) + ZeroOrMore(action) ) modifier = PROTECTED | PUBLIC | PRIVATE | FRAGMENT ruleAction = AT + id + ACTION throwsSpec = THROWS.suppress() + delimitedList(id) ruleScopeSpec = ( (SCOPE_.suppress() + ACTION) | (SCOPE_.suppress() + delimitedList(id) + SEMI) | (SCOPE_.suppress() + ACTION + SCOPE_.suppress() + delimitedList(id) + SEMI) ) unary_op = oneOf("^ !") notTerminal = CHAR_LITERAL | TOKEN_REF | STRING_LITERAL terminal = ( CHAR_LITERAL | TOKEN_REF + Optional(ARG_ACTION) | STRING_LITERAL | "." ) + Optional(unary_op) block = Forward() notSet = TIL + (notTerminal | block) rangeNotPython = CHAR_LITERAL("c1") + RANGE + CHAR_LITERAL("c2") atom = Group( (rangeNotPython + Optional(unary_op)("op")) | terminal | (notSet + Optional(unary_op)("op")) | (RULE_REF + Optional(ARG_ACTION("arg")) + Optional(unary_op)("op")) ) element = Forward() treeSpec = ROOT + LPAR + element * (2,) + RPAR ebnfSuffix = oneOf("? * +") ebnf = block + Optional(ebnfSuffix("op") | "=>") elementNoOptionSpec = ( (id("result_name") + oneOf("= +=")("labelOp") + atom("atom") + Optional(ebnfSuffix)) | (id("result_name") + oneOf("= +=")("labelOp") + block + Optional(ebnfSuffix)) | atom("atom") + Optional(ebnfSuffix) | ebnf | ACTION | (treeSpec + Optional(ebnfSuffix)) ) # | SEMPRED ( '=>' -> GATED_SEMPRED | -> SEMPRED ) element <<= Group(elementNoOptionSpec)("element") # Do not ask me why group is needed twice... seems like the xml that you see is not always the real structure? alternative = Group(Group(OneOrMore(element))("elements")) rewrite = Optional(Literal("TODO REWRITE RULES TODO")) block <<= ( LPAR + Optional(Optional(optionsSpec("opts")) + COLON) + Group( alternative("a1") + rewrite + Group(ZeroOrMore(VERT + alternative("a2") + rewrite))("alternatives") )("block") + RPAR ) altList = ( alternative("a1") + rewrite + Group(ZeroOrMore(VERT + alternative("a2") + rewrite))("alternatives") ) exceptionHandler = CATCH.suppress() + ARG_ACTION + ACTION finallyClause = FINALLY.suppress() + ACTION exceptionGroup = (OneOrMore(exceptionHandler) + Optional(finallyClause)) | finallyClause ruleHeading = ( Optional(ML_COMMENT)("ruleComment") + Optional(modifier)("modifier") + id("ruleName") + Optional("!") + Optional(ARG_ACTION("arg")) + Optional(Suppress("returns") + ARG_ACTION("rt")) + Optional(throwsSpec) + Optional(optionsSpec) + Optional(ruleScopeSpec) + ZeroOrMore(ruleAction) ) rule = Group(ruleHeading + COLON + altList + SEMI + Optional(exceptionGroup))("rule") grammarDef = grammarHeading + Group(OneOrMore(rule))("rules") def grammar(): return grammarDef def __antlrAlternativesConverter(pyparsingRules, antlrBlock): rule = None if ( hasattr(antlrBlock, "alternatives") and antlrBlock.alternatives != "" and len(antlrBlock.alternatives) > 0 ): alternatives = [] alternatives.append(__antlrAlternativeConverter(pyparsingRules, antlrBlock.a1)) for alternative in antlrBlock.alternatives: alternatives.append( __antlrAlternativeConverter(pyparsingRules, alternative) ) rule = MatchFirst(alternatives)("anonymous_or") elif hasattr(antlrBlock, "a1") and antlrBlock.a1 != "": rule = __antlrAlternativeConverter(pyparsingRules, antlrBlock.a1) else: raise Exception("Not yet implemented") assert rule != None return rule def __antlrAlternativeConverter(pyparsingRules, antlrAlternative): elementList = [] for element in antlrAlternative.elements: rule = None if hasattr(element.atom, "c1") and element.atom.c1 != "": regex = r"[" + str(element.atom.c1[0]) + "-" + str(element.atom.c2[0] + "]") rule = Regex(regex)("anonymous_regex") elif hasattr(element, "block") and element.block != "": rule = __antlrAlternativesConverter(pyparsingRules, element.block) else: ruleRef = element.atom[0] assert ruleRef in pyparsingRules rule = pyparsingRules[ruleRef](ruleRef) if hasattr(element, "op") and element.op != "": if element.op == "+": rule = Group(OneOrMore(rule))("anonymous_one_or_more") elif element.op == "*": rule = Group(ZeroOrMore(rule))("anonymous_zero_or_more") elif element.op == "?": rule = Optional(rule) else: raise Exception("rule operator not yet implemented : " + element.op) rule = rule elementList.append(rule) if len(elementList) > 1: rule = Group(And(elementList))("anonymous_and") else: rule = elementList[0] assert rule is not None return rule def __antlrRuleConverter(pyparsingRules, antlrRule): rule = None rule = __antlrAlternativesConverter(pyparsingRules, antlrRule) assert rule != None rule(antlrRule.ruleName) return rule def antlrConverter(antlrGrammarTree): pyparsingRules = {} antlrTokens = {} for antlrToken in antlrGrammarTree.tokens: antlrTokens[antlrToken.token_ref] = antlrToken.lit for antlrTokenName, antlrToken in list(antlrTokens.items()): pyparsingRules[antlrTokenName] = Literal(antlrToken) antlrRules = {} for antlrRule in antlrGrammarTree.rules: antlrRules[antlrRule.ruleName] = antlrRule pyparsingRules[antlrRule.ruleName] = Forward() # antlr is a top down grammar for antlrRuleName, antlrRule in list(antlrRules.items()): pyparsingRule = __antlrRuleConverter(pyparsingRules, antlrRule) assert pyparsingRule != None pyparsingRules[antlrRuleName] <<= pyparsingRule return pyparsingRules if __name__ == "__main__": text = """\ grammar SimpleCalc; options { language = Python; } tokens { PLUS = '+' ; MINUS = '-' ; MULT = '*' ; DIV = '/' ; } /*------------------------------------------------------------------ * PARSER RULES *------------------------------------------------------------------*/ expr : term ( ( PLUS | MINUS ) term )* ; term : factor ( ( MULT | DIV ) factor )* ; factor : NUMBER ; /*------------------------------------------------------------------ * LEXER RULES *------------------------------------------------------------------*/ NUMBER : (DIGIT)+ ; /* WHITESPACE : ( '\t' | ' ' | '\r' | '\n'| '\u000C' )+ { $channel = HIDDEN; } ; */ fragment DIGIT : '0'..'9' ; """ grammar().validate() antlrGrammarTree = grammar().parseString(text) print(antlrGrammarTree.dump()) pyparsingRules = antlrConverter(antlrGrammarTree) pyparsingRule = pyparsingRules["expr"] pyparsingTree = pyparsingRule.parseString("2 - 5 * 42 + 7 / 25") print(pyparsingTree.dump())