diff options
author | kotfu <jared@kotfu.net> | 2018-05-02 20:27:14 -0600 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-05-02 20:27:14 -0600 |
commit | 6b5c23c127e78ca4f9e30a1cbea76c61f10e418c (patch) | |
tree | 07ebf6b0b51227800b287a6d26d1951d449cbde6 /cmd2/parsing.py | |
parent | d37004d05d0be67f48f35d6986ea899a35bcf89f (diff) | |
parent | fa94eed90cf81b24e5b83c2b4c7e16025d849996 (diff) | |
download | cmd2-git-6b5c23c127e78ca4f9e30a1cbea76c61f10e418c.tar.gz |
Merge pull request #370 from python-cmd2/ply
Switch parsing logic from pyparsing to shlex
Diffstat (limited to 'cmd2/parsing.py')
-rw-r--r-- | cmd2/parsing.py | 446 |
1 files changed, 446 insertions, 0 deletions
diff --git a/cmd2/parsing.py b/cmd2/parsing.py new file mode 100644 index 00000000..908e9272 --- /dev/null +++ b/cmd2/parsing.py @@ -0,0 +1,446 @@ +# +# -*- coding: utf-8 -*- +"""Statement parsing classes for cmd2""" + +import re +import shlex +from typing import List, Tuple + +from . import constants +from . import utils + +LINE_FEED = '\n' + + +class Statement(str): + """String subclass with additional attributes to store the results of parsing. + + The cmd module in the standard library passes commands around as a + string. To retain backwards compatibility, cmd2 does the same. However, we + need a place to capture the additional output of the command parsing, so we add + our own attributes to this subclass. + + The string portion of the class contains the arguments, but not the command, nor + the output redirection clauses. + + :var raw: string containing exactly what we input by the user + :type raw: str + :var command: the command, i.e. the first whitespace delimited word + :type command: str or None + :var multiline_command: if the command is a multiline command, the name of the + command, otherwise None + :type command: str or None + :var args: the arguments to the command, not including any output + redirection or terminators. quoted arguments remain + quoted. + :type args: str or None + :var: argv: a list of arguments a la sys.argv. Quotes, if any, are removed + from the elements of the list, and aliases and shortcuts + are expanded + :type argv: list + :var terminator: the charater which terminated the multiline command, if + there was one + :type terminator: str or None + :var suffix: characters appearing after the terminator but before output + redirection, if any + :type suffix: str or None + :var pipe_to: if output was piped to a shell command, the shell command + :type pipe_to: str or None + :var output: if output was redirected, the redirection token, i.e. '>>' + :type output: str or None + :var output_to: if output was redirected, the destination, usually a filename + :type output_to: str or None + + """ + def __init__(self, obj): + super().__init__() + self.raw = str(obj) + self.command = None + self.multiline_command = None + self.args = None + self.argv = None + self.terminator = None + self.suffix = None + self.pipe_to = None + self.output = None + self.output_to = None + + @property + def command_and_args(self): + """Combine command and args with a space separating them. + + Quoted arguments remain quoted. + """ + if self.command and self.args: + rtn = '{} {}'.format(self.command, self.args) + elif self.command: + # we are trusting that if we get here that self.args is None + rtn = self.command + else: + rtn = None + return rtn + + +class StatementParser(): + """Parse raw text into command components. + + Shortcuts is a list of tuples with each tuple containing the shortcut and the expansion. + """ + def __init__( + self, + allow_redirection=True, + terminators=None, + multiline_commands=None, + aliases=None, + shortcuts=None, + ): + self.allow_redirection = allow_redirection + if terminators is None: + self.terminators = [';'] + else: + self.terminators = terminators + if multiline_commands is None: + self.multiline_commands = [] + else: + self.multiline_commands = multiline_commands + if aliases is None: + self.aliases = {} + else: + self.aliases = aliases + if shortcuts is None: + self.shortcuts = [] + else: + self.shortcuts = shortcuts + + # this regular expression matches C-style comments and quoted + # strings, i.e. stuff between single or double quote marks + # it's used with _comment_replacer() to strip out the C-style + # comments, while leaving C-style comments that are inside either + # double or single quotes. + # + # this big regular expression can be broken down into 3 regular + # expressions that are OR'ed together. + # + # /\*.*?(\*/|$) matches C-style comments, with an optional + # closing '*/'. The optional closing '*/' is + # there to retain backward compatibility with + # the pyparsing implementation of cmd2 < 0.9.0 + # \'(?:\\.|[^\\\'])*\' matches a single quoted string, allowing + # for embedded backslash escaped single quote + # marks + # "(?:\\.|[^\\"])*" matches a double quoted string, allowing + # for embedded backslash escaped double quote + # marks + # + # by way of reminder the (?:...) regular expression syntax is just + # a non-capturing version of regular parenthesis. We need the non- + # capturing syntax because _comment_replacer() looks at match + # groups + self.comment_pattern = re.compile( + r'/\*.*?(\*/|$)|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', + re.DOTALL | re.MULTILINE + ) + + # aliases have to be a word, so make a regular expression + # that matches the first word in the line. This regex has two + # parts, the first parenthesis enclosed group matches one + # or more non-whitespace characters, and the second group + # matches either a whitespace character or the end of the + # string. We use \A and \Z to ensure we always match the + # beginning and end of a string that may have multiple + # lines + self.command_pattern = re.compile(r'\A(\S+)(\s|\Z)') + + + def tokenize(self, line: str) -> List[str]: + """Lex a string into a list of tokens. + + Comments are removed, and shortcuts and aliases are expanded. + """ + + # strip C-style comments + # shlex will handle the python/shell style comments for us + line = re.sub(self.comment_pattern, self._comment_replacer, line) + + # expand shortcuts and aliases + line = self._expand(line) + + # split on whitespace + lexer = shlex.shlex(line, posix=False) + lexer.whitespace_split = True + + # custom lexing + tokens = self._split_on_punctuation(list(lexer)) + return tokens + + def parse(self, rawinput: str) -> Statement: + """Tokenize the input and parse it into a Statement object, stripping + comments, expanding aliases and shortcuts, and extracting output + redirection directives. + """ + + # handle the special case/hardcoded terminator of a blank line + # we have to do this before we tokenize because tokenizing + # destroys all unquoted whitespace in the input + terminator = None + if rawinput[-1:] == LINE_FEED: + terminator = LINE_FEED + + command = None + args = None + argv = None + + # lex the input into a list of tokens + tokens = self.tokenize(rawinput) + + # of the valid terminators, find the first one to occur in the input + terminator_pos = len(tokens)+1 + for test_terminator in self.terminators: + try: + pos = tokens.index(test_terminator) + if pos < terminator_pos: + terminator_pos = pos + terminator = test_terminator + break + except ValueError: + # the terminator is not in the tokens + pass + + if terminator: + if terminator == LINE_FEED: + terminator_pos = len(tokens)+1 + else: + terminator_pos = tokens.index(terminator) + # everything before the first terminator is the command and the args + argv = tokens[:terminator_pos] + (command, args) = self._command_and_args(argv) + # we will set the suffix later + # remove all the tokens before and including the terminator + tokens = tokens[terminator_pos+1:] + else: + (testcommand, testargs) = self._command_and_args(tokens) + if testcommand in self.multiline_commands: + # no terminator on this line but we have a multiline command + # everything else on the line is part of the args + # because redirectors can only be after a terminator + command = testcommand + args = testargs + argv = tokens + tokens = [] + + # check for output redirect + output = None + output_to = None + try: + output_pos = tokens.index('>') + output = '>' + output_to = ' '.join(tokens[output_pos+1:]) + # remove all the tokens after the output redirect + tokens = tokens[:output_pos] + except ValueError: + pass + + try: + output_pos = tokens.index('>>') + output = '>>' + output_to = ' '.join(tokens[output_pos+1:]) + # remove all tokens after the output redirect + tokens = tokens[:output_pos] + except ValueError: + pass + + # check for pipes + try: + # find the first pipe if it exists + pipe_pos = tokens.index('|') + # save everything after the first pipe + pipe_to = ' '.join(tokens[pipe_pos+1:]) + # remove all the tokens after the pipe + tokens = tokens[:pipe_pos] + except ValueError: + # no pipe in the tokens + pipe_to = None + + if terminator: + # whatever is left is the suffix + suffix = ' '.join(tokens) + else: + # no terminator, so whatever is left is the command and the args + suffix = None + if not command: + # command could already have been set, if so, don't set it again + argv = tokens + (command, args) = self._command_and_args(argv) + + # set multiline + if command in self.multiline_commands: + multiline_command = command + else: + multiline_command = None + + # build the statement + # string representation of args must be an empty string instead of + # None for compatibility with standard library cmd + statement = Statement('' if args is None else args) + statement.raw = rawinput + statement.command = command + # if there are no args we will use None since we don't have to worry + # about compatibility with standard library cmd + statement.args = args + statement.argv = list(map(lambda x: utils.strip_quotes(x), argv)) + statement.terminator = terminator + statement.output = output + statement.output_to = output_to + statement.pipe_to = pipe_to + statement.suffix = suffix + statement.multiline_command = multiline_command + return statement + + def parse_command_only(self, rawinput: str) -> Statement: + """Partially parse input into a Statement object. The command is + identified, and shortcuts and aliases are expanded. + Terminators, multiline commands, and output redirection are not + parsed. + """ + # lex the input into a list of tokens + tokens = self.tokenize(rawinput) + + # parse out the command and everything else + (command, args) = self._command_and_args(tokens) + + # build the statement + # string representation of args must be an empty string instead of + # None for compatibility with standard library cmd + statement = Statement('' if args is None else args) + statement.raw = rawinput + statement.command = command + statement.args = args + statement.argv = tokens + return statement + + def _expand(self, line: str) -> str: + """Expand shortcuts and aliases""" + + # expand aliases + # make a copy of aliases so we can edit it + tmp_aliases = list(self.aliases.keys()) + keep_expanding = bool(tmp_aliases) + while keep_expanding: + for cur_alias in tmp_aliases: + keep_expanding = False + # apply our regex to line + match = self.command_pattern.search(line) + if match: + # we got a match, extract the command + command = match.group(1) + if command == cur_alias: + # rebuild line with the expanded alias + line = self.aliases[cur_alias] + match.group(2) + line[match.end(2):] + tmp_aliases.remove(cur_alias) + keep_expanding = bool(tmp_aliases) + break + + # expand shortcuts + for (shortcut, expansion) in self.shortcuts: + if line.startswith(shortcut): + # If the next character after the shortcut isn't a space, then insert one + shortcut_len = len(shortcut) + if len(line) == shortcut_len or line[shortcut_len] != ' ': + expansion += ' ' + + # Expand the shortcut + line = line.replace(shortcut, expansion, 1) + break + return line + + @staticmethod + def _command_and_args(tokens: List[str]) -> Tuple[str, str]: + """Given a list of tokens, return a tuple of the command + and the args as a string. + + The args string will be '' instead of None to retain backwards compatibility + with cmd in the standard library. + """ + command = None + args = None + + if tokens: + command = tokens[0] + + if len(tokens) > 1: + args = ' '.join(tokens[1:]) + + return (command, args) + + @staticmethod + def _comment_replacer(match): + matched_string = match.group(0) + if matched_string.startswith('/'): + # the matched string was a comment, so remove it + return '' + # the matched string was a quoted string, return the match + return matched_string + + def _split_on_punctuation(self, tokens: List[str]) -> List[str]: + """ + # Further splits tokens from a command line using punctuation characters + # as word breaks when they are in unquoted strings. Each run of punctuation + # characters is treated as a single token. + + :param initial_tokens: the tokens as parsed by shlex + :return: the punctuated tokens + """ + punctuation = [] + punctuation.extend(self.terminators) + if self.allow_redirection: + punctuation.extend(constants.REDIRECTION_CHARS) + + punctuated_tokens = [] + + for cur_initial_token in tokens: + + # Save tokens up to 1 character in length or quoted tokens. No need to parse these. + if len(cur_initial_token) <= 1 or cur_initial_token[0] in constants.QUOTES: + punctuated_tokens.append(cur_initial_token) + continue + + # Iterate over each character in this token + cur_index = 0 + cur_char = cur_initial_token[cur_index] + + # Keep track of the token we are building + new_token = '' + + while True: + if cur_char not in punctuation: + + # Keep appending to new_token until we hit a punctuation char + while cur_char not in punctuation: + new_token += cur_char + cur_index += 1 + if cur_index < len(cur_initial_token): + cur_char = cur_initial_token[cur_index] + else: + break + + else: + cur_punc = cur_char + + # Keep appending to new_token until we hit something other than cur_punc + while cur_char == cur_punc: + new_token += cur_char + cur_index += 1 + if cur_index < len(cur_initial_token): + cur_char = cur_initial_token[cur_index] + else: + break + + # Save the new token + punctuated_tokens.append(new_token) + new_token = '' + + # Check if we've viewed all characters + if cur_index >= len(cur_initial_token): + break + + return punctuated_tokens |