Merge pull request #370 from python-cmd2/ply

Switch parsing logic from pyparsing to shlex
author: kotfu <jared@kotfu.net> 2018-05-02 20:27:14 -0600
committer: GitHub <noreply@github.com> 2018-05-02 20:27:14 -0600
commit: 6b5c23c127e78ca4f9e30a1cbea76c61f10e418c (patch)
tree: 07ebf6b0b51227800b287a6d26d1951d449cbde6 /cmd2/parsing.py
parent: d37004d05d0be67f48f35d6986ea899a35bcf89f (diff)
parent: fa94eed90cf81b24e5b83c2b4c7e16025d849996 (diff)
download: cmd2-git-6b5c23c127e78ca4f9e30a1cbea76c61f10e418c.tar.gz
1 files changed, 446 insertions, 0 deletions
diff --git a/cmd2/parsing.py b/cmd2/parsing.py
new file mode 100644
index 00000000..908e9272
--- /dev/null
+++ b/cmd2/parsing.py
@@ -0,0 +1,446 @@
+#
+# -*- coding: utf-8 -*-
+"""Statement parsing classes for cmd2"""
+
+import re
+import shlex
+from typing import List, Tuple
+
+from . import constants
+from . import utils
+
+LINE_FEED = '\n'
+
+
+class Statement(str):
+    """String subclass with additional attributes to store the results of parsing.
+
+    The cmd module in the standard library passes commands around as a
+    string. To retain backwards compatibility, cmd2 does the same. However, we
+    need a place to capture the additional output of the command parsing, so we add
+    our own attributes to this subclass.
+
+    The string portion of the class contains the arguments, but not the command, nor
+    the output redirection clauses.
+
+    :var raw:               string containing exactly what we input by the user
+    :type raw:              str
+    :var command:           the command, i.e. the first whitespace delimited word
+    :type command:          str or None
+    :var multiline_command: if the command is a multiline command, the name of the
+                            command, otherwise None
+    :type command:          str or None
+    :var args:              the arguments to the command, not including any output
+                            redirection or terminators. quoted arguments remain
+                            quoted.
+    :type args:             str or None
+    :var: argv:             a list of arguments a la sys.argv. Quotes, if any, are removed
+                            from the elements of the list, and aliases and shortcuts
+                            are expanded
+    :type argv:             list
+    :var terminator:        the charater which terminated the multiline command, if
+                            there was one
+    :type terminator:       str or None
+    :var suffix:            characters appearing after the terminator but before output
+                            redirection, if any
+    :type suffix:           str or None
+    :var pipe_to:           if output was piped to a shell command, the shell command
+    :type pipe_to:          str or None
+    :var output:            if output was redirected, the redirection token, i.e. '>>'
+    :type output:           str or None
+    :var output_to:         if output was redirected, the destination, usually a filename
+    :type output_to:        str or None
+
+    """
+    def __init__(self, obj):
+        super().__init__()
+        self.raw = str(obj)
+        self.command = None
+        self.multiline_command = None
+        self.args = None
+        self.argv = None
+        self.terminator = None
+        self.suffix = None
+        self.pipe_to = None
+        self.output = None
+        self.output_to = None
+
+    @property
+    def command_and_args(self):
+        """Combine command and args with a space separating them.
+
+        Quoted arguments remain quoted.
+        """
+        if self.command and self.args:
+            rtn = '{} {}'.format(self.command, self.args)
+        elif self.command:
+            # we are trusting that if we get here that self.args is None
+            rtn = self.command
+        else:
+            rtn = None
+        return rtn
+
+
+class StatementParser():
+    """Parse raw text into command components.
+
+    Shortcuts is a list of tuples with each tuple containing the shortcut and the expansion.
+    """
+    def __init__(
+            self,
+            allow_redirection=True,
+            terminators=None,
+            multiline_commands=None,
+            aliases=None,
+            shortcuts=None,
+        ):
+        self.allow_redirection = allow_redirection
+        if terminators is None:
+            self.terminators = [';']
+        else:
+            self.terminators = terminators
+        if multiline_commands is None:
+            self.multiline_commands = []
+        else:
+            self.multiline_commands = multiline_commands
+        if aliases is None:
+            self.aliases = {}
+        else:
+            self.aliases = aliases
+        if shortcuts is None:
+            self.shortcuts = []
+        else:
+            self.shortcuts = shortcuts
+
+        # this regular expression matches C-style comments and quoted
+        # strings, i.e. stuff between single or double quote marks
+        # it's used with _comment_replacer() to strip out the C-style
+        # comments, while leaving C-style comments that are inside either
+        # double or single quotes.
+        #
+        # this big regular expression can be broken down into 3 regular
+        # expressions that are OR'ed together.
+        #
+        # /\*.*?(\*/|$)          matches C-style comments, with an optional
+        #                        closing '*/'. The optional closing '*/' is
+        #                        there to retain backward compatibility with
+        #                        the pyparsing implementation of cmd2 < 0.9.0
+        # \'(?:\\.|[^\\\'])*\'   matches a single quoted string, allowing
+        #                        for embedded backslash escaped single quote
+        #                        marks
+        # "(?:\\.|[^\\"])*"      matches a double quoted string, allowing
+        #                        for embedded backslash escaped double quote
+        #                        marks
+        #
+        # by way of reminder the (?:...) regular expression syntax is just
+        # a non-capturing version of regular parenthesis. We need the non-
+        # capturing syntax because _comment_replacer() looks at match
+        # groups
+        self.comment_pattern = re.compile(
+            r'/\*.*?(\*/|$)|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
+            re.DOTALL | re.MULTILINE
+        )
+
+        # aliases have to be a word, so make a regular expression
+        # that matches the first word in the line. This regex has two
+        # parts, the first parenthesis enclosed group matches one
+        # or more non-whitespace characters, and the second group
+        # matches either a whitespace character or the end of the
+        # string. We use \A and \Z to ensure we always match the
+        # beginning and end of a string that may have multiple
+        # lines
+        self.command_pattern = re.compile(r'\A(\S+)(\s|\Z)')
+
+
+    def tokenize(self, line: str) -> List[str]:
+        """Lex a string into a list of tokens.
+
+        Comments are removed, and shortcuts and aliases are expanded.
+        """
+
+        # strip C-style comments
+        # shlex will handle the python/shell style comments for us
+        line = re.sub(self.comment_pattern, self._comment_replacer, line)
+
+        # expand shortcuts and aliases
+        line = self._expand(line)
+
+        # split on whitespace
+        lexer = shlex.shlex(line, posix=False)
+        lexer.whitespace_split = True
+
+        # custom lexing
+        tokens = self._split_on_punctuation(list(lexer))
+        return tokens
+
+    def parse(self, rawinput: str) -> Statement:
+        """Tokenize the input and parse it into a Statement object, stripping
+        comments, expanding aliases and shortcuts, and extracting output
+        redirection directives.
+        """
+
+        # handle the special case/hardcoded terminator of a blank line
+        # we have to do this before we tokenize because tokenizing
+        # destroys all unquoted whitespace in the input
+        terminator = None
+        if rawinput[-1:] == LINE_FEED:
+            terminator = LINE_FEED
+
+        command = None
+        args = None
+        argv = None
+
+        # lex the input into a list of tokens
+        tokens = self.tokenize(rawinput)
+
+        # of the valid terminators, find the first one to occur in the input
+        terminator_pos = len(tokens)+1
+        for test_terminator in self.terminators:
+            try:
+                pos = tokens.index(test_terminator)
+                if pos < terminator_pos:
+                    terminator_pos = pos
+                    terminator = test_terminator
+                    break
+            except ValueError:
+                # the terminator is not in the tokens
+                pass
+
+        if terminator:
+            if terminator == LINE_FEED:
+                terminator_pos = len(tokens)+1
+            else:
+                terminator_pos = tokens.index(terminator)
+            # everything before the first terminator is the command and the args
+            argv = tokens[:terminator_pos]
+            (command, args) = self._command_and_args(argv)
+            # we will set the suffix later
+            # remove all the tokens before and including the terminator
+            tokens = tokens[terminator_pos+1:]
+        else:
+            (testcommand, testargs) = self._command_and_args(tokens)
+            if testcommand in self.multiline_commands:
+                # no terminator on this line but we have a multiline command
+                # everything else on the line is part of the args
+                # because redirectors can only be after a terminator
+                command = testcommand
+                args = testargs
+                argv = tokens
+                tokens = []
+
+        # check for output redirect
+        output = None
+        output_to = None
+        try:
+            output_pos = tokens.index('>')
+            output = '>'
+            output_to = ' '.join(tokens[output_pos+1:])
+            # remove all the tokens after the output redirect
+            tokens = tokens[:output_pos]
+        except ValueError:
+            pass
+
+        try:
+            output_pos = tokens.index('>>')
+            output = '>>'
+            output_to = ' '.join(tokens[output_pos+1:])
+            # remove all tokens after the output redirect
+            tokens = tokens[:output_pos]
+        except ValueError:
+            pass
+
+        # check for pipes
+        try:
+            # find the first pipe if it exists
+            pipe_pos = tokens.index('|')
+            # save everything after the first pipe
+            pipe_to = ' '.join(tokens[pipe_pos+1:])
+            # remove all the tokens after the pipe
+            tokens = tokens[:pipe_pos]
+        except ValueError:
+            # no pipe in the tokens
+            pipe_to = None
+
+        if terminator:
+            # whatever is left is the suffix
+            suffix = ' '.join(tokens)
+        else:
+            # no terminator, so whatever is left is the command and the args
+            suffix = None
+            if not command:
+                # command could already have been set, if so, don't set it again
+                argv = tokens
+                (command, args) = self._command_and_args(argv)
+
+        # set multiline
+        if command in self.multiline_commands:
+            multiline_command = command
+        else:
+            multiline_command = None
+
+        # build the statement
+        # string representation of args must be an empty string instead of
+        # None for compatibility with standard library cmd
+        statement = Statement('' if args is None else args)
+        statement.raw = rawinput
+        statement.command = command
+        # if there are no args we will use None since we don't have to worry
+        # about compatibility with standard library cmd
+        statement.args = args
+        statement.argv = list(map(lambda x: utils.strip_quotes(x), argv))
+        statement.terminator = terminator
+        statement.output = output
+        statement.output_to = output_to
+        statement.pipe_to = pipe_to
+        statement.suffix = suffix
+        statement.multiline_command = multiline_command
+        return statement
+
+    def parse_command_only(self, rawinput: str) -> Statement:
+        """Partially parse input into a Statement object. The command is
+        identified, and shortcuts and aliases are expanded.
+        Terminators, multiline commands, and output redirection are not
+        parsed.
+        """
+        # lex the input into a list of tokens
+        tokens = self.tokenize(rawinput)
+
+        # parse out the command and everything else
+        (command, args) = self._command_and_args(tokens)
+
+        # build the statement
+        # string representation of args must be an empty string instead of
+        # None for compatibility with standard library cmd
+        statement = Statement('' if args is None else args)
+        statement.raw = rawinput
+        statement.command = command
+        statement.args = args
+        statement.argv = tokens
+        return statement
+
+    def _expand(self, line: str) -> str:
+        """Expand shortcuts and aliases"""
+
+        # expand aliases
+        # make a copy of aliases so we can edit it
+        tmp_aliases = list(self.aliases.keys())
+        keep_expanding = bool(tmp_aliases)
+        while keep_expanding:
+            for cur_alias in tmp_aliases:
+                keep_expanding = False
+                # apply our regex to line
+                match = self.command_pattern.search(line)
+                if match:
+                    # we got a match, extract the command
+                    command = match.group(1)
+                    if command == cur_alias:
+                        # rebuild line with the expanded alias
+                        line = self.aliases[cur_alias] + match.group(2) + line[match.end(2):]
+                        tmp_aliases.remove(cur_alias)
+                        keep_expanding = bool(tmp_aliases)
+                        break
+
+        # expand shortcuts
+        for (shortcut, expansion) in self.shortcuts:
+            if  line.startswith(shortcut):
+                # If the next character after the shortcut isn't a space, then insert one
+                shortcut_len = len(shortcut)
+                if len(line) == shortcut_len or line[shortcut_len] != ' ':
+                    expansion += ' '
+
+                # Expand the shortcut
+                line = line.replace(shortcut, expansion, 1)
+                break
+        return line
+
+    @staticmethod
+    def _command_and_args(tokens: List[str]) -> Tuple[str, str]:
+        """Given a list of tokens, return a tuple of the command
+        and the args as a string.
+
+        The args string will be '' instead of None to retain backwards compatibility
+        with cmd in the standard library.
+        """
+        command = None
+        args = None
+
+        if tokens:
+            command = tokens[0]
+
+        if len(tokens) > 1:
+            args = ' '.join(tokens[1:])
+
+        return (command, args)
+
+    @staticmethod
+    def _comment_replacer(match):
+        matched_string = match.group(0)
+        if matched_string.startswith('/'):
+            # the matched string was a comment, so remove it
+            return ''
+        # the matched string was a quoted string, return the match
+        return matched_string
+
+    def _split_on_punctuation(self, tokens: List[str]) -> List[str]:
+        """
+        # Further splits tokens from a command line using punctuation characters
+        # as word breaks when they are in unquoted strings. Each run of punctuation
+        # characters is treated as a single token.
+
+        :param initial_tokens: the tokens as parsed by shlex
+        :return: the punctuated tokens
+        """
+        punctuation = []
+        punctuation.extend(self.terminators)
+        if self.allow_redirection:
+            punctuation.extend(constants.REDIRECTION_CHARS)
+
+        punctuated_tokens = []
+
+        for cur_initial_token in tokens:
+
+            # Save tokens up to 1 character in length or quoted tokens. No need to parse these.
+            if len(cur_initial_token) <= 1 or cur_initial_token[0] in constants.QUOTES:
+                punctuated_tokens.append(cur_initial_token)
+                continue
+
+            # Iterate over each character in this token
+            cur_index = 0
+            cur_char = cur_initial_token[cur_index]
+
+            # Keep track of the token we are building
+            new_token = ''
+
+            while True:
+                if cur_char not in punctuation:
+
+                    # Keep appending to new_token until we hit a punctuation char
+                    while cur_char not in punctuation:
+                        new_token += cur_char
+                        cur_index += 1
+                        if cur_index < len(cur_initial_token):
+                            cur_char = cur_initial_token[cur_index]
+                        else:
+                            break
+
+                else:
+                    cur_punc = cur_char
+
+                    # Keep appending to new_token until we hit something other than cur_punc
+                    while cur_char == cur_punc:
+                        new_token += cur_char
+                        cur_index += 1
+                        if cur_index < len(cur_initial_token):
+                            cur_char = cur_initial_token[cur_index]
+                        else:
+                            break
+
+                # Save the new token
+                punctuated_tokens.append(new_token)
+                new_token = ''
+
+                # Check if we've viewed all characters
+                if cur_index >= len(cur_initial_token):
+                    break
+
+        return punctuated_tokens
author	kotfu <jared@kotfu.net>	2018-05-02 20:27:14 -0600
committer	GitHub <noreply@github.com>	2018-05-02 20:27:14 -0600
commit	6b5c23c127e78ca4f9e30a1cbea76c61f10e418c (patch)
tree	07ebf6b0b51227800b287a6d26d1951d449cbde6 /cmd2/parsing.py
parent	d37004d05d0be67f48f35d6986ea899a35bcf89f (diff)
parent	fa94eed90cf81b24e5b83c2b4c7e16025d849996 (diff)
download	cmd2-git-6b5c23c127e78ca4f9e30a1cbea76c61f10e418c.tar.gz