Merge pull request #260 from vmuriart/long_live_indexes

Long live indexes - Improve performance
author: Vik <vmuriart@gmail.com> 2016-06-16 02:33:28 -0700
committer: GitHub <noreply@github.com> 2016-06-16 02:33:28 -0700
commit: 92b5f2bb88ed1c1080ecf7eb7449f5c642ae196a (patch)
tree: 30b53c5970fc01fab5a14c9a0298f4e8d4eba077 /sqlparse/engine/grouping.py
parent: 451d6d5d380cb4246c47e374aa9c4034fc7f9805 (diff)
parent: 9fcf1f2cda629cdf11a8a4ac596fb7cae0e89de9 (diff)
download: sqlparse-92b5f2bb88ed1c1080ecf7eb7449f5c642ae196a.tar.gz
1 files changed, 243 insertions, 157 deletions
diff --git a/sqlparse/engine/grouping.py b/sqlparse/engine/grouping.py
index 6e414b8..62f37a6 100644
--- a/sqlparse/engine/grouping.py
+++ b/sqlparse/engine/grouping.py
@@ -7,56 +7,58 @@
 
 from sqlparse import sql
 from sqlparse import tokens as T
-from sqlparse.utils import recurse, imt, find_matching
-
-M_ROLE = (T.Keyword, ('null', 'role'))
-M_SEMICOLON = (T.Punctuation, ';')
-M_COMMA = (T.Punctuation, ',')
+from sqlparse.utils import recurse, imt
 
 T_NUMERICAL = (T.Number, T.Number.Integer, T.Number.Float)
 T_STRING = (T.String, T.String.Single, T.String.Symbol)
 T_NAME = (T.Name, T.Name.Placeholder)
 
 
-def _group_left_right(tlist, m, cls,
-                      valid_left=lambda t: t is not None,
-                      valid_right=lambda t: t is not None,
-                      semicolon=False):
-    """Groups together tokens that are joined by a middle token. ie. x < y"""
+def _group_matching(tlist, cls):
+    """Groups Tokens that have beginning and end."""
+    opens = []
+    tidx_offset = 0
+    for idx, token in enumerate(list(tlist)):
+        tidx = idx - tidx_offset
+
+        if token.is_whitespace():
+            # ~50% of tokens will be whitespace. Will checking early
+            # for them avoid 3 comparisons, but then add 1 more comparison
+            # for the other ~50% of tokens...
+            continue
 
-    for token in list(tlist):
         if token.is_group() and not isinstance(token, cls):
-            _group_left_right(token, m, cls, valid_left, valid_right,
-                              semicolon)
-
-        if not token.match(*m):
+            # Check inside previously grouped (ie. parenthesis) if group
+            # of differnt type is inside (ie, case). though ideally  should
+            # should check for all open/close tokens at once to avoid recursion
+            _group_matching(token, cls)
             continue
 
-        left, right = tlist.token_prev(token), tlist.token_next(token)
+        if token.match(*cls.M_OPEN):
+            opens.append(tidx)
 
-        if valid_left(left) and valid_right(right):
-            if semicolon:
-                # only overwrite if a semicolon present.
-                sright = tlist.token_next_by(m=M_SEMICOLON, idx=right)
-                right = sright or right
-            tokens = tlist.tokens_between(left, right)
-            tlist.group_tokens(cls, tokens, extend=True)
+        elif token.match(*cls.M_CLOSE):
+            try:
+                open_idx = opens.pop()
+            except IndexError:
+                # this indicates invalid sql and unbalanced tokens.
+                # instead of break, continue in case other "valid" groups exist
+                continue
+            close_idx = tidx
+            tlist.group_tokens(cls, open_idx, close_idx)
+            tidx_offset += close_idx - open_idx
 
 
-def _group_matching(tlist, cls):
-    """Groups Tokens that have beginning and end."""
-    [_group_matching(sgroup, cls) for sgroup in tlist.get_sublists()
-     if not isinstance(sgroup, cls)]
-    idx = 1 if isinstance(tlist, cls) else 0
+def group_brackets(tlist):
+    _group_matching(tlist, sql.SquareBrackets)
 
-    token = tlist.token_next_by(m=cls.M_OPEN, idx=idx)
-    while token:
-        end = find_matching(tlist, token, cls.M_OPEN, cls.M_CLOSE)
-        if end is not None:
-            tokens = tlist.tokens_between(token, end)
-            token = tlist.group_tokens(cls, tokens)
-            _group_matching(token, cls)
-        token = tlist.token_next_by(m=cls.M_OPEN, idx=token)
+
+def group_parenthesis(tlist):
+    _group_matching(tlist, sql.Parenthesis)
+
+
+def group_case(tlist):
+    _group_matching(tlist, sql.Case)
 
 
 def group_if(tlist):
@@ -67,149 +69,202 @@ def group_for(tlist):
     _group_matching(tlist, sql.For)
 
 
-def group_foreach(tlist):
-    _group_matching(tlist, sql.For)
-
-
 def group_begin(tlist):
     _group_matching(tlist, sql.Begin)
 
 
+def group_typecasts(tlist):
+    def match(token):
+        return token.match(T.Punctuation, '::')
+
+    def valid(token):
+        return token is not None
+
+    def post(tlist, pidx, tidx, nidx):
+        return pidx, nidx
+
+    valid_prev = valid_next = valid
+    _group(tlist, sql.Identifier, match, valid_prev, valid_next, post)
+
+
+def group_period(tlist):
+    def match(token):
+        return token.match(T.Punctuation, '.')
+
+    def valid_prev(token):
+        sqlcls = sql.SquareBrackets, sql.Identifier
+        ttypes = T.Name, T.String.Symbol
+        return imt(token, i=sqlcls, t=ttypes)
+
+    def valid_next(token):
+        sqlcls = sql.SquareBrackets, sql.Function
+        ttypes = T.Name, T.String.Symbol, T.Wildcard
+        return imt(token, i=sqlcls, t=ttypes)
+
+    def post(tlist, pidx, tidx, nidx):
+        return pidx, nidx
+
+    _group(tlist, sql.Identifier, match, valid_prev, valid_next, post)
+
+
 def group_as(tlist):
-    lfunc = lambda tk: not imt(tk, t=T.Keyword) or tk.value == 'NULL'
-    rfunc = lambda tk: not imt(tk, t=(T.DML, T.DDL))
-    _group_left_right(tlist, (T.Keyword, 'AS'), sql.Identifier,
-                      valid_left=lfunc, valid_right=rfunc)
+    def match(token):
+        return token.is_keyword and token.normalized == 'AS'
+
+    def valid_prev(token):
+        return token.normalized == 'NULL' or not token.is_keyword
+
+    def valid_next(token):
+        ttypes = T.DML, T.DDL
+        return not imt(token, t=ttypes)
+
+    def post(tlist, pidx, tidx, nidx):
+        return pidx, nidx
+
+    _group(tlist, sql.Identifier, match, valid_prev, valid_next, post)
 
 
 def group_assignment(tlist):
-    _group_left_right(tlist, (T.Assignment, ':='), sql.Assignment,
-                      semicolon=True)
+    def match(token):
+        return token.match(T.Assignment, ':=')
 
+    def valid(token):
+        return token is not None
 
-def group_comparison(tlist):
-    I_COMPERABLE = (sql.Parenthesis, sql.Function, sql.Identifier,
-                    sql.Operation)
-    T_COMPERABLE = T_NUMERICAL + T_STRING + T_NAME
+    def post(tlist, pidx, tidx, nidx):
+        m_semicolon = T.Punctuation, ';'
+        snidx, _ = tlist.token_next_by(m=m_semicolon, idx=nidx)
+        nidx = snidx or nidx
+        return pidx, nidx
 
-    func = lambda tk: (imt(tk, t=T_COMPERABLE, i=I_COMPERABLE) or
-                       (tk and tk.is_keyword and tk.normalized == 'NULL'))
+    valid_prev = valid_next = valid
+    _group(tlist, sql.Assignment, match, valid_prev, valid_next, post)
 
-    _group_left_right(tlist, (T.Operator.Comparison, None), sql.Comparison,
-                      valid_left=func, valid_right=func)
 
+def group_comparison(tlist):
+    sqlcls = (sql.Parenthesis, sql.Function, sql.Identifier,
+              sql.Operation)
+    ttypes = T_NUMERICAL + T_STRING + T_NAME
+
+    def match(token):
+        return token.ttype == T.Operator.Comparison
+
+    def valid(token):
+        if imt(token, t=ttypes, i=sqlcls):
+            return True
+        elif token and token.is_keyword and token.normalized == 'NULL':
+            return True
+        else:
+            return False
 
-def group_case(tlist):
-    _group_matching(tlist, sql.Case)
+    def post(tlist, pidx, tidx, nidx):
+        return pidx, nidx
+
+    valid_prev = valid_next = valid
+    _group(tlist, sql.Comparison, match,
+           valid_prev, valid_next, post, extend=False)
 
 
 @recurse(sql.Identifier)
 def group_identifier(tlist):
-    T_IDENT = (T.String.Symbol, T.Name)
+    ttypes = (T.String.Symbol, T.Name)
 
-    token = tlist.token_next_by(t=T_IDENT)
+    tidx, token = tlist.token_next_by(t=ttypes)
     while token:
-        token = tlist.group_tokens(sql.Identifier, [token, ])
-        token = tlist.token_next_by(t=T_IDENT, idx=token)
+        tlist.group_tokens(sql.Identifier, tidx, tidx)
+        tidx, token = tlist.token_next_by(t=ttypes, idx=tidx)
 
 
-def group_period(tlist):
-    lfunc = lambda tk: imt(tk, i=(sql.SquareBrackets, sql.Identifier),
-                           t=(T.Name, T.String.Symbol,))
+def group_arrays(tlist):
+    sqlcls = sql.SquareBrackets, sql.Identifier, sql.Function
+    ttypes = T.Name, T.String.Symbol
 
-    rfunc = lambda tk: imt(tk, i=(sql.SquareBrackets, sql.Function),
-                           t=(T.Name, T.String.Symbol, T.Wildcard))
+    def match(token):
+        return isinstance(token, sql.SquareBrackets)
 
-    _group_left_right(tlist, (T.Punctuation, '.'), sql.Identifier,
-                      valid_left=lfunc, valid_right=rfunc)
+    def valid_prev(token):
+        return imt(token, i=sqlcls, t=ttypes)
 
+    def valid_next(token):
+        return True
 
-def group_arrays(tlist):
-    token = tlist.token_next_by(i=sql.SquareBrackets)
-    while token:
-        prev = tlist.token_prev(token)
-        if imt(prev, i=(sql.SquareBrackets, sql.Identifier, sql.Function),
-               t=(T.Name, T.String.Symbol,)):
-            tokens = tlist.tokens_between(prev, token)
-            token = tlist.group_tokens(sql.Identifier, tokens, extend=True)
-        token = tlist.token_next_by(i=sql.SquareBrackets, idx=token)
+    def post(tlist, pidx, tidx, nidx):
+        return pidx, tidx
 
+    _group(tlist, sql.Identifier, match,
+           valid_prev, valid_next, post, extend=True, recurse=False)
 
-@recurse(sql.Identifier)
-def group_operator(tlist):
-    I_CYCLE = (sql.SquareBrackets, sql.Parenthesis, sql.Function,
-               sql.Identifier, sql.Operation)
-    # wilcards wouldn't have operations next to them
-    T_CYCLE = T_NUMERICAL + T_STRING + T_NAME
-    func = lambda tk: imt(tk, i=I_CYCLE, t=T_CYCLE)
 
-    token = tlist.token_next_by(t=(T.Operator, T.Wildcard))
-    while token:
-        left, right = tlist.token_prev(token), tlist.token_next(token)
-
-        if func(left) and func(right):
-            token.ttype = T.Operator
-            tokens = tlist.tokens_between(left, right)
-            token = tlist.group_tokens(sql.Operation, tokens)
+def group_operator(tlist):
+    ttypes = T_NUMERICAL + T_STRING + T_NAME
+    sqlcls = (sql.SquareBrackets, sql.Parenthesis, sql.Function,
+              sql.Identifier, sql.Operation)
 
-        token = tlist.token_next_by(t=(T.Operator, T.Wildcard), idx=token)
+    def match(token):
+        return imt(token, t=(T.Operator, T.Wildcard))
 
+    def valid(token):
+        return imt(token, i=sqlcls, t=ttypes)
 
-@recurse(sql.IdentifierList)
-def group_identifier_list(tlist):
-    I_IDENT_LIST = (sql.Function, sql.Case, sql.Identifier, sql.Comparison,
-                    sql.IdentifierList, sql.Operation)
-    T_IDENT_LIST = (T_NUMERICAL + T_STRING + T_NAME +
-                    (T.Keyword, T.Comment, T.Wildcard))
+    def post(tlist, pidx, tidx, nidx):
+        tlist[tidx].ttype = T.Operator
+        return pidx, nidx
 
-    func = lambda t: imt(t, i=I_IDENT_LIST, m=M_ROLE, t=T_IDENT_LIST)
-    token = tlist.token_next_by(m=M_COMMA)
+    valid_prev = valid_next = valid
+    _group(tlist, sql.Operation, match,
+           valid_prev, valid_next, post, extend=False)
 
-    while token:
-        before, after = tlist.token_prev(token), tlist.token_next(token)
 
-        if func(before) and func(after):
-            tokens = tlist.tokens_between(before, after)
-            token = tlist.group_tokens(sql.IdentifierList, tokens, extend=True)
-        token = tlist.token_next_by(m=M_COMMA, idx=token)
+def group_identifier_list(tlist):
+    m_role = T.Keyword, ('null', 'role')
+    m_comma = T.Punctuation, ','
+    sqlcls = (sql.Function, sql.Case, sql.Identifier, sql.Comparison,
+              sql.IdentifierList, sql.Operation)
+    ttypes = (T_NUMERICAL + T_STRING + T_NAME +
+              (T.Keyword, T.Comment, T.Wildcard))
 
+    def match(token):
+        return imt(token, m=m_comma)
 
-def group_brackets(tlist):
-    _group_matching(tlist, sql.SquareBrackets)
+    def valid(token):
+        return imt(token, i=sqlcls, m=m_role, t=ttypes)
 
+    def post(tlist, pidx, tidx, nidx):
+        return pidx, nidx
 
-def group_parenthesis(tlist):
-    _group_matching(tlist, sql.Parenthesis)
+    valid_prev = valid_next = valid
+    _group(tlist, sql.IdentifierList, match,
+           valid_prev, valid_next, post, extend=True)
 
 
 @recurse(sql.Comment)
 def group_comments(tlist):
-    token = tlist.token_next_by(t=T.Comment)
+    tidx, token = tlist.token_next_by(t=T.Comment)
     while token:
-        end = tlist.token_not_matching(
-            token, lambda tk: imt(tk, t=T.Comment) or tk.is_whitespace())
+        eidx, end = tlist.token_not_matching(
+            lambda tk: imt(tk, t=T.Comment) or tk.is_whitespace(), idx=tidx)
         if end is not None:
-            end = tlist.token_prev(end, False)
-            tokens = tlist.tokens_between(token, end)
-            token = tlist.group_tokens(sql.Comment, tokens)
+            eidx, end = tlist.token_prev(eidx, skip_ws=False)
+            tlist.group_tokens(sql.Comment, tidx, eidx)
 
-        token = tlist.token_next_by(t=T.Comment, idx=token)
+        tidx, token = tlist.token_next_by(t=T.Comment, idx=tidx)
 
 
 @recurse(sql.Where)
 def group_where(tlist):
-    token = tlist.token_next_by(m=sql.Where.M_OPEN)
+    tidx, token = tlist.token_next_by(m=sql.Where.M_OPEN)
     while token:
-        end = tlist.token_next_by(m=sql.Where.M_CLOSE, idx=token)
+        eidx, end = tlist.token_next_by(m=sql.Where.M_CLOSE, idx=tidx)
 
         if end is None:
-            tokens = tlist.tokens_between(token, tlist._groupable_tokens[-1])
+            end = tlist._groupable_tokens[-1]
         else:
-            tokens = tlist.tokens_between(
-                token, tlist.tokens[tlist.token_index(end) - 1])
-
-        token = tlist.group_tokens(sql.Where, tokens)
-        token = tlist.token_next_by(m=sql.Where.M_OPEN, idx=token)
+            end = tlist.tokens[eidx - 1]
+        # TODO: convert this to eidx instead of end token.
+        # i think above values are len(tlist) and eidx-1
+        eidx = tlist.token_index(end)
+        tlist.group_tokens(sql.Where, tidx, eidx)
+        tidx, token = tlist.token_next_by(m=sql.Where.M_OPEN, idx=tidx)
 
 
 @recurse()
@@ -217,17 +272,12 @@ def group_aliased(tlist):
     I_ALIAS = (sql.Parenthesis, sql.Function, sql.Case, sql.Identifier,
                sql.Operation)
 
-    token = tlist.token_next_by(i=I_ALIAS, t=T.Number)
+    tidx, token = tlist.token_next_by(i=I_ALIAS, t=T.Number)
     while token:
-        next_ = tlist.token_next(token)
+        nidx, next_ = tlist.token_next(tidx)
         if imt(next_, i=sql.Identifier):
-            tokens = tlist.tokens_between(token, next_)
-            token = tlist.group_tokens(sql.Identifier, tokens, extend=True)
-        token = tlist.token_next_by(i=I_ALIAS, t=T.Number, idx=token)
-
-
-def group_typecasts(tlist):
-    _group_left_right(tlist, (T.Punctuation, '::'), sql.Identifier)
+            tlist.group_tokens(sql.Identifier, tidx, nidx, extend=True)
+        tidx, token = tlist.token_next_by(i=I_ALIAS, t=T.Number, idx=tidx)
 
 
 @recurse(sql.Function)
@@ -241,45 +291,51 @@ def group_functions(tlist):
             has_table = True
     if has_create and has_table:
         return
-    token = tlist.token_next_by(t=T.Name)
+
+    tidx, token = tlist.token_next_by(t=T.Name)
     while token:
-        next_ = tlist.token_next(token)
-        if imt(next_, i=sql.Parenthesis):
-            tokens = tlist.tokens_between(token, next_)
-            token = tlist.group_tokens(sql.Function, tokens)
-        token = tlist.token_next_by(t=T.Name, idx=token)
+        nidx, next_ = tlist.token_next(tidx)
+        if isinstance(next_, sql.Parenthesis):
+            tlist.group_tokens(sql.Function, tidx, nidx)
+        tidx, token = tlist.token_next_by(t=T.Name, idx=tidx)
 
 
 def group_order(tlist):
     """Group together Identifier and Asc/Desc token"""
-    token = tlist.token_next_by(t=T.Keyword.Order)
+    tidx, token = tlist.token_next_by(t=T.Keyword.Order)
     while token:
-        prev = tlist.token_prev(token)
-        if imt(prev, i=sql.Identifier, t=T.Number):
-            tokens = tlist.tokens_between(prev, token)
-            token = tlist.group_tokens(sql.Identifier, tokens)
-        token = tlist.token_next_by(t=T.Keyword.Order, idx=token)
+        pidx, prev_ = tlist.token_prev(tidx)
+        if imt(prev_, i=sql.Identifier, t=T.Number):
+            tlist.group_tokens(sql.Identifier, pidx, tidx)
+            tidx = pidx
+        tidx, token = tlist.token_next_by(t=T.Keyword.Order, idx=tidx)
 
 
 @recurse()
 def align_comments(tlist):
-    token = tlist.token_next_by(i=sql.Comment)
+    tidx, token = tlist.token_next_by(i=sql.Comment)
     while token:
-        before = tlist.token_prev(token)
-        if isinstance(before, sql.TokenList):
-            tokens = tlist.tokens_between(before, token)
-            token = tlist.group_tokens(sql.TokenList, tokens, extend=True)
-        token = tlist.token_next_by(i=sql.Comment, idx=token)
+        pidx, prev_ = tlist.token_prev(tidx)
+        if isinstance(prev_, sql.TokenList):
+            tlist.group_tokens(sql.TokenList, pidx, tidx, extend=True)
+            tidx = pidx
+        tidx, token = tlist.token_next_by(i=sql.Comment, idx=tidx)
 
 
 def group(stmt):
     for func in [
         group_comments,
+
+        # _group_matching
         group_brackets,
         group_parenthesis,
+        group_case,
+        group_if,
+        group_for,
+        group_begin,
+
         group_functions,
         group_where,
-        group_case,
         group_period,
         group_arrays,
         group_identifier,
@@ -290,12 +346,42 @@ def group(stmt):
         group_aliased,
         group_assignment,
         group_comparison,
+
         align_comments,
         group_identifier_list,
-        group_if,
-        group_for,
-        group_foreach,
-        group_begin,
     ]:
         func(stmt)
     return stmt
+
+
+def _group(tlist, cls, match,
+           valid_prev=lambda t: True,
+           valid_next=lambda t: True,
+           post=None,
+           extend=True,
+           recurse=True
+           ):
+    """Groups together tokens that are joined by a middle token. ie. x < y"""
+
+    tidx_offset = 0
+    pidx, prev_ = None, None
+    for idx, token in enumerate(list(tlist)):
+        tidx = idx - tidx_offset
+
+        if token.is_whitespace():
+            continue
+
+        if recurse and token.is_group() and not isinstance(token, cls):
+            _group(token, cls, match, valid_prev, valid_next, post, extend)
+
+        if match(token):
+            nidx, next_ = tlist.token_next(tidx)
+            if valid_prev(prev_) and valid_next(next_):
+                from_idx, to_idx = post(tlist, pidx, tidx, nidx)
+                grp = tlist.group_tokens(cls, from_idx, to_idx, extend=extend)
+
+                tidx_offset += to_idx - from_idx
+                pidx, prev_ = from_idx, grp
+                continue
+
+        pidx, prev_ = tidx, token
author	Vik <vmuriart@gmail.com>	2016-06-16 02:33:28 -0700
committer	GitHub <noreply@github.com>	2016-06-16 02:33:28 -0700
commit	92b5f2bb88ed1c1080ecf7eb7449f5c642ae196a (patch)
tree	30b53c5970fc01fab5a14c9a0298f4e8d4eba077 /sqlparse/engine/grouping.py
parent	451d6d5d380cb4246c47e374aa9c4034fc7f9805 (diff)
parent	9fcf1f2cda629cdf11a8a4ac596fb7cae0e89de9 (diff)
download	sqlparse-92b5f2bb88ed1c1080ecf7eb7449f5c642ae196a.tar.gz