diff options
| author | Vik <vmuriart@gmail.com> | 2016-06-16 02:33:28 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2016-06-16 02:33:28 -0700 |
| commit | 92b5f2bb88ed1c1080ecf7eb7449f5c642ae196a (patch) | |
| tree | 30b53c5970fc01fab5a14c9a0298f4e8d4eba077 /sqlparse/engine/grouping.py | |
| parent | 451d6d5d380cb4246c47e374aa9c4034fc7f9805 (diff) | |
| parent | 9fcf1f2cda629cdf11a8a4ac596fb7cae0e89de9 (diff) | |
| download | sqlparse-92b5f2bb88ed1c1080ecf7eb7449f5c642ae196a.tar.gz | |
Merge pull request #260 from vmuriart/long_live_indexes
Long live indexes - Improve performance
Diffstat (limited to 'sqlparse/engine/grouping.py')
| -rw-r--r-- | sqlparse/engine/grouping.py | 400 |
1 files changed, 243 insertions, 157 deletions
diff --git a/sqlparse/engine/grouping.py b/sqlparse/engine/grouping.py index 6e414b8..62f37a6 100644 --- a/sqlparse/engine/grouping.py +++ b/sqlparse/engine/grouping.py @@ -7,56 +7,58 @@ from sqlparse import sql from sqlparse import tokens as T -from sqlparse.utils import recurse, imt, find_matching - -M_ROLE = (T.Keyword, ('null', 'role')) -M_SEMICOLON = (T.Punctuation, ';') -M_COMMA = (T.Punctuation, ',') +from sqlparse.utils import recurse, imt T_NUMERICAL = (T.Number, T.Number.Integer, T.Number.Float) T_STRING = (T.String, T.String.Single, T.String.Symbol) T_NAME = (T.Name, T.Name.Placeholder) -def _group_left_right(tlist, m, cls, - valid_left=lambda t: t is not None, - valid_right=lambda t: t is not None, - semicolon=False): - """Groups together tokens that are joined by a middle token. ie. x < y""" +def _group_matching(tlist, cls): + """Groups Tokens that have beginning and end.""" + opens = [] + tidx_offset = 0 + for idx, token in enumerate(list(tlist)): + tidx = idx - tidx_offset + + if token.is_whitespace(): + # ~50% of tokens will be whitespace. Will checking early + # for them avoid 3 comparisons, but then add 1 more comparison + # for the other ~50% of tokens... + continue - for token in list(tlist): if token.is_group() and not isinstance(token, cls): - _group_left_right(token, m, cls, valid_left, valid_right, - semicolon) - - if not token.match(*m): + # Check inside previously grouped (ie. parenthesis) if group + # of differnt type is inside (ie, case). though ideally should + # should check for all open/close tokens at once to avoid recursion + _group_matching(token, cls) continue - left, right = tlist.token_prev(token), tlist.token_next(token) + if token.match(*cls.M_OPEN): + opens.append(tidx) - if valid_left(left) and valid_right(right): - if semicolon: - # only overwrite if a semicolon present. - sright = tlist.token_next_by(m=M_SEMICOLON, idx=right) - right = sright or right - tokens = tlist.tokens_between(left, right) - tlist.group_tokens(cls, tokens, extend=True) + elif token.match(*cls.M_CLOSE): + try: + open_idx = opens.pop() + except IndexError: + # this indicates invalid sql and unbalanced tokens. + # instead of break, continue in case other "valid" groups exist + continue + close_idx = tidx + tlist.group_tokens(cls, open_idx, close_idx) + tidx_offset += close_idx - open_idx -def _group_matching(tlist, cls): - """Groups Tokens that have beginning and end.""" - [_group_matching(sgroup, cls) for sgroup in tlist.get_sublists() - if not isinstance(sgroup, cls)] - idx = 1 if isinstance(tlist, cls) else 0 +def group_brackets(tlist): + _group_matching(tlist, sql.SquareBrackets) - token = tlist.token_next_by(m=cls.M_OPEN, idx=idx) - while token: - end = find_matching(tlist, token, cls.M_OPEN, cls.M_CLOSE) - if end is not None: - tokens = tlist.tokens_between(token, end) - token = tlist.group_tokens(cls, tokens) - _group_matching(token, cls) - token = tlist.token_next_by(m=cls.M_OPEN, idx=token) + +def group_parenthesis(tlist): + _group_matching(tlist, sql.Parenthesis) + + +def group_case(tlist): + _group_matching(tlist, sql.Case) def group_if(tlist): @@ -67,149 +69,202 @@ def group_for(tlist): _group_matching(tlist, sql.For) -def group_foreach(tlist): - _group_matching(tlist, sql.For) - - def group_begin(tlist): _group_matching(tlist, sql.Begin) +def group_typecasts(tlist): + def match(token): + return token.match(T.Punctuation, '::') + + def valid(token): + return token is not None + + def post(tlist, pidx, tidx, nidx): + return pidx, nidx + + valid_prev = valid_next = valid + _group(tlist, sql.Identifier, match, valid_prev, valid_next, post) + + +def group_period(tlist): + def match(token): + return token.match(T.Punctuation, '.') + + def valid_prev(token): + sqlcls = sql.SquareBrackets, sql.Identifier + ttypes = T.Name, T.String.Symbol + return imt(token, i=sqlcls, t=ttypes) + + def valid_next(token): + sqlcls = sql.SquareBrackets, sql.Function + ttypes = T.Name, T.String.Symbol, T.Wildcard + return imt(token, i=sqlcls, t=ttypes) + + def post(tlist, pidx, tidx, nidx): + return pidx, nidx + + _group(tlist, sql.Identifier, match, valid_prev, valid_next, post) + + def group_as(tlist): - lfunc = lambda tk: not imt(tk, t=T.Keyword) or tk.value == 'NULL' - rfunc = lambda tk: not imt(tk, t=(T.DML, T.DDL)) - _group_left_right(tlist, (T.Keyword, 'AS'), sql.Identifier, - valid_left=lfunc, valid_right=rfunc) + def match(token): + return token.is_keyword and token.normalized == 'AS' + + def valid_prev(token): + return token.normalized == 'NULL' or not token.is_keyword + + def valid_next(token): + ttypes = T.DML, T.DDL + return not imt(token, t=ttypes) + + def post(tlist, pidx, tidx, nidx): + return pidx, nidx + + _group(tlist, sql.Identifier, match, valid_prev, valid_next, post) def group_assignment(tlist): - _group_left_right(tlist, (T.Assignment, ':='), sql.Assignment, - semicolon=True) + def match(token): + return token.match(T.Assignment, ':=') + def valid(token): + return token is not None -def group_comparison(tlist): - I_COMPERABLE = (sql.Parenthesis, sql.Function, sql.Identifier, - sql.Operation) - T_COMPERABLE = T_NUMERICAL + T_STRING + T_NAME + def post(tlist, pidx, tidx, nidx): + m_semicolon = T.Punctuation, ';' + snidx, _ = tlist.token_next_by(m=m_semicolon, idx=nidx) + nidx = snidx or nidx + return pidx, nidx - func = lambda tk: (imt(tk, t=T_COMPERABLE, i=I_COMPERABLE) or - (tk and tk.is_keyword and tk.normalized == 'NULL')) + valid_prev = valid_next = valid + _group(tlist, sql.Assignment, match, valid_prev, valid_next, post) - _group_left_right(tlist, (T.Operator.Comparison, None), sql.Comparison, - valid_left=func, valid_right=func) +def group_comparison(tlist): + sqlcls = (sql.Parenthesis, sql.Function, sql.Identifier, + sql.Operation) + ttypes = T_NUMERICAL + T_STRING + T_NAME + + def match(token): + return token.ttype == T.Operator.Comparison + + def valid(token): + if imt(token, t=ttypes, i=sqlcls): + return True + elif token and token.is_keyword and token.normalized == 'NULL': + return True + else: + return False -def group_case(tlist): - _group_matching(tlist, sql.Case) + def post(tlist, pidx, tidx, nidx): + return pidx, nidx + + valid_prev = valid_next = valid + _group(tlist, sql.Comparison, match, + valid_prev, valid_next, post, extend=False) @recurse(sql.Identifier) def group_identifier(tlist): - T_IDENT = (T.String.Symbol, T.Name) + ttypes = (T.String.Symbol, T.Name) - token = tlist.token_next_by(t=T_IDENT) + tidx, token = tlist.token_next_by(t=ttypes) while token: - token = tlist.group_tokens(sql.Identifier, [token, ]) - token = tlist.token_next_by(t=T_IDENT, idx=token) + tlist.group_tokens(sql.Identifier, tidx, tidx) + tidx, token = tlist.token_next_by(t=ttypes, idx=tidx) -def group_period(tlist): - lfunc = lambda tk: imt(tk, i=(sql.SquareBrackets, sql.Identifier), - t=(T.Name, T.String.Symbol,)) +def group_arrays(tlist): + sqlcls = sql.SquareBrackets, sql.Identifier, sql.Function + ttypes = T.Name, T.String.Symbol - rfunc = lambda tk: imt(tk, i=(sql.SquareBrackets, sql.Function), - t=(T.Name, T.String.Symbol, T.Wildcard)) + def match(token): + return isinstance(token, sql.SquareBrackets) - _group_left_right(tlist, (T.Punctuation, '.'), sql.Identifier, - valid_left=lfunc, valid_right=rfunc) + def valid_prev(token): + return imt(token, i=sqlcls, t=ttypes) + def valid_next(token): + return True -def group_arrays(tlist): - token = tlist.token_next_by(i=sql.SquareBrackets) - while token: - prev = tlist.token_prev(token) - if imt(prev, i=(sql.SquareBrackets, sql.Identifier, sql.Function), - t=(T.Name, T.String.Symbol,)): - tokens = tlist.tokens_between(prev, token) - token = tlist.group_tokens(sql.Identifier, tokens, extend=True) - token = tlist.token_next_by(i=sql.SquareBrackets, idx=token) + def post(tlist, pidx, tidx, nidx): + return pidx, tidx + _group(tlist, sql.Identifier, match, + valid_prev, valid_next, post, extend=True, recurse=False) -@recurse(sql.Identifier) -def group_operator(tlist): - I_CYCLE = (sql.SquareBrackets, sql.Parenthesis, sql.Function, - sql.Identifier, sql.Operation) - # wilcards wouldn't have operations next to them - T_CYCLE = T_NUMERICAL + T_STRING + T_NAME - func = lambda tk: imt(tk, i=I_CYCLE, t=T_CYCLE) - token = tlist.token_next_by(t=(T.Operator, T.Wildcard)) - while token: - left, right = tlist.token_prev(token), tlist.token_next(token) - - if func(left) and func(right): - token.ttype = T.Operator - tokens = tlist.tokens_between(left, right) - token = tlist.group_tokens(sql.Operation, tokens) +def group_operator(tlist): + ttypes = T_NUMERICAL + T_STRING + T_NAME + sqlcls = (sql.SquareBrackets, sql.Parenthesis, sql.Function, + sql.Identifier, sql.Operation) - token = tlist.token_next_by(t=(T.Operator, T.Wildcard), idx=token) + def match(token): + return imt(token, t=(T.Operator, T.Wildcard)) + def valid(token): + return imt(token, i=sqlcls, t=ttypes) -@recurse(sql.IdentifierList) -def group_identifier_list(tlist): - I_IDENT_LIST = (sql.Function, sql.Case, sql.Identifier, sql.Comparison, - sql.IdentifierList, sql.Operation) - T_IDENT_LIST = (T_NUMERICAL + T_STRING + T_NAME + - (T.Keyword, T.Comment, T.Wildcard)) + def post(tlist, pidx, tidx, nidx): + tlist[tidx].ttype = T.Operator + return pidx, nidx - func = lambda t: imt(t, i=I_IDENT_LIST, m=M_ROLE, t=T_IDENT_LIST) - token = tlist.token_next_by(m=M_COMMA) + valid_prev = valid_next = valid + _group(tlist, sql.Operation, match, + valid_prev, valid_next, post, extend=False) - while token: - before, after = tlist.token_prev(token), tlist.token_next(token) - if func(before) and func(after): - tokens = tlist.tokens_between(before, after) - token = tlist.group_tokens(sql.IdentifierList, tokens, extend=True) - token = tlist.token_next_by(m=M_COMMA, idx=token) +def group_identifier_list(tlist): + m_role = T.Keyword, ('null', 'role') + m_comma = T.Punctuation, ',' + sqlcls = (sql.Function, sql.Case, sql.Identifier, sql.Comparison, + sql.IdentifierList, sql.Operation) + ttypes = (T_NUMERICAL + T_STRING + T_NAME + + (T.Keyword, T.Comment, T.Wildcard)) + def match(token): + return imt(token, m=m_comma) -def group_brackets(tlist): - _group_matching(tlist, sql.SquareBrackets) + def valid(token): + return imt(token, i=sqlcls, m=m_role, t=ttypes) + def post(tlist, pidx, tidx, nidx): + return pidx, nidx -def group_parenthesis(tlist): - _group_matching(tlist, sql.Parenthesis) + valid_prev = valid_next = valid + _group(tlist, sql.IdentifierList, match, + valid_prev, valid_next, post, extend=True) @recurse(sql.Comment) def group_comments(tlist): - token = tlist.token_next_by(t=T.Comment) + tidx, token = tlist.token_next_by(t=T.Comment) while token: - end = tlist.token_not_matching( - token, lambda tk: imt(tk, t=T.Comment) or tk.is_whitespace()) + eidx, end = tlist.token_not_matching( + lambda tk: imt(tk, t=T.Comment) or tk.is_whitespace(), idx=tidx) if end is not None: - end = tlist.token_prev(end, False) - tokens = tlist.tokens_between(token, end) - token = tlist.group_tokens(sql.Comment, tokens) + eidx, end = tlist.token_prev(eidx, skip_ws=False) + tlist.group_tokens(sql.Comment, tidx, eidx) - token = tlist.token_next_by(t=T.Comment, idx=token) + tidx, token = tlist.token_next_by(t=T.Comment, idx=tidx) @recurse(sql.Where) def group_where(tlist): - token = tlist.token_next_by(m=sql.Where.M_OPEN) + tidx, token = tlist.token_next_by(m=sql.Where.M_OPEN) while token: - end = tlist.token_next_by(m=sql.Where.M_CLOSE, idx=token) + eidx, end = tlist.token_next_by(m=sql.Where.M_CLOSE, idx=tidx) if end is None: - tokens = tlist.tokens_between(token, tlist._groupable_tokens[-1]) + end = tlist._groupable_tokens[-1] else: - tokens = tlist.tokens_between( - token, tlist.tokens[tlist.token_index(end) - 1]) - - token = tlist.group_tokens(sql.Where, tokens) - token = tlist.token_next_by(m=sql.Where.M_OPEN, idx=token) + end = tlist.tokens[eidx - 1] + # TODO: convert this to eidx instead of end token. + # i think above values are len(tlist) and eidx-1 + eidx = tlist.token_index(end) + tlist.group_tokens(sql.Where, tidx, eidx) + tidx, token = tlist.token_next_by(m=sql.Where.M_OPEN, idx=tidx) @recurse() @@ -217,17 +272,12 @@ def group_aliased(tlist): I_ALIAS = (sql.Parenthesis, sql.Function, sql.Case, sql.Identifier, sql.Operation) - token = tlist.token_next_by(i=I_ALIAS, t=T.Number) + tidx, token = tlist.token_next_by(i=I_ALIAS, t=T.Number) while token: - next_ = tlist.token_next(token) + nidx, next_ = tlist.token_next(tidx) if imt(next_, i=sql.Identifier): - tokens = tlist.tokens_between(token, next_) - token = tlist.group_tokens(sql.Identifier, tokens, extend=True) - token = tlist.token_next_by(i=I_ALIAS, t=T.Number, idx=token) - - -def group_typecasts(tlist): - _group_left_right(tlist, (T.Punctuation, '::'), sql.Identifier) + tlist.group_tokens(sql.Identifier, tidx, nidx, extend=True) + tidx, token = tlist.token_next_by(i=I_ALIAS, t=T.Number, idx=tidx) @recurse(sql.Function) @@ -241,45 +291,51 @@ def group_functions(tlist): has_table = True if has_create and has_table: return - token = tlist.token_next_by(t=T.Name) + + tidx, token = tlist.token_next_by(t=T.Name) while token: - next_ = tlist.token_next(token) - if imt(next_, i=sql.Parenthesis): - tokens = tlist.tokens_between(token, next_) - token = tlist.group_tokens(sql.Function, tokens) - token = tlist.token_next_by(t=T.Name, idx=token) + nidx, next_ = tlist.token_next(tidx) + if isinstance(next_, sql.Parenthesis): + tlist.group_tokens(sql.Function, tidx, nidx) + tidx, token = tlist.token_next_by(t=T.Name, idx=tidx) def group_order(tlist): """Group together Identifier and Asc/Desc token""" - token = tlist.token_next_by(t=T.Keyword.Order) + tidx, token = tlist.token_next_by(t=T.Keyword.Order) while token: - prev = tlist.token_prev(token) - if imt(prev, i=sql.Identifier, t=T.Number): - tokens = tlist.tokens_between(prev, token) - token = tlist.group_tokens(sql.Identifier, tokens) - token = tlist.token_next_by(t=T.Keyword.Order, idx=token) + pidx, prev_ = tlist.token_prev(tidx) + if imt(prev_, i=sql.Identifier, t=T.Number): + tlist.group_tokens(sql.Identifier, pidx, tidx) + tidx = pidx + tidx, token = tlist.token_next_by(t=T.Keyword.Order, idx=tidx) @recurse() def align_comments(tlist): - token = tlist.token_next_by(i=sql.Comment) + tidx, token = tlist.token_next_by(i=sql.Comment) while token: - before = tlist.token_prev(token) - if isinstance(before, sql.TokenList): - tokens = tlist.tokens_between(before, token) - token = tlist.group_tokens(sql.TokenList, tokens, extend=True) - token = tlist.token_next_by(i=sql.Comment, idx=token) + pidx, prev_ = tlist.token_prev(tidx) + if isinstance(prev_, sql.TokenList): + tlist.group_tokens(sql.TokenList, pidx, tidx, extend=True) + tidx = pidx + tidx, token = tlist.token_next_by(i=sql.Comment, idx=tidx) def group(stmt): for func in [ group_comments, + + # _group_matching group_brackets, group_parenthesis, + group_case, + group_if, + group_for, + group_begin, + group_functions, group_where, - group_case, group_period, group_arrays, group_identifier, @@ -290,12 +346,42 @@ def group(stmt): group_aliased, group_assignment, group_comparison, + align_comments, group_identifier_list, - group_if, - group_for, - group_foreach, - group_begin, ]: func(stmt) return stmt + + +def _group(tlist, cls, match, + valid_prev=lambda t: True, + valid_next=lambda t: True, + post=None, + extend=True, + recurse=True + ): + """Groups together tokens that are joined by a middle token. ie. x < y""" + + tidx_offset = 0 + pidx, prev_ = None, None + for idx, token in enumerate(list(tlist)): + tidx = idx - tidx_offset + + if token.is_whitespace(): + continue + + if recurse and token.is_group() and not isinstance(token, cls): + _group(token, cls, match, valid_prev, valid_next, post, extend) + + if match(token): + nidx, next_ = tlist.token_next(tidx) + if valid_prev(prev_) and valid_next(next_): + from_idx, to_idx = post(tlist, pidx, tidx, nidx) + grp = tlist.group_tokens(cls, from_idx, to_idx, extend=extend) + + tidx_offset += to_idx - from_idx + pidx, prev_ = from_idx, grp + continue + + pidx, prev_ = tidx, token |
