summaryrefslogtreecommitdiff
path: root/sqlparse/engine/grouping.py
diff options
context:
space:
mode:
authorVik <vmuriart@gmail.com>2016-06-16 02:33:28 -0700
committerGitHub <noreply@github.com>2016-06-16 02:33:28 -0700
commit92b5f2bb88ed1c1080ecf7eb7449f5c642ae196a (patch)
tree30b53c5970fc01fab5a14c9a0298f4e8d4eba077 /sqlparse/engine/grouping.py
parent451d6d5d380cb4246c47e374aa9c4034fc7f9805 (diff)
parent9fcf1f2cda629cdf11a8a4ac596fb7cae0e89de9 (diff)
downloadsqlparse-92b5f2bb88ed1c1080ecf7eb7449f5c642ae196a.tar.gz
Merge pull request #260 from vmuriart/long_live_indexes
Long live indexes - Improve performance
Diffstat (limited to 'sqlparse/engine/grouping.py')
-rw-r--r--sqlparse/engine/grouping.py400
1 files changed, 243 insertions, 157 deletions
diff --git a/sqlparse/engine/grouping.py b/sqlparse/engine/grouping.py
index 6e414b8..62f37a6 100644
--- a/sqlparse/engine/grouping.py
+++ b/sqlparse/engine/grouping.py
@@ -7,56 +7,58 @@
from sqlparse import sql
from sqlparse import tokens as T
-from sqlparse.utils import recurse, imt, find_matching
-
-M_ROLE = (T.Keyword, ('null', 'role'))
-M_SEMICOLON = (T.Punctuation, ';')
-M_COMMA = (T.Punctuation, ',')
+from sqlparse.utils import recurse, imt
T_NUMERICAL = (T.Number, T.Number.Integer, T.Number.Float)
T_STRING = (T.String, T.String.Single, T.String.Symbol)
T_NAME = (T.Name, T.Name.Placeholder)
-def _group_left_right(tlist, m, cls,
- valid_left=lambda t: t is not None,
- valid_right=lambda t: t is not None,
- semicolon=False):
- """Groups together tokens that are joined by a middle token. ie. x < y"""
+def _group_matching(tlist, cls):
+ """Groups Tokens that have beginning and end."""
+ opens = []
+ tidx_offset = 0
+ for idx, token in enumerate(list(tlist)):
+ tidx = idx - tidx_offset
+
+ if token.is_whitespace():
+ # ~50% of tokens will be whitespace. Will checking early
+ # for them avoid 3 comparisons, but then add 1 more comparison
+ # for the other ~50% of tokens...
+ continue
- for token in list(tlist):
if token.is_group() and not isinstance(token, cls):
- _group_left_right(token, m, cls, valid_left, valid_right,
- semicolon)
-
- if not token.match(*m):
+ # Check inside previously grouped (ie. parenthesis) if group
+ # of differnt type is inside (ie, case). though ideally should
+ # should check for all open/close tokens at once to avoid recursion
+ _group_matching(token, cls)
continue
- left, right = tlist.token_prev(token), tlist.token_next(token)
+ if token.match(*cls.M_OPEN):
+ opens.append(tidx)
- if valid_left(left) and valid_right(right):
- if semicolon:
- # only overwrite if a semicolon present.
- sright = tlist.token_next_by(m=M_SEMICOLON, idx=right)
- right = sright or right
- tokens = tlist.tokens_between(left, right)
- tlist.group_tokens(cls, tokens, extend=True)
+ elif token.match(*cls.M_CLOSE):
+ try:
+ open_idx = opens.pop()
+ except IndexError:
+ # this indicates invalid sql and unbalanced tokens.
+ # instead of break, continue in case other "valid" groups exist
+ continue
+ close_idx = tidx
+ tlist.group_tokens(cls, open_idx, close_idx)
+ tidx_offset += close_idx - open_idx
-def _group_matching(tlist, cls):
- """Groups Tokens that have beginning and end."""
- [_group_matching(sgroup, cls) for sgroup in tlist.get_sublists()
- if not isinstance(sgroup, cls)]
- idx = 1 if isinstance(tlist, cls) else 0
+def group_brackets(tlist):
+ _group_matching(tlist, sql.SquareBrackets)
- token = tlist.token_next_by(m=cls.M_OPEN, idx=idx)
- while token:
- end = find_matching(tlist, token, cls.M_OPEN, cls.M_CLOSE)
- if end is not None:
- tokens = tlist.tokens_between(token, end)
- token = tlist.group_tokens(cls, tokens)
- _group_matching(token, cls)
- token = tlist.token_next_by(m=cls.M_OPEN, idx=token)
+
+def group_parenthesis(tlist):
+ _group_matching(tlist, sql.Parenthesis)
+
+
+def group_case(tlist):
+ _group_matching(tlist, sql.Case)
def group_if(tlist):
@@ -67,149 +69,202 @@ def group_for(tlist):
_group_matching(tlist, sql.For)
-def group_foreach(tlist):
- _group_matching(tlist, sql.For)
-
-
def group_begin(tlist):
_group_matching(tlist, sql.Begin)
+def group_typecasts(tlist):
+ def match(token):
+ return token.match(T.Punctuation, '::')
+
+ def valid(token):
+ return token is not None
+
+ def post(tlist, pidx, tidx, nidx):
+ return pidx, nidx
+
+ valid_prev = valid_next = valid
+ _group(tlist, sql.Identifier, match, valid_prev, valid_next, post)
+
+
+def group_period(tlist):
+ def match(token):
+ return token.match(T.Punctuation, '.')
+
+ def valid_prev(token):
+ sqlcls = sql.SquareBrackets, sql.Identifier
+ ttypes = T.Name, T.String.Symbol
+ return imt(token, i=sqlcls, t=ttypes)
+
+ def valid_next(token):
+ sqlcls = sql.SquareBrackets, sql.Function
+ ttypes = T.Name, T.String.Symbol, T.Wildcard
+ return imt(token, i=sqlcls, t=ttypes)
+
+ def post(tlist, pidx, tidx, nidx):
+ return pidx, nidx
+
+ _group(tlist, sql.Identifier, match, valid_prev, valid_next, post)
+
+
def group_as(tlist):
- lfunc = lambda tk: not imt(tk, t=T.Keyword) or tk.value == 'NULL'
- rfunc = lambda tk: not imt(tk, t=(T.DML, T.DDL))
- _group_left_right(tlist, (T.Keyword, 'AS'), sql.Identifier,
- valid_left=lfunc, valid_right=rfunc)
+ def match(token):
+ return token.is_keyword and token.normalized == 'AS'
+
+ def valid_prev(token):
+ return token.normalized == 'NULL' or not token.is_keyword
+
+ def valid_next(token):
+ ttypes = T.DML, T.DDL
+ return not imt(token, t=ttypes)
+
+ def post(tlist, pidx, tidx, nidx):
+ return pidx, nidx
+
+ _group(tlist, sql.Identifier, match, valid_prev, valid_next, post)
def group_assignment(tlist):
- _group_left_right(tlist, (T.Assignment, ':='), sql.Assignment,
- semicolon=True)
+ def match(token):
+ return token.match(T.Assignment, ':=')
+ def valid(token):
+ return token is not None
-def group_comparison(tlist):
- I_COMPERABLE = (sql.Parenthesis, sql.Function, sql.Identifier,
- sql.Operation)
- T_COMPERABLE = T_NUMERICAL + T_STRING + T_NAME
+ def post(tlist, pidx, tidx, nidx):
+ m_semicolon = T.Punctuation, ';'
+ snidx, _ = tlist.token_next_by(m=m_semicolon, idx=nidx)
+ nidx = snidx or nidx
+ return pidx, nidx
- func = lambda tk: (imt(tk, t=T_COMPERABLE, i=I_COMPERABLE) or
- (tk and tk.is_keyword and tk.normalized == 'NULL'))
+ valid_prev = valid_next = valid
+ _group(tlist, sql.Assignment, match, valid_prev, valid_next, post)
- _group_left_right(tlist, (T.Operator.Comparison, None), sql.Comparison,
- valid_left=func, valid_right=func)
+def group_comparison(tlist):
+ sqlcls = (sql.Parenthesis, sql.Function, sql.Identifier,
+ sql.Operation)
+ ttypes = T_NUMERICAL + T_STRING + T_NAME
+
+ def match(token):
+ return token.ttype == T.Operator.Comparison
+
+ def valid(token):
+ if imt(token, t=ttypes, i=sqlcls):
+ return True
+ elif token and token.is_keyword and token.normalized == 'NULL':
+ return True
+ else:
+ return False
-def group_case(tlist):
- _group_matching(tlist, sql.Case)
+ def post(tlist, pidx, tidx, nidx):
+ return pidx, nidx
+
+ valid_prev = valid_next = valid
+ _group(tlist, sql.Comparison, match,
+ valid_prev, valid_next, post, extend=False)
@recurse(sql.Identifier)
def group_identifier(tlist):
- T_IDENT = (T.String.Symbol, T.Name)
+ ttypes = (T.String.Symbol, T.Name)
- token = tlist.token_next_by(t=T_IDENT)
+ tidx, token = tlist.token_next_by(t=ttypes)
while token:
- token = tlist.group_tokens(sql.Identifier, [token, ])
- token = tlist.token_next_by(t=T_IDENT, idx=token)
+ tlist.group_tokens(sql.Identifier, tidx, tidx)
+ tidx, token = tlist.token_next_by(t=ttypes, idx=tidx)
-def group_period(tlist):
- lfunc = lambda tk: imt(tk, i=(sql.SquareBrackets, sql.Identifier),
- t=(T.Name, T.String.Symbol,))
+def group_arrays(tlist):
+ sqlcls = sql.SquareBrackets, sql.Identifier, sql.Function
+ ttypes = T.Name, T.String.Symbol
- rfunc = lambda tk: imt(tk, i=(sql.SquareBrackets, sql.Function),
- t=(T.Name, T.String.Symbol, T.Wildcard))
+ def match(token):
+ return isinstance(token, sql.SquareBrackets)
- _group_left_right(tlist, (T.Punctuation, '.'), sql.Identifier,
- valid_left=lfunc, valid_right=rfunc)
+ def valid_prev(token):
+ return imt(token, i=sqlcls, t=ttypes)
+ def valid_next(token):
+ return True
-def group_arrays(tlist):
- token = tlist.token_next_by(i=sql.SquareBrackets)
- while token:
- prev = tlist.token_prev(token)
- if imt(prev, i=(sql.SquareBrackets, sql.Identifier, sql.Function),
- t=(T.Name, T.String.Symbol,)):
- tokens = tlist.tokens_between(prev, token)
- token = tlist.group_tokens(sql.Identifier, tokens, extend=True)
- token = tlist.token_next_by(i=sql.SquareBrackets, idx=token)
+ def post(tlist, pidx, tidx, nidx):
+ return pidx, tidx
+ _group(tlist, sql.Identifier, match,
+ valid_prev, valid_next, post, extend=True, recurse=False)
-@recurse(sql.Identifier)
-def group_operator(tlist):
- I_CYCLE = (sql.SquareBrackets, sql.Parenthesis, sql.Function,
- sql.Identifier, sql.Operation)
- # wilcards wouldn't have operations next to them
- T_CYCLE = T_NUMERICAL + T_STRING + T_NAME
- func = lambda tk: imt(tk, i=I_CYCLE, t=T_CYCLE)
- token = tlist.token_next_by(t=(T.Operator, T.Wildcard))
- while token:
- left, right = tlist.token_prev(token), tlist.token_next(token)
-
- if func(left) and func(right):
- token.ttype = T.Operator
- tokens = tlist.tokens_between(left, right)
- token = tlist.group_tokens(sql.Operation, tokens)
+def group_operator(tlist):
+ ttypes = T_NUMERICAL + T_STRING + T_NAME
+ sqlcls = (sql.SquareBrackets, sql.Parenthesis, sql.Function,
+ sql.Identifier, sql.Operation)
- token = tlist.token_next_by(t=(T.Operator, T.Wildcard), idx=token)
+ def match(token):
+ return imt(token, t=(T.Operator, T.Wildcard))
+ def valid(token):
+ return imt(token, i=sqlcls, t=ttypes)
-@recurse(sql.IdentifierList)
-def group_identifier_list(tlist):
- I_IDENT_LIST = (sql.Function, sql.Case, sql.Identifier, sql.Comparison,
- sql.IdentifierList, sql.Operation)
- T_IDENT_LIST = (T_NUMERICAL + T_STRING + T_NAME +
- (T.Keyword, T.Comment, T.Wildcard))
+ def post(tlist, pidx, tidx, nidx):
+ tlist[tidx].ttype = T.Operator
+ return pidx, nidx
- func = lambda t: imt(t, i=I_IDENT_LIST, m=M_ROLE, t=T_IDENT_LIST)
- token = tlist.token_next_by(m=M_COMMA)
+ valid_prev = valid_next = valid
+ _group(tlist, sql.Operation, match,
+ valid_prev, valid_next, post, extend=False)
- while token:
- before, after = tlist.token_prev(token), tlist.token_next(token)
- if func(before) and func(after):
- tokens = tlist.tokens_between(before, after)
- token = tlist.group_tokens(sql.IdentifierList, tokens, extend=True)
- token = tlist.token_next_by(m=M_COMMA, idx=token)
+def group_identifier_list(tlist):
+ m_role = T.Keyword, ('null', 'role')
+ m_comma = T.Punctuation, ','
+ sqlcls = (sql.Function, sql.Case, sql.Identifier, sql.Comparison,
+ sql.IdentifierList, sql.Operation)
+ ttypes = (T_NUMERICAL + T_STRING + T_NAME +
+ (T.Keyword, T.Comment, T.Wildcard))
+ def match(token):
+ return imt(token, m=m_comma)
-def group_brackets(tlist):
- _group_matching(tlist, sql.SquareBrackets)
+ def valid(token):
+ return imt(token, i=sqlcls, m=m_role, t=ttypes)
+ def post(tlist, pidx, tidx, nidx):
+ return pidx, nidx
-def group_parenthesis(tlist):
- _group_matching(tlist, sql.Parenthesis)
+ valid_prev = valid_next = valid
+ _group(tlist, sql.IdentifierList, match,
+ valid_prev, valid_next, post, extend=True)
@recurse(sql.Comment)
def group_comments(tlist):
- token = tlist.token_next_by(t=T.Comment)
+ tidx, token = tlist.token_next_by(t=T.Comment)
while token:
- end = tlist.token_not_matching(
- token, lambda tk: imt(tk, t=T.Comment) or tk.is_whitespace())
+ eidx, end = tlist.token_not_matching(
+ lambda tk: imt(tk, t=T.Comment) or tk.is_whitespace(), idx=tidx)
if end is not None:
- end = tlist.token_prev(end, False)
- tokens = tlist.tokens_between(token, end)
- token = tlist.group_tokens(sql.Comment, tokens)
+ eidx, end = tlist.token_prev(eidx, skip_ws=False)
+ tlist.group_tokens(sql.Comment, tidx, eidx)
- token = tlist.token_next_by(t=T.Comment, idx=token)
+ tidx, token = tlist.token_next_by(t=T.Comment, idx=tidx)
@recurse(sql.Where)
def group_where(tlist):
- token = tlist.token_next_by(m=sql.Where.M_OPEN)
+ tidx, token = tlist.token_next_by(m=sql.Where.M_OPEN)
while token:
- end = tlist.token_next_by(m=sql.Where.M_CLOSE, idx=token)
+ eidx, end = tlist.token_next_by(m=sql.Where.M_CLOSE, idx=tidx)
if end is None:
- tokens = tlist.tokens_between(token, tlist._groupable_tokens[-1])
+ end = tlist._groupable_tokens[-1]
else:
- tokens = tlist.tokens_between(
- token, tlist.tokens[tlist.token_index(end) - 1])
-
- token = tlist.group_tokens(sql.Where, tokens)
- token = tlist.token_next_by(m=sql.Where.M_OPEN, idx=token)
+ end = tlist.tokens[eidx - 1]
+ # TODO: convert this to eidx instead of end token.
+ # i think above values are len(tlist) and eidx-1
+ eidx = tlist.token_index(end)
+ tlist.group_tokens(sql.Where, tidx, eidx)
+ tidx, token = tlist.token_next_by(m=sql.Where.M_OPEN, idx=tidx)
@recurse()
@@ -217,17 +272,12 @@ def group_aliased(tlist):
I_ALIAS = (sql.Parenthesis, sql.Function, sql.Case, sql.Identifier,
sql.Operation)
- token = tlist.token_next_by(i=I_ALIAS, t=T.Number)
+ tidx, token = tlist.token_next_by(i=I_ALIAS, t=T.Number)
while token:
- next_ = tlist.token_next(token)
+ nidx, next_ = tlist.token_next(tidx)
if imt(next_, i=sql.Identifier):
- tokens = tlist.tokens_between(token, next_)
- token = tlist.group_tokens(sql.Identifier, tokens, extend=True)
- token = tlist.token_next_by(i=I_ALIAS, t=T.Number, idx=token)
-
-
-def group_typecasts(tlist):
- _group_left_right(tlist, (T.Punctuation, '::'), sql.Identifier)
+ tlist.group_tokens(sql.Identifier, tidx, nidx, extend=True)
+ tidx, token = tlist.token_next_by(i=I_ALIAS, t=T.Number, idx=tidx)
@recurse(sql.Function)
@@ -241,45 +291,51 @@ def group_functions(tlist):
has_table = True
if has_create and has_table:
return
- token = tlist.token_next_by(t=T.Name)
+
+ tidx, token = tlist.token_next_by(t=T.Name)
while token:
- next_ = tlist.token_next(token)
- if imt(next_, i=sql.Parenthesis):
- tokens = tlist.tokens_between(token, next_)
- token = tlist.group_tokens(sql.Function, tokens)
- token = tlist.token_next_by(t=T.Name, idx=token)
+ nidx, next_ = tlist.token_next(tidx)
+ if isinstance(next_, sql.Parenthesis):
+ tlist.group_tokens(sql.Function, tidx, nidx)
+ tidx, token = tlist.token_next_by(t=T.Name, idx=tidx)
def group_order(tlist):
"""Group together Identifier and Asc/Desc token"""
- token = tlist.token_next_by(t=T.Keyword.Order)
+ tidx, token = tlist.token_next_by(t=T.Keyword.Order)
while token:
- prev = tlist.token_prev(token)
- if imt(prev, i=sql.Identifier, t=T.Number):
- tokens = tlist.tokens_between(prev, token)
- token = tlist.group_tokens(sql.Identifier, tokens)
- token = tlist.token_next_by(t=T.Keyword.Order, idx=token)
+ pidx, prev_ = tlist.token_prev(tidx)
+ if imt(prev_, i=sql.Identifier, t=T.Number):
+ tlist.group_tokens(sql.Identifier, pidx, tidx)
+ tidx = pidx
+ tidx, token = tlist.token_next_by(t=T.Keyword.Order, idx=tidx)
@recurse()
def align_comments(tlist):
- token = tlist.token_next_by(i=sql.Comment)
+ tidx, token = tlist.token_next_by(i=sql.Comment)
while token:
- before = tlist.token_prev(token)
- if isinstance(before, sql.TokenList):
- tokens = tlist.tokens_between(before, token)
- token = tlist.group_tokens(sql.TokenList, tokens, extend=True)
- token = tlist.token_next_by(i=sql.Comment, idx=token)
+ pidx, prev_ = tlist.token_prev(tidx)
+ if isinstance(prev_, sql.TokenList):
+ tlist.group_tokens(sql.TokenList, pidx, tidx, extend=True)
+ tidx = pidx
+ tidx, token = tlist.token_next_by(i=sql.Comment, idx=tidx)
def group(stmt):
for func in [
group_comments,
+
+ # _group_matching
group_brackets,
group_parenthesis,
+ group_case,
+ group_if,
+ group_for,
+ group_begin,
+
group_functions,
group_where,
- group_case,
group_period,
group_arrays,
group_identifier,
@@ -290,12 +346,42 @@ def group(stmt):
group_aliased,
group_assignment,
group_comparison,
+
align_comments,
group_identifier_list,
- group_if,
- group_for,
- group_foreach,
- group_begin,
]:
func(stmt)
return stmt
+
+
+def _group(tlist, cls, match,
+ valid_prev=lambda t: True,
+ valid_next=lambda t: True,
+ post=None,
+ extend=True,
+ recurse=True
+ ):
+ """Groups together tokens that are joined by a middle token. ie. x < y"""
+
+ tidx_offset = 0
+ pidx, prev_ = None, None
+ for idx, token in enumerate(list(tlist)):
+ tidx = idx - tidx_offset
+
+ if token.is_whitespace():
+ continue
+
+ if recurse and token.is_group() and not isinstance(token, cls):
+ _group(token, cls, match, valid_prev, valid_next, post, extend)
+
+ if match(token):
+ nidx, next_ = tlist.token_next(tidx)
+ if valid_prev(prev_) and valid_next(next_):
+ from_idx, to_idx = post(tlist, pidx, tidx, nidx)
+ grp = tlist.group_tokens(cls, from_idx, to_idx, extend=extend)
+
+ tidx_offset += to_idx - from_idx
+ pidx, prev_ = from_idx, grp
+ continue
+
+ pidx, prev_ = tidx, token