From 7631460377274f6a9e074c974e2d63dafb1938eb Mon Sep 17 00:00:00 2001 From: quest Date: Sat, 21 Apr 2012 19:57:26 +0200 Subject: We can now work with file-like objects. --- sqlparse/__init__.py | 8 ++++++++ sqlparse/lexer.py | 57 +++++++++++++++++++++++++++++++++++----------------- 2 files changed, 47 insertions(+), 18 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/__init__.py b/sqlparse/__init__.py index f924c04..bacaa78 100644 --- a/sqlparse/__init__.py +++ b/sqlparse/__init__.py @@ -54,6 +54,14 @@ def split(sql): stack.split_statements = True return [unicode(stmt) for stmt in stack.run(sql)] +def splitstream(sql): + """Split *sql* into single statements. + + Returns a list of strings. + """ + stack = engine.FilterStack() + stack.split_statements = True + return stack.run(sql) from sqlparse.engine.filter import StatementFilter def split2(stream): diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index 321669d..67dbc29 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -159,6 +159,7 @@ class Lexer(object): stripnl = False tabsize = 0 flags = re.IGNORECASE + bufsize = 4096 tokens = { 'root': [ @@ -214,6 +215,21 @@ class Lexer(object): filter_ = filter_(**options) self.filters.append(filter_) + def _decode(self, text): + if self.encoding == 'guess': + try: + text = text.decode('utf-8') + if text.startswith(u'\ufeff'): + text = text[len(u'\ufeff'):] + except UnicodeDecodeError: + text = text.decode('latin1') + else: + text = text.decode(self.encoding) + + if self.tabsize > 0: + text = text.expandtabs(self.tabsize) + return text + def get_tokens(self, text, unfiltered=False): """ Return an iterable of (tokentype, value) pairs generated from @@ -223,24 +239,14 @@ class Lexer(object): Also preprocess the text, i.e. expand tabs and strip it if wanted and applies registered filters. """ - if not isinstance(text, unicode): - if self.encoding == 'guess': - try: - text = text.decode('utf-8') - if text.startswith(u'\ufeff'): - text = text[len(u'\ufeff'):] - except UnicodeDecodeError: - text = text.decode('latin1') - else: - text = text.decode(self.encoding) - if self.stripall: - text = text.strip() - elif self.stripnl: - text = text.strip('\n') - if self.tabsize > 0: - text = text.expandtabs(self.tabsize) -# if not text.endswith('\n'): -# text += '\n' + if isinstance(text, str): + text = self._decode(text) + + if isinstance(text, basestring): + if self.stripall: + text = text.strip() + elif self.stripnl: + text = text.strip('\n') def streamer(): for i, t, v in self.get_tokens_unprocessed(text): @@ -261,10 +267,19 @@ class Lexer(object): statestack = list(stack) statetokens = tokendefs[statestack[-1]] known_names = {} + + hasmore = False + if hasattr(text, 'read'): + o, text = text, self._decode(text.read(self.bufsize)) + hasmore = len(text) == self.bufsize + while 1: for rexmatch, action, new_state in statetokens: m = rexmatch(text, pos) if m: + if hasmore and m.end() == len(text): + continue + # print rex.pattern value = m.group() if value in known_names: @@ -307,6 +322,12 @@ class Lexer(object): statetokens = tokendefs['root'] yield pos, tokens.Text, u'\n' continue + if hasmore: + buf = self._decode(o.read(self.bufsize)) + hasmore = len(buf) == self.bufsize + text = text[pos:] + buf + pos = 0 + continue yield pos, tokens.Error, text[pos] pos += 1 except IndexError: -- cgit v1.2.1 From 13274ebb0d66b64363145bf23b6aa824845f61be Mon Sep 17 00:00:00 2001 From: quest Date: Sat, 21 Apr 2012 20:18:42 +0200 Subject: Always use a stream internally; makes things more readable. --- sqlparse/lexer.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index 67dbc29..82a6169 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -16,7 +16,7 @@ import re from sqlparse import tokens from sqlparse.keywords import KEYWORDS, KEYWORDS_COMMON - +from cStringIO import StringIO class include(str): pass @@ -239,15 +239,18 @@ class Lexer(object): Also preprocess the text, i.e. expand tabs and strip it if wanted and applies registered filters. """ - if isinstance(text, str): - text = self._decode(text) - if isinstance(text, basestring): if self.stripall: text = text.strip() elif self.stripnl: text = text.strip('\n') + if isinstance(text, unicode): + text = StringIO(text.encode('utf-8')) + self.encoding = 'utf-8' + else: + text = StringIO(text) + def streamer(): for i, t, v in self.get_tokens_unprocessed(text): yield t, v @@ -256,7 +259,7 @@ class Lexer(object): stream = apply_filters(stream, self.filters, self) return stream - def get_tokens_unprocessed(self, text, stack=('root',)): + def get_tokens_unprocessed(self, stream, stack=('root',)): """ Split ``text`` into (tokentype, text) pairs. @@ -268,10 +271,8 @@ class Lexer(object): statetokens = tokendefs[statestack[-1]] known_names = {} - hasmore = False - if hasattr(text, 'read'): - o, text = text, self._decode(text.read(self.bufsize)) - hasmore = len(text) == self.bufsize + text = self._decode(stream.read(self.bufsize)) + hasmore = len(text) == self.bufsize while 1: for rexmatch, action, new_state in statetokens: @@ -315,6 +316,12 @@ class Lexer(object): break else: try: + if hasmore: + buf = self._decode(stream.read(self.bufsize)) + hasmore = len(buf) == self.bufsize + text = text[pos:] + buf + pos = 0 + continue if text[pos] == '\n': # at EOL, reset state to "root" pos += 1 @@ -322,12 +329,6 @@ class Lexer(object): statetokens = tokendefs['root'] yield pos, tokens.Text, u'\n' continue - if hasmore: - buf = self._decode(o.read(self.bufsize)) - hasmore = len(buf) == self.bufsize - text = text[pos:] + buf - pos = 0 - continue yield pos, tokens.Error, text[pos] pos += 1 except IndexError: -- cgit v1.2.1 From 61f22063051fa94b0f055a995eed10b576535c29 Mon Sep 17 00:00:00 2001 From: quest Date: Sat, 21 Apr 2012 20:28:49 +0200 Subject: Minor clarifications and doc updates --- sqlparse/__init__.py | 8 ++++---- sqlparse/lexer.py | 14 ++++++++------ 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/__init__.py b/sqlparse/__init__.py index bacaa78..e622292 100644 --- a/sqlparse/__init__.py +++ b/sqlparse/__init__.py @@ -54,14 +54,14 @@ def split(sql): stack.split_statements = True return [unicode(stmt) for stmt in stack.run(sql)] -def splitstream(sql): - """Split *sql* into single statements. +def splitstream(stream): + """Split sql statements from file-like object . - Returns a list of strings. + Returns a list of Statement objects. """ stack = engine.FilterStack() stack.split_statements = True - return stack.run(sql) + return stack.run(stream) from sqlparse.engine.filter import StatementFilter def split2(stream): diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index 82a6169..a0ce5d2 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -279,6 +279,7 @@ class Lexer(object): m = rexmatch(text, pos) if m: if hasmore and m.end() == len(text): + # Since this is end, token may be truncated continue # print rex.pattern @@ -315,13 +316,14 @@ class Lexer(object): statetokens = tokendefs[statestack[-1]] break else: + if hasmore: + buf = self._decode(stream.read(self.bufsize)) + hasmore = len(buf) == self.bufsize + text = text[pos:] + buf + pos = 0 + continue + try: - if hasmore: - buf = self._decode(stream.read(self.bufsize)) - hasmore = len(buf) == self.bufsize - text = text[pos:] + buf - pos = 0 - continue if text[pos] == '\n': # at EOL, reset state to "root" pos += 1 -- cgit v1.2.1 From 1e3fbade7c80d917b4d727e9cef781b21be2fdf6 Mon Sep 17 00:00:00 2001 From: quest Date: Sat, 21 Apr 2012 22:37:30 +0200 Subject: splitstream -> parsestream --- sqlparse/__init__.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/__init__.py b/sqlparse/__init__.py index e622292..58a560c 100644 --- a/sqlparse/__init__.py +++ b/sqlparse/__init__.py @@ -31,6 +31,16 @@ def parse(sql): return tuple(stack.run(sql)) +def parsestream(stream): + """Pares sql statements from file-like object. + + Returns a generator of Statement instances. + """ + stack = engine.FilterStack() + stack.full_analyze() + return stack.run(stream) + + def format(sql, **options): """Format *sql* according to *options*. @@ -54,15 +64,6 @@ def split(sql): stack.split_statements = True return [unicode(stmt) for stmt in stack.run(sql)] -def splitstream(stream): - """Split sql statements from file-like object . - - Returns a list of Statement objects. - """ - stack = engine.FilterStack() - stack.split_statements = True - return stack.run(stream) - from sqlparse.engine.filter import StatementFilter def split2(stream): splitter = StatementFilter() -- cgit v1.2.1 From 1f8dfd8723dd7aa9610fd9249775dc3b403d7e77 Mon Sep 17 00:00:00 2001 From: quest Date: Sun, 22 Apr 2012 00:27:15 +0200 Subject: Oops; doesnt handle UTF-8 correctly when reading from stream --- sqlparse/lexer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index a0ce5d2..dc794ab 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -271,8 +271,9 @@ class Lexer(object): statetokens = tokendefs[statestack[-1]] known_names = {} - text = self._decode(stream.read(self.bufsize)) + text = stream.read(self.bufsize) hasmore = len(text) == self.bufsize + text = self._decode(text) while 1: for rexmatch, action, new_state in statetokens: @@ -317,9 +318,9 @@ class Lexer(object): break else: if hasmore: - buf = self._decode(stream.read(self.bufsize)) + buf = stream.read(self.bufsize) hasmore = len(buf) == self.bufsize - text = text[pos:] + buf + text = text[pos:] + self._decode(buf) pos = 0 continue -- cgit v1.2.1 From a16c08703c8eb213a8b570bb16636fbe7a2b4a28 Mon Sep 17 00:00:00 2001 From: quest Date: Sun, 22 Apr 2012 01:41:22 +0200 Subject: various optimizations in sql.py --- sqlparse/engine/grouping.py | 3 ++- sqlparse/sql.py | 27 +++++++++++++++++++-------- 2 files changed, 21 insertions(+), 9 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/engine/grouping.py b/sqlparse/engine/grouping.py index 55ec7e2..1487c24 100644 --- a/sqlparse/engine/grouping.py +++ b/sqlparse/engine/grouping.py @@ -55,7 +55,8 @@ def _group_matching(tlist, start_ttype, start_value, end_ttype, end_value, cls, include_semicolon=False, recurse=False): def _find_matching(i, tl, stt, sva, ett, eva): depth = 1 - for t in tl.tokens[i:]: + for n in xrange(i, len(tl.tokens)): + t = tl.tokens[n] if t.match(stt, sva): depth += 1 elif t.match(ett, eva): diff --git a/sqlparse/sql.py b/sqlparse/sql.py index 9c7aeee..31fa34d 100644 --- a/sqlparse/sql.py +++ b/sqlparse/sql.py @@ -15,11 +15,13 @@ class Token(object): the type of the token. """ - __slots__ = ('value', 'ttype', 'parent') + __slots__ = ('value', 'ttype', 'parent', 'normalized', 'is_keyword') def __init__(self, ttype, value): self.value = value + self.normalized = value.upper() if ttype in T.Keyword else value self.ttype = ttype + self.is_keyword = ttype in T.Keyword self.parent = None def __str__(self): @@ -71,9 +73,9 @@ class Token(object): type_matched = self.ttype is ttype if not type_matched or values is None: return type_matched - if isinstance(values, basestring): - values = set([values]) if regex: + if isinstance(values, basestring): + values = set([values]) if self.ttype is T.Keyword: values = set([re.compile(v, re.IGNORECASE) for v in values]) else: @@ -83,10 +85,18 @@ class Token(object): return True return False else: - if self.ttype in T.Keyword: - values = set([v.upper() for v in values]) - return self.value.upper() in values + if isinstance(values, basestring): + if self.is_keyword: + return values.upper() == self.normalized + else: + return values == self.value + if self.is_keyword: + for v in values: + if v.upper() == self.normalized: + return True + return False else: + print len(values) return self.value in values def is_group(self): @@ -227,7 +237,8 @@ class TokenList(Token): if not isinstance(idx, int): idx = self.token_index(idx) - for token in self.tokens[idx:]: + for n in xrange(idx, len(self.tokens)): + token = self.tokens[n] if token.match(ttype, value, regex): return token @@ -395,7 +406,7 @@ class Statement(TokenList): return 'UNKNOWN' elif first_token.ttype in (T.Keyword.DML, T.Keyword.DDL): - return first_token.value.upper() + return first_token.normalized return 'UNKNOWN' -- cgit v1.2.1