diff options
Diffstat (limited to 'sqlparse/lexer.py')
| -rw-r--r-- | sqlparse/lexer.py | 63 |
1 files changed, 44 insertions, 19 deletions
diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index 321669d..dc794ab 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -16,7 +16,7 @@ import re from sqlparse import tokens from sqlparse.keywords import KEYWORDS, KEYWORDS_COMMON - +from cStringIO import StringIO class include(str): pass @@ -159,6 +159,7 @@ class Lexer(object): stripnl = False tabsize = 0 flags = re.IGNORECASE + bufsize = 4096 tokens = { 'root': [ @@ -214,6 +215,21 @@ class Lexer(object): filter_ = filter_(**options) self.filters.append(filter_) + def _decode(self, text): + if self.encoding == 'guess': + try: + text = text.decode('utf-8') + if text.startswith(u'\ufeff'): + text = text[len(u'\ufeff'):] + except UnicodeDecodeError: + text = text.decode('latin1') + else: + text = text.decode(self.encoding) + + if self.tabsize > 0: + text = text.expandtabs(self.tabsize) + return text + def get_tokens(self, text, unfiltered=False): """ Return an iterable of (tokentype, value) pairs generated from @@ -223,24 +239,17 @@ class Lexer(object): Also preprocess the text, i.e. expand tabs and strip it if wanted and applies registered filters. """ - if not isinstance(text, unicode): - if self.encoding == 'guess': - try: - text = text.decode('utf-8') - if text.startswith(u'\ufeff'): - text = text[len(u'\ufeff'):] - except UnicodeDecodeError: - text = text.decode('latin1') + if isinstance(text, basestring): + if self.stripall: + text = text.strip() + elif self.stripnl: + text = text.strip('\n') + + if isinstance(text, unicode): + text = StringIO(text.encode('utf-8')) + self.encoding = 'utf-8' else: - text = text.decode(self.encoding) - if self.stripall: - text = text.strip() - elif self.stripnl: - text = text.strip('\n') - if self.tabsize > 0: - text = text.expandtabs(self.tabsize) -# if not text.endswith('\n'): -# text += '\n' + text = StringIO(text) def streamer(): for i, t, v in self.get_tokens_unprocessed(text): @@ -250,7 +259,7 @@ class Lexer(object): stream = apply_filters(stream, self.filters, self) return stream - def get_tokens_unprocessed(self, text, stack=('root',)): + def get_tokens_unprocessed(self, stream, stack=('root',)): """ Split ``text`` into (tokentype, text) pairs. @@ -261,10 +270,19 @@ class Lexer(object): statestack = list(stack) statetokens = tokendefs[statestack[-1]] known_names = {} + + text = stream.read(self.bufsize) + hasmore = len(text) == self.bufsize + text = self._decode(text) + while 1: for rexmatch, action, new_state in statetokens: m = rexmatch(text, pos) if m: + if hasmore and m.end() == len(text): + # Since this is end, token may be truncated + continue + # print rex.pattern value = m.group() if value in known_names: @@ -299,6 +317,13 @@ class Lexer(object): statetokens = tokendefs[statestack[-1]] break else: + if hasmore: + buf = stream.read(self.bufsize) + hasmore = len(buf) == self.bufsize + text = text[pos:] + self._decode(buf) + pos = 0 + continue + try: if text[pos] == '\n': # at EOL, reset state to "root" |
