From 13274ebb0d66b64363145bf23b6aa824845f61be Mon Sep 17 00:00:00 2001 From: quest Date: Sat, 21 Apr 2012 20:18:42 +0200 Subject: Always use a stream internally; makes things more readable. --- sqlparse/lexer.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index 67dbc29..82a6169 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -16,7 +16,7 @@ import re from sqlparse import tokens from sqlparse.keywords import KEYWORDS, KEYWORDS_COMMON - +from cStringIO import StringIO class include(str): pass @@ -239,15 +239,18 @@ class Lexer(object): Also preprocess the text, i.e. expand tabs and strip it if wanted and applies registered filters. """ - if isinstance(text, str): - text = self._decode(text) - if isinstance(text, basestring): if self.stripall: text = text.strip() elif self.stripnl: text = text.strip('\n') + if isinstance(text, unicode): + text = StringIO(text.encode('utf-8')) + self.encoding = 'utf-8' + else: + text = StringIO(text) + def streamer(): for i, t, v in self.get_tokens_unprocessed(text): yield t, v @@ -256,7 +259,7 @@ class Lexer(object): stream = apply_filters(stream, self.filters, self) return stream - def get_tokens_unprocessed(self, text, stack=('root',)): + def get_tokens_unprocessed(self, stream, stack=('root',)): """ Split ``text`` into (tokentype, text) pairs. @@ -268,10 +271,8 @@ class Lexer(object): statetokens = tokendefs[statestack[-1]] known_names = {} - hasmore = False - if hasattr(text, 'read'): - o, text = text, self._decode(text.read(self.bufsize)) - hasmore = len(text) == self.bufsize + text = self._decode(stream.read(self.bufsize)) + hasmore = len(text) == self.bufsize while 1: for rexmatch, action, new_state in statetokens: @@ -315,6 +316,12 @@ class Lexer(object): break else: try: + if hasmore: + buf = self._decode(stream.read(self.bufsize)) + hasmore = len(buf) == self.bufsize + text = text[pos:] + buf + pos = 0 + continue if text[pos] == '\n': # at EOL, reset state to "root" pos += 1 @@ -322,12 +329,6 @@ class Lexer(object): statetokens = tokendefs['root'] yield pos, tokens.Text, u'\n' continue - if hasmore: - buf = self._decode(o.read(self.bufsize)) - hasmore = len(buf) == self.bufsize - text = text[pos:] + buf - pos = 0 - continue yield pos, tokens.Error, text[pos] pos += 1 except IndexError: -- cgit v1.2.1