summaryrefslogtreecommitdiff
path: root/sqlparse/lexer.py
diff options
context:
space:
mode:
Diffstat (limited to 'sqlparse/lexer.py')
-rw-r--r--sqlparse/lexer.py63
1 files changed, 44 insertions, 19 deletions
diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py
index 321669d..dc794ab 100644
--- a/sqlparse/lexer.py
+++ b/sqlparse/lexer.py
@@ -16,7 +16,7 @@ import re
from sqlparse import tokens
from sqlparse.keywords import KEYWORDS, KEYWORDS_COMMON
-
+from cStringIO import StringIO
class include(str):
pass
@@ -159,6 +159,7 @@ class Lexer(object):
stripnl = False
tabsize = 0
flags = re.IGNORECASE
+ bufsize = 4096
tokens = {
'root': [
@@ -214,6 +215,21 @@ class Lexer(object):
filter_ = filter_(**options)
self.filters.append(filter_)
+ def _decode(self, text):
+ if self.encoding == 'guess':
+ try:
+ text = text.decode('utf-8')
+ if text.startswith(u'\ufeff'):
+ text = text[len(u'\ufeff'):]
+ except UnicodeDecodeError:
+ text = text.decode('latin1')
+ else:
+ text = text.decode(self.encoding)
+
+ if self.tabsize > 0:
+ text = text.expandtabs(self.tabsize)
+ return text
+
def get_tokens(self, text, unfiltered=False):
"""
Return an iterable of (tokentype, value) pairs generated from
@@ -223,24 +239,17 @@ class Lexer(object):
Also preprocess the text, i.e. expand tabs and strip it if
wanted and applies registered filters.
"""
- if not isinstance(text, unicode):
- if self.encoding == 'guess':
- try:
- text = text.decode('utf-8')
- if text.startswith(u'\ufeff'):
- text = text[len(u'\ufeff'):]
- except UnicodeDecodeError:
- text = text.decode('latin1')
+ if isinstance(text, basestring):
+ if self.stripall:
+ text = text.strip()
+ elif self.stripnl:
+ text = text.strip('\n')
+
+ if isinstance(text, unicode):
+ text = StringIO(text.encode('utf-8'))
+ self.encoding = 'utf-8'
else:
- text = text.decode(self.encoding)
- if self.stripall:
- text = text.strip()
- elif self.stripnl:
- text = text.strip('\n')
- if self.tabsize > 0:
- text = text.expandtabs(self.tabsize)
-# if not text.endswith('\n'):
-# text += '\n'
+ text = StringIO(text)
def streamer():
for i, t, v in self.get_tokens_unprocessed(text):
@@ -250,7 +259,7 @@ class Lexer(object):
stream = apply_filters(stream, self.filters, self)
return stream
- def get_tokens_unprocessed(self, text, stack=('root',)):
+ def get_tokens_unprocessed(self, stream, stack=('root',)):
"""
Split ``text`` into (tokentype, text) pairs.
@@ -261,10 +270,19 @@ class Lexer(object):
statestack = list(stack)
statetokens = tokendefs[statestack[-1]]
known_names = {}
+
+ text = stream.read(self.bufsize)
+ hasmore = len(text) == self.bufsize
+ text = self._decode(text)
+
while 1:
for rexmatch, action, new_state in statetokens:
m = rexmatch(text, pos)
if m:
+ if hasmore and m.end() == len(text):
+ # Since this is end, token may be truncated
+ continue
+
# print rex.pattern
value = m.group()
if value in known_names:
@@ -299,6 +317,13 @@ class Lexer(object):
statetokens = tokendefs[statestack[-1]]
break
else:
+ if hasmore:
+ buf = stream.read(self.bufsize)
+ hasmore = len(buf) == self.bufsize
+ text = text[pos:] + self._decode(buf)
+ pos = 0
+ continue
+
try:
if text[pos] == '\n':
# at EOL, reset state to "root"