Decode bytes to unicode in Lexer.get_tokens().

Raise TypeError if the input is neither bytes in a known encoding nor unicode nor a file-like object (file, StringIO). Remove function u(). Add bytes_type to compat. Add tests for non-ascii.
author: Oleg Broytman <phd@phdru.name> 2016-08-31 16:10:35 +0300
committer: Oleg Broytman <phd@phdru.name> 2016-08-31 16:11:22 +0300
commit: 843499915e91e0ee324a0407c78ac6f570806370 (patch)
tree: cda75d04543a02e20bf04ab01c8dfc5e670a269d /sqlparse
parent: b05bc5ab586cb06d89c38e2eee7f77e1d3fc03c5 (diff)
download: sqlparse-843499915e91e0ee324a0407c78ac6f570806370.tar.gz
2 files changed, 18 insertions, 19 deletions
diff --git a/sqlparse/compat.py b/sqlparse/compat.py
index d6a9144..933e0be 100644
--- a/sqlparse/compat.py
+++ b/sqlparse/compat.py
@@ -23,14 +23,10 @@ PY3 = sys.version_info[0] == 3
 
 
 if PY3:
-    def u(s, encoding=None):
-        return str(s)
-
-
     def unicode_compatible(cls):
         return cls
 
-
+    bytes_type = bytes
     text_type = str
     string_types = (str,)
     from io import StringIO
@@ -38,20 +34,12 @@ if PY3:
 
 
 elif PY2:
-    def u(s, encoding=None):
-        encoding = encoding or 'unicode-escape'
-        try:
-            return unicode(s)
-        except UnicodeDecodeError:
-            return unicode(s, encoding)
-
-
     def unicode_compatible(cls):
         cls.__unicode__ = cls.__str__
         cls.__str__ = lambda x: x.__unicode__().encode('utf-8')
         return cls
 
-
+    bytes_type = str
     text_type = unicode
     string_types = (str, unicode,)
     from StringIO import StringIO
diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py
index e7996b2..15a9aef 100644
--- a/sqlparse/lexer.py
+++ b/sqlparse/lexer.py
@@ -14,7 +14,7 @@
 
 from sqlparse import tokens
 from sqlparse.keywords import SQL_REGEX
-from sqlparse.compat import file_types, string_types, u
+from sqlparse.compat import bytes_type, text_type, file_types
 from sqlparse.utils import consume
 
 
@@ -37,10 +37,21 @@ class Lexer(object):
 
         ``stack`` is the inital stack (default: ``['root']``)
         """
-        if isinstance(text, string_types):
-            text = u(text, encoding)
-        elif isinstance(text, file_types):
-            text = u(text.read(), encoding)
+        if isinstance(text, file_types):
+            text = text.read()
+
+        if isinstance(text, text_type):
+            pass
+        elif isinstance(text, bytes_type):
+            try:
+                text = text.decode()
+            except UnicodeDecodeError:
+                if not encoding:
+                    encoding = 'unicode-escape'
+                text = text.decode(encoding)
+        else:
+            raise TypeError(u"Expected text or file-like object, got {!r}".
+                            format(type(text)))
 
         iterable = enumerate(text)
         for pos, char in iterable:
author	Oleg Broytman <phd@phdru.name>	2016-08-31 16:10:35 +0300
committer	Oleg Broytman <phd@phdru.name>	2016-08-31 16:11:22 +0300
commit	843499915e91e0ee324a0407c78ac6f570806370 (patch)
tree	cda75d04543a02e20bf04ab01c8dfc5e670a269d /sqlparse
parent	b05bc5ab586cb06d89c38e2eee7f77e1d3fc03c5 (diff)
download	sqlparse-843499915e91e0ee324a0407c78ac6f570806370.tar.gz