Add encoding parameter to top-level functions (fixes issue20).

author: Andi Albrecht <albrecht.andi@gmail.com> 2013-04-04 05:54:43 +0200
committer: Andi Albrecht <albrecht.andi@gmail.com> 2013-04-04 05:54:43 +0200
commit: 081b23757c1a534baf42f7c099bab471bf20fe83 (patch)
tree: f751d0eb5f754117bcdf6767cf2600b1f67def62
parent: e664ae1da02f87f720878c7699cc26d0a8e9e659 (diff)
download: sqlparse-081b23757c1a534baf42f7c099bab471bf20fe83.tar.gz
7 files changed, 49 insertions, 18 deletions
diff --git a/CHANGES b/CHANGES
index ac3d26c..bca59c1 100644
--- a/CHANGES
+++ b/CHANGES
@@ -10,6 +10,11 @@ Bug Fixes
 
 Enhancements
  * Improve parsing speed when SQL contains CLOBs or BLOBs (issue86).
+ * Top-level API functions now accept encoding keyword to parse
+   statements in certain encodings more reliable (issue20).
+
+Other
+ * Documentation updates.
 
 
 Release 0.1.6 (Jan 01, 2013)
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 2531c9b..99e50e2 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -12,6 +12,10 @@ The :mod:`sqlparse` module provides the following functions on module-level.
 
 .. autofunction:: sqlparse.parse
 
+In most cases there's no need to set the `encoding` parameter. If
+`encoding` is not set, sqlparse assumes that the given SQL statement
+is encoded either in utf-8 or latin-1.
+
 
 .. _formatting:
 
diff --git a/sqlparse/__init__.py b/sqlparse/__init__.py
index b338350..e4de928 100644
--- a/sqlparse/__init__.py
+++ b/sqlparse/__init__.py
@@ -18,24 +18,26 @@ from sqlparse import formatter
 from sqlparse.exceptions import SQLParseError
 
 
-def parse(sql):
+def parse(sql, encoding=None):
     """Parse sql and return a list of statements.
 
-    *sql* is a single string containting one or more SQL statements.
-
-    Returns a tuple of :class:`~sqlparse.sql.Statement` instances.
+    :param sql: A string containting one or more SQL statements.
+    :param encoding: The encoding of the statement (optional).
+    :returns: A tuple of :class:`~sqlparse.sql.Statement` instances.
     """
-    return tuple(parsestream(sql))
+    return tuple(parsestream(sql, encoding))
 
 
-def parsestream(stream):
+def parsestream(stream, encoding=None):
     """Parses sql statements from file-like object.
 
-    Returns a generator of Statement instances.
+    :param stream: A file-like object.
+    :param encoding: The encoding of the stream contents (optional).
+    :returns: A generator of :class:`~sqlparse.sql.Statement` instances.
     """
     stack = engine.FilterStack()
     stack.full_analyze()
-    return stack.run(stream)
+    return stack.run(stream, encoding)
 
 
 def format(sql, **options):
@@ -43,23 +45,29 @@ def format(sql, **options):
 
     Available options are documented in :ref:`formatting`.
 
-    Returns the formatted SQL statement as string.
+    In addition to the formatting options this function accepts the
+    keyword "encoding" which determines the encoding of the statement.
+
+    :returns: The formatted SQL statement as string.
     """
+    encoding = options.pop('encoding', None)
     stack = engine.FilterStack()
     options = formatter.validate_options(options)
     stack = formatter.build_filter_stack(stack, options)
     stack.postprocess.append(filters.SerializerUnicode())
-    return ''.join(stack.run(sql))
+    return ''.join(stack.run(sql, encoding))
 
 
-def split(sql):
+def split(sql, encoding=None):
     """Split *sql* into single statements.
 
-    Returns a list of strings.
+    :param sql: A string containting one or more SQL statements.
+    :param encoding: The encoding of the statement (optional).
+    :returns: A list of strings.
     """
     stack = engine.FilterStack()
     stack.split_statements = True
-    return [unicode(stmt) for stmt in stack.run(sql)]
+    return [unicode(stmt) for stmt in stack.run(sql, encoding)]
 
 
 from sqlparse.engine.filter import StatementFilter
diff --git a/sqlparse/engine/__init__.py b/sqlparse/engine/__init__.py
index 3e2822b..62c82b8 100644
--- a/sqlparse/engine/__init__.py
+++ b/sqlparse/engine/__init__.py
@@ -36,8 +36,8 @@ class FilterStack(object):
     def full_analyze(self):
         self.enable_grouping()
 
-    def run(self, sql):
-        stream = lexer.tokenize(sql)
+    def run(self, sql, encoding=None):
+        stream = lexer.tokenize(sql, encoding)
         # Process token stream
         if self.preprocess:
             for filter_ in self.preprocess:
diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py
index e769d7b..4d200a6 100644
--- a/sqlparse/lexer.py
+++ b/sqlparse/lexer.py
@@ -224,7 +224,8 @@ class Lexer(object):
 
     def _decode(self, text):
         if sys.version_info[0] == 3:
-            return text
+            if isinstance(text, str):
+                return text
         if self.encoding == 'guess':
             try:
                 text = text.decode('utf-8')
@@ -355,11 +356,13 @@ class Lexer(object):
                     break
 
 
-def tokenize(sql):
+def tokenize(sql, encoding=None):
     """Tokenize sql.
 
     Tokenize *sql* using the :class:`Lexer` and return a 2-tuple stream
     of ``(token type, value)`` items.
     """
     lexer = Lexer()
+    if encoding is not None:
+        lexer.encoding = encoding
     return lexer.get_tokens(sql)
diff --git a/tests/files/test_cp1251.sql b/tests/files/test_cp1251.sql
new file mode 100644
index 0000000..6c0228b
--- /dev/null
+++ b/tests/files/test_cp1251.sql
@@ -0,0 +1 @@
+insert into foo values (1); -- ����� ��� �������
diff --git a/tests/test_regressions.py b/tests/test_regressions.py
index 94d644f..e9d890b 100644
--- a/tests/test_regressions.py
+++ b/tests/test_regressions.py
@@ -2,7 +2,7 @@
 
 import sys
 
-from tests.utils import TestCaseBase
+from tests.utils import TestCaseBase, load_file
 
 import sqlparse
 from sqlparse import sql
@@ -188,3 +188,13 @@ def test_dont_alias_keywords():
     assert len(p.tokens) == 5
     assert p.tokens[0].ttype is T.Keyword
     assert p.tokens[2].ttype is T.Keyword
+
+
+def test_format_accepts_encoding():  # issue20
+    sql = load_file('test_cp1251.sql', 'cp1251')
+    formatted = sqlparse.format(sql, reindent=True, encoding='cp1251')
+    if sys.version_info < (3,):
+        tformatted = u'insert into foo\nvalues (1); -- Песня про надежду\n'
+    else:
+        tformatted = 'insert into foo\nvalues (1); -- Песня про надежду\n'
+    assert formatted == tformatted
author	Andi Albrecht <albrecht.andi@gmail.com>	2013-04-04 05:54:43 +0200
committer	Andi Albrecht <albrecht.andi@gmail.com>	2013-04-04 05:54:43 +0200
commit	081b23757c1a534baf42f7c099bab471bf20fe83 (patch)
tree	f751d0eb5f754117bcdf6767cf2600b1f67def62
parent	e664ae1da02f87f720878c7699cc26d0a8e9e659 (diff)
download	sqlparse-081b23757c1a534baf42f7c099bab471bf20fe83.tar.gz