summaryrefslogtreecommitdiff
path: root/sqlparse
diff options
context:
space:
mode:
authorMichael Schuller <michael.schuller@artlogic.net>2014-03-07 18:01:41 +0000
committerMichael Schuller <michael.schuller@artlogic.net>2014-03-10 12:20:38 +0000
commit480e52fddf28fad591f3214ee28c2d2af8842ce1 (patch)
treead9fe3c141c22113769a7f50b588f1d8c6156819 /sqlparse
parentff7ba6404342898616be24115f7be4744520289d (diff)
downloadsqlparse-480e52fddf28fad591f3214ee28c2d2af8842ce1.tar.gz
Fix SerializerUnicode to split unquoted newlines
This provides a fix to issue #131. The `split_unquoted_newlines()` function added to the utils module handles the splitting of the string by performing a simple iteration of the string passed in and splitting on unquoted CR, LF, or CR+LFs as they are found.
Diffstat (limited to 'sqlparse')
-rw-r--r--sqlparse/filters.py7
-rw-r--r--sqlparse/utils.py43
2 files changed, 46 insertions, 4 deletions
diff --git a/sqlparse/filters.py b/sqlparse/filters.py
index 40caf51..5a613a0 100644
--- a/sqlparse/filters.py
+++ b/sqlparse/filters.py
@@ -11,6 +11,7 @@ from sqlparse.pipeline import Pipeline
from sqlparse.tokens import (Comment, Comparison, Keyword, Name, Punctuation,
String, Whitespace)
from sqlparse.utils import memoize_generator
+from sqlparse.utils import split_unquoted_newlines
# --------------------------
@@ -534,10 +535,8 @@ class SerializerUnicode:
def process(self, stack, stmt):
raw = unicode(stmt)
- add_nl = raw.endswith('\n')
- res = '\n'.join(line.rstrip() for line in raw.splitlines())
- if add_nl:
- res += '\n'
+ lines = split_unquoted_newlines(raw)
+ res = '\n'.join(line.rstrip() for line in lines)
return res
diff --git a/sqlparse/utils.py b/sqlparse/utils.py
index cdf27b1..2a7fb46 100644
--- a/sqlparse/utils.py
+++ b/sqlparse/utils.py
@@ -94,3 +94,46 @@ def memoize_generator(func):
yield item
return wrapped_func
+
+def split_unquoted_newlines(text):
+ """Split a string on all unquoted newlines
+
+ This is a fairly simplistic implementation of splitting a string on all
+ unescaped CR, LF, or CR+LF occurences. Only iterates the string once. Seemed
+ easier than a complex regular expression.
+ """
+ lines = ['']
+ quoted = None
+ escape_next = False
+ last_char = None
+ for c in text:
+ escaped = False
+ # If the previous character was an unescpaed '\', this character is
+ # escaped.
+ if escape_next:
+ escaped = True
+ escape_next = False
+ # If the current character is '\' and it is not escaped, the next
+ # character is escaped.
+ if c == '\\' and not escaped:
+ escape_next = True
+ # Start a quoted portion if a) we aren't in one already, and b) the
+ # quote isn't escaped.
+ if c in '"\'' and not escaped and not quoted:
+ quoted = c
+ # Escaped quotes (obvs) don't count as a closing match.
+ elif c == quoted and not escaped:
+ quoted = None
+
+ if not quoted and c in ['\r', '\n']:
+ if c == '\n' and last_char == '\r':
+ # It's a CR+LF, so don't append another line
+ pass
+ else:
+ lines.append('')
+ else:
+ lines[-1] += c
+
+ last_char = c
+
+ return lines \ No newline at end of file