From 480e52fddf28fad591f3214ee28c2d2af8842ce1 Mon Sep 17 00:00:00 2001 From: Michael Schuller Date: Fri, 7 Mar 2014 18:01:41 +0000 Subject: Fix SerializerUnicode to split unquoted newlines This provides a fix to issue #131. The `split_unquoted_newlines()` function added to the utils module handles the splitting of the string by performing a simple iteration of the string passed in and splitting on unquoted CR, LF, or CR+LFs as they are found. --- sqlparse/filters.py | 7 +++---- sqlparse/utils.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 4 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/filters.py b/sqlparse/filters.py index 40caf51..5a613a0 100644 --- a/sqlparse/filters.py +++ b/sqlparse/filters.py @@ -11,6 +11,7 @@ from sqlparse.pipeline import Pipeline from sqlparse.tokens import (Comment, Comparison, Keyword, Name, Punctuation, String, Whitespace) from sqlparse.utils import memoize_generator +from sqlparse.utils import split_unquoted_newlines # -------------------------- @@ -534,10 +535,8 @@ class SerializerUnicode: def process(self, stack, stmt): raw = unicode(stmt) - add_nl = raw.endswith('\n') - res = '\n'.join(line.rstrip() for line in raw.splitlines()) - if add_nl: - res += '\n' + lines = split_unquoted_newlines(raw) + res = '\n'.join(line.rstrip() for line in lines) return res diff --git a/sqlparse/utils.py b/sqlparse/utils.py index cdf27b1..2a7fb46 100644 --- a/sqlparse/utils.py +++ b/sqlparse/utils.py @@ -94,3 +94,46 @@ def memoize_generator(func): yield item return wrapped_func + +def split_unquoted_newlines(text): + """Split a string on all unquoted newlines + + This is a fairly simplistic implementation of splitting a string on all + unescaped CR, LF, or CR+LF occurences. Only iterates the string once. Seemed + easier than a complex regular expression. + """ + lines = [''] + quoted = None + escape_next = False + last_char = None + for c in text: + escaped = False + # If the previous character was an unescpaed '\', this character is + # escaped. + if escape_next: + escaped = True + escape_next = False + # If the current character is '\' and it is not escaped, the next + # character is escaped. + if c == '\\' and not escaped: + escape_next = True + # Start a quoted portion if a) we aren't in one already, and b) the + # quote isn't escaped. + if c in '"\'' and not escaped and not quoted: + quoted = c + # Escaped quotes (obvs) don't count as a closing match. + elif c == quoted and not escaped: + quoted = None + + if not quoted and c in ['\r', '\n']: + if c == '\n' and last_char == '\r': + # It's a CR+LF, so don't append another line + pass + else: + lines.append('') + else: + lines[-1] += c + + last_char = c + + return lines \ No newline at end of file -- cgit v1.2.1