diff options
Diffstat (limited to 'git')
-rw-r--r-- | git/compat.py | 192 | ||||
m--------- | git/ext/gitdb | 0 | ||||
-rw-r--r-- | git/objects/fun.py | 7 | ||||
-rw-r--r-- | git/test/performance/test_commit.py | 2 | ||||
-rw-r--r-- | git/test/test_fun.py | 18 |
5 files changed, 205 insertions, 14 deletions
diff --git a/git/compat.py b/git/compat.py index e7243e25..a2403d69 100644 --- a/git/compat.py +++ b/git/compat.py @@ -10,6 +10,8 @@ import locale import os import sys +import codecs + from gitdb.utils.compat import ( xrange, @@ -67,7 +69,7 @@ def safe_decode(s): if isinstance(s, unicode): return s elif isinstance(s, bytes): - return s.decode(defenc, 'replace') + return s.decode(defenc, 'surrogateescape') elif s is not None: raise TypeError('Expected bytes or text, but got %r' % (s,)) @@ -121,3 +123,191 @@ class UnicodeMixin(object): else: # Python 2 def __str__(self): return self.__unicode__().encode(defenc) + + +""" +This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error +handler of Python 3. +Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc +""" + +# This code is released under the Python license and the BSD 2-clause license + + +FS_ERRORS = 'surrogateescape' + +# # -- Python 2/3 compatibility ------------------------------------- +# FS_ERRORS = 'my_surrogateescape' + +def u(text): + if PY3: + return text + else: + return text.decode('unicode_escape') + +def b(data): + if PY3: + return data.encode('latin1') + else: + return data + +if PY3: + _unichr = chr + bytes_chr = lambda code: bytes((code,)) +else: + _unichr = unichr + bytes_chr = chr + +def surrogateescape_handler(exc): + """ + Pure Python implementation of the PEP 383: the "surrogateescape" error + handler of Python 3. Undecodable bytes will be replaced by a Unicode + character U+DCxx on decoding, and these are translated into the + original bytes on encoding. + """ + mystring = exc.object[exc.start:exc.end] + + try: + if isinstance(exc, UnicodeDecodeError): + # mystring is a byte-string in this case + decoded = replace_surrogate_decode(mystring) + elif isinstance(exc, UnicodeEncodeError): + # In the case of u'\udcc3'.encode('ascii', + # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an + # exception anyway after this function is called, even though I think + # it's doing what it should. It seems that the strict encoder is called + # to encode the unicode string that this function returns ... + decoded = replace_surrogate_encode(mystring) + else: + raise exc + except NotASurrogateError: + raise exc + return (decoded, exc.end) + + +class NotASurrogateError(Exception): + pass + + +def replace_surrogate_encode(mystring): + """ + Returns a (unicode) string, not the more logical bytes, because the codecs + register_error functionality expects this. + """ + decoded = [] + for ch in mystring: + # if PY3: + # code = ch + # else: + code = ord(ch) + + # The following magic comes from Py3.3's Python/codecs.c file: + if not 0xD800 <= code <= 0xDCFF: + # Not a surrogate. Fail with the original exception. + raise exc + # mybytes = [0xe0 | (code >> 12), + # 0x80 | ((code >> 6) & 0x3f), + # 0x80 | (code & 0x3f)] + # Is this a good idea? + if 0xDC00 <= code <= 0xDC7F: + decoded.append(_unichr(code - 0xDC00)) + elif code <= 0xDCFF: + decoded.append(_unichr(code - 0xDC00)) + else: + raise NotASurrogateError + return str().join(decoded) + + +def replace_surrogate_decode(mybytes): + """ + Returns a (unicode) string + """ + decoded = [] + for ch in mybytes: + # We may be parsing newbytes (in which case ch is an int) or a native + # str on Py2 + if isinstance(ch, int): + code = ch + else: + code = ord(ch) + if 0x80 <= code <= 0xFF: + decoded.append(_unichr(0xDC00 + code)) + elif code <= 0x7F: + decoded.append(_unichr(code)) + else: + # # It may be a bad byte + # # Try swallowing it. + # continue + # print("RAISE!") + raise NotASurrogateError + return str().join(decoded) + + +def encodefilename(fn): + if FS_ENCODING == 'ascii': + # ASCII encoder of Python 2 expects that the error handler returns a + # Unicode string encodable to ASCII, whereas our surrogateescape error + # handler has to return bytes in 0x80-0xFF range. + encoded = [] + for index, ch in enumerate(fn): + code = ord(ch) + if code < 128: + ch = bytes_chr(code) + elif 0xDC80 <= code <= 0xDCFF: + ch = bytes_chr(code - 0xDC00) + else: + raise UnicodeEncodeError(FS_ENCODING, + fn, index, index+1, + 'ordinal not in range(128)') + encoded.append(ch) + return bytes().join(encoded) + elif FS_ENCODING == 'utf-8': + # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF + # doesn't go through our error handler + encoded = [] + for index, ch in enumerate(fn): + code = ord(ch) + if 0xD800 <= code <= 0xDFFF: + if 0xDC80 <= code <= 0xDCFF: + ch = bytes_chr(code - 0xDC00) + encoded.append(ch) + else: + raise UnicodeEncodeError( + FS_ENCODING, + fn, index, index+1, 'surrogates not allowed') + else: + ch_utf8 = ch.encode('utf-8') + encoded.append(ch_utf8) + return bytes().join(encoded) + else: + return fn.encode(FS_ENCODING, FS_ERRORS) + +def decodefilename(fn): + return fn.decode(FS_ENCODING, FS_ERRORS) + +FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') +# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]') +# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') + + +# normalize the filesystem encoding name. +# For example, we expect "utf-8", not "UTF8". +FS_ENCODING = codecs.lookup(FS_ENCODING).name + + +def register_surrogateescape(): + """ + Registers the surrogateescape error handler on Python 2 (only) + """ + if PY3: + return + try: + codecs.lookup_error(FS_ERRORS) + except LookupError: + codecs.register_error(FS_ERRORS, surrogateescape_handler) + + +try: + b"100644 \x9f\0aaa".decode(defenc, "surrogateescape") +except: + register_surrogateescape() diff --git a/git/ext/gitdb b/git/ext/gitdb -Subproject 97035c64f429c229629c25becc54ae44dd95e49 +Subproject 38866bc7c4956170c681a62c4508f934ac82646 diff --git a/git/objects/fun.py b/git/objects/fun.py index 5c0f4819..d5b3f902 100644 --- a/git/objects/fun.py +++ b/git/objects/fun.py @@ -2,6 +2,7 @@ from stat import S_ISDIR from git.compat import ( byte_ord, + safe_decode, defenc, xrange, text_type, @@ -76,11 +77,7 @@ def tree_entries_from_data(data): # default encoding for strings in git is utf8 # Only use the respective unicode object if the byte stream was encoded name = data[ns:i] - try: - name = name.decode(defenc) - except UnicodeDecodeError: - pass - # END handle encoding + name = safe_decode(name) # byte is NULL, get next 20 i += 1 diff --git a/git/test/performance/test_commit.py b/git/test/performance/test_commit.py index c60dc2fc..322d3c9f 100644 --- a/git/test/performance/test_commit.py +++ b/git/test/performance/test_commit.py @@ -52,7 +52,7 @@ class TestPerformance(TestBigRepoRW): # END for each object # END for each commit elapsed_time = time() - st - print("Traversed %i Trees and a total of %i unchached objects in %s [s] ( %f objs/s )" + print("Traversed %i Trees and a total of %i uncached objects in %s [s] ( %f objs/s )" % (nc, no, elapsed_time, no / elapsed_time), file=sys.stderr) def test_commit_traversal(self): diff --git a/git/test/test_fun.py b/git/test/test_fun.py index 3be25e3e..9d436653 100644 --- a/git/test/test_fun.py +++ b/git/test/test_fun.py @@ -1,10 +1,8 @@ from io import BytesIO -from stat import ( - S_IFDIR, - S_IFREG, - S_IFLNK -) +from stat import S_IFDIR, S_IFREG, S_IFLNK +from unittest.case import skipIf +from git.compat import PY3 from git.index import IndexFile from git.index.fun import ( aggressive_tree_merge @@ -253,6 +251,12 @@ class TestFun(TestBase): assert entries # END for each commit - def test_tree_entries_from_data_with_failing_name_decode(self): + @skipIf(PY3, 'odd types returned ... maybe figure it out one day') + def test_tree_entries_from_data_with_failing_name_decode_py2(self): + r = tree_entries_from_data(b'100644 \x9f\0aaa') + assert r == [('aaa', 33188, u'\udc9f')], r + + @skipIf(not PY3, 'odd types returned ... maybe figure it out one day') + def test_tree_entries_from_data_with_failing_name_decode_py3(self): r = tree_entries_from_data(b'100644 \x9f\0aaa') - assert r == [(b'aaa', 33188, b'\x9f')], r + assert r == [(b'aaa', 33188, '\udc9f')], r |