summaryrefslogtreecommitdiff
path: root/lib/git/odb/utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'lib/git/odb/utils.py')
-rw-r--r--lib/git/odb/utils.py147
1 files changed, 145 insertions, 2 deletions
diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py
index d88dca1a..8a054201 100644
--- a/lib/git/odb/utils.py
+++ b/lib/git/odb/utils.py
@@ -1,7 +1,10 @@
import binascii
import os
import zlib
+from cStringIO import StringIO
from git.utils import make_sha
+import errno
+from fun import chunk_size
__all__ = ('FDSha1Writer', )
@@ -21,8 +24,10 @@ def to_bin_sha(sha):
return sha
return hex_to_bin(sha)
+# errors
+ENOENT = errno.ENOENT
+
# os shortcuts
-getsize = os.path.getsize
exists = os.path.exists
mkdir = os.mkdir
isdir = os.path.isdir
@@ -32,6 +37,11 @@ join = os.path.join
read = os.read
write = os.write
close = os.close
+
+# ZLIB configuration
+# used when compressing objects
+Z_BEST_SPEED = 1
+
#} END Routines
@@ -50,7 +60,7 @@ class FDCompressedSha1Writer(object):
def __init__(self, fd):
self.fd = fd
self.sha1 = make_sha("")
- self.zip = zlib.compressobj()
+ self.zip = zlib.compressobj(Z_BEST_SPEED)
def write(self, data):
""":raise IOError: If not all bytes could be written
@@ -76,4 +86,137 @@ class FDCompressedSha1Writer(object):
return close(self.fd)
+class DecompressMemMapReader(object):
+ """Reads data in chunks from a memory map and decompresses it. The client sees
+ only the uncompressed data, respective file-like read calls are handling on-demand
+ buffered decompression accordingly
+
+ A constraint on the total size of bytes is activated, simulating
+ a logical file within a possibly larger physical memory area
+
+ To read efficiently, you clearly don't want to read individual bytes, instead,
+ read a few kilobytes at least.
+
+ :note: The chunk-size should be carefully selected as it will involve quite a bit
+ of string copying due to the way the zlib is implemented. Its very wasteful,
+ hence we try to find a good tradeoff between allocation time and number of
+ times we actually allocate. An own zlib implementation would be good here
+ to better support streamed reading - it would only need to keep the mmap
+ and decompress it into chunks, thats all ... """
+ __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_cs', '_close')
+
+ def __init__(self, m, close_on_deletion, cs = 128*1024):
+ """Initialize with mmap and chunk_size for stream reading"""
+ self._m = m
+ self._zip = zlib.decompressobj()
+ self._buf = None # buffer of decompressed bytes
+ self._buflen = 0 # length of bytes in buffer
+ self._s = 0 # size of uncompressed data to read in total
+ self._br = 0 # num uncompressed bytes read
+ self._cws = 0 # start byte of compression window
+ self._cwe = 0 # end byte of compression window
+ self._cs = cs # chunk size (when reading from zip)
+ self._close = close_on_deletion # close the memmap on deletion ?
+
+ def __del__(self):
+ if self._close:
+ self._m.close()
+ # END handle resource freeing
+
+ def initialize(self, size=0):
+ """Initialize this instance for acting as a read-only stream for size bytes.
+ :param size: size in bytes to be decompresed before being depleted.
+ If 0, default object header information is parsed from the data,
+ returning a tuple of (type_string, uncompressed_size)
+ If not 0, the size will be used, and None is returned.
+ :note: must only be called exactly once"""
+ if size:
+ self._s = size
+ return
+ # END handle size
+
+ # read header
+ maxb = 8192
+ self._s = maxb
+ hdr = self.read(maxb)
+ hdrend = hdr.find("\0")
+ type, size = hdr[:hdrend].split(" ")
+ self._s = int(size)
+
+ # adjust internal state to match actual header length that we ignore
+ # The buffer will be depleted first on future reads
+ self._br = 0
+ hdrend += 1 # count terminating \0
+ self._buf = StringIO(hdr[hdrend:])
+ self._buflen = len(hdr) - hdrend
+
+ return type, size
+
+ def read(self, size=-1):
+ if size < 1:
+ size = self._s - self._br
+ else:
+ size = min(size, self._s - self._br)
+ # END clamp size
+
+ if size == 0:
+ return str()
+ # END handle depletion
+
+ # deplete the buffer, then just continue using the decompress object
+ # which has an own buffer. We just need this to transparently parse the
+ # header from the zlib stream
+ dat = str()
+ if self._buf:
+ if self._buflen >= size:
+ # have enough data
+ dat = self._buf.read(size)
+ self._buflen -= size
+ self._br += size
+ return dat
+ else:
+ dat = self._buf.getvalue() # ouch, duplicates data
+ size -= self._buflen
+ self._br += self._buflen
+
+ self._buflen = 0
+ self._buf = None
+ # END handle buffer len
+ # END handle buffer
+
+ # decompress some data
+ # Abstract: zlib needs to operate on chunks of our memory map ( which may
+ # be large ), as it will otherwise and always fill in the 'unconsumed_tail'
+ # attribute which possible reads our whole map to the end, forcing
+ # everything to be read from disk even though just a portion was requested.
+ # As this would be a nogo, we workaround it by passing only chunks of data,
+ # moving the window into the memory map along as we decompress, which keeps
+ # the tail smaller than our chunk-size. This causes 'only' the chunk to be
+ # copied once, and another copy of a part of it when it creates the unconsumed
+ # tail. We have to use it to hand in the appropriate amount of bytes durin g
+ # the next read.
+ if self._zip.unconsumed_tail:
+ # move the window, make it as large as size demands. For code-clarity,
+ # we just take the chunk from our map again instead of reusing the unconsumed
+ # tail. The latter one would safe some memory copying, but we could end up
+ # with not getting enough data uncompressed, so we had to sort that out as well.
+ # Now we just assume the worst case, hence the data is uncompressed and the window
+ # needs to be as large as the uncompressed bytes we want to read.
+ self._cws = self._cwe - len(self._zip.unconsumed_tail)
+ self._cwe = self._cws + size
+ indata = self._m[self._cws:self._cwe] # another copy ... :(
+ else:
+ cws = self._cws
+ self._cws = self._cwe
+ self._cwe = cws + size
+ indata = self._m[self._cws:self._cwe] # ... copy it again :(
+ # END handle tail
+
+ dcompdat = self._zip.decompress(indata, size)
+ self._br += len(dcompdat)
+
+ if dat:
+ return dat + dcompdat
+ return dcompdat
+
#} END classes