summaryrefslogtreecommitdiff
path: root/lib/git/odb
diff options
context:
space:
mode:
authorSebastian Thiel <byronimo@gmail.com>2010-06-04 17:30:31 +0200
committerSebastian Thiel <byronimo@gmail.com>2010-06-04 17:30:31 +0200
commit6fbb69306c0e14bacb8dcb92a89af27d3d5d631f (patch)
tree4a6e0c14b412315c13cc4ac6466f4888644195a3 /lib/git/odb
parent25dca42bac17d511b7e2ebdd9d1d679e7626db5f (diff)
parente746f96bcc29238b79118123028ca170adc4ff0f (diff)
downloadgitpython-6fbb69306c0e14bacb8dcb92a89af27d3d5d631f.tar.gz
Merge branch 'odb'
Conflicts: lib/git/cmd.py
Diffstat (limited to 'lib/git/odb')
-rw-r--r--lib/git/odb/__init__.py6
-rw-r--r--lib/git/odb/db.py337
-rw-r--r--lib/git/odb/fun.py108
-rw-r--r--lib/git/odb/stream.py446
-rw-r--r--lib/git/odb/utils.py38
5 files changed, 935 insertions, 0 deletions
diff --git a/lib/git/odb/__init__.py b/lib/git/odb/__init__.py
new file mode 100644
index 00000000..5789d7eb
--- /dev/null
+++ b/lib/git/odb/__init__.py
@@ -0,0 +1,6 @@
+"""Initialize the object database module"""
+
+# default imports
+from db import *
+from stream import *
+
diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py
new file mode 100644
index 00000000..a8de28ec
--- /dev/null
+++ b/lib/git/odb/db.py
@@ -0,0 +1,337 @@
+"""Contains implementations of database retrieveing objects"""
+from git.utils import IndexFileSHA1Writer
+from git.errors import (
+ InvalidDBRoot,
+ BadObject,
+ BadObjectType
+ )
+
+from stream import (
+ DecompressMemMapReader,
+ FDCompressedSha1Writer,
+ Sha1Writer,
+ OStream,
+ OInfo
+ )
+
+from utils import (
+ ENOENT,
+ to_hex_sha,
+ exists,
+ hex_to_bin,
+ isdir,
+ mkdir,
+ rename,
+ dirname,
+ join
+ )
+
+from fun import (
+ chunk_size,
+ loose_object_header_info,
+ write_object
+ )
+
+import tempfile
+import mmap
+import os
+
+
+__all__ = ('ObjectDBR', 'ObjectDBW', 'FileDBBase', 'LooseObjectDB', 'PackedDB',
+ 'CompoundDB', 'ReferenceDB', 'GitObjectDB' )
+
+class ObjectDBR(object):
+ """Defines an interface for object database lookup.
+ Objects are identified either by hex-sha (40 bytes) or
+ by sha (20 bytes)"""
+
+ def __contains__(self, sha):
+ return self.has_obj
+
+ #{ Query Interface
+ def has_object(self, sha):
+ """
+ :return: True if the object identified by the given 40 byte hexsha or 20 bytes
+ binary sha is contained in the database
+ :raise BadObject:"""
+ raise NotImplementedError("To be implemented in subclass")
+
+ def info(self, sha):
+ """ :return: OInfo instance
+ :param sha: 40 bytes hexsha or 20 bytes binary sha
+ :raise BadObject:"""
+ raise NotImplementedError("To be implemented in subclass")
+
+ def info_async(self, input_channel):
+ """Retrieve information of a multitude of objects asynchronously
+ :param input_channel: Channel yielding the sha's of the objects of interest
+ :return: Channel yielding OInfo|InvalidOInfo, in any order"""
+ raise NotImplementedError("To be implemented in subclass")
+
+ def stream(self, sha):
+ """:return: OStream instance
+ :param sha: 40 bytes hexsha or 20 bytes binary sha
+ :raise BadObject:"""
+ raise NotImplementedError("To be implemented in subclass")
+
+ def stream_async(self, input_channel):
+ """Retrieve the OStream of multiple objects
+ :param input_channel: see ``info``
+ :param max_threads: see ``ObjectDBW.store``
+ :return: Channel yielding OStream|InvalidOStream instances in any order"""
+ raise NotImplementedError("To be implemented in subclass")
+
+ #} END query interface
+
+class ObjectDBW(object):
+ """Defines an interface to create objects in the database"""
+
+ def __init__(self, *args, **kwargs):
+ self._ostream = None
+
+ #{ Edit Interface
+ def set_ostream(self, stream):
+ """Adjusts the stream to which all data should be sent when storing new objects
+ :param stream: if not None, the stream to use, if None the default stream
+ will be used.
+ :return: previously installed stream, or None if there was no override
+ :raise TypeError: if the stream doesn't have the supported functionality"""
+ cstream = self._ostream
+ self._ostream = stream
+ return cstream
+
+ def ostream(self):
+ """:return: overridden output stream this instance will write to, or None
+ if it will write to the default stream"""
+ return self._ostream
+
+ def store(self, istream):
+ """Create a new object in the database
+ :return: the input istream object with its sha set to its corresponding value
+ :param istream: IStream compatible instance. If its sha is already set
+ to a value, the object will just be stored in the our database format,
+ in which case the input stream is expected to be in object format ( header + contents ).
+ :raise IOError: if data could not be written"""
+ raise NotImplementedError("To be implemented in subclass")
+
+ def store_async(self, input_channel):
+ """Create multiple new objects in the database asynchronously. The method will
+ return right away, returning an output channel which receives the results as
+ they are computed.
+
+ :return: Channel yielding your IStream which served as input, in any order.
+ The IStreams sha will be set to the sha it received during the process,
+ or its error attribute will be set to the exception informing about the error.
+ :param input_channel: Channel yielding IStream instance.
+ As the same instances will be used in the output channel, you can create a map
+ between the id(istream) -> istream
+ :note:As some ODB implementations implement this operation as atomic, they might
+ abort the whole operation if one item could not be processed. Hence check how
+ many items have actually been produced."""
+ raise NotImplementedError("To be implemented in subclass")
+
+ #} END edit interface
+
+
+class FileDBBase(object):
+ """Provides basic facilities to retrieve files of interest, including
+ caching facilities to help mapping hexsha's to objects"""
+
+ def __init__(self, root_path):
+ """Initialize this instance to look for its files at the given root path
+ All subsequent operations will be relative to this path
+ :raise InvalidDBRoot:
+ :note: The base will perform basic checking for accessability, but the subclass
+ is required to verify that the root_path contains the database structure it needs"""
+ super(FileDBBase, self).__init__()
+ if not os.path.isdir(root_path):
+ raise InvalidDBRoot(root_path)
+ self._root_path = root_path
+
+
+ #{ Interface
+ def root_path(self):
+ """:return: path at which this db operates"""
+ return self._root_path
+
+ def db_path(self, rela_path):
+ """
+ :return: the given relative path relative to our database root, allowing
+ to pontentially access datafiles"""
+ return join(self._root_path, rela_path)
+ #} END interface
+
+
+
+class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW):
+ """A database which operates on loose object files"""
+
+ # CONFIGURATION
+ # chunks in which data will be copied between streams
+ stream_chunk_size = chunk_size
+
+
+ def __init__(self, root_path):
+ super(LooseObjectDB, self).__init__(root_path)
+ self._hexsha_to_file = dict()
+ # Additional Flags - might be set to 0 after the first failure
+ # Depending on the root, this might work for some mounts, for others not, which
+ # is why it is per instance
+ self._fd_open_flags = getattr(os, 'O_NOATIME', 0)
+
+ #{ Interface
+ def object_path(self, hexsha):
+ """
+ :return: path at which the object with the given hexsha would be stored,
+ relative to the database root"""
+ return join(hexsha[:2], hexsha[2:])
+
+ def readable_db_object_path(self, hexsha):
+ """
+ :return: readable object path to the object identified by hexsha
+ :raise BadObject: If the object file does not exist"""
+ try:
+ return self._hexsha_to_file[hexsha]
+ except KeyError:
+ pass
+ # END ignore cache misses
+
+ # try filesystem
+ path = self.db_path(self.object_path(hexsha))
+ if exists(path):
+ self._hexsha_to_file[hexsha] = path
+ return path
+ # END handle cache
+ raise BadObject(hexsha)
+
+ #} END interface
+
+ def _map_loose_object(self, sha):
+ """
+ :return: memory map of that file to allow random read access
+ :raise BadObject: if object could not be located"""
+ db_path = self.db_path(self.object_path(to_hex_sha(sha)))
+ try:
+ fd = os.open(db_path, os.O_RDONLY|self._fd_open_flags)
+ except OSError,e:
+ if e.errno != ENOENT:
+ # try again without noatime
+ try:
+ fd = os.open(db_path, os.O_RDONLY)
+ except OSError:
+ raise BadObject(to_hex_sha(sha))
+ # didn't work because of our flag, don't try it again
+ self._fd_open_flags = 0
+ else:
+ raise BadObject(to_hex_sha(sha))
+ # END handle error
+ # END exception handling
+ try:
+ return mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
+ finally:
+ os.close(fd)
+ # END assure file is closed
+
+ def set_ostream(self, stream):
+ """:raise TypeError: if the stream does not support the Sha1Writer interface"""
+ if stream is not None and not isinstance(stream, Sha1Writer):
+ raise TypeError("Output stream musst support the %s interface" % Sha1Writer.__name__)
+ return super(LooseObjectDB, self).set_ostream(stream)
+
+ def info(self, sha):
+ m = self._map_loose_object(sha)
+ try:
+ type, size = loose_object_header_info(m)
+ return OInfo(sha, type, size)
+ finally:
+ m.close()
+ # END assure release of system resources
+
+ def stream(self, sha):
+ m = self._map_loose_object(sha)
+ type, size, stream = DecompressMemMapReader.new(m, close_on_deletion = True)
+ return OStream(sha, type, size, stream)
+
+ def has_object(self, sha):
+ try:
+ self.readable_db_object_path(to_hex_sha(sha))
+ return True
+ except BadObject:
+ return False
+ # END check existance
+
+ def store(self, istream):
+ """note: The sha we produce will be hex by nature"""
+ assert istream.sha is None, "Direct istream writing not yet implemented"
+ tmp_path = None
+ writer = self.ostream()
+ if writer is None:
+ # open a tmp file to write the data to
+ fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path)
+ writer = FDCompressedSha1Writer(fd)
+ # END handle custom writer
+
+ try:
+ try:
+ write_object(istream.type, istream.size, istream.read, writer.write,
+ chunk_size=self.stream_chunk_size)
+ except:
+ if tmp_path:
+ os.remove(tmp_path)
+ raise
+ # END assure tmpfile removal on error
+ finally:
+ if tmp_path:
+ writer.close()
+ # END assure target stream is closed
+
+ sha = writer.sha(as_hex=True)
+
+ if tmp_path:
+ obj_path = self.db_path(self.object_path(sha))
+ obj_dir = dirname(obj_path)
+ if not isdir(obj_dir):
+ mkdir(obj_dir)
+ # END handle destination directory
+ rename(tmp_path, obj_path)
+ # END handle dry_run
+
+ istream.sha = sha
+ return istream
+
+
+class PackedDB(FileDBBase, ObjectDBR):
+ """A database operating on a set of object packs"""
+
+
+class CompoundDB(ObjectDBR):
+ """A database which delegates calls to sub-databases"""
+
+
+class ReferenceDB(CompoundDB):
+ """A database consisting of database referred to in a file"""
+
+
+#class GitObjectDB(CompoundDB, ObjectDBW):
+class GitObjectDB(LooseObjectDB):
+ """A database representing the default git object store, which includes loose
+ objects, pack files and an alternates file
+
+ It will create objects only in the loose object database.
+ :note: for now, we use the git command to do all the lookup, just until he
+ have packs and the other implementations
+ """
+ def __init__(self, root_path, git):
+ """Initialize this instance with the root and a git command"""
+ super(GitObjectDB, self).__init__(root_path)
+ self._git = git
+
+ def info(self, sha):
+ t = self._git.get_object_header(sha)
+ return OInfo(t[0], t[1], t[2])
+
+ def stream(self, sha):
+ """For now, all lookup is done by git itself"""
+ t = self._git.stream_object_data(sha)
+ return OStream(t[0], t[1], t[2], t[3])
+
diff --git a/lib/git/odb/fun.py b/lib/git/odb/fun.py
new file mode 100644
index 00000000..870a6f02
--- /dev/null
+++ b/lib/git/odb/fun.py
@@ -0,0 +1,108 @@
+"""Contains basic c-functions which usually contain performance critical code
+Keeping this code separate from the beginning makes it easier to out-source
+it into c later, if required"""
+
+from git.errors import (
+ BadObjectType
+ )
+
+import zlib
+decompressobj = zlib.decompressobj
+
+
+# INVARIANTS
+type_id_to_type_map = {
+ 1 : "commit",
+ 2 : "tree",
+ 3 : "blob",
+ 4 : "tag"
+ }
+
+# used when dealing with larger streams
+chunk_size = 1000*1000
+
+__all__ = ('is_loose_object', 'loose_object_header_info', 'object_header_info',
+ 'write_object' )
+
+#{ Routines
+
+def is_loose_object(m):
+ """:return: True the file contained in memory map m appears to be a loose object.
+ Only the first two bytes are needed"""
+ b0, b1 = map(ord, m[:2])
+ word = (b0 << 8) + b1
+ return b0 == 0x78 and (word % 31) == 0
+
+def loose_object_header_info(m):
+ """:return: tuple(type_string, uncompressed_size_in_bytes) the type string of the
+ object as well as its uncompressed size in bytes.
+ :param m: memory map from which to read the compressed object data"""
+ decompress_size = 8192 # is used in cgit as well
+ hdr = decompressobj().decompress(m, decompress_size)
+ type_name, size = hdr[:hdr.find("\0")].split(" ")
+ return type_name, int(size)
+
+def object_header_info(m):
+ """:return: tuple(type_string, uncompressed_size_in_bytes
+ :param mmap: mapped memory map. It will be
+ seeked to the actual start of the object contents, which can be used
+ to initialize a zlib decompress object.
+ :note: This routine can only handle new-style objects which are assumably contained
+ in packs
+ """
+ assert not is_loose_object(m), "Use loose_object_header_info instead"
+
+ c = b0 # first byte
+ i = 1 # next char to read
+ type_id = (c >> 4) & 7 # numeric type
+ size = c & 15 # starting size
+ s = 4 # starting bit-shift size
+ while c & 0x80:
+ c = ord(m[i])
+ i += 1
+ size += (c & 0x7f) << s
+ s += 7
+ # END character loop
+
+ # finally seek the map to the start of the data stream
+ m.seek(i)
+ try:
+ return (type_id_to_type_map[type_id], size)
+ except KeyError:
+ # invalid object type - we could try to be smart now and decode part
+ # of the stream to get the info, problem is that we had trouble finding
+ # the exact start of the content stream
+ raise BadObjectType(type_id)
+ # END handle exceptions
+
+def write_object(type, size, read, write, chunk_size=chunk_size):
+ """Write the object as identified by type, size and source_stream into the
+ target_stream
+
+ :param type: type string of the object
+ :param size: amount of bytes to write from source_stream
+ :param read: read method of a stream providing the content data
+ :param write: write method of the output stream
+ :param close_target_stream: if True, the target stream will be closed when
+ the routine exits, even if an error is thrown
+ :return: The actual amount of bytes written to stream, which includes the header and a trailing newline"""
+ tbw = 0 # total num bytes written
+ dbw = 0 # num data bytes written
+
+ # WRITE HEADER: type SP size NULL
+ tbw += write("%s %i\0" % (type, size))
+
+ # WRITE ALL DATA UP TO SIZE
+ while True:
+ cs = min(chunk_size, size-dbw)
+ data_len = write(read(cs))
+ dbw += data_len
+ if data_len < cs or dbw == size:
+ tbw += dbw
+ break
+ # END check for stream end
+ # END duplicate data
+ return tbw
+
+
+#} END routines
diff --git a/lib/git/odb/stream.py b/lib/git/odb/stream.py
new file mode 100644
index 00000000..d1181382
--- /dev/null
+++ b/lib/git/odb/stream.py
@@ -0,0 +1,446 @@
+import zlib
+from cStringIO import StringIO
+from git.utils import make_sha
+import errno
+
+from utils import (
+ to_hex_sha,
+ to_bin_sha,
+ write,
+ close
+ )
+
+__all__ = ('OInfo', 'OStream', 'IStream', 'InvalidOInfo', 'InvalidOStream',
+ 'DecompressMemMapReader', 'FDCompressedSha1Writer')
+
+
+# ZLIB configuration
+# used when compressing objects - 1 to 9 ( slowest )
+Z_BEST_SPEED = 1
+
+
+#{ ODB Bases
+
+class OInfo(tuple):
+ """Carries information about an object in an ODB, provdiing information
+ about the sha of the object, the type_string as well as the uncompressed size
+ in bytes.
+
+ It can be accessed using tuple notation and using attribute access notation::
+
+ assert dbi[0] == dbi.sha
+ assert dbi[1] == dbi.type
+ assert dbi[2] == dbi.size
+
+ The type is designed to be as lighteight as possible."""
+ __slots__ = tuple()
+
+ def __new__(cls, sha, type, size):
+ return tuple.__new__(cls, (sha, type, size))
+
+ def __init__(self, *args):
+ tuple.__init__(self)
+
+ #{ Interface
+ @property
+ def sha(self):
+ return self[0]
+
+ @property
+ def type(self):
+ return self[1]
+
+ @property
+ def size(self):
+ return self[2]
+ #} END interface
+
+
+class OStream(OInfo):
+ """Base for object streams retrieved from the database, providing additional
+ information about the stream.
+ Generally, ODB streams are read-only as objects are immutable"""
+ __slots__ = tuple()
+
+ def __new__(cls, sha, type, size, stream, *args, **kwargs):
+ """Helps with the initialization of subclasses"""
+ return tuple.__new__(cls, (sha, type, size, stream))
+
+
+ def __init__(self, *args, **kwargs):
+ tuple.__init__(self)
+ #{ Interface
+
+ def is_compressed(self):
+ """:return: True if reads of this stream yield zlib compressed data. Default False
+ :note: this does not imply anything about the actual internal storage.
+ Hence the data could be uncompressed, but read compressed, or vice versa"""
+ raise False
+
+ #} END interface
+
+ #{ Stream Reader Interface
+
+ def read(self, size=-1):
+ return self[3].read(size)
+
+ #} END stream reader interface
+
+
+class IStream(list):
+ """Represents an input content stream to be fed into the ODB. It is mutable to allow
+ the ODB to record information about the operations outcome right in this instance.
+
+ It provides interfaces for the OStream and a StreamReader to allow the instance
+ to blend in without prior conversion.
+
+ The only method your content stream must support is 'read'"""
+ __slots__ = tuple()
+
+ def __new__(cls, type, size, stream, sha=None, compressed=False):
+ return list.__new__(cls, (sha, type, size, stream, compressed, None))
+
+ def __init__(self, type, size, stream, sha=None, compressed=None):
+ list.__init__(self, (sha, type, size, stream, compressed, None))
+
+ #{ Interface
+
+ def hexsha(self):
+ """:return: our sha, hex encoded, 40 bytes"""
+ return to_hex_sha(self[0])
+
+ def binsha(self):
+ """:return: our sha as binary, 20 bytes"""
+ return to_bin_sha(self[0])
+
+ def _error(self):
+ """:return: the error that occurred when processing the stream, or None"""
+ return self[5]
+
+ def _set_error(self, exc):
+ """Set this input stream to the given exc, may be None to reset the error"""
+ self[5] = exc
+
+ error = property(_error, _set_error)
+
+ #} END interface
+
+ #{ Stream Reader Interface
+
+ def read(self, size=-1):
+ """Implements a simple stream reader interface, passing the read call on
+ to our internal stream"""
+ return self[3].read(size)
+
+ #} END stream reader interface
+
+ #{ interface
+
+ def _set_sha(self, sha):
+ self[0] = sha
+
+ def _sha(self):
+ return self[0]
+
+ sha = property(_sha, _set_sha)
+
+
+ def _type(self):
+ return self[1]
+
+ def _set_type(self, type):
+ self[1] = type
+
+ type = property(_type, _set_type)
+
+ def _size(self):
+ return self[2]
+
+ def _set_size(self, size):
+ self[2] = size
+
+ size = property(_size, _set_size)
+
+ def _stream(self):
+ return self[3]
+
+ def _set_stream(self, stream):
+ self[3] = stream
+
+ stream = property(_stream, _set_stream)
+
+ #} END odb info interface
+
+ #{ OStream interface
+
+ def is_compressed(self):
+ return self[4]
+
+ #} END OStream interface
+
+
+class InvalidOInfo(tuple):
+ """Carries information about a sha identifying an object which is invalid in
+ the queried database. The exception attribute provides more information about
+ the cause of the issue"""
+ __slots__ = tuple()
+
+ def __new__(cls, sha, exc):
+ return tuple.__new__(cls, (sha, exc))
+
+ def __init__(self, sha, exc):
+ tuple.__init__(self, (sha, exc))
+
+ @property
+ def sha(self):
+ return self[0]
+
+ @property
+ def error(self):
+ """:return: exception instance explaining the failure"""
+ return self[1]
+
+
+class InvalidOStream(InvalidOInfo):
+ """Carries information about an invalid ODB stream"""
+ __slots__ = tuple()
+
+#} END ODB Bases
+
+
+#{ RO Streams
+
+class DecompressMemMapReader(object):
+ """Reads data in chunks from a memory map and decompresses it. The client sees
+ only the uncompressed data, respective file-like read calls are handling on-demand
+ buffered decompression accordingly
+
+ A constraint on the total size of bytes is activated, simulating
+ a logical file within a possibly larger physical memory area
+
+ To read efficiently, you clearly don't want to read individual bytes, instead,
+ read a few kilobytes at least.
+
+ :note: The chunk-size should be carefully selected as it will involve quite a bit
+ of string copying due to the way the zlib is implemented. Its very wasteful,
+ hence we try to find a good tradeoff between allocation time and number of
+ times we actually allocate. An own zlib implementation would be good here
+ to better support streamed reading - it would only need to keep the mmap
+ and decompress it into chunks, thats all ... """
+ __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close')
+
+ max_read_size = 512*1024
+
+ def __init__(self, m, close_on_deletion, size):
+ """Initialize with mmap for stream reading"""
+ self._m = m
+ self._zip = zlib.decompressobj()
+ self._buf = None # buffer of decompressed bytes
+ self._buflen = 0 # length of bytes in buffer
+ self._s = size # size of uncompressed data to read in total
+ self._br = 0 # num uncompressed bytes read
+ self._cws = 0 # start byte of compression window
+ self._cwe = 0 # end byte of compression window
+ self._close = close_on_deletion # close the memmap on deletion ?
+
+ def __del__(self):
+ if self._close:
+ self._m.close()
+ # END handle resource freeing
+
+ @classmethod
+ def new(self, m, close_on_deletion=False):
+ """Create a new DecompressMemMapReader instance for acting as a read-only stream
+ This method parses the object header from m and returns the parsed
+ type and size, as well as the created stream instance.
+ :param m: memory map on which to oparate
+ :param close_on_deletion: if True, the memory map will be closed once we are
+ being deleted"""
+ inst = DecompressMemMapReader(m, close_on_deletion, 0)
+
+ # read header
+ maxb = 512 # should really be enough, cgit uses 8192 I believe
+ inst._s = maxb
+ hdr = inst.read(maxb)
+ hdrend = hdr.find("\0")
+ type, size = hdr[:hdrend].split(" ")
+ size = int(size)
+ inst._s = size
+
+ # adjust internal state to match actual header length that we ignore
+ # The buffer will be depleted first on future reads
+ inst._br = 0
+ hdrend += 1 # count terminating \0
+ inst._buf = StringIO(hdr[hdrend:])
+ inst._buflen = len(hdr) - hdrend
+
+ return type, size, inst
+
+ def read(self, size=-1):
+ if size < 1:
+ size = self._s - self._br
+ else:
+ size = min(size, self._s - self._br)
+ # END clamp size
+
+ if size == 0:
+ return str()
+ # END handle depletion
+
+ # protect from memory peaks
+ # If he tries to read large chunks, our memory patterns get really bad
+ # as we end up copying a possibly huge chunk from our memory map right into
+ # memory. This might not even be possible. Nonetheless, try to dampen the
+ # effect a bit by reading in chunks, returning a huge string in the end.
+ # Our performance now depends on StringIO. This way we don't need two large
+ # buffers in peak times, but only one large one in the end which is
+ # the return buffer
+ # NO: We don't do it - if the user thinks its best, he is right. If he
+ # has trouble, he will start reading in chunks. According to our tests
+ # its still faster if we read 10 Mb at once instead of chunking it.
+
+ # if size > self.max_read_size:
+ # sio = StringIO()
+ # while size:
+ # read_size = min(self.max_read_size, size)
+ # data = self.read(read_size)
+ # sio.write(data)
+ # size -= len(data)
+ # if len(data) < read_size:
+ # break
+ # # END data loop
+ # sio.seek(0)
+ # return sio.getvalue()
+ # # END handle maxread
+ #
+ # deplete the buffer, then just continue using the decompress object
+ # which has an own buffer. We just need this to transparently parse the
+ # header from the zlib stream
+ dat = str()
+ if self._buf:
+ if self._buflen >= size:
+ # have enough data
+ dat = self._buf.read(size)
+ self._buflen -= size
+ self._br += size
+ return dat
+ else:
+ dat = self._buf.read() # ouch, duplicates data
+ size -= self._buflen
+ self._br += self._buflen
+
+ self._buflen = 0
+ self._buf = None
+ # END handle buffer len
+ # END handle buffer
+
+ # decompress some data
+ # Abstract: zlib needs to operate on chunks of our memory map ( which may
+ # be large ), as it will otherwise and always fill in the 'unconsumed_tail'
+ # attribute which possible reads our whole map to the end, forcing
+ # everything to be read from disk even though just a portion was requested.
+ # As this would be a nogo, we workaround it by passing only chunks of data,
+ # moving the window into the memory map along as we decompress, which keeps
+ # the tail smaller than our chunk-size. This causes 'only' the chunk to be
+ # copied once, and another copy of a part of it when it creates the unconsumed
+ # tail. We have to use it to hand in the appropriate amount of bytes durin g
+ # the next read.
+ tail = self._zip.unconsumed_tail
+ if tail:
+ # move the window, make it as large as size demands. For code-clarity,
+ # we just take the chunk from our map again instead of reusing the unconsumed
+ # tail. The latter one would safe some memory copying, but we could end up
+ # with not getting enough data uncompressed, so we had to sort that out as well.
+ # Now we just assume the worst case, hence the data is uncompressed and the window
+ # needs to be as large as the uncompressed bytes we want to read.
+ self._cws = self._cwe - len(tail)
+ self._cwe = self._cws + size
+
+
+ indata = self._m[self._cws:self._cwe] # another copy ... :(
+ # get the actual window end to be sure we don't use it for computations
+ self._cwe = self._cws + len(indata)
+ else:
+ cws = self._cws
+ self._cws = self._cwe
+ self._cwe = cws + size
+ indata = self._m[self._cws:self._cwe] # ... copy it again :(
+ # END handle tail
+
+ dcompdat = self._zip.decompress(indata, size)
+
+ self._br += len(dcompdat)
+ if dat:
+ dcompdat = dat + dcompdat
+
+ return dcompdat
+
+#} END RO streams
+
+
+#{ W Streams
+
+class Sha1Writer(object):
+ """Simple stream writer which produces a sha whenever you like as it degests
+ everything it is supposed to write"""
+
+ def __init__(self):
+ self.sha1 = make_sha("")
+
+ #{ Stream Interface
+
+ def write(self, data):
+ """:raise IOError: If not all bytes could be written
+ :return: lenght of incoming data"""
+ self.sha1.update(data)
+ return len(data)
+
+ # END stream interface
+
+ #{ Interface
+
+ def sha(self, as_hex = False):
+ """:return: sha so far
+ :param as_hex: if True, sha will be hex-encoded, binary otherwise"""
+ if as_hex:
+ return self.sha1.hexdigest()
+ return self.sha1.digest()
+
+ #} END interface
+
+class FDCompressedSha1Writer(Sha1Writer):
+ """Digests data written to it, making the sha available, then compress the
+ data and write it to the file descriptor
+ :note: operates on raw file descriptors
+ :note: for this to work, you have to use the close-method of this instance"""
+ __slots__ = ("fd", "sha1", "zip")
+
+ # default exception
+ exc = IOError("Failed to write all bytes to filedescriptor")
+
+ def __init__(self, fd):
+ super(FDCompressedSha1Writer, self).__init__()
+ self.fd = fd
+ self.zip = zlib.compressobj(Z_BEST_SPEED)
+
+ #{ Stream Interface
+
+ def write(self, data):
+ """:raise IOError: If not all bytes could be written
+ :return: lenght of incoming data"""
+ self.sha1.update(data)
+ cdata = self.zip.compress(data)
+ bytes_written = write(self.fd, cdata)
+ if bytes_written != len(cdata):
+ raise self.exc
+ return len(data)
+
+ def close(self):
+ remainder = self.zip.flush()
+ if write(self.fd, remainder) != len(remainder):
+ raise self.exc
+ return close(self.fd)
+
+ #} END stream interface
+
+#} END W streams
diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py
new file mode 100644
index 00000000..6863e97b
--- /dev/null
+++ b/lib/git/odb/utils.py
@@ -0,0 +1,38 @@
+import binascii
+import os
+import errno
+
+#{ Routines
+
+hex_to_bin = binascii.a2b_hex
+bin_to_hex = binascii.b2a_hex
+
+def to_hex_sha(sha):
+ """:return: hexified version of sha"""
+ if len(sha) == 40:
+ return sha
+ return bin_to_hex(sha)
+
+def to_bin_sha(sha):
+ if len(sha) == 20:
+ return sha
+ return hex_to_bin(sha)
+
+# errors
+ENOENT = errno.ENOENT
+
+# os shortcuts
+exists = os.path.exists
+mkdir = os.mkdir
+isdir = os.path.isdir
+rename = os.rename
+dirname = os.path.dirname
+join = os.path.join
+read = os.read
+write = os.write
+close = os.close
+
+
+#} END Routines
+
+