Merge branch 'odb'

Conflicts: lib/git/cmd.py
author: Sebastian Thiel <byronimo@gmail.com> 2010-06-04 17:30:31 +0200
committer: Sebastian Thiel <byronimo@gmail.com> 2010-06-04 17:30:31 +0200
commit: 6fbb69306c0e14bacb8dcb92a89af27d3d5d631f (patch)
tree: 4a6e0c14b412315c13cc4ac6466f4888644195a3 /lib/git/odb
parent: 25dca42bac17d511b7e2ebdd9d1d679e7626db5f (diff)
parent: e746f96bcc29238b79118123028ca170adc4ff0f (diff)
download: gitpython-6fbb69306c0e14bacb8dcb92a89af27d3d5d631f.tar.gz
5 files changed, 935 insertions, 0 deletions
diff --git a/lib/git/odb/__init__.py b/lib/git/odb/__init__.py
new file mode 100644
index 00000000..5789d7eb
--- /dev/null
+++ b/lib/git/odb/__init__.py
@@ -0,0 +1,6 @@
+"""Initialize the object database module"""
+
+# default imports
+from db import *
+from stream import *
+
diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py
new file mode 100644
index 00000000..a8de28ec
--- /dev/null
+++ b/lib/git/odb/db.py
@@ -0,0 +1,337 @@
+"""Contains implementations of database retrieveing objects"""
+from git.utils import IndexFileSHA1Writer
+from git.errors import (
+	InvalidDBRoot, 
+	BadObject, 
+	BadObjectType
+	)
+
+from stream import (
+		DecompressMemMapReader,
+		FDCompressedSha1Writer,
+		Sha1Writer,
+		OStream,
+		OInfo
+	)
+
+from utils import (
+		ENOENT,
+		to_hex_sha,
+		exists,
+		hex_to_bin,
+		isdir,
+		mkdir,
+		rename,
+		dirname,
+		join
+	)
+
+from fun import ( 
+	chunk_size,
+	loose_object_header_info, 
+	write_object
+	)
+
+import tempfile
+import mmap
+import os
+
+
+__all__ = ('ObjectDBR', 'ObjectDBW', 'FileDBBase', 'LooseObjectDB', 'PackedDB', 
+			'CompoundDB', 'ReferenceDB', 'GitObjectDB' )
+
+class ObjectDBR(object):
+	"""Defines an interface for object database lookup.
+	Objects are identified either by hex-sha (40 bytes) or 
+	by sha (20 bytes)"""
+	
+	def __contains__(self, sha):
+		return self.has_obj
+	
+	#{ Query Interface 
+	def has_object(self, sha):
+		"""
+		:return: True if the object identified by the given 40 byte hexsha or 20 bytes
+			binary sha is contained in the database
+		:raise BadObject:"""
+		raise NotImplementedError("To be implemented in subclass")
+		
+	def info(self, sha):
+		""" :return: OInfo instance
+		:param sha: 40 bytes hexsha or 20 bytes binary sha
+		:raise BadObject:"""
+		raise NotImplementedError("To be implemented in subclass")
+		
+	def info_async(self, input_channel):
+		"""Retrieve information of a multitude of objects asynchronously
+		:param input_channel: Channel yielding the sha's of the objects of interest
+		:return: Channel yielding OInfo|InvalidOInfo, in any order"""
+		raise NotImplementedError("To be implemented in subclass")
+		
+	def stream(self, sha):
+		""":return: OStream instance
+		:param sha: 40 bytes hexsha or 20 bytes binary sha
+		:raise BadObject:"""
+		raise NotImplementedError("To be implemented in subclass")
+		
+	def stream_async(self, input_channel):
+		"""Retrieve the OStream of multiple objects
+		:param input_channel: see ``info``
+		:param max_threads: see ``ObjectDBW.store``
+		:return: Channel yielding OStream|InvalidOStream instances in any order"""
+		raise NotImplementedError("To be implemented in subclass")
+			
+	#} END query interface
+	
+class ObjectDBW(object):
+	"""Defines an interface to create objects in the database"""
+	
+	def __init__(self, *args, **kwargs):
+		self._ostream = None
+	
+	#{ Edit Interface
+	def set_ostream(self, stream):
+		"""Adjusts the stream to which all data should be sent when storing new objects
+		:param stream: if not None, the stream to use, if None the default stream
+			will be used.
+		:return: previously installed stream, or None if there was no override
+		:raise TypeError: if the stream doesn't have the supported functionality"""
+		cstream = self._ostream
+		self._ostream = stream
+		return cstream
+		
+	def ostream(self):
+		""":return: overridden output stream this instance will write to, or None
+			if it will write to the default stream"""
+		return self._ostream
+	
+	def store(self, istream):
+		"""Create a new object in the database
+		:return: the input istream object with its sha set to its corresponding value
+		:param istream: IStream compatible instance. If its sha is already set 
+			to a value, the object will just be stored in the our database format, 
+			in which case the input stream is expected to be in object format ( header + contents ).
+		:raise IOError: if data could not be written"""
+		raise NotImplementedError("To be implemented in subclass")
+	
+	def store_async(self, input_channel):
+		"""Create multiple new objects in the database asynchronously. The method will 
+		return right away, returning an output channel which receives the results as 
+		they are computed.
+		
+		:return: Channel yielding your IStream which served as input, in any order.
+			The IStreams sha will be set to the sha it received during the process, 
+			or its error attribute will be set to the exception informing about the error.
+		:param input_channel: Channel yielding IStream instance.
+			As the same instances will be used in the output channel, you can create a map
+			between the id(istream) -> istream
+		:note:As some ODB implementations implement this operation as atomic, they might 
+			abort the whole operation if one item could not be processed. Hence check how 
+			many items have actually been produced."""
+		raise NotImplementedError("To be implemented in subclass")
+	
+	#} END edit interface
+	
+
+class FileDBBase(object):
+	"""Provides basic facilities to retrieve files of interest, including 
+	caching facilities to help mapping hexsha's to objects"""
+	
+	def __init__(self, root_path):
+		"""Initialize this instance to look for its files at the given root path
+		All subsequent operations will be relative to this path
+		:raise InvalidDBRoot: 
+		:note: The base will perform basic checking for accessability, but the subclass
+			is required to verify that the root_path contains the database structure it needs"""
+		super(FileDBBase, self).__init__()
+		if not os.path.isdir(root_path):
+			raise InvalidDBRoot(root_path)
+		self._root_path = root_path
+		
+		
+	#{ Interface 
+	def root_path(self):
+		""":return: path at which this db operates"""
+		return self._root_path
+	
+	def db_path(self, rela_path):
+		"""
+		:return: the given relative path relative to our database root, allowing 
+			to pontentially access datafiles"""
+		return join(self._root_path, rela_path)
+	#} END interface
+		
+	
+	
+class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW):
+	"""A database which operates on loose object files"""
+	
+	# CONFIGURATION
+	# chunks in which data will be copied between streams
+	stream_chunk_size = chunk_size
+	
+	
+	def __init__(self, root_path):
+		super(LooseObjectDB, self).__init__(root_path)
+		self._hexsha_to_file = dict()
+		# Additional Flags - might be set to 0 after the first failure
+		# Depending on the root, this might work for some mounts, for others not, which
+		# is why it is per instance
+		self._fd_open_flags = getattr(os, 'O_NOATIME', 0)
+	
+	#{ Interface 
+	def object_path(self, hexsha):
+		"""
+		:return: path at which the object with the given hexsha would be stored, 
+			relative to the database root"""
+		return join(hexsha[:2], hexsha[2:])
+	
+	def readable_db_object_path(self, hexsha):
+		"""
+		:return: readable object path to the object identified by hexsha
+		:raise BadObject: If the object file does not exist"""
+		try:
+			return self._hexsha_to_file[hexsha]
+		except KeyError:
+			pass
+		# END ignore cache misses 
+			
+		# try filesystem
+		path = self.db_path(self.object_path(hexsha))
+		if exists(path):
+			self._hexsha_to_file[hexsha] = path
+			return path
+		# END handle cache
+		raise BadObject(hexsha)
+		
+	#} END interface
+	
+	def _map_loose_object(self, sha):
+		"""
+		:return: memory map of that file to allow random read access
+		:raise BadObject: if object could not be located"""
+		db_path = self.db_path(self.object_path(to_hex_sha(sha)))
+		try:
+			fd = os.open(db_path, os.O_RDONLY|self._fd_open_flags)
+		except OSError,e:
+			if e.errno != ENOENT:
+				# try again without noatime
+				try:
+					fd = os.open(db_path, os.O_RDONLY)
+				except OSError:
+					raise BadObject(to_hex_sha(sha))
+				# didn't work because of our flag, don't try it again
+				self._fd_open_flags = 0
+			else:
+				raise BadObject(to_hex_sha(sha))
+			# END handle error
+		# END exception handling
+		try:
+			return mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
+		finally:
+			os.close(fd)
+		# END assure file is closed
+		
+	def set_ostream(self, stream):
+		""":raise TypeError: if the stream does not support the Sha1Writer interface"""
+		if stream is not None and not isinstance(stream, Sha1Writer):
+			raise TypeError("Output stream musst support the %s interface" % Sha1Writer.__name__)
+		return super(LooseObjectDB, self).set_ostream(stream)
+			
+	def info(self, sha):
+		m = self._map_loose_object(sha)
+		try:
+			type, size = loose_object_header_info(m)
+			return OInfo(sha, type, size)
+		finally:
+			m.close()
+		# END assure release of system resources
+		
+	def stream(self, sha):
+		m = self._map_loose_object(sha)
+		type, size, stream = DecompressMemMapReader.new(m, close_on_deletion = True)
+		return OStream(sha, type, size, stream)
+		
+	def has_object(self, sha):
+		try:
+			self.readable_db_object_path(to_hex_sha(sha))
+			return True
+		except BadObject:
+			return False
+		# END check existance
+	
+	def store(self, istream):
+		"""note: The sha we produce will be hex by nature"""
+		assert istream.sha is None, "Direct istream writing not yet implemented"
+		tmp_path = None
+		writer = self.ostream()
+		if writer is None:
+			# open a tmp file to write the data to
+			fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path)
+			writer = FDCompressedSha1Writer(fd)
+		# END handle custom writer
+	
+		try:
+			try:
+				write_object(istream.type, istream.size, istream.read, writer.write,
+								chunk_size=self.stream_chunk_size)
+			except:
+				if tmp_path:
+					os.remove(tmp_path)
+				raise
+			# END assure tmpfile removal on error
+		finally:
+			if tmp_path:
+				writer.close()
+		# END assure target stream is closed
+		
+		sha = writer.sha(as_hex=True)
+		
+		if tmp_path:
+			obj_path = self.db_path(self.object_path(sha))
+			obj_dir = dirname(obj_path)
+			if not isdir(obj_dir):
+				mkdir(obj_dir)
+			# END handle destination directory
+			rename(tmp_path, obj_path)
+		# END handle dry_run
+		
+		istream.sha = sha
+		return istream
+	
+	
+class PackedDB(FileDBBase, ObjectDBR):
+	"""A database operating on a set of object packs"""
+	
+	
+class CompoundDB(ObjectDBR):
+	"""A database which delegates calls to sub-databases"""
+	
+
+class ReferenceDB(CompoundDB):
+	"""A database consisting of database referred to in a file"""
+	
+	
+#class GitObjectDB(CompoundDB, ObjectDBW):
+class GitObjectDB(LooseObjectDB):
+	"""A database representing the default git object store, which includes loose 
+	objects, pack files and an alternates file
+	
+	It will create objects only in the loose object database.
+	:note: for now, we use the git command to do all the lookup, just until he 
+		have packs and the other implementations
+	"""
+	def __init__(self, root_path, git):
+		"""Initialize this instance with the root and a git command"""
+		super(GitObjectDB, self).__init__(root_path)
+		self._git = git
+		
+	def info(self, sha):
+		t = self._git.get_object_header(sha)
+		return OInfo(t[0], t[1], t[2])
+		
+	def stream(self, sha):
+		"""For now, all lookup is done by git itself"""
+		t = self._git.stream_object_data(sha)
+		return OStream(t[0], t[1], t[2], t[3])
+	
diff --git a/lib/git/odb/fun.py b/lib/git/odb/fun.py
new file mode 100644
index 00000000..870a6f02
--- /dev/null
+++ b/lib/git/odb/fun.py
@@ -0,0 +1,108 @@
+"""Contains basic c-functions which usually contain performance critical code
+Keeping this code separate from the beginning makes it easier to out-source
+it into c later, if required"""
+
+from git.errors import (
+	BadObjectType
+	)
+
+import zlib
+decompressobj = zlib.decompressobj
+
+
+# INVARIANTS
+type_id_to_type_map = 	{
+							1 : "commit",
+							2 : "tree",
+							3 : "blob",
+							4 : "tag"
+						}
+
+# used when dealing with larger streams
+chunk_size = 1000*1000
+
+__all__ = ('is_loose_object', 'loose_object_header_info', 'object_header_info', 
+			'write_object' )
+
+#{ Routines
+
+def is_loose_object(m):
+	""":return: True the file contained in memory map m appears to be a loose object.
+	Only the first two bytes are needed"""
+	b0, b1 = map(ord, m[:2])
+	word = (b0 << 8) + b1
+	return b0 == 0x78 and (word % 31) == 0
+
+def loose_object_header_info(m):
+	""":return: tuple(type_string, uncompressed_size_in_bytes) the type string of the 
+		object as well as its uncompressed size in bytes.
+	:param m: memory map from which to read the compressed object data"""
+	decompress_size = 8192		# is used in cgit as well
+	hdr = decompressobj().decompress(m, decompress_size)
+	type_name, size = hdr[:hdr.find("\0")].split(" ")
+	return type_name, int(size)
+	
+def object_header_info(m):
+	""":return: tuple(type_string, uncompressed_size_in_bytes 
+	:param mmap: mapped memory map. It will be 
+		seeked to the actual start of the object contents, which can be used
+		to initialize a zlib decompress object.
+	:note: This routine can only handle new-style objects which are assumably contained
+		in packs
+		"""
+	assert not is_loose_object(m), "Use loose_object_header_info instead"
+	
+	c = b0							# first byte
+	i = 1							# next char to read
+	type_id = (c >> 4) & 7			# numeric type
+	size = c & 15					# starting size
+	s = 4							# starting bit-shift size
+	while c & 0x80:
+		c = ord(m[i])
+		i += 1
+		size += (c & 0x7f) << s
+		s += 7
+	# END character loop
+	
+	# finally seek the map to the start of the data stream
+	m.seek(i)
+	try:
+		return (type_id_to_type_map[type_id], size)
+	except KeyError:
+		# invalid object type - we could try to be smart now and decode part 
+		# of the stream to get the info, problem is that we had trouble finding 
+		# the exact start of the content stream
+		raise BadObjectType(type_id)
+	# END handle exceptions
+	
+def write_object(type, size, read, write, chunk_size=chunk_size):
+	"""Write the object as identified by type, size and source_stream into the 
+	target_stream
+	
+	:param type: type string of the object
+	:param size: amount of bytes to write from source_stream
+	:param read: read method of a stream providing the content data
+	:param write: write method of the output stream
+	:param close_target_stream: if True, the target stream will be closed when 
+		the routine exits, even if an error is thrown
+	:return: The actual amount of bytes written to stream, which includes the header and a trailing newline"""
+	tbw = 0												# total num bytes written
+	dbw = 0												# num data bytes written
+	
+	# WRITE HEADER: type SP size NULL
+	tbw += write("%s %i\0" % (type, size))
+
+	# WRITE ALL DATA UP TO SIZE
+	while True:
+		cs = min(chunk_size, size-dbw)
+		data_len = write(read(cs))
+		dbw += data_len
+		if data_len < cs or dbw == size:
+			tbw += dbw
+			break
+		# END check for stream end
+	# END duplicate data
+	return tbw
+
+	
+#} END routines
diff --git a/lib/git/odb/stream.py b/lib/git/odb/stream.py
new file mode 100644
index 00000000..d1181382
--- /dev/null
+++ b/lib/git/odb/stream.py
@@ -0,0 +1,446 @@
+import zlib
+from cStringIO import StringIO
+from git.utils import make_sha
+import errno
+
+from utils import (
+		to_hex_sha,
+		to_bin_sha, 
+		write, 
+		close
+	)
+
+__all__ = ('OInfo', 'OStream', 'IStream', 'InvalidOInfo', 'InvalidOStream', 
+			'DecompressMemMapReader', 'FDCompressedSha1Writer')
+
+
+# ZLIB configuration
+# used when compressing objects - 1 to 9 ( slowest )
+Z_BEST_SPEED = 1
+
+
+#{ ODB Bases
+
+class OInfo(tuple):
+	"""Carries information about an object in an ODB, provdiing information 
+	about the sha of the object, the type_string as well as the uncompressed size
+	in bytes.
+	
+	It can be accessed using tuple notation and using attribute access notation::
+	
+		assert dbi[0] == dbi.sha
+		assert dbi[1] == dbi.type
+		assert dbi[2] == dbi.size
+	
+	The type is designed to be as lighteight as possible."""
+	__slots__ = tuple()
+	
+	def __new__(cls, sha, type, size):
+		return tuple.__new__(cls, (sha, type, size))
+	
+	def __init__(self, *args):
+		tuple.__init__(self)
+	
+	#{ Interface 
+	@property
+	def sha(self):
+		return self[0]
+		
+	@property
+	def type(self):
+		return self[1]
+		
+	@property
+	def size(self):
+		return self[2]
+	#} END interface
+
+
+class OStream(OInfo):
+	"""Base for object streams retrieved from the database, providing additional 
+	information about the stream.
+	Generally, ODB streams are read-only as objects are immutable"""
+	__slots__ = tuple()
+	
+	def __new__(cls, sha, type, size, stream, *args, **kwargs):
+		"""Helps with the initialization of subclasses"""
+		return tuple.__new__(cls, (sha, type, size, stream))
+	
+	
+	def __init__(self, *args, **kwargs):
+		tuple.__init__(self)
+	#{ Interface 
+	
+	def is_compressed(self):
+		""":return: True if reads of this stream yield zlib compressed data. Default False
+		:note: this does not imply anything about the actual internal storage.
+			Hence the data could be uncompressed, but read compressed, or vice versa"""
+		raise False
+		
+	#} END interface
+	
+	#{ Stream Reader Interface 
+	
+	def read(self, size=-1):
+		return self[3].read(size)
+		
+	#} END stream reader interface
+
+
+class IStream(list):
+	"""Represents an input content stream to be fed into the ODB. It is mutable to allow 
+	the ODB to record information about the operations outcome right in this instance.
+	
+	It provides interfaces for the OStream and a StreamReader to allow the instance
+	to blend in without prior conversion.
+	
+	The only method your content stream must support is 'read'"""
+	__slots__ = tuple()
+	
+	def __new__(cls, type, size, stream, sha=None, compressed=False):
+		return list.__new__(cls, (sha, type, size, stream, compressed, None))
+		
+	def __init__(self, type, size, stream, sha=None, compressed=None):
+		list.__init__(self, (sha, type, size, stream, compressed, None))
+	
+	#{ Interface 
+	
+	def hexsha(self):
+		""":return: our sha, hex encoded, 40 bytes"""
+		return to_hex_sha(self[0])
+	
+	def binsha(self):
+		""":return: our sha as binary, 20 bytes"""
+		return to_bin_sha(self[0])
+		
+	def _error(self):
+		""":return: the error that occurred when processing the stream, or None"""
+		return self[5]
+		
+	def _set_error(self, exc):
+		"""Set this input stream to the given exc, may be None to reset the error"""
+		self[5] = exc
+			
+	error = property(_error, _set_error)
+	
+	#} END interface
+	
+	#{ Stream Reader Interface
+	
+	def read(self, size=-1):
+		"""Implements a simple stream reader interface, passing the read call on 
+			to our internal stream"""
+		return self[3].read(size)
+		
+	#} END stream reader interface 
+	
+	#{  interface
+	
+	def _set_sha(self, sha):
+		self[0] = sha
+		
+	def _sha(self):
+		return self[0]
+		
+	sha = property(_sha, _set_sha)
+	
+	
+	def _type(self):
+		return self[1]
+	
+	def _set_type(self, type):
+		self[1] = type
+		
+	type = property(_type, _set_type)
+	
+	def _size(self):
+		return self[2]
+		
+	def _set_size(self, size):
+		self[2] = size
+	
+	size = property(_size, _set_size)
+	
+	def _stream(self):
+		return self[3]
+		
+	def _set_stream(self, stream):
+		self[3] = stream
+	
+	stream = property(_stream, _set_stream)
+	
+	#} END odb info interface 
+	
+	#{ OStream interface 
+	
+	def is_compressed(self):
+		return self[4]
+		
+	#} END OStream interface
+		
+
+class InvalidOInfo(tuple):
+	"""Carries information about a sha identifying an object which is invalid in 
+	the queried database. The exception attribute provides more information about
+	the cause of the issue"""
+	__slots__ = tuple()
+	
+	def __new__(cls, sha, exc):
+		return tuple.__new__(cls, (sha, exc))
+		
+	def __init__(self, sha, exc):
+		tuple.__init__(self, (sha, exc))
+	
+	@property
+	def sha(self):
+		return self[0]
+		
+	@property
+	def error(self):
+		""":return: exception instance explaining the failure"""
+		return self[1]
+
+
+class InvalidOStream(InvalidOInfo):
+	"""Carries information about an invalid ODB stream"""
+	__slots__ = tuple()
+	
+#} END ODB Bases
+
+
+#{ RO Streams
+
+class DecompressMemMapReader(object):
+	"""Reads data in chunks from a memory map and decompresses it. The client sees 
+	only the uncompressed data, respective file-like read calls are handling on-demand
+	buffered decompression accordingly
+	
+	A constraint on the total size of bytes is activated, simulating 
+	a logical file within a possibly larger physical memory area
+	
+	To read efficiently, you clearly don't want to read individual bytes, instead, 
+	read a few kilobytes at least.
+	
+	:note: The chunk-size should be carefully selected as it will involve quite a bit 
+		of string copying due to the way the zlib is implemented. Its very wasteful, 
+		hence we try to find a good tradeoff between allocation time and number of 
+		times we actually allocate. An own zlib implementation would be good here
+		to better support streamed reading - it would only need to keep the mmap
+		and decompress it into chunks, thats all ... """
+	__slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close')
+	
+	max_read_size = 512*1024
+	
+	def __init__(self, m, close_on_deletion, size):
+		"""Initialize with mmap for stream reading"""
+		self._m = m
+		self._zip = zlib.decompressobj()
+		self._buf = None						# buffer of decompressed bytes
+		self._buflen = 0						# length of bytes in buffer
+		self._s = size							# size of uncompressed data to read in total
+		self._br = 0							# num uncompressed bytes read
+		self._cws = 0							# start byte of compression window
+		self._cwe = 0							# end byte of compression window
+		self._close = close_on_deletion			# close the memmap on deletion ?
+		
+	def __del__(self):
+		if self._close:
+			self._m.close()
+		# END handle resource freeing
+		
+	@classmethod
+	def new(self, m, close_on_deletion=False):
+		"""Create a new DecompressMemMapReader instance for acting as a read-only stream
+		This method parses the object header from m and returns the parsed 
+		type and size, as well as the created stream instance.
+		:param m: memory map on which to oparate
+		:param close_on_deletion: if True, the memory map will be closed once we are 
+			being deleted"""
+		inst = DecompressMemMapReader(m, close_on_deletion, 0)
+		
+		# read header
+		maxb = 512				# should really be enough, cgit uses 8192 I believe
+		inst._s = maxb
+		hdr = inst.read(maxb)
+		hdrend = hdr.find("\0")
+		type, size = hdr[:hdrend].split(" ")
+		size = int(size)
+		inst._s = size
+		
+		# adjust internal state to match actual header length that we ignore
+		# The buffer will be depleted first on future reads
+		inst._br = 0
+		hdrend += 1									# count terminating \0
+		inst._buf = StringIO(hdr[hdrend:])
+		inst._buflen = len(hdr) - hdrend
+		
+		return type, size, inst
+		
+	def read(self, size=-1):
+		if size < 1:
+			size = self._s - self._br
+		else:
+			size = min(size, self._s - self._br)
+		# END clamp size
+		
+		if size == 0:
+			return str()
+		# END handle depletion
+		
+		# protect from memory peaks
+		# If he tries to read large chunks, our memory patterns get really bad
+		# as we end up copying a possibly huge chunk from our memory map right into
+		# memory. This might not even be possible. Nonetheless, try to dampen the 
+		# effect a bit by reading in chunks, returning a huge string in the end.
+		# Our performance now depends on StringIO. This way we don't need two large
+		# buffers in peak times, but only one large one in the end which is 
+		# the return buffer
+		# NO: We don't do it - if the user thinks its best, he is right. If he 
+		# has trouble, he will start reading in chunks. According to our tests
+		# its still faster if we read 10 Mb at once instead of chunking it.
+		
+		# if size > self.max_read_size:
+			# sio = StringIO()
+			# while size:
+				# read_size = min(self.max_read_size, size)
+				# data = self.read(read_size)
+				# sio.write(data)
+				# size -= len(data)
+				# if len(data) < read_size:
+					# break
+			# # END data loop
+			# sio.seek(0)
+			# return sio.getvalue()
+		# # END handle maxread
+		# 
+		# deplete the buffer, then just continue using the decompress object 
+		# which has an own buffer. We just need this to transparently parse the 
+		# header from the zlib stream
+		dat = str()
+		if self._buf:
+			if self._buflen >= size:
+				# have enough data
+				dat = self._buf.read(size)
+				self._buflen -= size
+				self._br += size
+				return dat
+			else:
+				dat = self._buf.read()		# ouch, duplicates data
+				size -= self._buflen
+				self._br += self._buflen
+				
+				self._buflen = 0
+				self._buf = None
+			# END handle buffer len
+		# END handle buffer
+		
+		# decompress some data
+		# Abstract: zlib needs to operate on chunks of our memory map ( which may 
+		# be large ), as it will otherwise and always fill in the 'unconsumed_tail'
+		# attribute which possible reads our whole map to the end, forcing 
+		# everything to be read from disk even though just a portion was requested.
+		# As this would be a nogo, we workaround it by passing only chunks of data, 
+		# moving the window into the memory map along as we decompress, which keeps 
+		# the tail smaller than our chunk-size. This causes 'only' the chunk to be
+		# copied once, and another copy of a part of it when it creates the unconsumed
+		# tail. We have to use it to hand in the appropriate amount of bytes durin g
+		# the next read.
+		tail = self._zip.unconsumed_tail
+		if tail:
+			# move the window, make it as large as size demands. For code-clarity, 
+			# we just take the chunk from our map again instead of reusing the unconsumed
+			# tail. The latter one would safe some memory copying, but we could end up
+			# with not getting enough data uncompressed, so we had to sort that out as well.
+			# Now we just assume the worst case, hence the data is uncompressed and the window
+			# needs to be as large as the uncompressed bytes we want to read.
+			self._cws = self._cwe - len(tail)
+			self._cwe = self._cws + size
+			
+			
+			indata = self._m[self._cws:self._cwe]		# another copy ... :(
+			# get the actual window end to be sure we don't use it for computations
+			self._cwe = self._cws + len(indata) 
+		else:
+			cws = self._cws
+			self._cws = self._cwe
+			self._cwe = cws + size 
+			indata = self._m[self._cws:self._cwe]		# ... copy it again :(
+		# END handle tail
+			
+		dcompdat = self._zip.decompress(indata, size)
+		
+		self._br += len(dcompdat)
+		if dat:
+			dcompdat = dat + dcompdat
+			
+		return dcompdat
+		
+#} END RO streams
+
+
+#{ W Streams
+
+class Sha1Writer(object):
+	"""Simple stream writer which produces a sha whenever you like as it degests
+	everything it is supposed to write"""
+	
+	def __init__(self):
+		self.sha1 = make_sha("")
+
+	#{ Stream Interface
+
+	def write(self, data):
+		""":raise IOError: If not all bytes could be written
+		:return: lenght of incoming data"""
+		self.sha1.update(data)
+		return len(data)
+
+	# END stream interface 
+
+	#{ Interface
+	
+	def sha(self, as_hex = False):
+		""":return: sha so far
+		:param as_hex: if True, sha will be hex-encoded, binary otherwise"""
+		if as_hex:
+			return self.sha1.hexdigest()
+		return self.sha1.digest()
+	
+	#} END interface 
+
+class FDCompressedSha1Writer(Sha1Writer):
+	"""Digests data written to it, making the sha available, then compress the 
+	data and write it to the file descriptor
+	:note: operates on raw file descriptors
+	:note: for this to work, you have to use the close-method of this instance"""
+	__slots__ = ("fd", "sha1", "zip")
+	
+	# default exception
+	exc = IOError("Failed to write all bytes to filedescriptor")
+	
+	def __init__(self, fd):
+		super(FDCompressedSha1Writer, self).__init__()
+		self.fd = fd
+		self.zip = zlib.compressobj(Z_BEST_SPEED)
+
+	#{ Stream Interface
+
+	def write(self, data):
+		""":raise IOError: If not all bytes could be written
+		:return: lenght of incoming data"""
+		self.sha1.update(data)
+		cdata = self.zip.compress(data)
+		bytes_written = write(self.fd, cdata)
+		if bytes_written != len(cdata):
+			raise self.exc
+		return len(data)
+
+	def close(self):
+		remainder = self.zip.flush()
+		if write(self.fd, remainder) != len(remainder):
+			raise self.exc
+		return close(self.fd)
+
+	#} END stream interface
+
+#} END W streams
diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py
new file mode 100644
index 00000000..6863e97b
--- /dev/null
+++ b/lib/git/odb/utils.py
@@ -0,0 +1,38 @@
+import binascii
+import os
+import errno
+
+#{ Routines
+
+hex_to_bin = binascii.a2b_hex
+bin_to_hex = binascii.b2a_hex
+
+def to_hex_sha(sha):
+	""":return: hexified version  of sha"""
+	if len(sha) == 40:
+		return sha
+	return bin_to_hex(sha)
+	
+def to_bin_sha(sha):
+	if len(sha) == 20:
+		return sha
+	return hex_to_bin(sha)
+
+# errors
+ENOENT = errno.ENOENT
+
+# os shortcuts
+exists = os.path.exists
+mkdir = os.mkdir
+isdir = os.path.isdir
+rename = os.rename
+dirname = os.path.dirname
+join = os.path.join
+read = os.read
+write = os.write
+close = os.close
+
+
+#} END Routines
+
+
author	Sebastian Thiel <byronimo@gmail.com>	2010-06-04 17:30:31 +0200
committer	Sebastian Thiel <byronimo@gmail.com>	2010-06-04 17:30:31 +0200
commit	6fbb69306c0e14bacb8dcb92a89af27d3d5d631f (patch)
tree	4a6e0c14b412315c13cc4ac6466f4888644195a3 /lib/git/odb
parent	25dca42bac17d511b7e2ebdd9d1d679e7626db5f (diff)
parent	e746f96bcc29238b79118123028ca170adc4ff0f (diff)
download	gitpython-6fbb69306c0e14bacb8dcb92a89af27d3d5d631f.tar.gz