5 files changed, 327 insertions, 55 deletions
diff --git a/lib/git/errors.py b/lib/git/errors.py
index 956e007f..d8a35e02 100644
--- a/lib/git/errors.py
+++ b/lib/git/errors.py
@@ -18,6 +18,9 @@ class InvalidDBRoot(ODBError):
 	
 class BadObject(ODBError):
 	"""The object with the given SHA does not exist"""
+	
+class BadObjectType(ODBError):
+	"""The object had an unsupported type"""
 
 class NoSuchPathError(OSError):
     """ Thrown if a path could not be access by the system. """
diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py
index f7043199..64a5678e 100644
--- a/lib/git/objects/base.py
+++ b/lib/git/objects/base.py
@@ -125,8 +125,8 @@ class Object(LazyMixin):
 		Returns 
 			File Object compatible stream to the uncompressed raw data of the object
 		"""
-		proc = self.repo.git.cat_file(self.type, self.sha, as_process=True)
-		return utils.ProcessStreamAdapter(proc, "stdout") 
+		sha, type, size, stream = self.repo.git.stream_object_data(self.sha)
+		return stream 
 
 	def stream_data(self, ostream):
 		"""
diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py
index 1248a3f4..5c50a512 100644
--- a/lib/git/odb/db.py
+++ b/lib/git/odb/db.py
@@ -1,17 +1,18 @@
 """Contains implementations of database retrieveing objects"""
-import os
+from git.utils import IndexFileSHA1Writer
 from git.errors import (
 	InvalidDBRoot, 
-	BadObject
+	BadObject, 
+	BadObjectType
 	)
-from git.utils import IndexFileSHA1Writer
 
 from utils import (
-		getsize,
+		DecompressMemMapReader,
+		FDCompressedSha1Writer,
+		ENOENT,
 		to_hex_sha,
 		exists,
 		hex_to_bin,
-		FDCompressedSha1Writer,
 		isdir,
 		mkdir,
 		rename,
@@ -19,8 +20,15 @@ from utils import (
 		join
 	)
 
+from fun import ( 
+	chunk_size,
+	loose_object_header_info, 
+	write_object
+	)
+
 import tempfile
 import mmap
+import os
 
 
 class iObjectDBR(object):
@@ -36,7 +44,8 @@ class iObjectDBR(object):
 	def has_object(self, sha):
 		"""
 		:return: True if the object identified by the given 40 byte hexsha or 20 bytes
-			binary sha is contained in the database"""
+			binary sha is contained in the database
+		:raise BadObject:"""
 		raise NotImplementedError("To be implemented in subclass")
 		
 	def object(self, sha):
@@ -44,14 +53,16 @@ class iObjectDBR(object):
 		:return: tuple(type_string, size_in_bytes, stream) a tuple with object
 			information including its type, its size as well as a stream from which its
 			contents can be read
-		:param sha: 40 bytes hexsha or 20 bytes binary sha  """
+		:param sha: 40 bytes hexsha or 20 bytes binary sha
+		:raise BadObject:"""
 		raise NotImplementedError("To be implemented in subclass")
 		
 	def object_info(self, sha):
 		"""
 		:return: tuple(type_string, size_in_bytes) tuple with the object's type 
 			string as well as its size in bytes
-		:param sha: 40 bytes hexsha or 20 bytes binary sha"""
+		:param sha: 40 bytes hexsha or 20 bytes binary sha
+		:raise BadObject:"""
 		raise NotImplementedError("To be implemented in subclass")
 			
 	#} END query interface
@@ -70,7 +81,8 @@ class iObjectDBW(object):
 		:param stream: stream providing the data
 		:param dry_run: if True, the object database will not actually be changed
 		:param sha_as_hex: if True, the returned sha identifying the object will be 
-			hex encoded, not binary"""
+			hex encoded, not binary
+		:raise IOError: if data could not be written"""
 		raise NotImplementedError("To be implemented in subclass")
 	
 	def to_objects(self, iter_info, dry_run=False, sha_as_hex=True, max_threads=0):
@@ -82,7 +94,8 @@ class iObjectDBW(object):
 		:param dry_run: see ``to_obj``
 		:param sha_as_hex: see ``to_obj``
 		:param max_threads: if < 1, any number of threads may be started while processing
-			the request, otherwise the given number of threads will be started."""
+			the request, otherwise the given number of threads will be started.
+		:raise IOError: if data could not be written"""
 		# a trivial implementation, ignoring the threads for now
 		# TODO: add configuration to the class to determine whether we may 
 		# actually use multiple threads, default False of course. If the add
@@ -130,15 +143,19 @@ class FileDBBase(object):
 	
 class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW):
 	"""A database which operates on loose object files"""
-	__slots__ = ('_hexsha_to_file', )
-	
+	__slots__ = ('_hexsha_to_file', '_fd_open_flags')
 	# CONFIGURATION
 	# chunks in which data will be copied between streams
-	stream_chunk_size = 1000*1000
+	stream_chunk_size = chunk_size
+	
 	
 	def __init__(self, root_path):
 		super(LooseObjectDB, self).__init__(root_path)
 		self._hexsha_to_file = dict()
+		# Additional Flags - might be set to 0 after the first failure
+		# Depending on the root, this might work for some mounts, for others not, which
+		# is why it is per instance
+		self._fd_open_flags = os.O_NOATIME
 	
 	#{ Interface 
 	def object_path(self, hexsha):
@@ -167,36 +184,46 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW):
 		
 	#} END interface
 	
-	def _object_header_info(self, mmap):
-		""":return: tuple(type_string, uncompressed_size_in_bytes 
-		:param mmap: newly mapped memory map at position 0. It will be 
-			seeked to the actual start of the object contents, which can be used
-			to initialize a zlib decompress object."""
-		raise NotImplementedError("todo")
-	
-	def _map_object(self, sha):
+	def _map_loose_object(self, sha):
 		"""
-		:return: tuple(file, mmap) tuple with an opened file for reading, and 
-			a memory map of that file"""
-		db_path = self.readable_db_object_path(to_hex_sha(sha))
-		f = open(db_path, 'rb')
-		m = mmap.mmap(f.fileno(), getsize(db_path), access=mmap.ACCESS_READ)
-		return (f, m)
+		:return: memory map of that file to allow random read access
+		:raise BadObject: if object could not be located"""
+		db_path = self.db_path(self.object_path(to_hex_sha(sha)))
+		try:
+			fd = os.open(db_path, os.O_RDONLY|self._fd_open_flags)
+		except OSError,e:
+			if e.errno != ENOENT:
+				# try again without noatime
+				try:
+					fd = os.open(db_path, os.O_RDONLY)
+				except OSError:
+					raise BadObject(to_hex_sha(sha))
+				# didn't work because of our flag, don't try it again
+				self._fd_open_flags = 0
+			else:
+				raise BadObject(to_hex_sha(sha))
+			# END handle error
+		# END exception handling
+		try:
+			return mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
+		finally:
+			os.close(fd)
+		# END assure file is closed
 			
 	def object_info(self, sha):
-		f, m = self._map_object(sha)
+		m = self._map_loose_object(sha)
 		try:
-			type, size = self._object_header_info(m)
+			return loose_object_header_info(m)
 		finally:
-			f.close()
 			m.close()
 		# END assure release of system resources
 		
 	def object(self, sha):
-		f, m = self._map_object(sha)
-		type, size = self._object_header_info(m)
-		# TODO: init a dynamic decompress stream from our memory map
+		m = self._map_loose_object(sha)
+		reader = DecompressMemMapReader(m, close_on_deletion = True)
+		type, size = reader.initialize()
 		
+		return type, size, reader
 		
 	def has_object(self, sha):
 		try:
@@ -210,25 +237,10 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW):
 		# open a tmp file to write the data to
 		fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path)
 		writer = FDCompressedSha1Writer(fd)
-		
-		# WRITE HEADER: type SP size NULL
-		writer.write("%s %i%s" % (type, size, chr(0)))
-		
-		# WRITE ALL DATA
-		chunksize = self.stream_chunk_size
+	
 		try:
-			try:
-				while True:
-					data_len = writer.write(stream.read(chunksize))
-					if data_len < chunksize:
-						# WRITE FOOTER
-						writer.write('\n')
-						break
-					# END check for stream end
-				# END duplicate data
-			finally:
-				writer.close()
-			# END assure file was closed
+			write_object(type, size, stream, writer,
+							close_target_stream=True, chunk_size=self.stream_chunk_size)
 		except:
 			os.remove(tmp_path)
 			raise
diff --git a/lib/git/odb/fun.py b/lib/git/odb/fun.py
new file mode 100644
index 00000000..ee7144dd
--- /dev/null
+++ b/lib/git/odb/fun.py
@@ -0,0 +1,114 @@
+"""Contains basic c-functions which usually contain performance critical code
+Keeping this code separate from the beginning makes it easier to out-source
+it into c later, if required"""
+
+from git.errors import (
+	BadObjectType
+	)
+
+import zlib
+decompressobj = zlib.decompressobj
+
+
+# INVARIANTS
+type_id_to_type_map = 	{
+							1 : "commit",
+							2 : "tree",
+							3 : "blob",
+							4 : "tag"
+						}
+
+# used when dealing with larger streams
+chunk_size = 1000*1000
+
+
+#{ Routines
+
+def is_loose_object(m):
+	""":return: True the file contained in memory map m appears to be a loose object.
+	Only the first two bytes are needed"""
+	b0, b1 = map(ord, m[:2])
+	word = (b0 << 8) + b1
+	return b0 == 0x78 and (word % 31) == 0
+
+def loose_object_header_info(m):
+	""":return: tuple(type_string, uncompressed_size_in_bytes) the type string of the 
+		object as well as its uncompressed size in bytes.
+	:param m: memory map from which to read the compressed object data"""
+	decompress_size = 8192		# is used in cgit as well
+	hdr = decompressobj().decompress(m, decompress_size)
+	type_name, size = hdr[:hdr.find("\0")].split(" ")
+	return type_name, int(size)
+	
+def object_header_info(m):
+	""":return: tuple(type_string, uncompressed_size_in_bytes 
+	:param mmap: mapped memory map. It will be 
+		seeked to the actual start of the object contents, which can be used
+		to initialize a zlib decompress object.
+	:note: This routine can only handle new-style objects which are assumably contained
+		in packs
+		"""
+	assert not is_loose_object(m), "Use loose_object_header_info instead"
+	
+	c = b0							# first byte
+	i = 1							# next char to read
+	type_id = (c >> 4) & 7			# numeric type
+	size = c & 15					# starting size
+	s = 4							# starting bit-shift size
+	while c & 0x80:
+		c = ord(m[i])
+		i += 1
+		size += (c & 0x7f) << s
+		s += 7
+	# END character loop
+	
+	# finally seek the map to the start of the data stream
+	m.seek(i)
+	try:
+		return (type_id_to_type_map[type_id], size)
+	except KeyError:
+		# invalid object type - we could try to be smart now and decode part 
+		# of the stream to get the info, problem is that we had trouble finding 
+		# the exact start of the content stream
+		raise BadObjectType(type_id)
+	# END handle exceptions
+	
+def write_object(type, size, source_stream, target_stream, close_target_stream=True, 
+					chunk_size=chunk_size):
+	"""Write the object as identified by type, size and source_stream into the 
+	target_stream
+	
+	:param type: type string of the object
+	:param size: amount of bytes to write from source_stream
+	:param source_stream: stream as file-like object providing at least size bytes
+	:param target_stream: stream as file-like object to receive the data
+	:param close_target_stream: if True, the target stream will be closed when 
+		the routine exits, even if an error is thrown
+	:param chunk_size: size of chunks to read from source. Larger values can be beneficial
+		for io performance, but cost more memory as well
+	:return: The actual amount of bytes written to stream, which includes the header and a trailing newline"""
+	tbw = 0												# total num bytes written
+	dbw = 0												# num data bytes written
+	try:
+		# WRITE HEADER: type SP size NULL
+		tbw += target_stream.write("%s %i\0" % (type, size))
+	
+		# WRITE ALL DATA UP TO SIZE
+		while True:
+			cs = min(chunk_size, size-dbw)
+			data_len = target_stream.write(source_stream.read(cs))
+			dbw += data_len
+			if data_len < cs or dbw == size:
+				tbw += dbw
+				break
+			# END check for stream end
+		# END duplicate data
+		return tbw
+	finally:
+		if close_target_stream:
+			target_stream.close()
+		# END handle stream closing
+	# END assure file was closed
+	
+	
+#} END routines
diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py
index d88dca1a..8a054201 100644
--- a/lib/git/odb/utils.py
+++ b/lib/git/odb/utils.py
@@ -1,7 +1,10 @@
 import binascii
 import os
 import zlib
+from cStringIO import StringIO
 from git.utils import make_sha
+import errno
+from fun import chunk_size
 
 __all__ = ('FDSha1Writer', )
 
@@ -21,8 +24,10 @@ def to_bin_sha(sha):
 		return sha
 	return hex_to_bin(sha)
 
+# errors
+ENOENT = errno.ENOENT
+
 # os shortcuts
-getsize = os.path.getsize
 exists = os.path.exists
 mkdir = os.mkdir
 isdir = os.path.isdir
@@ -32,6 +37,11 @@ join = os.path.join
 read = os.read
 write = os.write
 close = os.close
+
+# ZLIB configuration
+# used when compressing objects
+Z_BEST_SPEED = 1
+
 #} END Routines
 
 
@@ -50,7 +60,7 @@ class FDCompressedSha1Writer(object):
 	def __init__(self, fd):
 		self.fd = fd
 		self.sha1 = make_sha("")
-		self.zip = zlib.compressobj()
+		self.zip = zlib.compressobj(Z_BEST_SPEED)
 
 	def write(self, data):
 		""":raise IOError: If not all bytes could be written
@@ -76,4 +86,137 @@ class FDCompressedSha1Writer(object):
 		return close(self.fd)
 
 
+class DecompressMemMapReader(object):
+	"""Reads data in chunks from a memory map and decompresses it. The client sees 
+	only the uncompressed data, respective file-like read calls are handling on-demand
+	buffered decompression accordingly
+	
+	A constraint on the total size of bytes is activated, simulating 
+	a logical file within a possibly larger physical memory area
+	
+	To read efficiently, you clearly don't want to read individual bytes, instead, 
+	read a few kilobytes at least.
+	
+	:note: The chunk-size should be carefully selected as it will involve quite a bit 
+		of string copying due to the way the zlib is implemented. Its very wasteful, 
+		hence we try to find a good tradeoff between allocation time and number of 
+		times we actually allocate. An own zlib implementation would be good here
+		to better support streamed reading - it would only need to keep the mmap
+		and decompress it into chunks, thats all ... """
+	__slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_cs', '_close')
+	
+	def __init__(self, m, close_on_deletion, cs = 128*1024):
+		"""Initialize with mmap and chunk_size for stream reading"""
+		self._m = m
+		self._zip = zlib.decompressobj()
+		self._buf = None						# buffer of decompressed bytes
+		self._buflen = 0						# length of bytes in buffer
+		self._s = 0								# size of uncompressed data to read in total
+		self._br = 0							# num uncompressed bytes read
+		self._cws = 0							# start byte of compression window
+		self._cwe = 0							# end byte of compression window
+		self._cs = cs							# chunk size (when reading from zip) 
+		self._close = close_on_deletion			# close the memmap on deletion ?
+		
+	def __del__(self):
+		if self._close:
+			self._m.close()
+		# END handle resource freeing
+		
+	def initialize(self, size=0):
+		"""Initialize this instance for acting as a read-only stream for size bytes.
+		:param size: size in bytes to be decompresed before being depleted.
+			If 0, default object header information is parsed from the data, 
+			returning a tuple of (type_string, uncompressed_size)
+			If not 0, the size will be used, and None is returned.
+		:note: must only be called exactly once"""
+		if size:
+			self._s = size
+			return
+		# END handle size
+		
+		# read header
+		maxb = 8192
+		self._s = maxb
+		hdr = self.read(maxb)
+		hdrend = hdr.find("\0")
+		type, size = hdr[:hdrend].split(" ")
+		self._s = int(size)
+		
+		# adjust internal state to match actual header length that we ignore
+		# The buffer will be depleted first on future reads
+		self._br = 0
+		hdrend += 1									# count terminating \0
+		self._buf = StringIO(hdr[hdrend:])
+		self._buflen = len(hdr) - hdrend
+		
+		return type, size
+		
+	def read(self, size=-1):
+		if size < 1:
+			size = self._s - self._br
+		else:
+			size = min(size, self._s - self._br)
+		# END clamp size
+		
+		if size == 0:
+			return str()
+		# END handle depletion
+		
+		# deplete the buffer, then just continue using the decompress object 
+		# which has an own buffer. We just need this to transparently parse the 
+		# header from the zlib stream
+		dat = str()
+		if self._buf:
+			if self._buflen >= size:
+				# have enough data
+				dat = self._buf.read(size)
+				self._buflen -= size
+				self._br += size
+				return dat
+			else:
+				dat = self._buf.getvalue()		# ouch, duplicates data
+				size -= self._buflen
+				self._br += self._buflen
+				
+				self._buflen = 0
+				self._buf = None
+			# END handle buffer len
+		# END handle buffer
+		
+		# decompress some data
+		# Abstract: zlib needs to operate on chunks of our memory map ( which may 
+		# be large ), as it will otherwise and always fill in the 'unconsumed_tail'
+		# attribute which possible reads our whole map to the end, forcing 
+		# everything to be read from disk even though just a portion was requested.
+		# As this would be a nogo, we workaround it by passing only chunks of data, 
+		# moving the window into the memory map along as we decompress, which keeps 
+		# the tail smaller than our chunk-size. This causes 'only' the chunk to be
+		# copied once, and another copy of a part of it when it creates the unconsumed
+		# tail. We have to use it to hand in the appropriate amount of bytes durin g
+		# the next read.
+		if self._zip.unconsumed_tail:
+			# move the window, make it as large as size demands. For code-clarity, 
+			# we just take the chunk from our map again instead of reusing the unconsumed
+			# tail. The latter one would safe some memory copying, but we could end up
+			# with not getting enough data uncompressed, so we had to sort that out as well.
+			# Now we just assume the worst case, hence the data is uncompressed and the window
+			# needs to be as large as the uncompressed bytes we want to read.
+			self._cws = self._cwe - len(self._zip.unconsumed_tail)
+			self._cwe = self._cws + size
+			indata = self._m[self._cws:self._cwe]		# another copy ... :(
+		else:
+			cws = self._cws
+			self._cws = self._cwe
+			self._cwe = cws + size 
+			indata = self._m[self._cws:self._cwe]		# ... copy it again :(
+		# END handle tail
+		
+		dcompdat = self._zip.decompress(indata, size)
+		self._br += len(dcompdat)
+		
+		if dat:
+			return dat + dcompdat
+		return dcompdat
+		
 #} END classes