From 38d59fc8ccccae8882fa48671377bf40a27915a7 Mon Sep 17 00:00:00 2001
From: Sebastian Thiel <byronimo@gmail.com>
Date: Thu, 3 Jun 2010 16:35:35 +0200
Subject: odb: implemented loose object streaming, which is impossible to do
 efficiently considering that it copies string buffers all the time

---
 lib/git/odb/fun.py | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 lib/git/odb/fun.py

(limited to 'lib/git/odb/fun.py')

diff --git a/lib/git/odb/fun.py b/lib/git/odb/fun.py
new file mode 100644
index 00000000..ee7144dd
--- /dev/null
+++ b/lib/git/odb/fun.py
@@ -0,0 +1,114 @@
+"""Contains basic c-functions which usually contain performance critical code
+Keeping this code separate from the beginning makes it easier to out-source
+it into c later, if required"""
+
+from git.errors import (
+	BadObjectType
+	)
+
+import zlib
+decompressobj = zlib.decompressobj
+
+
+# INVARIANTS
+type_id_to_type_map = 	{
+							1 : "commit",
+							2 : "tree",
+							3 : "blob",
+							4 : "tag"
+						}
+
+# used when dealing with larger streams
+chunk_size = 1000*1000
+
+
+#{ Routines
+
+def is_loose_object(m):
+	""":return: True the file contained in memory map m appears to be a loose object.
+	Only the first two bytes are needed"""
+	b0, b1 = map(ord, m[:2])
+	word = (b0 << 8) + b1
+	return b0 == 0x78 and (word % 31) == 0
+
+def loose_object_header_info(m):
+	""":return: tuple(type_string, uncompressed_size_in_bytes) the type string of the 
+		object as well as its uncompressed size in bytes.
+	:param m: memory map from which to read the compressed object data"""
+	decompress_size = 8192		# is used in cgit as well
+	hdr = decompressobj().decompress(m, decompress_size)
+	type_name, size = hdr[:hdr.find("\0")].split(" ")
+	return type_name, int(size)
+	
+def object_header_info(m):
+	""":return: tuple(type_string, uncompressed_size_in_bytes 
+	:param mmap: mapped memory map. It will be 
+		seeked to the actual start of the object contents, which can be used
+		to initialize a zlib decompress object.
+	:note: This routine can only handle new-style objects which are assumably contained
+		in packs
+		"""
+	assert not is_loose_object(m), "Use loose_object_header_info instead"
+	
+	c = b0							# first byte
+	i = 1							# next char to read
+	type_id = (c >> 4) & 7			# numeric type
+	size = c & 15					# starting size
+	s = 4							# starting bit-shift size
+	while c & 0x80:
+		c = ord(m[i])
+		i += 1
+		size += (c & 0x7f) << s
+		s += 7
+	# END character loop
+	
+	# finally seek the map to the start of the data stream
+	m.seek(i)
+	try:
+		return (type_id_to_type_map[type_id], size)
+	except KeyError:
+		# invalid object type - we could try to be smart now and decode part 
+		# of the stream to get the info, problem is that we had trouble finding 
+		# the exact start of the content stream
+		raise BadObjectType(type_id)
+	# END handle exceptions
+	
+def write_object(type, size, source_stream, target_stream, close_target_stream=True, 
+					chunk_size=chunk_size):
+	"""Write the object as identified by type, size and source_stream into the 
+	target_stream
+	
+	:param type: type string of the object
+	:param size: amount of bytes to write from source_stream
+	:param source_stream: stream as file-like object providing at least size bytes
+	:param target_stream: stream as file-like object to receive the data
+	:param close_target_stream: if True, the target stream will be closed when 
+		the routine exits, even if an error is thrown
+	:param chunk_size: size of chunks to read from source. Larger values can be beneficial
+		for io performance, but cost more memory as well
+	:return: The actual amount of bytes written to stream, which includes the header and a trailing newline"""
+	tbw = 0												# total num bytes written
+	dbw = 0												# num data bytes written
+	try:
+		# WRITE HEADER: type SP size NULL
+		tbw += target_stream.write("%s %i\0" % (type, size))
+	
+		# WRITE ALL DATA UP TO SIZE
+		while True:
+			cs = min(chunk_size, size-dbw)
+			data_len = target_stream.write(source_stream.read(cs))
+			dbw += data_len
+			if data_len < cs or dbw == size:
+				tbw += dbw
+				break
+			# END check for stream end
+		# END duplicate data
+		return tbw
+	finally:
+		if close_target_stream:
+			target_stream.close()
+		# END handle stream closing
+	# END assure file was closed
+	
+	
+#} END routines
-- 
cgit v1.2.1


From e746f96bcc29238b79118123028ca170adc4ff0f Mon Sep 17 00:00:00 2001
From: Sebastian Thiel <byronimo@gmail.com>
Date: Fri, 4 Jun 2010 17:22:08 +0200
Subject: Fixed implementation after design change to deal with it - all tests
 run, but next there will have to be more through testing

---
 lib/git/odb/fun.py | 46 ++++++++++++++++++++--------------------------
 1 file changed, 20 insertions(+), 26 deletions(-)

(limited to 'lib/git/odb/fun.py')

diff --git a/lib/git/odb/fun.py b/lib/git/odb/fun.py
index ee7144dd..870a6f02 100644
--- a/lib/git/odb/fun.py
+++ b/lib/git/odb/fun.py
@@ -21,6 +21,8 @@ type_id_to_type_map = 	{
 # used when dealing with larger streams
 chunk_size = 1000*1000
 
+__all__ = ('is_loose_object', 'loose_object_header_info', 'object_header_info', 
+			'write_object' )
 
 #{ Routines
 
@@ -73,42 +75,34 @@ def object_header_info(m):
 		raise BadObjectType(type_id)
 	# END handle exceptions
 	
-def write_object(type, size, source_stream, target_stream, close_target_stream=True, 
-					chunk_size=chunk_size):
+def write_object(type, size, read, write, chunk_size=chunk_size):
 	"""Write the object as identified by type, size and source_stream into the 
 	target_stream
 	
 	:param type: type string of the object
 	:param size: amount of bytes to write from source_stream
-	:param source_stream: stream as file-like object providing at least size bytes
-	:param target_stream: stream as file-like object to receive the data
+	:param read: read method of a stream providing the content data
+	:param write: write method of the output stream
 	:param close_target_stream: if True, the target stream will be closed when 
 		the routine exits, even if an error is thrown
-	:param chunk_size: size of chunks to read from source. Larger values can be beneficial
-		for io performance, but cost more memory as well
 	:return: The actual amount of bytes written to stream, which includes the header and a trailing newline"""
 	tbw = 0												# total num bytes written
 	dbw = 0												# num data bytes written
-	try:
-		# WRITE HEADER: type SP size NULL
-		tbw += target_stream.write("%s %i\0" % (type, size))
-	
-		# WRITE ALL DATA UP TO SIZE
-		while True:
-			cs = min(chunk_size, size-dbw)
-			data_len = target_stream.write(source_stream.read(cs))
-			dbw += data_len
-			if data_len < cs or dbw == size:
-				tbw += dbw
-				break
-			# END check for stream end
-		# END duplicate data
-		return tbw
-	finally:
-		if close_target_stream:
-			target_stream.close()
-		# END handle stream closing
-	# END assure file was closed
 	
+	# WRITE HEADER: type SP size NULL
+	tbw += write("%s %i\0" % (type, size))
+
+	# WRITE ALL DATA UP TO SIZE
+	while True:
+		cs = min(chunk_size, size-dbw)
+		data_len = write(read(cs))
+		dbw += data_len
+		if data_len < cs or dbw == size:
+			tbw += dbw
+			break
+		# END check for stream end
+	# END duplicate data
+	return tbw
+
 	
 #} END routines
-- 
cgit v1.2.1