15 files changed, 230 insertions, 82 deletions
diff --git a/CHANGES b/CHANGES
index e24e723d..e9e1257e 100644
--- a/CHANGES
+++ b/CHANGES
@@ -5,6 +5,7 @@ CHANGES
 0.2 Beta 2
 ===========
  * Commit objects now carry the 'encoding' information of their message. It wasn't parsed previously, and defaults to UTF-8
+ * Commit.create_from_tree now uses a pure-python implementation, mimicing git-commit-tree
 
 0.2
 =====
diff --git a/lib/git/cmd.py b/lib/git/cmd.py
index aaa27adc..18d1c505 100644
--- a/lib/git/cmd.py
+++ b/lib/git/cmd.py
@@ -323,12 +323,7 @@ class Git(object):
 				stdout_value = proc.stdout.read().rstrip()		# strip trailing "\n"
 			else:
 				max_chunk_size = 1024*64
-				while True:
-					chunk = proc.stdout.read(max_chunk_size)
-					output_stream.write(chunk)
-					if len(chunk) < max_chunk_size:
-						break
-				# END reading output stream
+				stream_copy(proc.stdout, output_stream, max_chunk_size)
 				stdout_value = output_stream
 			# END stdout handling
 			stderr_value = proc.stderr.read().rstrip()			# strip trailing "\n"
diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py
index 64a5678e..f7043199 100644
--- a/lib/git/objects/base.py
+++ b/lib/git/objects/base.py
@@ -125,8 +125,8 @@ class Object(LazyMixin):
 		Returns 
 			File Object compatible stream to the uncompressed raw data of the object
 		"""
-		sha, type, size, stream = self.repo.git.stream_object_data(self.sha)
-		return stream 
+		proc = self.repo.git.cat_file(self.type, self.sha, as_process=True)
+		return utils.ProcessStreamAdapter(proc, "stdout") 
 
 	def stream_data(self, ostream):
 		"""
diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py
index 98aca360..d56ce306 100644
--- a/lib/git/objects/commit.py
+++ b/lib/git/objects/commit.py
@@ -91,15 +91,6 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri
 		"""
 		super(Commit,self).__init__(repo, sha)
 		self._set_self_from_args_(locals())
-
-		if parents is not None:
-			cls = type(self)
-			self.parents = tuple(cls(repo, p) for p in parents if not isinstance(p, cls))
-		# END for each parent to convert
-			
-		if self.sha and tree is not None:
-			self.tree = Tree(repo, tree, path='')
-		# END id to tree conversion
 		
 	@classmethod
 	def _get_intermediate_items(cls, commit):
@@ -350,7 +341,12 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri
 						committer, committer_time, committer_offset,
 						message, parent_commits, conf_encoding)
 		
-		# serialize !
+		stream = StringIO()
+		new_commit._serialize(stream)
+		streamlen = stream.tell()
+		stream.seek(0)
+		
+		new_commit.sha = repo.odb.to_object(cls.type, streamlen, stream, sha_as_hex=True)
 		
 		if head:
 			try:
@@ -377,8 +373,28 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri
 	#{ Serializable Implementation
 	
 	def _serialize(self, stream):
-		# for now, this is very inefficient and in fact shouldn't be used like this
-		return super(Commit, self)._serialize(stream)
+		write = stream.write
+		write("tree %s\n" % self.tree)
+		for p in self.parents:
+			write("parent %s\n" % p)
+			
+		a = self.author
+		c = self.committer
+		fmt = "%s %s <%s> %s %s\n"
+		write(fmt % ("author", a.name, a.email, 
+						self.authored_date, 
+						utils.altz_to_utctz_str(self.author_tz_offset)))
+			
+		write(fmt % ("committer", c.name, c.email, 
+						self.committed_date,
+						utils.altz_to_utctz_str(self.committer_tz_offset)))
+		
+		if self.encoding != self.default_encoding:
+			write("encoding %s\n" % self.encoding)
+		
+		write("\n")
+		write(self.message)
+		return self
 	
 	def _deserialize(self, stream):
 		""":param from_rev_list: if true, the stream format is coming from the rev-list command
@@ -416,7 +432,7 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri
 		
 		# a stream from our data simply gives us the plain message
 		# The end of our message stream is marked with a newline that we strip
-		self.message = stream.read()[:-1]
+		self.message = stream.read()
 		return self
 		
 	#} END serializable implementation
diff --git a/lib/git/objects/utils.py b/lib/git/objects/utils.py
index 6d378a72..c93f2091 100644
--- a/lib/git/objects/utils.py
+++ b/lib/git/objects/utils.py
@@ -16,7 +16,8 @@ import time
 import os
 
 __all__ = ('get_object_type_by_name', 'get_user_id', 'parse_date', 'parse_actor_and_date', 
-			'ProcessStreamAdapter', 'Traversable')
+			'ProcessStreamAdapter', 'Traversable', 'altz_to_utctz_str', 'utctz_to_altz', 
+			'verify_utctz')
 
 def get_object_type_by_name(object_type_name):
 	"""
@@ -57,14 +58,24 @@ def get_user_id():
 	return "%s@%s" % (username, platform.node())
 		
 
-def _utc_tz_to_altz(utctz):
+def utctz_to_altz(utctz):
 	"""we convert utctz to the timezone in seconds, it is the format time.altzone
 	returns. Git stores it as UTC timezon which has the opposite sign as well, 
 	which explains the -1 * ( that was made explicit here )
 	:param utctz: git utc timezone string, i.e. +0200"""
 	return -1 * int(float(utctz)/100*3600)
+	
+def altz_to_utctz_str(altz):
+	"""As above, but inverses the operation, returning a string that can be used
+	in commit objects"""
+	utci = -1 * int((altz / 3600)*100)
+	utcs = str(abs(utci))
+	utcs = "0"*(4-len(utcs)) + utcs
+	prefix = (utci < 0 and '-') or '+'
+	return prefix + utcs
+	
 
-def _verify_utctz(offset):
+def verify_utctz(offset):
 	""":raise ValueError: if offset is incorrect
 	:return: offset"""
 	fmt_exc = ValueError("Invalid timezone offset format: %s" % offset)
@@ -97,11 +108,11 @@ def parse_date(string_date):
 		if string_date.count(' ') == 1 and string_date.rfind(':') == -1:
 			timestamp, offset = string_date.split()
 			timestamp = int(timestamp)
-			return timestamp, _utc_tz_to_altz(_verify_utctz(offset))
+			return timestamp, utctz_to_altz(verify_utctz(offset))
 		else:
 			offset = "+0000"					# local time by default
 			if string_date[-5] in '-+':
-				offset = _verify_utctz(string_date[-5:])
+				offset = verify_utctz(string_date[-5:])
 				string_date = string_date[:-6]	# skip space as well
 			# END split timezone info
 			
@@ -139,7 +150,7 @@ def parse_date(string_date):
 					fstruct = time.struct_time((dtstruct.tm_year, dtstruct.tm_mon, dtstruct.tm_mday, 
 												tstruct.tm_hour, tstruct.tm_min, tstruct.tm_sec,
 												dtstruct.tm_wday, dtstruct.tm_yday, tstruct.tm_isdst))
-					return int(time.mktime(fstruct)), _utc_tz_to_altz(offset)
+					return int(time.mktime(fstruct)), utctz_to_altz(offset)
 				except ValueError:
 					continue
 				# END exception handling
@@ -167,7 +178,7 @@ def parse_actor_and_date(line):
 	"""
 	m = _re_actor_epoch.search(line)
 	actor, epoch, offset = m.groups()
-	return (Actor._from_string(actor), int(epoch), _utc_tz_to_altz(offset))
+	return (Actor._from_string(actor), int(epoch), utctz_to_altz(offset))
 	
 	
 	
diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py
index 94d1cea8..fd340962 100644
--- a/lib/git/odb/utils.py
+++ b/lib/git/odb/utils.py
@@ -137,7 +137,7 @@ class DecompressMemMapReader(object):
 		# END handle size
 		
 		# read header
-		maxb = 8192
+		maxb = 512				# should really be enough, cgit uses 8192 I believe
 		self._s = maxb
 		hdr = self.read(maxb)
 		hdrend = hdr.find("\0")
@@ -172,20 +172,24 @@ class DecompressMemMapReader(object):
 		# Our performance now depends on StringIO. This way we don't need two large
 		# buffers in peak times, but only one large one in the end which is 
 		# the return buffer
-		if size > self.max_read_size:
-			sio = StringIO()
-			while size:
-				read_size = min(self.max_read_size, size)
-				data = self.read(read_size)
-				sio.write(data)
-				size -= len(data)
-				if len(data) < read_size:
-					break
-			# END data loop
-			sio.seek(0)
-			return sio.getvalue()
-		# END handle maxread
+		# NO: We don't do it - if the user thinks its best, he is right. If he 
+		# has trouble, he will start reading in chunks. According to our tests
+		# its still faster if we read 10 Mb at once instead of chunking it.
 		
+		# if size > self.max_read_size:
+			# sio = StringIO()
+			# while size:
+				# read_size = min(self.max_read_size, size)
+				# data = self.read(read_size)
+				# sio.write(data)
+				# size -= len(data)
+				# if len(data) < read_size:
+					# break
+			# # END data loop
+			# sio.seek(0)
+			# return sio.getvalue()
+		# # END handle maxread
+		# 
 		# deplete the buffer, then just continue using the decompress object 
 		# which has an own buffer. We just need this to transparently parse the 
 		# header from the zlib stream
diff --git a/lib/git/repo.py b/lib/git/repo.py
index f4caa3fb..0bd2249c 100644
--- a/lib/git/repo.py
+++ b/lib/git/repo.py
@@ -4,12 +4,6 @@
 # This module is part of GitPython and is released under
 # the BSD License: http://www.opensource.org/licenses/bsd-license.php
 
-import os
-import sys
-import re
-import gzip
-import StringIO
-
 from errors import InvalidGitRepositoryError, NoSuchPathError
 from cmd import Git
 from actor import Actor
@@ -19,6 +13,15 @@ from objects import *
 from config import GitConfigParser
 from remote import Remote
 
+from odb.db import LooseObjectDB
+
+import os
+import sys
+import re
+import gzip
+import StringIO
+
+
 def touch(filename):
     fp = open(filename, "a")
     fp.close()
@@ -53,7 +56,7 @@ class Repo(object):
     'git_dir' is the .git repository directoy, which is always set.
     """
     DAEMON_EXPORT_FILE = 'git-daemon-export-ok'
-    __slots__ = ( "working_dir", "_working_tree_dir", "git_dir", "_bare", "git" )
+    __slots__ = ( "working_dir", "_working_tree_dir", "git_dir", "_bare", "git", "odb" )
     
     # precompiled regex
     re_whitespace = re.compile(r'\s+')
@@ -65,27 +68,22 @@ class Repo(object):
     # represents the configuration level of a configuration file
     config_level = ("system", "global", "repository")
 
-    def __init__(self, path=None):
-        """
-        Create a new Repo instance
-
-        ``path``
-            is the path to either the root git directory or the bare git repo
+    def __init__(self, path=None, odbt = LooseObjectDB):
+        """ Create a new Repo instance
 
-        Examples::
+		:param path: is the path to either the root git directory or the bare git repo::
 
             repo = Repo("/Users/mtrier/Development/git-python")
             repo = Repo("/Users/mtrier/Development/git-python.git")
             repo = Repo("~/Development/git-python.git")
             repo = Repo("$REPOSITORIES/Development/git-python.git")
-            
-        Raises
-            InvalidGitRepositoryError or NoSuchPathError
-
-        Returns
-            ``git.Repo``
-        """
-
+        
+        :param odbt: Object DataBase type - a type which is constructed by providing 
+        	the directory containing the database objects, i.e. .git/objects. It will
+        	be used to access all object data
+        :raise InvalidGitRepositoryError:
+        :raise NoSuchPathError:
+		:return: git.Repo """
         epath = os.path.abspath(os.path.expandvars(os.path.expanduser(path or os.getcwd())))
 
         if not os.path.exists(epath):
@@ -130,6 +128,7 @@ class Repo(object):
         
         self.working_dir = self._working_tree_dir or self.git_dir
         self.git = Git(self.working_dir)
+        self.odb = odbt(os.path.join(self.git_dir, 'objects'))
 
     def __eq__(self, rhs):
     	if isinstance(rhs, Repo):
diff --git a/lib/git/utils.py b/lib/git/utils.py
index 360c77c9..60a7de48 100644
--- a/lib/git/utils.py
+++ b/lib/git/utils.py
@@ -27,6 +27,21 @@ def make_sha(source=''):
         sha1 = sha.sha(source)
         return sha1
 
+def stream_copy(source, destination, chunk_size=512*1024):
+	"""Copy all data from the source stream into the destination stream in chunks
+	of size chunk_size
+	:return: amount of bytes written"""
+	br = 0
+	while True:
+		chunk = source.read(chunk_size)
+		destination.write(chunk)
+		br += len(chunk)
+		if len(chunk) < chunk_size:
+			break
+	# END reading output stream
+	return br
+	
+
 def join_path(a, *p):
     """Join path tokens together similar to os.path.join, but always use 
     '/' instead of possibly '\' on windows."""
diff --git a/test/git/performance/lib.py b/test/git/performance/lib.py
index 4b552b20..650bea82 100644
--- a/test/git/performance/lib.py
+++ b/test/git/performance/lib.py
@@ -1,6 +1,8 @@
 """Contains library functions"""
 import os
 from test.testlib import *
+import shutil
+import tempfile
 
 from git import (
 	Repo
@@ -25,7 +27,7 @@ def resolve_or_fail(env_var):
 
 #{ Base Classes 
 
-class TestBigRepoReadOnly(TestBase):
+class TestBigRepoR(TestBase):
 	"""TestCase providing access to readonly 'big' repositories using the following 
 	member variables:
 	
@@ -40,7 +42,24 @@ class TestBigRepoReadOnly(TestBase):
 	
 	@classmethod
 	def setUpAll(cls):
-		super(TestBigRepoReadOnly, cls).setUpAll()
-		cls.gitrepo = Repo(resolve_or_fail(k_env_git_repo))
+		super(TestBigRepoR, cls).setUpAll()
+		cls.gitrorepo = Repo(resolve_or_fail(k_env_git_repo))
 
+
+class TestBigRepoRW(TestBigRepoR):
+	"""As above, but provides a big repository that we can write to.
+	
+	Provides ``self.gitrwrepo``"""
+	
+	@classmethod
+	def setUpAll(cls):
+		super(TestBigRepoRW, cls).setUpAll()
+		dirname = tempfile.mktemp()
+		os.mkdir(dirname)
+		cls.gitrwrepo = cls.gitrorepo.clone(dirname, shared=True, bare=True)
+	
+	@classmethod
+	def tearDownAll(cls):
+		shutil.rmtree(cls.gitrwrepo.working_tree_dir)
+		
 #} END base classes
diff --git a/test/git/performance/test_commit.py b/test/git/performance/test_commit.py
index b4a9d868..2398c93d 100644
--- a/test/git/performance/test_commit.py
+++ b/test/git/performance/test_commit.py
@@ -6,10 +6,12 @@
 
 from lib import *
 from git import *
+from test.git.test_commit import assert_commit_serialization
+from cStringIO import StringIO
 from time import time
 import sys
 
-class TestPerformance(TestBigRepoReadOnly):
+class TestPerformance(TestBigRepoRW):
 
 	# ref with about 100 commits in its history
 	ref_100 = '0.1.6'
@@ -48,7 +50,7 @@ class TestPerformance(TestBigRepoReadOnly):
 		# bound to cat-file parsing performance
 		nc = 0
 		st = time()
-		for c in self.gitrepo.commit(self.head_sha_2k).traverse(branch_first=False):
+		for c in self.gitrorepo.commit(self.head_sha_2k).traverse(branch_first=False):
 			nc += 1
 			self._query_commit_info(c)
 		# END for each traversed commit
@@ -59,10 +61,38 @@ class TestPerformance(TestBigRepoReadOnly):
 		# bound to stream parsing performance
 		nc = 0
 		st = time()
-		for c in Commit.iter_items(self.gitrepo, self.head_sha_2k):
+		for c in Commit.iter_items(self.gitrorepo, self.head_sha_2k):
 			nc += 1
 			self._query_commit_info(c)
 		# END for each traversed commit
 		elapsed_time = time() - st
 		print >> sys.stderr, "Iterated %i Commits in %s [s] ( %f commits/s )" % (nc, elapsed_time, nc/elapsed_time)
 		
+	def test_commit_serialization(self):
+		assert_commit_serialization(self.gitrwrepo, self.head_sha_2k, True)
+		
+		rwrepo = self.gitrwrepo
+		make_object = rwrepo.odb.to_object
+		# direct serialization - deserialization can be tested afterwards
+		# serialization is probably limited on IO
+		hc = rwrepo.commit(self.head_sha_2k)
+		
+		commits = list()
+		nc = 5000
+		st = time()
+		for i in xrange(nc):
+			cm = Commit(	rwrepo, Commit.NULL_HEX_SHA, hc.tree, 
+							hc.author, hc.authored_date, hc.author_tz_offset, 
+							hc.committer, hc.committed_date, hc.committer_tz_offset, 
+							str(i), parents=hc.parents, encoding=hc.encoding)
+			
+			stream = StringIO()
+			cm._serialize(stream)
+			slen = stream.tell()
+			stream.seek(0)
+			
+			cm.sha = make_object(Commit.type, slen, stream)
+		# END commit creation
+		elapsed = time() - st
+		
+		print >> sys.stderr, "Serialized %i commits to loose objects in %f s ( %f commits / s )" % (nc, elapsed, nc / elapsed)
diff --git a/test/git/performance/test_odb.py b/test/git/performance/test_odb.py
index 0ad2ce33..7b1ee838 100644
--- a/test/git/performance/test_odb.py
+++ b/test/git/performance/test_odb.py
@@ -5,18 +5,18 @@ import sys
 import stat
 
 from lib import (
-	TestBigRepoReadOnly
+	TestBigRepoR
 	)
 
 
-class TestObjDBPerformance(TestBigRepoReadOnly):
+class TestObjDBPerformance(TestBigRepoR):
 	
 	def test_random_access(self):
 		
 		# GET COMMITS
 		# TODO: use the actual db for this
 		st = time()
-		root_commit = self.gitrepo.commit(self.head_sha_2k)
+		root_commit = self.gitrorepo.commit(self.head_sha_2k)
 		commits = list(root_commit.traverse())
 		nc = len(commits)
 		elapsed = time() - st
diff --git a/test/git/performance/test_streams.py b/test/git/performance/test_streams.py
index 6c2834b3..d31bee14 100644
--- a/test/git/performance/test_streams.py
+++ b/test/git/performance/test_streams.py
@@ -14,7 +14,7 @@ import subprocess
 
 
 from lib import (
-	TestBigRepoReadOnly
+	TestBigRepoR
 	)
 
 
@@ -32,7 +32,7 @@ def make_memory_file(size_in_bytes, randomize=False):
 	return actual_size*4, StringIO(a.tostring())
 
 
-class TestObjDBPerformance(TestBigRepoReadOnly):
+class TestObjDBPerformance(TestBigRepoR):
 	
 	large_data_size_bytes = 1000*1000*10		# some MiB should do it
 	moderate_data_size_bytes = 1000*1000*1		# just 1 MiB
diff --git a/test/git/performance/test_utils.py b/test/git/performance/test_utils.py
index 381d7c8b..47366d34 100644
--- a/test/git/performance/test_utils.py
+++ b/test/git/performance/test_utils.py
@@ -4,11 +4,11 @@ import sys
 import stat
 
 from lib import (
-	TestBigRepoReadOnly
+	TestBigRepoR
 	)
 
 
-class TestUtilPerformance(TestBigRepoReadOnly):
+class TestUtilPerformance(TestBigRepoR):
 	
 	def test_access(self):
 		# compare dict vs. slot access
diff --git a/test/git/test_commit.py b/test/git/test_commit.py
index ad7a0082..a5f184e6 100644
--- a/test/git/test_commit.py
+++ b/test/git/test_commit.py
@@ -7,6 +7,56 @@
 from test.testlib import *
 from git import *
 
+from cStringIO import StringIO
+import time
+import sys
+
+
+def assert_commit_serialization(rwrepo, commit_id, print_performance_info=False):
+	"""traverse all commits in the history of commit identified by commit_id and check 
+	if the serialization works.
+	:param print_performance_info: if True, we will show how fast we are"""
+	ns = 0		# num serializations
+	nds = 0		# num deserializations
+	
+	st = time.time()
+	for cm in rwrepo.commit(commit_id).traverse():
+		nds += 1
+		
+		# assert that we deserialize commits correctly, hence we get the same 
+		# sha on serialization
+		stream = StringIO()
+		cm._serialize(stream)
+		ns += 1
+		streamlen = stream.tell()
+		stream.seek(0)
+		
+		csha = rwrepo.odb.to_object(Commit.type, streamlen, stream)
+		assert csha == cm.sha
+		
+		nc = Commit(rwrepo, Commit.NULL_HEX_SHA, cm.tree.sha,
+						cm.author, cm.authored_date, cm.author_tz_offset, 
+						cm.committer, cm.committed_date, cm.committer_tz_offset, 
+						cm.message, cm.parents, cm.encoding)
+		
+		assert nc.parents == cm.parents
+		stream = StringIO()
+		nc._serialize(stream)
+		ns += 1
+		streamlen = stream.tell()
+		stream.seek(0)
+		nc.sha = rwrepo.odb.to_object(Commit.type, streamlen, stream)
+		
+		# if it worked, we have exactly the same contents !
+		assert nc.sha == cm.sha
+	# END check commits
+	elapsed = time.time() - st
+	
+	if print_performance_info:
+		print >> sys.stderr, "Serialized %i and deserialized %i commits in %f s ( (%f, %f) commits / s" % (ns, nds, elapsed, ns/elapsed, nds/elapsed)
+	# END handle performance info
+	
+
 class TestCommit(TestBase):
 
 	def test_bake(self):
@@ -19,7 +69,7 @@ class TestCommit(TestBase):
 		assert commit.author == commit.committer
 		assert isinstance(commit.authored_date, int) and isinstance(commit.committed_date, int)
 		assert isinstance(commit.author_tz_offset, int) and isinstance(commit.committer_tz_offset, int)
-		assert commit.message == "Added missing information to docstrings of commit and stats module"
+		assert commit.message == "Added missing information to docstrings of commit and stats module\n"
 
 
 	def test_stats(self):
@@ -49,7 +99,7 @@ class TestCommit(TestBase):
 		assert commit.committed_date == 1210193388
 		assert commit.author_tz_offset == 14400, commit.author_tz_offset
 		assert commit.committer_tz_offset == 14400, commit.committer_tz_offset
-		assert commit.message == "initial project"
+		assert commit.message == "initial project\n"
 		
 	def test_traversal(self):
 		start = self.rorepo.commit("a4d06724202afccd2b5c54f81bcf2bf26dea7fff")
@@ -171,3 +221,8 @@ class TestCommit(TestBase):
 		name_rev = self.rorepo.head.commit.name_rev
 		assert isinstance(name_rev, basestring)
 		
+	@with_bare_rw_repo
+	def test_serialization(self, rwrepo):
+		# create all commits of our repo
+		assert_commit_serialization(rwrepo, '0.1.6')
+		
diff --git a/test/git/test_utils.py b/test/git/test_utils.py
index 2c3c392b..83ef7e4b 100644
--- a/test/git/test_utils.py
+++ b/test/git/test_utils.py
@@ -116,8 +116,6 @@ class TestUtils(TestCase):
 			os.remove(my_file)
 		# END final cleanup
 		
-	  
-		
 	def test_user_id(self):
 		assert '@' in get_user_id()
 		
@@ -127,7 +125,12 @@ class TestUtils(TestCase):
 			assert len(rval) == 2
 			assert isinstance(rval[0], int) and isinstance(rval[1], int)
 			assert rval[0] == veri_time
-			assert rval[1] == offset 
+			assert rval[1] == offset
+			
+			# now that we are here, test our conversion functions as well
+			utctz = altz_to_utctz_str(offset)
+			assert isinstance(utctz, basestring)
+			assert utctz_to_altz(verify_utctz(utctz)) == offset
 		# END assert rval utility
 		
 		rfc = ("Thu, 07 Apr 2005 22:13:11 +0000", 0)