diff options
-rw-r--r-- | CHANGES | 1 | ||||
-rw-r--r-- | lib/git/cmd.py | 7 | ||||
-rw-r--r-- | lib/git/objects/base.py | 4 | ||||
-rw-r--r-- | lib/git/objects/commit.py | 42 | ||||
-rw-r--r-- | lib/git/objects/utils.py | 25 | ||||
-rw-r--r-- | lib/git/odb/utils.py | 32 | ||||
-rw-r--r-- | lib/git/repo.py | 43 | ||||
-rw-r--r-- | lib/git/utils.py | 15 | ||||
-rw-r--r-- | test/git/performance/lib.py | 25 | ||||
-rw-r--r-- | test/git/performance/test_commit.py | 36 | ||||
-rw-r--r-- | test/git/performance/test_odb.py | 6 | ||||
-rw-r--r-- | test/git/performance/test_streams.py | 4 | ||||
-rw-r--r-- | test/git/performance/test_utils.py | 4 | ||||
-rw-r--r-- | test/git/test_commit.py | 59 | ||||
-rw-r--r-- | test/git/test_utils.py | 9 |
15 files changed, 230 insertions, 82 deletions
@@ -5,6 +5,7 @@ CHANGES 0.2 Beta 2 =========== * Commit objects now carry the 'encoding' information of their message. It wasn't parsed previously, and defaults to UTF-8 + * Commit.create_from_tree now uses a pure-python implementation, mimicing git-commit-tree 0.2 ===== diff --git a/lib/git/cmd.py b/lib/git/cmd.py index aaa27adc..18d1c505 100644 --- a/lib/git/cmd.py +++ b/lib/git/cmd.py @@ -323,12 +323,7 @@ class Git(object): stdout_value = proc.stdout.read().rstrip() # strip trailing "\n" else: max_chunk_size = 1024*64 - while True: - chunk = proc.stdout.read(max_chunk_size) - output_stream.write(chunk) - if len(chunk) < max_chunk_size: - break - # END reading output stream + stream_copy(proc.stdout, output_stream, max_chunk_size) stdout_value = output_stream # END stdout handling stderr_value = proc.stderr.read().rstrip() # strip trailing "\n" diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py index 64a5678e..f7043199 100644 --- a/lib/git/objects/base.py +++ b/lib/git/objects/base.py @@ -125,8 +125,8 @@ class Object(LazyMixin): Returns File Object compatible stream to the uncompressed raw data of the object """ - sha, type, size, stream = self.repo.git.stream_object_data(self.sha) - return stream + proc = self.repo.git.cat_file(self.type, self.sha, as_process=True) + return utils.ProcessStreamAdapter(proc, "stdout") def stream_data(self, ostream): """ diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py index 98aca360..d56ce306 100644 --- a/lib/git/objects/commit.py +++ b/lib/git/objects/commit.py @@ -91,15 +91,6 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri """ super(Commit,self).__init__(repo, sha) self._set_self_from_args_(locals()) - - if parents is not None: - cls = type(self) - self.parents = tuple(cls(repo, p) for p in parents if not isinstance(p, cls)) - # END for each parent to convert - - if self.sha and tree is not None: - self.tree = Tree(repo, tree, path='') - # END id to tree conversion @classmethod def _get_intermediate_items(cls, commit): @@ -350,7 +341,12 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri committer, committer_time, committer_offset, message, parent_commits, conf_encoding) - # serialize ! + stream = StringIO() + new_commit._serialize(stream) + streamlen = stream.tell() + stream.seek(0) + + new_commit.sha = repo.odb.to_object(cls.type, streamlen, stream, sha_as_hex=True) if head: try: @@ -377,8 +373,28 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri #{ Serializable Implementation def _serialize(self, stream): - # for now, this is very inefficient and in fact shouldn't be used like this - return super(Commit, self)._serialize(stream) + write = stream.write + write("tree %s\n" % self.tree) + for p in self.parents: + write("parent %s\n" % p) + + a = self.author + c = self.committer + fmt = "%s %s <%s> %s %s\n" + write(fmt % ("author", a.name, a.email, + self.authored_date, + utils.altz_to_utctz_str(self.author_tz_offset))) + + write(fmt % ("committer", c.name, c.email, + self.committed_date, + utils.altz_to_utctz_str(self.committer_tz_offset))) + + if self.encoding != self.default_encoding: + write("encoding %s\n" % self.encoding) + + write("\n") + write(self.message) + return self def _deserialize(self, stream): """:param from_rev_list: if true, the stream format is coming from the rev-list command @@ -416,7 +432,7 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri # a stream from our data simply gives us the plain message # The end of our message stream is marked with a newline that we strip - self.message = stream.read()[:-1] + self.message = stream.read() return self #} END serializable implementation diff --git a/lib/git/objects/utils.py b/lib/git/objects/utils.py index 6d378a72..c93f2091 100644 --- a/lib/git/objects/utils.py +++ b/lib/git/objects/utils.py @@ -16,7 +16,8 @@ import time import os __all__ = ('get_object_type_by_name', 'get_user_id', 'parse_date', 'parse_actor_and_date', - 'ProcessStreamAdapter', 'Traversable') + 'ProcessStreamAdapter', 'Traversable', 'altz_to_utctz_str', 'utctz_to_altz', + 'verify_utctz') def get_object_type_by_name(object_type_name): """ @@ -57,14 +58,24 @@ def get_user_id(): return "%s@%s" % (username, platform.node()) -def _utc_tz_to_altz(utctz): +def utctz_to_altz(utctz): """we convert utctz to the timezone in seconds, it is the format time.altzone returns. Git stores it as UTC timezon which has the opposite sign as well, which explains the -1 * ( that was made explicit here ) :param utctz: git utc timezone string, i.e. +0200""" return -1 * int(float(utctz)/100*3600) + +def altz_to_utctz_str(altz): + """As above, but inverses the operation, returning a string that can be used + in commit objects""" + utci = -1 * int((altz / 3600)*100) + utcs = str(abs(utci)) + utcs = "0"*(4-len(utcs)) + utcs + prefix = (utci < 0 and '-') or '+' + return prefix + utcs + -def _verify_utctz(offset): +def verify_utctz(offset): """:raise ValueError: if offset is incorrect :return: offset""" fmt_exc = ValueError("Invalid timezone offset format: %s" % offset) @@ -97,11 +108,11 @@ def parse_date(string_date): if string_date.count(' ') == 1 and string_date.rfind(':') == -1: timestamp, offset = string_date.split() timestamp = int(timestamp) - return timestamp, _utc_tz_to_altz(_verify_utctz(offset)) + return timestamp, utctz_to_altz(verify_utctz(offset)) else: offset = "+0000" # local time by default if string_date[-5] in '-+': - offset = _verify_utctz(string_date[-5:]) + offset = verify_utctz(string_date[-5:]) string_date = string_date[:-6] # skip space as well # END split timezone info @@ -139,7 +150,7 @@ def parse_date(string_date): fstruct = time.struct_time((dtstruct.tm_year, dtstruct.tm_mon, dtstruct.tm_mday, tstruct.tm_hour, tstruct.tm_min, tstruct.tm_sec, dtstruct.tm_wday, dtstruct.tm_yday, tstruct.tm_isdst)) - return int(time.mktime(fstruct)), _utc_tz_to_altz(offset) + return int(time.mktime(fstruct)), utctz_to_altz(offset) except ValueError: continue # END exception handling @@ -167,7 +178,7 @@ def parse_actor_and_date(line): """ m = _re_actor_epoch.search(line) actor, epoch, offset = m.groups() - return (Actor._from_string(actor), int(epoch), _utc_tz_to_altz(offset)) + return (Actor._from_string(actor), int(epoch), utctz_to_altz(offset)) diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py index 94d1cea8..fd340962 100644 --- a/lib/git/odb/utils.py +++ b/lib/git/odb/utils.py @@ -137,7 +137,7 @@ class DecompressMemMapReader(object): # END handle size # read header - maxb = 8192 + maxb = 512 # should really be enough, cgit uses 8192 I believe self._s = maxb hdr = self.read(maxb) hdrend = hdr.find("\0") @@ -172,20 +172,24 @@ class DecompressMemMapReader(object): # Our performance now depends on StringIO. This way we don't need two large # buffers in peak times, but only one large one in the end which is # the return buffer - if size > self.max_read_size: - sio = StringIO() - while size: - read_size = min(self.max_read_size, size) - data = self.read(read_size) - sio.write(data) - size -= len(data) - if len(data) < read_size: - break - # END data loop - sio.seek(0) - return sio.getvalue() - # END handle maxread + # NO: We don't do it - if the user thinks its best, he is right. If he + # has trouble, he will start reading in chunks. According to our tests + # its still faster if we read 10 Mb at once instead of chunking it. + # if size > self.max_read_size: + # sio = StringIO() + # while size: + # read_size = min(self.max_read_size, size) + # data = self.read(read_size) + # sio.write(data) + # size -= len(data) + # if len(data) < read_size: + # break + # # END data loop + # sio.seek(0) + # return sio.getvalue() + # # END handle maxread + # # deplete the buffer, then just continue using the decompress object # which has an own buffer. We just need this to transparently parse the # header from the zlib stream diff --git a/lib/git/repo.py b/lib/git/repo.py index f4caa3fb..0bd2249c 100644 --- a/lib/git/repo.py +++ b/lib/git/repo.py @@ -4,12 +4,6 @@ # This module is part of GitPython and is released under # the BSD License: http://www.opensource.org/licenses/bsd-license.php -import os -import sys -import re -import gzip -import StringIO - from errors import InvalidGitRepositoryError, NoSuchPathError from cmd import Git from actor import Actor @@ -19,6 +13,15 @@ from objects import * from config import GitConfigParser from remote import Remote +from odb.db import LooseObjectDB + +import os +import sys +import re +import gzip +import StringIO + + def touch(filename): fp = open(filename, "a") fp.close() @@ -53,7 +56,7 @@ class Repo(object): 'git_dir' is the .git repository directoy, which is always set. """ DAEMON_EXPORT_FILE = 'git-daemon-export-ok' - __slots__ = ( "working_dir", "_working_tree_dir", "git_dir", "_bare", "git" ) + __slots__ = ( "working_dir", "_working_tree_dir", "git_dir", "_bare", "git", "odb" ) # precompiled regex re_whitespace = re.compile(r'\s+') @@ -65,27 +68,22 @@ class Repo(object): # represents the configuration level of a configuration file config_level = ("system", "global", "repository") - def __init__(self, path=None): - """ - Create a new Repo instance - - ``path`` - is the path to either the root git directory or the bare git repo + def __init__(self, path=None, odbt = LooseObjectDB): + """ Create a new Repo instance - Examples:: + :param path: is the path to either the root git directory or the bare git repo:: repo = Repo("/Users/mtrier/Development/git-python") repo = Repo("/Users/mtrier/Development/git-python.git") repo = Repo("~/Development/git-python.git") repo = Repo("$REPOSITORIES/Development/git-python.git") - - Raises - InvalidGitRepositoryError or NoSuchPathError - - Returns - ``git.Repo`` - """ - + + :param odbt: Object DataBase type - a type which is constructed by providing + the directory containing the database objects, i.e. .git/objects. It will + be used to access all object data + :raise InvalidGitRepositoryError: + :raise NoSuchPathError: + :return: git.Repo """ epath = os.path.abspath(os.path.expandvars(os.path.expanduser(path or os.getcwd()))) if not os.path.exists(epath): @@ -130,6 +128,7 @@ class Repo(object): self.working_dir = self._working_tree_dir or self.git_dir self.git = Git(self.working_dir) + self.odb = odbt(os.path.join(self.git_dir, 'objects')) def __eq__(self, rhs): if isinstance(rhs, Repo): diff --git a/lib/git/utils.py b/lib/git/utils.py index 360c77c9..60a7de48 100644 --- a/lib/git/utils.py +++ b/lib/git/utils.py @@ -27,6 +27,21 @@ def make_sha(source=''): sha1 = sha.sha(source) return sha1 +def stream_copy(source, destination, chunk_size=512*1024): + """Copy all data from the source stream into the destination stream in chunks + of size chunk_size + :return: amount of bytes written""" + br = 0 + while True: + chunk = source.read(chunk_size) + destination.write(chunk) + br += len(chunk) + if len(chunk) < chunk_size: + break + # END reading output stream + return br + + def join_path(a, *p): """Join path tokens together similar to os.path.join, but always use '/' instead of possibly '\' on windows.""" diff --git a/test/git/performance/lib.py b/test/git/performance/lib.py index 4b552b20..650bea82 100644 --- a/test/git/performance/lib.py +++ b/test/git/performance/lib.py @@ -1,6 +1,8 @@ """Contains library functions""" import os from test.testlib import * +import shutil +import tempfile from git import ( Repo @@ -25,7 +27,7 @@ def resolve_or_fail(env_var): #{ Base Classes -class TestBigRepoReadOnly(TestBase): +class TestBigRepoR(TestBase): """TestCase providing access to readonly 'big' repositories using the following member variables: @@ -40,7 +42,24 @@ class TestBigRepoReadOnly(TestBase): @classmethod def setUpAll(cls): - super(TestBigRepoReadOnly, cls).setUpAll() - cls.gitrepo = Repo(resolve_or_fail(k_env_git_repo)) + super(TestBigRepoR, cls).setUpAll() + cls.gitrorepo = Repo(resolve_or_fail(k_env_git_repo)) + +class TestBigRepoRW(TestBigRepoR): + """As above, but provides a big repository that we can write to. + + Provides ``self.gitrwrepo``""" + + @classmethod + def setUpAll(cls): + super(TestBigRepoRW, cls).setUpAll() + dirname = tempfile.mktemp() + os.mkdir(dirname) + cls.gitrwrepo = cls.gitrorepo.clone(dirname, shared=True, bare=True) + + @classmethod + def tearDownAll(cls): + shutil.rmtree(cls.gitrwrepo.working_tree_dir) + #} END base classes diff --git a/test/git/performance/test_commit.py b/test/git/performance/test_commit.py index b4a9d868..2398c93d 100644 --- a/test/git/performance/test_commit.py +++ b/test/git/performance/test_commit.py @@ -6,10 +6,12 @@ from lib import * from git import * +from test.git.test_commit import assert_commit_serialization +from cStringIO import StringIO from time import time import sys -class TestPerformance(TestBigRepoReadOnly): +class TestPerformance(TestBigRepoRW): # ref with about 100 commits in its history ref_100 = '0.1.6' @@ -48,7 +50,7 @@ class TestPerformance(TestBigRepoReadOnly): # bound to cat-file parsing performance nc = 0 st = time() - for c in self.gitrepo.commit(self.head_sha_2k).traverse(branch_first=False): + for c in self.gitrorepo.commit(self.head_sha_2k).traverse(branch_first=False): nc += 1 self._query_commit_info(c) # END for each traversed commit @@ -59,10 +61,38 @@ class TestPerformance(TestBigRepoReadOnly): # bound to stream parsing performance nc = 0 st = time() - for c in Commit.iter_items(self.gitrepo, self.head_sha_2k): + for c in Commit.iter_items(self.gitrorepo, self.head_sha_2k): nc += 1 self._query_commit_info(c) # END for each traversed commit elapsed_time = time() - st print >> sys.stderr, "Iterated %i Commits in %s [s] ( %f commits/s )" % (nc, elapsed_time, nc/elapsed_time) + def test_commit_serialization(self): + assert_commit_serialization(self.gitrwrepo, self.head_sha_2k, True) + + rwrepo = self.gitrwrepo + make_object = rwrepo.odb.to_object + # direct serialization - deserialization can be tested afterwards + # serialization is probably limited on IO + hc = rwrepo.commit(self.head_sha_2k) + + commits = list() + nc = 5000 + st = time() + for i in xrange(nc): + cm = Commit( rwrepo, Commit.NULL_HEX_SHA, hc.tree, + hc.author, hc.authored_date, hc.author_tz_offset, + hc.committer, hc.committed_date, hc.committer_tz_offset, + str(i), parents=hc.parents, encoding=hc.encoding) + + stream = StringIO() + cm._serialize(stream) + slen = stream.tell() + stream.seek(0) + + cm.sha = make_object(Commit.type, slen, stream) + # END commit creation + elapsed = time() - st + + print >> sys.stderr, "Serialized %i commits to loose objects in %f s ( %f commits / s )" % (nc, elapsed, nc / elapsed) diff --git a/test/git/performance/test_odb.py b/test/git/performance/test_odb.py index 0ad2ce33..7b1ee838 100644 --- a/test/git/performance/test_odb.py +++ b/test/git/performance/test_odb.py @@ -5,18 +5,18 @@ import sys import stat from lib import ( - TestBigRepoReadOnly + TestBigRepoR ) -class TestObjDBPerformance(TestBigRepoReadOnly): +class TestObjDBPerformance(TestBigRepoR): def test_random_access(self): # GET COMMITS # TODO: use the actual db for this st = time() - root_commit = self.gitrepo.commit(self.head_sha_2k) + root_commit = self.gitrorepo.commit(self.head_sha_2k) commits = list(root_commit.traverse()) nc = len(commits) elapsed = time() - st diff --git a/test/git/performance/test_streams.py b/test/git/performance/test_streams.py index 6c2834b3..d31bee14 100644 --- a/test/git/performance/test_streams.py +++ b/test/git/performance/test_streams.py @@ -14,7 +14,7 @@ import subprocess from lib import ( - TestBigRepoReadOnly + TestBigRepoR ) @@ -32,7 +32,7 @@ def make_memory_file(size_in_bytes, randomize=False): return actual_size*4, StringIO(a.tostring()) -class TestObjDBPerformance(TestBigRepoReadOnly): +class TestObjDBPerformance(TestBigRepoR): large_data_size_bytes = 1000*1000*10 # some MiB should do it moderate_data_size_bytes = 1000*1000*1 # just 1 MiB diff --git a/test/git/performance/test_utils.py b/test/git/performance/test_utils.py index 381d7c8b..47366d34 100644 --- a/test/git/performance/test_utils.py +++ b/test/git/performance/test_utils.py @@ -4,11 +4,11 @@ import sys import stat from lib import ( - TestBigRepoReadOnly + TestBigRepoR ) -class TestUtilPerformance(TestBigRepoReadOnly): +class TestUtilPerformance(TestBigRepoR): def test_access(self): # compare dict vs. slot access diff --git a/test/git/test_commit.py b/test/git/test_commit.py index ad7a0082..a5f184e6 100644 --- a/test/git/test_commit.py +++ b/test/git/test_commit.py @@ -7,6 +7,56 @@ from test.testlib import * from git import * +from cStringIO import StringIO +import time +import sys + + +def assert_commit_serialization(rwrepo, commit_id, print_performance_info=False): + """traverse all commits in the history of commit identified by commit_id and check + if the serialization works. + :param print_performance_info: if True, we will show how fast we are""" + ns = 0 # num serializations + nds = 0 # num deserializations + + st = time.time() + for cm in rwrepo.commit(commit_id).traverse(): + nds += 1 + + # assert that we deserialize commits correctly, hence we get the same + # sha on serialization + stream = StringIO() + cm._serialize(stream) + ns += 1 + streamlen = stream.tell() + stream.seek(0) + + csha = rwrepo.odb.to_object(Commit.type, streamlen, stream) + assert csha == cm.sha + + nc = Commit(rwrepo, Commit.NULL_HEX_SHA, cm.tree.sha, + cm.author, cm.authored_date, cm.author_tz_offset, + cm.committer, cm.committed_date, cm.committer_tz_offset, + cm.message, cm.parents, cm.encoding) + + assert nc.parents == cm.parents + stream = StringIO() + nc._serialize(stream) + ns += 1 + streamlen = stream.tell() + stream.seek(0) + nc.sha = rwrepo.odb.to_object(Commit.type, streamlen, stream) + + # if it worked, we have exactly the same contents ! + assert nc.sha == cm.sha + # END check commits + elapsed = time.time() - st + + if print_performance_info: + print >> sys.stderr, "Serialized %i and deserialized %i commits in %f s ( (%f, %f) commits / s" % (ns, nds, elapsed, ns/elapsed, nds/elapsed) + # END handle performance info + + class TestCommit(TestBase): def test_bake(self): @@ -19,7 +69,7 @@ class TestCommit(TestBase): assert commit.author == commit.committer assert isinstance(commit.authored_date, int) and isinstance(commit.committed_date, int) assert isinstance(commit.author_tz_offset, int) and isinstance(commit.committer_tz_offset, int) - assert commit.message == "Added missing information to docstrings of commit and stats module" + assert commit.message == "Added missing information to docstrings of commit and stats module\n" def test_stats(self): @@ -49,7 +99,7 @@ class TestCommit(TestBase): assert commit.committed_date == 1210193388 assert commit.author_tz_offset == 14400, commit.author_tz_offset assert commit.committer_tz_offset == 14400, commit.committer_tz_offset - assert commit.message == "initial project" + assert commit.message == "initial project\n" def test_traversal(self): start = self.rorepo.commit("a4d06724202afccd2b5c54f81bcf2bf26dea7fff") @@ -171,3 +221,8 @@ class TestCommit(TestBase): name_rev = self.rorepo.head.commit.name_rev assert isinstance(name_rev, basestring) + @with_bare_rw_repo + def test_serialization(self, rwrepo): + # create all commits of our repo + assert_commit_serialization(rwrepo, '0.1.6') + diff --git a/test/git/test_utils.py b/test/git/test_utils.py index 2c3c392b..83ef7e4b 100644 --- a/test/git/test_utils.py +++ b/test/git/test_utils.py @@ -116,8 +116,6 @@ class TestUtils(TestCase): os.remove(my_file) # END final cleanup - - def test_user_id(self): assert '@' in get_user_id() @@ -127,7 +125,12 @@ class TestUtils(TestCase): assert len(rval) == 2 assert isinstance(rval[0], int) and isinstance(rval[1], int) assert rval[0] == veri_time - assert rval[1] == offset + assert rval[1] == offset + + # now that we are here, test our conversion functions as well + utctz = altz_to_utctz_str(offset) + assert isinstance(utctz, basestring) + assert utctz_to_altz(verify_utctz(utctz)) == offset # END assert rval utility rfc = ("Thu, 07 Apr 2005 22:13:11 +0000", 0) |