diff options
author | Sebastian Thiel <byronimo@gmail.com> | 2010-06-22 21:23:47 +0200 |
---|---|---|
committer | Sebastian Thiel <byronimo@gmail.com> | 2010-06-22 21:23:47 +0200 |
commit | be97c4558992a437cde235aafc7ae2bd6df84ac8 (patch) | |
tree | 3e44a7c38e356817ca81721725709d7374f95012 | |
parent | 778234d544b3f58dd415aaf10679d15b01a5281f (diff) | |
download | gitpython-be97c4558992a437cde235aafc7ae2bd6df84ac8.tar.gz |
Initial frame for implementing read_tree using pure python. As git-read-tree can do much more than we can ( and faster assumably ), the .new method is used to create new index instances from up to 3 trees.
Implemented multi-tree traversal to facilitate building a stage list more efficiently ( although I am not sure whether it could be faster to use a dictionary together with some intensive lookup ), including test
Added performance to learn how fast certain operations are, and whether one should be preferred over another
-rw-r--r-- | lib/git/index/base.py | 24 | ||||
-rw-r--r-- | lib/git/index/fun.py | 29 | ||||
-rw-r--r-- | lib/git/objects/fun.py | 118 | ||||
-rw-r--r-- | lib/git/repo.py | 2 | ||||
-rw-r--r-- | test/git/performance/test_utils.py | 94 | ||||
-rw-r--r-- | test/git/test_fun.py | 70 | ||||
-rw-r--r-- | test/git/test_index.py | 6 | ||||
-rw-r--r-- | test/git/test_tree.py | 5 |
8 files changed, 339 insertions, 9 deletions
diff --git a/lib/git/index/base.py b/lib/git/index/base.py index 06437702..f1be00e0 100644 --- a/lib/git/index/base.py +++ b/lib/git/index/base.py @@ -133,7 +133,6 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): def _index_path(self): return join_path_native(self.repo.git_dir, "index") - @property def path(self): """ :return: Path to the index file we are representing """ @@ -241,6 +240,26 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): return self @classmethod + def new(cls, repo, *tree_sha): + """ Merge the given treeish revisions into a new index which is returned. + This method behaves like git-read-tree --aggressive when doing the merge. + + :param repo: The repository treeish are located in. + + :param *tree_sha: + see ``from_tree`` + + :return: + New IndexFile instance. Its path will be undefined. + If you intend to write such a merged Index, supply an alternate file_path + to its 'write' method.""" + base_entries = aggressive_tree_merge(repo.odb, tree_sha) + + inst = cls(self.repo) + raise NotImplementedError("convert to entries") + + + @classmethod def from_tree(cls, repo, *treeish, **kwargs): """ Merge the given treeish revisions into a new index which is returned. @@ -275,8 +294,7 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): As the underlying git-read-tree command takes into account the current index, it will be temporarily moved out of the way to assure there are no unsuspected - interferences. - """ + interferences.""" if len(treeish) == 0 or len(treeish) > 3: raise ValueError("Please specify between 1 and 3 treeish, got %i" % len(treeish)) diff --git a/lib/git/index/fun.py b/lib/git/index/fun.py index 9f877a66..962e139a 100644 --- a/lib/git/index/fun.py +++ b/lib/git/index/fun.py @@ -12,8 +12,10 @@ from git.utils import ( ) from typ import ( + BaseIndexEntry, IndexEntry, - CE_NAMEMASK + CE_NAMEMASK, + CE_STAGESHIFT ) from util import ( @@ -23,7 +25,6 @@ from util import ( from gitdb.base import IStream from gitdb.typ import str_tree_type -from binascii import a2b_hex __all__ = ('write_cache', 'read_cache', 'write_tree_from_cache', 'entry_key' ) @@ -150,6 +151,7 @@ def write_tree_from_cache(entries, odb, sl, si=0): :return: tuple(binsha, list(tree_entry, ...)) a tuple of a sha and a list of tree entries being a tuple of hexsha, mode, name""" tree_items = list() + tree_items_append = tree_items.append ci = sl.start end = sl.stop while ci < end: @@ -161,7 +163,7 @@ def write_tree_from_cache(entries, odb, sl, si=0): rbound = entry.path.find('/', si) if rbound == -1: # its not a tree - tree_items.append((entry.binsha, entry.mode, entry.path[si:])) + tree_items_append((entry.binsha, entry.mode, entry.path[si:])) else: # find common base range base = entry.path[si:rbound] @@ -178,7 +180,7 @@ def write_tree_from_cache(entries, odb, sl, si=0): # enter recursion # ci - 1 as we want to count our current item as well sha, tree_entry_list = write_tree_from_cache(entries, odb, slice(ci-1, xi), rbound+1) - tree_items.append((sha, S_IFDIR, base)) + tree_items_append((sha, S_IFDIR, base)) # skip ahead ci = xi @@ -193,5 +195,24 @@ def write_tree_from_cache(entries, odb, sl, si=0): istream = odb.store(IStream(str_tree_type, len(sio.getvalue()), sio)) return (istream.binsha, tree_items) +def _tree_entry_to_baseindexentry(tree_entry, stage): + return BaseIndexEntry(tree_entry[1], tree_entry[0], stage <<CE_STAGESHIFT, tree_entry[2]) +def aggressive_tree_merge(odb, tree_shas): + """ + :return: list of BaseIndexEntries representing the aggressive merge of the given + trees. All valid entries are on stage 0, whereas the conflicting ones are left + on stage 1, 2 or 3, whereas stage 1 corresponds to the common ancestor tree, + 2 to our tree and 3 to 'their' tree. + :param tree_shas: 1, 2 or 3 trees as identified by their shas""" + out = list() + out_append = out.append + if len(tree_shas) == 1: + for entry in traverse_tree_recursive(odb, tree_shas[0]): + out_append(_tree_entry_to_baseindexentry(entry, 0)) + # END for each entry + else: + raise ValueError("Cannot handle %i trees at once" % len(tree_shas)) + # END handle tree shas + return out diff --git a/lib/git/objects/fun.py b/lib/git/objects/fun.py index 7882437d..d21a7dad 100644 --- a/lib/git/objects/fun.py +++ b/lib/git/objects/fun.py @@ -2,6 +2,9 @@ __all__ = ('tree_to_stream', 'tree_entries_from_data') +from stat import S_ISDIR + + def tree_to_stream(entries, write): """Write the give list of entries into a stream using its write method :param entries: **sorted** list of tuples with (binsha, mode, name) @@ -64,3 +67,118 @@ def tree_entries_from_data(data): out.append((sha, mode, name)) # END for each byte in data stream return out + + +def _find_by_name(tree_data, name, is_dir, start_at): + """return data entry matching the given name and tree mode + or None. + Before the item is returned, the respective data item is set + None in the tree_data list to mark it done""" + try: + item = tree_data[start_at] + if item and item[2] == name and S_ISDIR(item[1]) == is_dir: + tree_data[start_at] = None + return item + except IndexError: + pass + # END exception handling + for index, item in enumerate(tree_data): + if item and item[2] == name and S_ISDIR(item[1]) == is_dir: + tree_data[index] = None + return item + # END if item matches + # END for each item + return None + +def _to_full_path(item, path_prefix): + """Rebuild entry with given path prefix""" + if not item: + return item + return (item[0], item[1], path_prefix+item[2]) + +def traverse_trees_recursive(odb, tree_shas, path_prefix): + """ + :return: list with entries according to the given tree-shas. + The result is encoded in a list + of n tuple|None per blob/commit, (n == len(tree_shas)), where + * [0] == 20 byte sha + * [1] == mode as int + * [2] == path relative to working tree root + The entry tuple is None if the respective blob/commit did not + exist in the given tree. + :param tree_shas: iterable of shas pointing to trees. All trees must + be on the same level. A tree-sha may be None in which case None + :param path_prefix: a prefix to be added to the returned paths on this level, + set it '' for the first iteration + :note: The ordering of the returned items will be partially lost""" + trees_data = list() + nt = len(tree_shas) + for tree_sha in tree_shas: + if tree_sha is None: + data = list() + else: + data = tree_entries_from_data(odb.stream(tree_sha).read()) + # END handle muted trees + trees_data.append(data) + # END for each sha to get data for + + out = list() + out_append = out.append + + # find all matching entries and recursively process them together if the match + # is a tree. If the match is a non-tree item, put it into the result. + # Processed items will be set None + for ti, tree_data in enumerate(trees_data): + for ii, item in enumerate(tree_data): + if not item: + continue + # END skip already done items + entries = [ None for n in range(nt) ] + entries[ti] = item + sha, mode, name = item # its faster to unpack + is_dir = S_ISDIR(mode) # type mode bits + + # find this item in all other tree data items + # wrap around, but stop one before our current index, hence + # ti+nt, not ti+1+nt + for tio in range(ti+1, ti+nt): + tio = tio % nt + entries[tio] = _find_by_name(trees_data[tio], name, is_dir, ii) + # END for each other item data + + # if we are a directory, enter recursion + if is_dir: + out.extend(traverse_trees_recursive(odb, [ei[0] for ei in entries if ei], path_prefix+name+'/')) + else: + out_append(tuple(_to_full_path(e, path_prefix) for e in entries)) + # END handle recursion + + # finally mark it done + tree_data[ii] = None + # END for each item + + # we are done with one tree, set all its data empty + del(tree_data[:]) + # END for each tree_data chunk + return out + +def traverse_tree_recursive(odb, tree_sha, path_prefix): + """ + :return: list of entries of the tree pointed to by tree_sha. An entry + has the following format: + * [0] 20 byte sha + * [1] mode as int + * [2] path relative to the repository + :param path_prefix: prefix to prepend to the front of all returned paths""" + entries = list() + data = tree_entries_from_data(odb.stream(tree_sha).read()) + + # unpacking/packing is faster than accessing individual items + for sha, mode, name in data: + if S_ISDIR(mode): + entries.extend(traverse_tree_recursive(odb, sha, path_prefix+name+'/')) + else: + entries.append((sha, mode, path_prefix+name)) + # END for each item + + return entries diff --git a/lib/git/repo.py b/lib/git/repo.py index 74525403..f97126ea 100644 --- a/lib/git/repo.py +++ b/lib/git/repo.py @@ -71,7 +71,7 @@ class Repo(object): # represents the configuration level of a configuration file config_level = ("system", "global", "repository") - def __init__(self, path=None, odbt = GitCmdObjectDB): + def __init__(self, path=None, odbt = GitDB): """ Create a new Repo instance :param path: is the path to either the root git directory or the bare git repo:: diff --git a/test/git/performance/test_utils.py b/test/git/performance/test_utils.py index 76adffec..16100f8b 100644 --- a/test/git/performance/test_utils.py +++ b/test/git/performance/test_utils.py @@ -57,3 +57,97 @@ class TestUtilPerformance(TestBigRepoR): na = ni * 3 print >> sys.stderr, "Accessed %s[x] %i times in %s s ( %f acc / s)" % (cls.__name__, na, elapsed, na / elapsed) # END for each sequence + + def test_instantiation(self): + ni = 100000 + max_num_items = 4 + for mni in range(max_num_items+1): + for cls in (tuple, list): + st = time() + for i in xrange(ni): + if mni == 0: + cls() + elif mni == 1: + cls((1,)) + elif mni == 2: + cls((1,2)) + elif mni == 3: + cls((1,2,3)) + elif mni == 4: + cls((1,2,3,4)) + else: + cls(x for x in xrange(mni)) + # END handle empty cls + # END for each item + elapsed = time() - st + print >> sys.stderr, "Created %i %ss of size %i in %f s ( %f inst / s)" % (ni, cls.__name__, mni, elapsed, ni / elapsed) + # END for each type + # END for each item count + + # tuple and tuple direct + st = time() + for i in xrange(ni): + t = (1,2,3,4) + # END for each item + elapsed = time() - st + print >> sys.stderr, "Created %i tuples (1,2,3,4) in %f s ( %f tuples / s)" % (ni, elapsed, ni / elapsed) + + st = time() + for i in xrange(ni): + t = tuple((1,2,3,4)) + # END for each item + elapsed = time() - st + print >> sys.stderr, "Created %i tuples tuple((1,2,3,4)) in %f s ( %f tuples / s)" % (ni, elapsed, ni / elapsed) + + def test_unpacking_vs_indexing(self): + ni = 1000000 + list_items = [1,2,3,4] + tuple_items = (1,2,3,4) + + for sequence in (list_items, tuple_items): + st = time() + for i in xrange(ni): + one, two, three, four = sequence + # END for eac iteration + elapsed = time() - st + print >> sys.stderr, "Unpacked %i %ss of size %i in %f s ( %f acc / s)" % (ni, type(sequence).__name__, len(sequence), elapsed, ni / elapsed) + + st = time() + for i in xrange(ni): + one, two, three, four = sequence[0], sequence[1], sequence[2], sequence[3] + # END for eac iteration + elapsed = time() - st + print >> sys.stderr, "Unpacked %i %ss of size %i individually in %f s ( %f acc / s)" % (ni, type(sequence).__name__, len(sequence), elapsed, ni / elapsed) + + st = time() + for i in xrange(ni): + one, two = sequence[0], sequence[1] + # END for eac iteration + elapsed = time() - st + print >> sys.stderr, "Unpacked %i %ss of size %i individually (2 of 4) in %f s ( %f acc / s)" % (ni, type(sequence).__name__, len(sequence), elapsed, ni / elapsed) + # END for each sequence + + def test_large_list_vs_iteration(self): + # what costs more: alloc/realloc of lists, or the cpu strain of iterators ? + def slow_iter(ni): + for i in xrange(ni): + yield i + # END slow iter - be closer to the real world + + # alloc doesn't play a role here it seems + for ni in (500, 1000, 10000, 20000, 40000): + st = time() + for i in list(xrange(ni)): + i + # END for each item + elapsed = time() - st + print >> sys.stderr, "Iterated %i items from list in %f s ( %f acc / s)" % (ni, elapsed, ni / elapsed) + + st = time() + for i in slow_iter(ni): + i + # END for each item + elapsed = time() - st + print >> sys.stderr, "Iterated %i items from iterator in %f s ( %f acc / s)" % (ni, elapsed, ni / elapsed) + # END for each number of iterations + diff --git a/test/git/test_fun.py b/test/git/test_fun.py new file mode 100644 index 00000000..ccf15c77 --- /dev/null +++ b/test/git/test_fun.py @@ -0,0 +1,70 @@ +from test.testlib import * +from git.objects.fun import ( + traverse_tree_recursive, + traverse_trees_recursive + ) + +from git.index.fun import ( + aggressive_tree_merge + ) + +class TestFun(TestBase): + + def test_aggressive_tree_merge(self): + # head tree with additions, removals and modification compared to its predecessor + HC = self.rorepo.commit("6c1faef799095f3990e9970bc2cb10aa0221cf9c") + H = HC.tree + B = HC.parents[0].tree + + # test new index from single tree + + def _assert_entries(self, entries, num_trees): + assert len(entries[0]) == num_trees + for entry in entries: + paths = set(e[2] for e in entry if e) + + # only one path per set of entries + assert len(paths) == 1 + # END verify entry + + def test_tree_traversal(self): + # low level tree tarversal + odb = self.rorepo.odb + H = self.rorepo.tree('29eb123beb1c55e5db4aa652d843adccbd09ae18') # head tree + M = self.rorepo.tree('e14e3f143e7260de9581aee27e5a9b2645db72de') # merge tree + B = self.rorepo.tree('f606937a7a21237c866efafcad33675e6539c103') # base tree + B_old = self.rorepo.tree('1f66cfbbce58b4b552b041707a12d437cc5f400a') # old base tree + + # two very different trees + entries = traverse_trees_recursive(odb, [B_old.sha, H.sha], '') + self._assert_entries(entries, 2) + + oentries = traverse_trees_recursive(odb, [H.sha, B_old.sha], '') + assert len(oentries) == len(entries) + self._assert_entries(oentries, 2) + + # single tree + is_no_tree = lambda i, d: i.type != 'tree' + entries = traverse_trees_recursive(odb, [B.sha], '') + assert len(entries) == len(list(B.traverse(predicate=is_no_tree))) + self._assert_entries(entries, 1) + + # two trees + entries = traverse_trees_recursive(odb, [B.sha, H.sha], '') + self._assert_entries(entries, 2) + + # tree trees + entries = traverse_trees_recursive(odb, [B.sha, H.sha, M.sha], '') + self._assert_entries(entries, 3) + + def test_tree_traversal_single(self): + max_count = 50 + count = 0 + odb = self.rorepo.odb + for commit in self.rorepo.commit("29eb123beb1c55e5db4aa652d843adccbd09ae18").traverse(): + if count >= max_count: + break + count += 1 + entries = traverse_tree_recursive(odb, commit.tree.sha, '') + assert entries + # END for each commit diff --git a/test/git/test_index.py b/test/git/test_index.py index d0063e89..ae754430 100644 --- a/test/git/test_index.py +++ b/test/git/test_index.py @@ -580,7 +580,7 @@ class TestIndex(TestBase): # write all trees and compare them # its important to have a few submodules in there too - max_count = 100 + max_count = 25 count = 0 for commit in rw_repo.head.commit.traverse(): if count >= max_count: @@ -593,3 +593,7 @@ class TestIndex(TestBase): assert index.write_tree() == orig_tree # END for each commit + def test_index_new(self): + self.fail("todo index new") + + diff --git a/test/git/test_tree.py b/test/git/test_tree.py index d983cb2f..a443bd97 100644 --- a/test/git/test_tree.py +++ b/test/git/test_tree.py @@ -7,6 +7,10 @@ import os from test.testlib import * from git import * +from git.objects.fun import ( + traverse_tree_recursive, + traverse_trees_recursive + ) from cStringIO import StringIO class TestTree(TestBase): @@ -136,3 +140,4 @@ class TestTree(TestBase): def test_repr(self): tree = Tree(self.rorepo, 'abc') assert_equal('<git.Tree "abc">', repr(tree)) + |