From a58a60ac5f322eb4bfd38741469ff21b5a33d2d5 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 12 Oct 2009 23:18:43 +0200 Subject: tree: now behaves like a list with string indexing functionality - using a dict as cache is a problem as the tree is ordered, added blobs, trees and traverse method repo: remove blob function as blobs are created directly or iterated - primitve types should not clutter the repo interface --- lib/git/objects/tree.py | 170 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 136 insertions(+), 34 deletions(-) (limited to 'lib/git/objects/tree.py') diff --git a/lib/git/objects/tree.py b/lib/git/objects/tree.py index 273384a3..707cebaa 100644 --- a/lib/git/objects/tree.py +++ b/lib/git/objects/tree.py @@ -9,26 +9,57 @@ import blob import base class Tree(base.IndexObject): + """ + Tress represent a ordered list of Blobs and other Trees. Hence it can be + accessed like a list. + + Tree's will cache their contents after first retrieval to improve efficiency. + + ``Tree as a list``:: + + Access a specific blob using the + tree['filename'] notation. + + You may as well access by index + blob = tree[0] + + + """ type = "tree" - __slots__ = "_contents" + __slots__ = "_cache" def __init__(self, repo, id, mode=None, path=None): super(Tree, self).__init__(repo, id, mode, path) def _set_cache_(self, attr): - if attr == "_contents": - # Read the tree contents. - self._contents = {} - for line in self.repo.git.ls_tree(self.id).splitlines(): - obj = self.content__from_string(self.repo, line) - if obj is not None: - self._contents[obj.path] = obj + if attr == "_cache": + # Set the data when we need it + self._cache = self._get_tree_cache(self.repo, self.id) else: super(Tree, self)._set_cache_(attr) - @staticmethod - def content__from_string(repo, text): + @classmethod + def _get_tree_cache(cls, repo, treeish): + """ + Return + list(object_instance, ...) + + ``treeish`` + sha or ref identifying a tree + """ + out = list() + for line in repo.git.ls_tree(treeish).splitlines(): + obj = cls.content_from_string(repo, line) + if obj is not None: + out.append(obj) + # END if object was handled + # END for each line from ls-tree + return out + + + @classmethod + def content_from_string(cls, repo, text): """ Parse a content item and create the appropriate object @@ -40,6 +71,8 @@ class Tree(base.IndexObject): Returns ``git.Blob`` or ``git.Tree`` + + NOTE: Currently sub-modules are ignored ! """ try: mode, typ, id, path = text.expandtabs(1).split(" ", 3) @@ -51,6 +84,7 @@ class Tree(base.IndexObject): elif typ == "blob": return blob.Blob(repo, id, mode, path) elif typ == "commit": + # TODO: Return a submodule return None else: raise(TypeError, "Invalid type: %s" % typ) @@ -67,36 +101,104 @@ class Tree(base.IndexObject): Returns - ``git.Blob`` or ``git.Tree`` or ``None`` if not found + ``git.Blob`` or ``git.Tree`` + + Raise + KeyError if given file or tree does not exist in tree """ - return self.get(file) + return self[file] def __repr__(self): return '' % self.id + + @classmethod + def _iter_recursive(cls, repo, tree, cur_depth, max_depth, predicate ): + + for obj in tree: + # adjust path to be complete + obj.path = os.path.join(tree.path, obj.path) + if not predicate(obj): + continue + yield obj + if obj.type == "tree" and ( max_depth < 0 or cur_depth+1 <= max_depth ): + for recursive_obj in cls._iter_recursive( repo, obj, cur_depth+1, max_depth, predicate ): + yield recursive_obj + # END for each recursive object + # END if we may enter recursion + # END for each object + + def traverse(self, max_depth=-1, predicate = lambda i: True): + """ + Returns + Iterator to traverse the tree recursively up to the given level. + The iterator returns Blob and Tree objects + + ``max_depth`` + + if -1, the whole tree will be traversed + if 0, only the first level will be traversed which is the same as + the default non-recursive iterator + + ``predicate`` + + If predicate(item) returns True, item will be returned by iterator + """ + return self._iter_recursive( self.repo, self, 0, max_depth, predicate ) + + @property + def trees(self): + """ + Returns + list(Tree, ...) list of trees directly below this tree + """ + return [ i for i in self if i.type == "tree" ] + + @property + def blobs(self): + """ + Returns + list(Blob, ...) list of blobs directly below this tree + """ + return [ i for i in self if i.type == "blob" ] - # Implement the basics of the dict protocol: - # directories/trees can be seen as object dicts. - def __getitem__(self, key): - return self._contents[key] + # List protocol + def __getslice__(self,i,j): + return self._cache[i:j] + def __iter__(self): - return iter(self._contents) - + return iter(self._cache) + def __len__(self): - return len(self._contents) - - def __contains__(self, key): - return key in self._contents - - def get(self, key): - return self._contents.get(key) - - def items(self): - return self._contents.items() - - def keys(self): - return self._contents.keys() - - def values(self): - return self._contents.values() + return len(self._cache) + + def __getitem__(self,item): + if isinstance(item, int): + return self._cache[item] + + if isinstance(item, basestring): + # compatability + for obj in self._cache: + if obj.path == item: + return obj + # END for each obj + raise KeyError( "Blob or Tree named %s not found" % item ) + # END index is basestring + + raise TypeError( "Invalid index type: %r" % item ) + + + def __contains__(self,item): + if isinstance(item, base.IndexObject): + return item in self._cache + + # compatability + for obj in self._cache: + if item == obj.path: + return True + # END for each item + return False + + def __reversed__(self): + return reversed(self._cache) -- cgit v1.2.1 From 86fa577e135713e56b287169d69d976cde27ac97 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 13 Oct 2009 17:36:27 +0200 Subject: tree: renamed content_from_string to _from_string to make it private. Removed tests that were testing that method --- lib/git/objects/tree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/git/objects/tree.py') diff --git a/lib/git/objects/tree.py b/lib/git/objects/tree.py index 707cebaa..1bc35d95 100644 --- a/lib/git/objects/tree.py +++ b/lib/git/objects/tree.py @@ -50,7 +50,7 @@ class Tree(base.IndexObject): """ out = list() for line in repo.git.ls_tree(treeish).splitlines(): - obj = cls.content_from_string(repo, line) + obj = cls._from_string(repo, line) if obj is not None: out.append(obj) # END if object was handled @@ -59,7 +59,7 @@ class Tree(base.IndexObject): @classmethod - def content_from_string(cls, repo, text): + def _from_string(cls, repo, text): """ Parse a content item and create the appropriate object -- cgit v1.2.1 From 6eeae8b24135b4de05f6d725b009c287577f053d Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 14 Oct 2009 17:24:15 +0200 Subject: test: Added time-consuming test which could also be a benchmark in fact - currently it cause hundreds of command invocations which is slow Fixed issue with trees not properly initialized with their default mode _set_cache_: some objects checked whether the attribute was within their __slots__ although it should have been accessed through its class --- lib/git/objects/tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/git/objects/tree.py') diff --git a/lib/git/objects/tree.py b/lib/git/objects/tree.py index 1bc35d95..01dfb37b 100644 --- a/lib/git/objects/tree.py +++ b/lib/git/objects/tree.py @@ -29,7 +29,7 @@ class Tree(base.IndexObject): type = "tree" __slots__ = "_cache" - def __init__(self, repo, id, mode=None, path=None): + def __init__(self, repo, id, mode=0, path=None): super(Tree, self).__init__(repo, id, mode, path) def _set_cache_(self, attr): -- cgit v1.2.1 From 2e6d110fbfa1f2e6a96bc8329e936d0cf1192844 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 14 Oct 2009 23:37:45 +0200 Subject: tree: now reads tress directly by parsing the binary data, allowing it to safe possibly hundreds of command calls --- lib/git/objects/tree.py | 102 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 70 insertions(+), 32 deletions(-) (limited to 'lib/git/objects/tree.py') diff --git a/lib/git/objects/tree.py b/lib/git/objects/tree.py index 01dfb37b..abfa9622 100644 --- a/lib/git/objects/tree.py +++ b/lib/git/objects/tree.py @@ -7,6 +7,13 @@ import os import blob import base +import binascii + +def sha_to_hex(sha): + """Takes a string and returns the hex of the sha within""" + hexsha = binascii.hexlify(sha) + assert len(hexsha) == 40, "Incorrect length of sha1 string: %d" % hexsha + return hexsha class Tree(base.IndexObject): """ @@ -29,18 +36,23 @@ class Tree(base.IndexObject): type = "tree" __slots__ = "_cache" + # using ascii codes for comparison + ascii_commit_id = (0x31 << 4) + 0x36 + ascii_blob_id = (0x31 << 4) + 0x30 + ascii_tree_id = (0x34 << 4) + 0x30 + + def __init__(self, repo, id, mode=0, path=None): super(Tree, self).__init__(repo, id, mode, path) def _set_cache_(self, attr): if attr == "_cache": # Set the data when we need it - self._cache = self._get_tree_cache(self.repo, self.id) + self._cache = self._get_tree_cache() else: super(Tree, self)._set_cache_(attr) - @classmethod - def _get_tree_cache(cls, repo, treeish): + def _get_tree_cache(self): """ Return list(object_instance, ...) @@ -49,45 +61,71 @@ class Tree(base.IndexObject): sha or ref identifying a tree """ out = list() - for line in repo.git.ls_tree(treeish).splitlines(): - obj = cls._from_string(repo, line) + for obj in self._iter_from_data(): if obj is not None: out.append(obj) # END if object was handled # END for each line from ls-tree return out - - @classmethod - def _from_string(cls, repo, text): + + def _iter_from_data(self): """ - Parse a content item and create the appropriate object - - ``repo`` - is the Repo - - ``text`` - is the single line containing the items data in `git ls-tree` format - + Reads the binary non-pretty printed representation of a tree and converts + it into Blob, Tree or Commit objects. + + Note: This method was inspired by the parse_tree method in dulwich. + Returns - ``git.Blob`` or ``git.Tree`` - - NOTE: Currently sub-modules are ignored ! + list(IndexObject, ...) """ - try: - mode, typ, id, path = text.expandtabs(1).split(" ", 3) - except: - return None + ord_zero = ord('0') + data = self.data + len_data = len(data) + i = 0 + while i < len_data: + mode = 0 + mode_boundary = i + 6 + + # keep it ascii - we compare against the respective values + type_id = (ord(data[i])<<4) + ord(data[i+1]) + i += 2 + + while data[i] != ' ': + # move existing mode integer up one level being 3 bits + # and add the actual ordinal value of the character + mode = (mode << 3) + (ord(data[i]) - ord_zero) + i += 1 + # END while reading mode + + # byte is space now, skip it + i += 1 + + # parse name, it is NULL separated + + ns = i + while data[i] != '\0': + i += 1 + # END while not reached NULL + name = data[ns:i] + + # byte is NULL, get next 20 + i += 1 + sha = data[i:i+20] + i = i + 20 + + hexsha = sha_to_hex(sha) + if type_id == self.ascii_blob_id: + yield blob.Blob(self.repo, hexsha, mode, name) + elif type_id == self.ascii_tree_id: + yield Tree(self.repo, hexsha, mode, name) + elif type_id == self.ascii_commit_id: + # todo + yield None + else: + raise TypeError( "Unknown type found in tree data: %i" % type_id ) + # END for each byte in data stream - if typ == "tree": - return Tree(repo, id, mode, path) - elif typ == "blob": - return blob.Blob(repo, id, mode, path) - elif typ == "commit": - # TODO: Return a submodule - return None - else: - raise(TypeError, "Invalid type: %s" % typ) def __div__(self, file): """ -- cgit v1.2.1