5 files changed, 268 insertions, 125 deletions
diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py
index 43aa8dd1..07538ada 100644
--- a/lib/git/objects/base.py
+++ b/lib/git/objects/base.py
@@ -4,32 +4,10 @@
 # This module is part of GitPython and is released under
 # the BSD License: http://www.opensource.org/licenses/bsd-license.php
 import os
-
-class LazyMixin(object):
-	lazy_properties = []
-	__slots__ = tuple()
+from git.utils import LazyMixin
 	
-	def __getattr__(self, attr):
-		"""
-		Whenever an attribute is requested that we do not know, we allow it 
-		to be created and set. Next time the same attribute is reqeusted, it is simply
-		returned from our dict/slots.
-		"""
-		self._set_cache_(attr)
-		# will raise in case the cache was not created
-		return object.__getattribute__(self, attr)
+_assertion_msg_format = "Created object %r whose python type %r disagrees with the acutal git object type %r"
 
-	def _set_cache_(self, attr):
-		""" This method should be overridden in the derived class. 
-		It should check whether the attribute named by attr can be created
-		and cached. Do nothing if you do not know the attribute or call your subclass
-		
-		The derived class may create as many additional attributes as it deems 
-		necessary in case a git command returns more information than represented 
-		in the single attribute."""
-		pass
-	
-		
 class Object(LazyMixin):
 	"""
 	Implements an Object which may be Blobs, Trees, Commits and Tags
@@ -71,9 +49,13 @@ class Object(LazyMixin):
 		Retrieve object information
 		"""
 		if attr  == "size":
-			self.size = int(self.repo.git.cat_file(self.id, s=True).rstrip())
+			hexsha, typename, self.size = self.repo.git.get_object_header(self.id)
+			assert typename == self.type, _assertion_msg_format % (self.id, typename, self.type)
 		elif attr == "data":
-			self.data = self.repo.git.cat_file(self.id, p=True, with_raw_output=True)
+			hexsha, typename, self.size, self.data = self.repo.git.get_object_data(self.id)
+			assert typename == self.type, _assertion_msg_format % (self.id, typename, self.type)
+		else:
+			super(Object,self)._set_cache_(attr)
 		
 	def __eq__(self, other):
 		"""
@@ -143,8 +125,15 @@ class IndexObject(Object):
 		if isinstance(mode, basestring):
 			self.mode = self._mode_str_to_int(mode)
 	
+	def _set_cache_(self, attr):
+		if attr in IndexObject.__slots__:
+			# they cannot be retrieved lateron ( not without searching for them )
+			raise AttributeError( "path and mode attributes must have been set during %s object creation" % type(self).__name__ )
+		else:
+			super(IndexObject, self)._set_cache_(attr)
+	
 	@classmethod
-	def _mode_str_to_int( cls, modestr ):
+	def _mode_str_to_int(cls, modestr):
 		"""
 		``modestr``
 			string like 755 or 644 or 100644 - only the last 3 chars will be used
diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py
index c3e97bf9..101014ab 100644
--- a/lib/git/objects/commit.py
+++ b/lib/git/objects/commit.py
@@ -6,14 +6,14 @@
 
 import re
 import time
-
+from git.utils import Iterable
 from git.actor import Actor
-from tree import Tree
 import git.diff as diff
 import git.stats as stats
+from tree import Tree
 import base
 
-class Commit(base.Object):
+class Commit(base.Object, Iterable):
 	"""
 	Wraps a git Commit object.
 	
@@ -37,7 +37,7 @@ class Commit(base.Object):
 		The parameter documentation indicates the type of the argument after a colon ':'.
 
 		``id``
-			is the sha id of the commit
+			is the sha id of the commit or a ref
 
 		``parents`` : tuple( Commit, ... )
 			is a tuple of commit ids or actual Commits
@@ -71,7 +71,7 @@ class Commit(base.Object):
 		# END for each parent to convert
 			
 		if self.id and tree is not None:
-			self.tree = Tree(repo, id=tree)
+			self.tree = Tree(repo, id=tree, path='')
 		# END id to tree conversion
 
 	def _set_cache_(self, attr):
@@ -80,8 +80,11 @@ class Commit(base.Object):
 		to be set.
 		We set all values at once.
 		"""
-		if attr in self.__slots__:
-			temp = Commit.find_all(self.repo, self.id, max_count=1)[0]
+		if attr in Commit.__slots__:
+			# prepare our data lines to match rev-list
+			data_lines = self.data.splitlines()
+			data_lines.insert(0, "commit %s" % self.id)
+			temp = self._iter_from_process_or_stream(self.repo, iter(data_lines)).next()
 			self.parents = temp.parents
 			self.tree = temp.tree
 			self.author = temp.author
@@ -120,7 +123,7 @@ class Commit(base.Object):
 		return len(repo.git.rev_list(ref, '--', path).strip().splitlines())
 
 	@classmethod
-	def find_all(cls, repo, ref, path='', **kwargs):
+	def iter_items(cls, repo, ref, path='', **kwargs):
 		"""
 		Find all commits matching the given criteria.
 
@@ -128,7 +131,7 @@ class Commit(base.Object):
 			is the Repo
 
 		``ref``
-			is the ref from which to begin (SHA1 or name)
+			is the ref from which to begin (SHA1, Head or name)
 
 		``path``
 			is an optinal path, if set only Commits that include the path 
@@ -140,55 +143,67 @@ class Commit(base.Object):
 			``skip`` is the number of commits to skip
 
 		Returns
-			git.Commit[]
+			iterator yielding Commit items
 		"""
-		options = {'pretty': 'raw'}
+		options = {'pretty': 'raw', 'as_process' : True }
 		options.update(kwargs)
 
-		output = repo.git.rev_list(ref, '--', path, **options)
-		return cls._list_from_string(repo, output)
+		# the test system might confront us with string values - 
+		proc = repo.git.rev_list(ref, '--', path, **options)
+		return cls._iter_from_process_or_stream(repo, proc)
 
 	@classmethod
-	def _list_from_string(cls, repo, text):
+	def _iter_from_process_or_stream(cls, repo, proc_or_stream):
 		"""
 		Parse out commit information into a list of Commit objects
 
 		``repo``
 			is the Repo
 
-		``text``
-			is the text output from the git-rev-list command (raw format)
+		``proc``
+			git-rev-list process instance (raw format)
 
 		Returns
-			git.Commit[]
+			iterator returning Commit objects
 		"""
-		lines =text.splitlines(False)
-		commits = []
-
-		while lines:
-			id = lines.pop(0).split()[1]
-			tree = lines.pop(0).split()[1]
+		stream = proc_or_stream
+		if not hasattr(stream,'next'):
+			stream = proc_or_stream.stdout
+			
+		for line in stream:
+			id = line.split()[1]
+			assert line.split()[0] == "commit"
+			tree = stream.next().split()[1]
 
 			parents = []
-			while lines and lines[0].startswith('parent'):
-				parents.append(lines.pop(0).split()[-1])
-			# END while there are parent lines
-			author, authored_date = cls._actor(lines.pop(0))
-			committer, committed_date = cls._actor(lines.pop(0))
+			next_line = None
+			for parent_line in stream:
+				if not parent_line.startswith('parent'):
+					next_line = parent_line
+					break
+				# END abort reading parents
+				parents.append(parent_line.split()[-1])
+			# END for each parent line
+			
+			author, authored_date = cls._actor(next_line)
+			committer, committed_date = cls._actor(stream.next())
 			
-			# free line
-			lines.pop(0)
+			# empty line
+			stream.next()
 			
 			message_lines = []
-			while lines and not lines[0].startswith('commit'):
-				message_lines.append(lines.pop(0).strip())
+			next_line = None
+			for msg_line in stream:
+				if not msg_line.startswith('    '):
+					break
+				# END abort message reading 
+				message_lines.append(msg_line.strip())
 			# END while there are message lines
-			message = '\n'.join(message_lines[:-1])	# last line is empty
-
-			commits.append(Commit(repo, id=id, parents=parents, tree=tree, author=author, authored_date=authored_date,
-								  committer=committer, committed_date=committed_date, message=message))
-		# END while lines
-		return commits
+			message = '\n'.join(message_lines)
+			
+			yield Commit(repo, id=id, parents=parents, tree=tree, author=author, authored_date=authored_date,
+						  committer=committer, committed_date=committed_date, message=message)
+		# END for each line in stream
 
 	@classmethod
 	def diff(cls, repo, a, b=None, paths=None):
diff --git a/lib/git/objects/tag.py b/lib/git/objects/tag.py
index af1022f0..ecf6349d 100644
--- a/lib/git/objects/tag.py
+++ b/lib/git/objects/tag.py
@@ -8,7 +8,7 @@ Module containing all object based types.
 """
 import base
 import commit
-from util import get_object_type_by_name
+from utils import get_object_type_by_name
 
 class TagObject(base.Object):
 	"""
@@ -48,9 +48,8 @@ class TagObject(base.Object):
 		"""
 		Cache all our attributes at once
 		"""
-		if attr in self.__slots__:
-			output = self.repo.git.cat_file(self.type,self.id)
-			lines = output.split("\n")
+		if attr in TagObject.__slots__:
+			lines = self.data.splitlines()
 			
 			obj, hexsha = lines[0].split(" ")		# object <hexsha>
 			type_token, type_name = lines[1].split(" ") # type <type_name>
diff --git a/lib/git/objects/tree.py b/lib/git/objects/tree.py
index 273384a3..abfa9622 100644
--- a/lib/git/objects/tree.py
+++ b/lib/git/objects/tree.py
@@ -7,53 +7,125 @@
 import os
 import blob
 import base
+import binascii
+
+def sha_to_hex(sha):
+    """Takes a string and returns the hex of the sha within"""
+    hexsha = binascii.hexlify(sha)
+    assert len(hexsha) == 40, "Incorrect length of sha1 string: %d" % hexsha
+    return hexsha
 
 class Tree(base.IndexObject):
+	"""
+	Tress represent a ordered list of Blobs and other Trees. Hence it can be 
+	accessed like a list.
+	
+	Tree's will cache their contents after first retrieval to improve efficiency.
+	
+	``Tree as a list``::
+		
+		Access a specific blob using the  
+		tree['filename'] notation.
+		
+		You may as well access by index
+		blob = tree[0]
+		
+		
+	"""
 	
 	type = "tree"
-	__slots__ = "_contents"
+	__slots__ = "_cache"
+	
+	# using ascii codes for comparison 
+	ascii_commit_id = (0x31 << 4) + 0x36		
+	ascii_blob_id = (0x31 << 4) + 0x30
+	ascii_tree_id = (0x34 << 4) + 0x30
 	
-	def __init__(self, repo, id, mode=None, path=None):
+	
+	def __init__(self, repo, id, mode=0, path=None):
 		super(Tree, self).__init__(repo, id, mode, path)
 
 	def _set_cache_(self, attr):
-		if attr == "_contents":
-			# Read the tree contents.
-			self._contents = {}
-			for line in self.repo.git.ls_tree(self.id).splitlines():
-				obj = self.content__from_string(self.repo, line)
-				if obj is not None:
-					self._contents[obj.path] = obj
+		if attr == "_cache":
+			# Set the data when we need it
+			self._cache = self._get_tree_cache()
 		else:
 			super(Tree, self)._set_cache_(attr)
 
-	@staticmethod
-	def content__from_string(repo, text):
+	def _get_tree_cache(self):
 		"""
-		Parse a content item and create the appropriate object
-
-		``repo``
-			is the Repo
-
-		 ``text``
-			is the single line containing the items data in `git ls-tree` format
-
+		Return
+			list(object_instance, ...)
+		
+		``treeish``
+			sha or ref identifying a tree  
+		"""
+		out = list()
+		for obj in self._iter_from_data():
+			if obj is not None:
+				out.append(obj)
+			# END if object was handled
+		# END for each line from ls-tree
+		return out
+		
+		
+	def _iter_from_data(self):
+		"""
+		Reads the binary non-pretty printed representation of a tree and converts
+		it into Blob, Tree or Commit objects.
+		
+		Note: This method was inspired by the parse_tree method in dulwich.
+		
 		Returns
-			``git.Blob`` or ``git.Tree``
+			list(IndexObject, ...)
 		"""
-		try:
-			mode, typ, id, path = text.expandtabs(1).split(" ", 3)
-		except:
-			return None
+		ord_zero = ord('0')
+		data = self.data
+		len_data = len(data)
+		i = 0
+		while i < len_data:
+			mode = 0
+			mode_boundary = i + 6
+			
+			# keep it ascii - we compare against the respective values
+			type_id = (ord(data[i])<<4) + ord(data[i+1])
+			i += 2
+			
+			while data[i] != ' ':
+				# move existing mode integer up one level being 3 bits
+				# and add the actual ordinal value of the character
+				mode = (mode << 3) + (ord(data[i]) - ord_zero)
+				i += 1
+			# END while reading mode
+			
+			# byte is space now, skip it
+			i += 1
+			
+			# parse name, it is NULL separated
+			
+			ns = i
+			while data[i] != '\0':
+				i += 1
+			# END while not reached NULL
+			name = data[ns:i]
+			
+			# byte is NULL, get next 20
+			i += 1
+			sha = data[i:i+20]
+			i = i + 20
+			
+			hexsha = sha_to_hex(sha)
+			if type_id == self.ascii_blob_id:
+				yield blob.Blob(self.repo, hexsha, mode, name)
+			elif type_id == self.ascii_tree_id:
+				yield Tree(self.repo, hexsha, mode, name)
+			elif type_id == self.ascii_commit_id:
+				# todo 
+				yield None
+			else:
+				raise TypeError( "Unknown type found in tree data: %i" % type_id )
+		# END for each byte in data stream
 
-		if typ == "tree":
-			return Tree(repo, id, mode, path)
-		elif typ == "blob":
-			return blob.Blob(repo, id, mode, path)
-		elif typ == "commit":
-			return None 
-		else:
-		  raise(TypeError, "Invalid type: %s" % typ)
 
 	def __div__(self, file):
 		"""
@@ -67,36 +139,104 @@ class Tree(base.IndexObject):
 			<git.Blob "8b1e02c0fb554eed2ce2ef737a68bb369d7527df">
 
 		Returns
-			``git.Blob`` or ``git.Tree`` or ``None`` if not found
+			``git.Blob`` or ``git.Tree``
+		
+		Raise 
+			KeyError if given file or tree does not exist in tree
 		"""
-		return self.get(file)
+		return self[file]
 
 
 	def __repr__(self):
 		return '<git.Tree "%s">' % self.id
+		
+	@classmethod
+	def _iter_recursive(cls, repo, tree, cur_depth, max_depth, predicate ):
+		
+		for obj in tree:
+			# adjust path to be complete
+			obj.path = os.path.join(tree.path, obj.path)
+			if not predicate(obj):
+				continue
+			yield obj
+			if obj.type == "tree" and ( max_depth < 0 or cur_depth+1 <= max_depth ):
+				for recursive_obj in cls._iter_recursive( repo, obj, cur_depth+1, max_depth, predicate ):
+					yield recursive_obj
+				# END for each recursive object
+			# END if we may enter recursion
+		# END for each object
+		
+	def traverse(self, max_depth=-1, predicate = lambda i: True):
+		"""
+		Returns
+			Iterator to traverse the tree recursively up to the given level.
+			The iterator returns Blob and Tree objects
+		
+		``max_depth``
+		
+			if -1, the whole tree will be traversed
+			if 0, only the first level will be traversed which is the same as 
+			the default non-recursive iterator
+			
+		``predicate``
+		
+			If predicate(item) returns True, item will be returned by iterator
+		"""
+		return self._iter_recursive( self.repo, self, 0, max_depth, predicate )
+		
+	@property
+	def trees(self):
+		"""
+		Returns
+			list(Tree, ...) list of trees directly below this tree
+		"""
+		return [ i for i in self if i.type == "tree" ]
+		
+	@property
+	def blobs(self):
+		"""
+		Returns
+			list(Blob, ...) list of blobs directly below this tree
+		"""
+		return [ i for i in self if i.type == "blob" ]
 
-	# Implement the basics of the dict protocol:
-	# directories/trees can be seen as object dicts.
-	def __getitem__(self, key):
-		return self._contents[key]
 
+	# List protocol
+	def __getslice__(self,i,j):
+		return self._cache[i:j]
+		
 	def __iter__(self):
-		return iter(self._contents)
-
+		return iter(self._cache)
+		
 	def __len__(self):
-		return len(self._contents)
-
-	def __contains__(self, key):
-		return key in self._contents
-
-	def get(self, key):
-		return self._contents.get(key)
-
-	def items(self):
-		return self._contents.items()
-
-	def keys(self):
-		return self._contents.keys()
-
-	def values(self):
-		return self._contents.values()
+		return len(self._cache)
+		
+	def __getitem__(self,item):
+		if isinstance(item, int):
+			return self._cache[item]
+		
+		if isinstance(item, basestring):
+			# compatability
+			for obj in self._cache:
+				if obj.path == item:
+					return obj
+			# END for each obj
+			raise KeyError( "Blob or Tree named %s not found" % item )
+		# END index is basestring 
+		
+		raise TypeError( "Invalid index type: %r" % item )
+		
+		
+	def __contains__(self,item):
+		if isinstance(item, base.IndexObject):
+			return item in self._cache
+		
+		# compatability
+		for obj in self._cache:
+			if item == obj.path:
+				return True
+		# END for each item
+		return False
+	
+	def __reversed__(self):
+		return reversed(self._cache)
diff --git a/lib/git/objects/util.py b/lib/git/objects/utils.py
index 15c1d114..15c1d114 100644
--- a/lib/git/objects/util.py
+++ b/lib/git/objects/utils.py