Added first design and frame for object database. In a first step, loose objects will be written using our utilities, and certain object retrieval functionality moves into the GitObjectDatabase which is used by the repo instance

Added performance test for object database access, which shows quite respectable tree parsing performance, and okay blob access. Nonetheless, it will be hard to beat the c performance using a pure python implementation, but it can be a nice practice to write it anyway to allow more direct pack manipulations. Some could benefit from the ability to write packs as these can serve as local cache if alternates are used
author: Sebastian Thiel <byronimo@gmail.com> 2010-06-02 20:11:00 +0200
committer: Sebastian Thiel <byronimo@gmail.com> 2010-06-02 20:11:00 +0200
commit: 282018b79cc8df078381097cb3aeb29ff56e83c6 (patch)
tree: f4ad72281d65d062239fb571fdd693e328883e55
parent: 538820055ce1bf9dd07ecda48210832f96194504 (diff)
download: gitpython-282018b79cc8df078381097cb3aeb29ff56e83c6.tar.gz
5 files changed, 210 insertions, 9 deletions
diff --git a/lib/git/errors.py b/lib/git/errors.py
index f66fb528..ecb1c35b 100644
--- a/lib/git/errors.py
+++ b/lib/git/errors.py
@@ -8,19 +8,16 @@ Module containing all exceptions thrown througout the git package,
 """
 
 class InvalidGitRepositoryError(Exception):
-    """
-    Thrown if the given repository appears to have an invalid format. 
-    """
+    """ Thrown if the given repository appears to have an invalid format.  """
+
+class InvalidDBRoot(Exception):
+	"""Thrown if an object database cannot be initialized at the given path"""
 
 class NoSuchPathError(OSError):
-    """
-    Thrown if a path could not be access by the system.
-    """
+    """ Thrown if a path could not be access by the system. """
 
 class GitCommandError(Exception):
-    """
-    Thrown if execution of the git command fails with non-zero status code.
-    """
+    """ Thrown if execution of the git command fails with non-zero status code. """
     def __init__(self, command, status, stderr=None):
         self.stderr = stderr
         self.status = status
diff --git a/lib/git/odb/__init__.py b/lib/git/odb/__init__.py
new file mode 100644
index 00000000..17000244
--- /dev/null
+++ b/lib/git/odb/__init__.py
@@ -0,0 +1,2 @@
+"""Initialize the object database module"""
+
diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py
new file mode 100644
index 00000000..fd1b640a
--- /dev/null
+++ b/lib/git/odb/db.py
@@ -0,0 +1,129 @@
+"""Contains implementations of database retrieveing objects"""
+import os
+from git.errors import InvalidDBRoot
+
+
+class iObjectDBR(object):
+	"""Defines an interface for object database lookup.
+	Objects are identified either by hex-sha (40 bytes) or 
+	by sha (20 bytes)"""
+	__slots__ = tuple()
+	
+	#{ Query Interface 
+	def has_obj_hex(self, hexsha):
+		""":return: True if the object identified by the given 40 byte hexsha is 
+		contained in the database"""
+		raise NotImplementedError("To be implemented in subclass")
+		
+	def has_obj_bin(self, sha):
+		""":return: as ``has_obj_hex``, but takes a 20 byte binary sha"""
+		raise NotImplementedError("To be implemented in subclass")
+		
+	def obj_hex(self, hexsha):
+		""":return: tuple(type_string, size_in_bytes, stream) a tuple with object
+		information including its type, its size as well as a stream from which its
+		contents can be read"""
+		raise NotImplementedError("To be implemented in subclass")
+		
+	def obj_bin(self, sha):
+		""":return: as in ``obj_hex``, but takes a binary sha"""
+		raise NotImplementedError("To be implemented in subclass")
+		
+	def obj_info_hex(self, hexsha):
+		""":return: tuple(type_string, size_in_bytes) tuple with the object's type 
+			string as well as its size in bytes"""
+		raise NotImplementedError("To be implemented in subclass")
+			
+	#} END query interface
+	
+class iObjectDBW(object):
+	"""Defines an interface to create objects in the database"""
+	__slots__ = tuple()
+	
+	#{ Edit Interface
+	
+	def to_obj(self, type, size, stream, dry_run=False, sha_as_hex=True):
+		"""Create a new object in the database
+		:return: the sha identifying the object in the database
+		:param type: type string identifying the object
+		:param size: size of the data to read from stream
+		:param stream: stream providing the data
+		:param dry_run: if True, the object database will not actually be changed
+		:param sha_as_hex: if True, the returned sha identifying the object will be 
+			hex encoded, not binary"""
+		raise NotImplementedError("To be implemented in subclass")
+	
+	def to_objs(self, iter_info, dry_run=False, sha_as_hex=True, max_threads=0):
+		"""Create multiple new objects in the database
+		:return: sequence of shas identifying the created objects in the order in which 
+			they where given.
+		:param iter_info: iterable yielding tuples containing the type_string
+			size_in_bytes and the steam with the content data.
+		:param dry_run: see ``to_obj``
+		:param sha_as_hex: see ``to_obj``
+		:param max_threads: if < 1, any number of threads may be started while processing
+			the request, otherwise the given number of threads will be started."""
+		# a trivial implementation, ignoring the threads for now
+		# TODO: add configuration to the class to determine whether we may 
+		# actually use multiple threads, default False of course. If the add
+		shas = list()
+		for args in iter_info:
+			shas.append(self.to_obj(*args, dry_run=dry_run, sha_as_hex=sha_as_hex))
+		return shas
+		
+	#} END edit interface
+	
+
+class FileDBBase(object):
+	"""Provides basic facilities to retrieve files of interest, including 
+	caching facilities to help mapping hexsha's to objects"""
+	__slots__ = ('_root_path', )
+	
+	def __init__(self, root_path):
+		"""Initialize this instance to look for its files at the given root path
+		All subsequent operations will be relative to this path
+		:raise InvalidDBRoot: 
+		:note: The base will perform basic checking for accessability, but the subclass
+			is required to verify that the root_path contains the database structure it needs"""
+		if not os.path.isdir(root_path):
+			raise InvalidDBRoot(root_path)
+		self._root_path = root_path
+		
+		
+	#{ Interface 
+	def root_path(self):
+		""":return: path at which this db operates"""
+		return self._root_path
+	
+	#} END interface
+		
+	#{ Utiltities
+	def _root_rela_path(self, rela_path):
+		""":return: the given relative path relative to our database root"""
+		return os.path.join(self._root_path, rela_path)
+		
+	#} END utilities
+	
+	
+class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW):
+	"""A database which operates on loose object files"""
+	
+	
+class PackedDB(FileDBBase, iObjectDBR):
+	"""A database operating on a set of object packs"""
+	
+	
+class CompoundDB(iObjectDBR):
+	"""A database which delegates calls to sub-databases"""
+	
+
+class ReferenceDB(CompoundDB):
+	"""A database consisting of database referred to in a file"""
+	
+	
+class GitObjectDB(CompoundDB, iObjectDBW):
+	"""A database representing the default git object store, which includes loose 
+	objects, pack files and an alternates file
+	
+	It will create objects only in the loose object database."""
+	
diff --git a/test/git/performance/test_odb.py b/test/git/performance/test_odb.py
new file mode 100644
index 00000000..0ad2ce33
--- /dev/null
+++ b/test/git/performance/test_odb.py
@@ -0,0 +1,61 @@
+"""Performance tests for object store"""
+
+from time import time
+import sys
+import stat
+
+from lib import (
+	TestBigRepoReadOnly
+	)
+
+
+class TestObjDBPerformance(TestBigRepoReadOnly):
+	
+	def test_random_access(self):
+		
+		# GET COMMITS
+		# TODO: use the actual db for this
+		st = time()
+		root_commit = self.gitrepo.commit(self.head_sha_2k)
+		commits = list(root_commit.traverse())
+		nc = len(commits)
+		elapsed = time() - st
+		
+		print >> sys.stderr, "Retrieved %i commits from ObjectStore in %g s ( %f commits / s )" % (nc, elapsed, nc / elapsed)
+			
+			
+		# GET TREES
+		# walk all trees of all commits
+		st = time()
+		blobs_per_commit = list()
+		nt = 0
+		for commit in commits:
+			tree = commit.tree
+			blobs = list()
+			for item in tree.traverse():
+				nt += 1
+				if item.type == 'blob':
+					blobs.append(item)
+				# direct access for speed
+			# END while trees are there for walking
+			blobs_per_commit.append(blobs)
+		# END for each commit
+		elapsed = time() - st
+		
+		print >> sys.stderr, "Retrieved %i objects from %i commits in %g s ( %f objects / s )" % (nt, len(commits), elapsed, nt / elapsed)
+		
+		# GET BLOBS
+		st = time()
+		nb = 0
+		too_many = 15000
+		for blob_list in blobs_per_commit:
+			for blob in blob_list:
+				blob.data
+			# END for each blobsha
+			nb += len(blob_list)
+			if nb > too_many:
+				break
+		# END for each bloblist
+		elapsed = time() - st
+		
+		print >> sys.stderr, "Retrieved %i blob and their data in %g s ( %f blobs / s )" % (nb, elapsed, nb / elapsed)
diff --git a/test/git/test_odb.py b/test/git/test_odb.py
new file mode 100644
index 00000000..6f92a5c1
--- /dev/null
+++ b/test/git/test_odb.py
@@ -0,0 +1,12 @@
+"""Test for object db"""
+
+from test.testlib import *
+from git.odb.db import *
+
+
+class TestDB(TestBase):
+	"""Test the different db class implementations"""
+	
+	def test_loose_db(self):
+		self.fail("todo")
+
author	Sebastian Thiel <byronimo@gmail.com>	2010-06-02 20:11:00 +0200
committer	Sebastian Thiel <byronimo@gmail.com>	2010-06-02 20:11:00 +0200
commit	282018b79cc8df078381097cb3aeb29ff56e83c6 (patch)
tree	f4ad72281d65d062239fb571fdd693e328883e55
parent	538820055ce1bf9dd07ecda48210832f96194504 (diff)
download	gitpython-282018b79cc8df078381097cb3aeb29ff56e83c6.tar.gz