From 282018b79cc8df078381097cb3aeb29ff56e83c6 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 20:11:00 +0200 Subject: Added first design and frame for object database. In a first step, loose objects will be written using our utilities, and certain object retrieval functionality moves into the GitObjectDatabase which is used by the repo instance Added performance test for object database access, which shows quite respectable tree parsing performance, and okay blob access. Nonetheless, it will be hard to beat the c performance using a pure python implementation, but it can be a nice practice to write it anyway to allow more direct pack manipulations. Some could benefit from the ability to write packs as these can serve as local cache if alternates are used --- lib/git/odb/db.py | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 lib/git/odb/db.py (limited to 'lib/git/odb/db.py') diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py new file mode 100644 index 00000000..fd1b640a --- /dev/null +++ b/lib/git/odb/db.py @@ -0,0 +1,129 @@ +"""Contains implementations of database retrieveing objects""" +import os +from git.errors import InvalidDBRoot + + +class iObjectDBR(object): + """Defines an interface for object database lookup. + Objects are identified either by hex-sha (40 bytes) or + by sha (20 bytes)""" + __slots__ = tuple() + + #{ Query Interface + def has_obj_hex(self, hexsha): + """:return: True if the object identified by the given 40 byte hexsha is + contained in the database""" + raise NotImplementedError("To be implemented in subclass") + + def has_obj_bin(self, sha): + """:return: as ``has_obj_hex``, but takes a 20 byte binary sha""" + raise NotImplementedError("To be implemented in subclass") + + def obj_hex(self, hexsha): + """:return: tuple(type_string, size_in_bytes, stream) a tuple with object + information including its type, its size as well as a stream from which its + contents can be read""" + raise NotImplementedError("To be implemented in subclass") + + def obj_bin(self, sha): + """:return: as in ``obj_hex``, but takes a binary sha""" + raise NotImplementedError("To be implemented in subclass") + + def obj_info_hex(self, hexsha): + """:return: tuple(type_string, size_in_bytes) tuple with the object's type + string as well as its size in bytes""" + raise NotImplementedError("To be implemented in subclass") + + #} END query interface + +class iObjectDBW(object): + """Defines an interface to create objects in the database""" + __slots__ = tuple() + + #{ Edit Interface + + def to_obj(self, type, size, stream, dry_run=False, sha_as_hex=True): + """Create a new object in the database + :return: the sha identifying the object in the database + :param type: type string identifying the object + :param size: size of the data to read from stream + :param stream: stream providing the data + :param dry_run: if True, the object database will not actually be changed + :param sha_as_hex: if True, the returned sha identifying the object will be + hex encoded, not binary""" + raise NotImplementedError("To be implemented in subclass") + + def to_objs(self, iter_info, dry_run=False, sha_as_hex=True, max_threads=0): + """Create multiple new objects in the database + :return: sequence of shas identifying the created objects in the order in which + they where given. + :param iter_info: iterable yielding tuples containing the type_string + size_in_bytes and the steam with the content data. + :param dry_run: see ``to_obj`` + :param sha_as_hex: see ``to_obj`` + :param max_threads: if < 1, any number of threads may be started while processing + the request, otherwise the given number of threads will be started.""" + # a trivial implementation, ignoring the threads for now + # TODO: add configuration to the class to determine whether we may + # actually use multiple threads, default False of course. If the add + shas = list() + for args in iter_info: + shas.append(self.to_obj(*args, dry_run=dry_run, sha_as_hex=sha_as_hex)) + return shas + + #} END edit interface + + +class FileDBBase(object): + """Provides basic facilities to retrieve files of interest, including + caching facilities to help mapping hexsha's to objects""" + __slots__ = ('_root_path', ) + + def __init__(self, root_path): + """Initialize this instance to look for its files at the given root path + All subsequent operations will be relative to this path + :raise InvalidDBRoot: + :note: The base will perform basic checking for accessability, but the subclass + is required to verify that the root_path contains the database structure it needs""" + if not os.path.isdir(root_path): + raise InvalidDBRoot(root_path) + self._root_path = root_path + + + #{ Interface + def root_path(self): + """:return: path at which this db operates""" + return self._root_path + + #} END interface + + #{ Utiltities + def _root_rela_path(self, rela_path): + """:return: the given relative path relative to our database root""" + return os.path.join(self._root_path, rela_path) + + #} END utilities + + +class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): + """A database which operates on loose object files""" + + +class PackedDB(FileDBBase, iObjectDBR): + """A database operating on a set of object packs""" + + +class CompoundDB(iObjectDBR): + """A database which delegates calls to sub-databases""" + + +class ReferenceDB(CompoundDB): + """A database consisting of database referred to in a file""" + + +class GitObjectDB(CompoundDB, iObjectDBW): + """A database representing the default git object store, which includes loose + objects, pack files and an alternates file + + It will create objects only in the loose object database.""" + -- cgit v1.2.1 From 8b86f9b399a8f5af792a04025fdeefc02883f3e5 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 22:40:52 +0200 Subject: initial version of loose object writing and simple cached object lookup appears to be working --- lib/git/odb/db.py | 148 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 124 insertions(+), 24 deletions(-) (limited to 'lib/git/odb/db.py') diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index fd1b640a..204da9ad 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -1,6 +1,21 @@ """Contains implementations of database retrieveing objects""" import os from git.errors import InvalidDBRoot +from git.utils import IndexFileSHA1Writer + +from utils import ( + to_hex_sha, + exists, + hex_to_bin, + FDCompressedSha1Writer, + isdir, + mkdir, + rename, + dirname, + join + ) + +import tempfile class iObjectDBR(object): @@ -9,29 +24,29 @@ class iObjectDBR(object): by sha (20 bytes)""" __slots__ = tuple() + def __contains__(self, sha): + return self.has_obj + #{ Query Interface - def has_obj_hex(self, hexsha): - """:return: True if the object identified by the given 40 byte hexsha is - contained in the database""" - raise NotImplementedError("To be implemented in subclass") - - def has_obj_bin(self, sha): - """:return: as ``has_obj_hex``, but takes a 20 byte binary sha""" - raise NotImplementedError("To be implemented in subclass") - - def obj_hex(self, hexsha): - """:return: tuple(type_string, size_in_bytes, stream) a tuple with object - information including its type, its size as well as a stream from which its - contents can be read""" + def has_object(self, sha): + """ + :return: True if the object identified by the given 40 byte hexsha or 20 bytes + binary sha is contained in the database""" raise NotImplementedError("To be implemented in subclass") - def obj_bin(self, sha): - """:return: as in ``obj_hex``, but takes a binary sha""" + def object(self, sha): + """ + :return: tuple(type_string, size_in_bytes, stream) a tuple with object + information including its type, its size as well as a stream from which its + contents can be read + :param sha: 40 bytes hexsha or 20 bytes binary sha """ raise NotImplementedError("To be implemented in subclass") - def obj_info_hex(self, hexsha): - """:return: tuple(type_string, size_in_bytes) tuple with the object's type - string as well as its size in bytes""" + def object_info(self, sha): + """ + :return: tuple(type_string, size_in_bytes) tuple with the object's type + string as well as its size in bytes + :param sha: 40 bytes hexsha or 20 bytes binary sha""" raise NotImplementedError("To be implemented in subclass") #} END query interface @@ -42,7 +57,7 @@ class iObjectDBW(object): #{ Edit Interface - def to_obj(self, type, size, stream, dry_run=False, sha_as_hex=True): + def to_object(self, type, size, stream, dry_run=False, sha_as_hex=True): """Create a new object in the database :return: the sha identifying the object in the database :param type: type string identifying the object @@ -53,7 +68,7 @@ class iObjectDBW(object): hex encoded, not binary""" raise NotImplementedError("To be implemented in subclass") - def to_objs(self, iter_info, dry_run=False, sha_as_hex=True, max_threads=0): + def to_objects(self, iter_info, dry_run=False, sha_as_hex=True, max_threads=0): """Create multiple new objects in the database :return: sequence of shas identifying the created objects in the order in which they where given. @@ -68,7 +83,7 @@ class iObjectDBW(object): # actually use multiple threads, default False of course. If the add shas = list() for args in iter_info: - shas.append(self.to_obj(*args, dry_run=dry_run, sha_as_hex=sha_as_hex)) + shas.append(self.to_object(*args, dry_run=dry_run, sha_as_hex=sha_as_hex)) return shas #} END edit interface @@ -95,18 +110,103 @@ class FileDBBase(object): """:return: path at which this db operates""" return self._root_path + def db_path(self, rela_path): + """ + :return: the given relative path relative to our database root, allowing + to pontentially access datafiles""" + return join(self._root_path, rela_path) #} END interface #{ Utiltities - def _root_rela_path(self, rela_path): - """:return: the given relative path relative to our database root""" - return os.path.join(self._root_path, rela_path) + #} END utilities class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): """A database which operates on loose object files""" + __slots__ = ('_hexsha_to_file', ) + + # CONFIGURATION + # chunks in which data will be copied between streams + stream_chunk_size = 1000*1000 + + def __init__(self, root_path): + super(LooseObjectDB, self).__init__(root_path) + self._hexsha_to_file = dict() + + #{ Interface + def hexsha_to_object_path(self, hexsha): + """ + :return: path at which the object with the given hexsha would be stored, + relative to the database root""" + return join(hexsha[:2], hexsha[2:]) + + #} END interface + + def has_object(self, sha): + sha = to_hex_sha(sha) + # try cache + if sha in self._hexsha_to_file: + return True + + # try filesystem + path = self.db_path(self.hexsha_to_object_path(sha)) + if exists(path): + self._hexsha_to_file[sha] = path + return True + # END handle cache + return False + + def to_object(self, type, size, stream, dry_run=False, sha_as_hex=True): + # open a tmp file to write the data to + fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) + writer = FDCompressedSha1Writer(fd) + + # WRITE HEADER: type SP size NULL + writer.write("%s %i%s" % (type, size, chr(0))) + + # WRITE ALL DATA + chunksize = self.stream_chunk_size + try: + try: + while True: + data_len = writer.write(stream.read(chunksize)) + if data_len < chunksize: + # WRITE FOOTER + writer.write('\n') + break + # END check for stream end + # END duplicate data + finally: + writer.close() + # END assure file was closed + except: + os.remove(tmp_path) + raise + # END assure tmpfile removal on error + + + # in dry-run mode, we delete the file afterwards + sha = writer.sha(as_hex=True) + + if dry_run: + os.remove(tmp_path) + else: + # rename the file into place + obj_path = self.db_path(self.hexsha_to_object_path(sha)) + obj_dir = dirname(obj_path) + if not isdir(obj_dir): + mkdir(obj_dir) + # END handle destination directory + rename(tmp_path, obj_path) + # END handle dry_run + + if not sha_as_hex: + sha = hex_to_bin(sha) + # END handle sha format + + return sha class PackedDB(FileDBBase, iObjectDBR): -- cgit v1.2.1 From 6f8ce8901e21587cd2320562df412e05b5ab1731 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 23:53:29 +0200 Subject: added frame for object reading, including simple test --- lib/git/odb/db.py | 76 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 62 insertions(+), 14 deletions(-) (limited to 'lib/git/odb/db.py') diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index 204da9ad..1248a3f4 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -1,9 +1,13 @@ """Contains implementations of database retrieveing objects""" import os -from git.errors import InvalidDBRoot +from git.errors import ( + InvalidDBRoot, + BadObject + ) from git.utils import IndexFileSHA1Writer from utils import ( + getsize, to_hex_sha, exists, hex_to_bin, @@ -16,6 +20,7 @@ from utils import ( ) import tempfile +import mmap class iObjectDBR(object): @@ -136,27 +141,70 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): self._hexsha_to_file = dict() #{ Interface - def hexsha_to_object_path(self, hexsha): + def object_path(self, hexsha): """ :return: path at which the object with the given hexsha would be stored, relative to the database root""" return join(hexsha[:2], hexsha[2:]) - #} END interface - - def has_object(self, sha): - sha = to_hex_sha(sha) - # try cache - if sha in self._hexsha_to_file: - return True + def readable_db_object_path(self, hexsha): + """ + :return: readable object path to the object identified by hexsha + :raise BadObject: If the object file does not exist""" + try: + return self._hexsha_to_file[hexsha] + except KeyError: + pass + # END ignore cache misses # try filesystem - path = self.db_path(self.hexsha_to_object_path(sha)) + path = self.db_path(self.object_path(hexsha)) if exists(path): - self._hexsha_to_file[sha] = path - return True + self._hexsha_to_file[hexsha] = path + return path # END handle cache - return False + raise BadObject(hexsha) + + #} END interface + + def _object_header_info(self, mmap): + """:return: tuple(type_string, uncompressed_size_in_bytes + :param mmap: newly mapped memory map at position 0. It will be + seeked to the actual start of the object contents, which can be used + to initialize a zlib decompress object.""" + raise NotImplementedError("todo") + + def _map_object(self, sha): + """ + :return: tuple(file, mmap) tuple with an opened file for reading, and + a memory map of that file""" + db_path = self.readable_db_object_path(to_hex_sha(sha)) + f = open(db_path, 'rb') + m = mmap.mmap(f.fileno(), getsize(db_path), access=mmap.ACCESS_READ) + return (f, m) + + def object_info(self, sha): + f, m = self._map_object(sha) + try: + type, size = self._object_header_info(m) + finally: + f.close() + m.close() + # END assure release of system resources + + def object(self, sha): + f, m = self._map_object(sha) + type, size = self._object_header_info(m) + # TODO: init a dynamic decompress stream from our memory map + + + def has_object(self, sha): + try: + self.readable_db_object_path(to_hex_sha(sha)) + return True + except BadObject: + return False + # END check existance def to_object(self, type, size, stream, dry_run=False, sha_as_hex=True): # open a tmp file to write the data to @@ -194,7 +242,7 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): os.remove(tmp_path) else: # rename the file into place - obj_path = self.db_path(self.hexsha_to_object_path(sha)) + obj_path = self.db_path(self.object_path(sha)) obj_dir = dirname(obj_path) if not isdir(obj_dir): mkdir(obj_dir) -- cgit v1.2.1 From 38d59fc8ccccae8882fa48671377bf40a27915a7 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 3 Jun 2010 16:35:35 +0200 Subject: odb: implemented loose object streaming, which is impossible to do efficiently considering that it copies string buffers all the time --- lib/git/odb/db.py | 114 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 63 insertions(+), 51 deletions(-) (limited to 'lib/git/odb/db.py') diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index 1248a3f4..5c50a512 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -1,17 +1,18 @@ """Contains implementations of database retrieveing objects""" -import os +from git.utils import IndexFileSHA1Writer from git.errors import ( InvalidDBRoot, - BadObject + BadObject, + BadObjectType ) -from git.utils import IndexFileSHA1Writer from utils import ( - getsize, + DecompressMemMapReader, + FDCompressedSha1Writer, + ENOENT, to_hex_sha, exists, hex_to_bin, - FDCompressedSha1Writer, isdir, mkdir, rename, @@ -19,8 +20,15 @@ from utils import ( join ) +from fun import ( + chunk_size, + loose_object_header_info, + write_object + ) + import tempfile import mmap +import os class iObjectDBR(object): @@ -36,7 +44,8 @@ class iObjectDBR(object): def has_object(self, sha): """ :return: True if the object identified by the given 40 byte hexsha or 20 bytes - binary sha is contained in the database""" + binary sha is contained in the database + :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") def object(self, sha): @@ -44,14 +53,16 @@ class iObjectDBR(object): :return: tuple(type_string, size_in_bytes, stream) a tuple with object information including its type, its size as well as a stream from which its contents can be read - :param sha: 40 bytes hexsha or 20 bytes binary sha """ + :param sha: 40 bytes hexsha or 20 bytes binary sha + :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") def object_info(self, sha): """ :return: tuple(type_string, size_in_bytes) tuple with the object's type string as well as its size in bytes - :param sha: 40 bytes hexsha or 20 bytes binary sha""" + :param sha: 40 bytes hexsha or 20 bytes binary sha + :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") #} END query interface @@ -70,7 +81,8 @@ class iObjectDBW(object): :param stream: stream providing the data :param dry_run: if True, the object database will not actually be changed :param sha_as_hex: if True, the returned sha identifying the object will be - hex encoded, not binary""" + hex encoded, not binary + :raise IOError: if data could not be written""" raise NotImplementedError("To be implemented in subclass") def to_objects(self, iter_info, dry_run=False, sha_as_hex=True, max_threads=0): @@ -82,7 +94,8 @@ class iObjectDBW(object): :param dry_run: see ``to_obj`` :param sha_as_hex: see ``to_obj`` :param max_threads: if < 1, any number of threads may be started while processing - the request, otherwise the given number of threads will be started.""" + the request, otherwise the given number of threads will be started. + :raise IOError: if data could not be written""" # a trivial implementation, ignoring the threads for now # TODO: add configuration to the class to determine whether we may # actually use multiple threads, default False of course. If the add @@ -130,15 +143,19 @@ class FileDBBase(object): class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): """A database which operates on loose object files""" - __slots__ = ('_hexsha_to_file', ) - + __slots__ = ('_hexsha_to_file', '_fd_open_flags') # CONFIGURATION # chunks in which data will be copied between streams - stream_chunk_size = 1000*1000 + stream_chunk_size = chunk_size + def __init__(self, root_path): super(LooseObjectDB, self).__init__(root_path) self._hexsha_to_file = dict() + # Additional Flags - might be set to 0 after the first failure + # Depending on the root, this might work for some mounts, for others not, which + # is why it is per instance + self._fd_open_flags = os.O_NOATIME #{ Interface def object_path(self, hexsha): @@ -167,36 +184,46 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): #} END interface - def _object_header_info(self, mmap): - """:return: tuple(type_string, uncompressed_size_in_bytes - :param mmap: newly mapped memory map at position 0. It will be - seeked to the actual start of the object contents, which can be used - to initialize a zlib decompress object.""" - raise NotImplementedError("todo") - - def _map_object(self, sha): + def _map_loose_object(self, sha): """ - :return: tuple(file, mmap) tuple with an opened file for reading, and - a memory map of that file""" - db_path = self.readable_db_object_path(to_hex_sha(sha)) - f = open(db_path, 'rb') - m = mmap.mmap(f.fileno(), getsize(db_path), access=mmap.ACCESS_READ) - return (f, m) + :return: memory map of that file to allow random read access + :raise BadObject: if object could not be located""" + db_path = self.db_path(self.object_path(to_hex_sha(sha))) + try: + fd = os.open(db_path, os.O_RDONLY|self._fd_open_flags) + except OSError,e: + if e.errno != ENOENT: + # try again without noatime + try: + fd = os.open(db_path, os.O_RDONLY) + except OSError: + raise BadObject(to_hex_sha(sha)) + # didn't work because of our flag, don't try it again + self._fd_open_flags = 0 + else: + raise BadObject(to_hex_sha(sha)) + # END handle error + # END exception handling + try: + return mmap.mmap(fd, 0, access=mmap.ACCESS_READ) + finally: + os.close(fd) + # END assure file is closed def object_info(self, sha): - f, m = self._map_object(sha) + m = self._map_loose_object(sha) try: - type, size = self._object_header_info(m) + return loose_object_header_info(m) finally: - f.close() m.close() # END assure release of system resources def object(self, sha): - f, m = self._map_object(sha) - type, size = self._object_header_info(m) - # TODO: init a dynamic decompress stream from our memory map + m = self._map_loose_object(sha) + reader = DecompressMemMapReader(m, close_on_deletion = True) + type, size = reader.initialize() + return type, size, reader def has_object(self, sha): try: @@ -210,25 +237,10 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): # open a tmp file to write the data to fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) writer = FDCompressedSha1Writer(fd) - - # WRITE HEADER: type SP size NULL - writer.write("%s %i%s" % (type, size, chr(0))) - - # WRITE ALL DATA - chunksize = self.stream_chunk_size + try: - try: - while True: - data_len = writer.write(stream.read(chunksize)) - if data_len < chunksize: - # WRITE FOOTER - writer.write('\n') - break - # END check for stream end - # END duplicate data - finally: - writer.close() - # END assure file was closed + write_object(type, size, stream, writer, + close_target_stream=True, chunk_size=self.stream_chunk_size) except: os.remove(tmp_path) raise -- cgit v1.2.1 From 26e138cb47dccc859ff219f108ce9b7d96cbcbcd Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 3 Jun 2010 18:21:05 +0200 Subject: odb: fixed streamed decompression reader ( specific tests would still be missing ) and added performance tests which are extremely promising --- lib/git/odb/db.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/git/odb/db.py') diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index 5c50a512..e656b2b5 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -91,8 +91,8 @@ class iObjectDBW(object): they where given. :param iter_info: iterable yielding tuples containing the type_string size_in_bytes and the steam with the content data. - :param dry_run: see ``to_obj`` - :param sha_as_hex: see ``to_obj`` + :param dry_run: see ``to_object`` + :param sha_as_hex: see ``to_object`` :param max_threads: if < 1, any number of threads may be started while processing the request, otherwise the given number of threads will be started. :raise IOError: if data could not be written""" -- cgit v1.2.1 From 1906ee4df9ae4e734288c5203cf79894dff76cab Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 3 Jun 2010 23:27:09 +0200 Subject: Fixed compatability issues with python 2.5, made sure all tests run --- lib/git/odb/db.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/git/odb/db.py') diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index e656b2b5..c970410d 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -101,7 +101,7 @@ class iObjectDBW(object): # actually use multiple threads, default False of course. If the add shas = list() for args in iter_info: - shas.append(self.to_object(*args, dry_run=dry_run, sha_as_hex=sha_as_hex)) + shas.append(self.to_object(dry_run=dry_run, sha_as_hex=sha_as_hex, *args)) return shas #} END edit interface @@ -155,7 +155,7 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): # Additional Flags - might be set to 0 after the first failure # Depending on the root, this might work for some mounts, for others not, which # is why it is per instance - self._fd_open_flags = os.O_NOATIME + self._fd_open_flags = getattr(os, 'O_NOATIME', 0) #{ Interface def object_path(self, hexsha): -- cgit v1.2.1 From b01ca6a3e4ae9d944d799743c8ff774e2a7a82b6 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 4 Jun 2010 00:09:00 +0200 Subject: db: implemented GitObjectDB using the git command to make sure we can lookup everything. Next is to implement pack-file reading, then alternates which should allow to resolve everything --- lib/git/odb/db.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'lib/git/odb/db.py') diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index c970410d..1d1d4c40 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -281,9 +281,27 @@ class ReferenceDB(CompoundDB): """A database consisting of database referred to in a file""" -class GitObjectDB(CompoundDB, iObjectDBW): +#class GitObjectDB(CompoundDB, iObjectDBW): +class GitObjectDB(LooseObjectDB): """A database representing the default git object store, which includes loose objects, pack files and an alternates file - It will create objects only in the loose object database.""" + It will create objects only in the loose object database. + :note: for now, we use the git command to do all the lookup, just until he + have packs and the other implementations + """ + __slots__ = ('_git', ) + def __init__(self, root_path, git): + """Initialize this instance with the root and a git command""" + super(GitObjectDB, self).__init__(root_path) + self._git = git + + def object_info(self, sha): + discard, type, size = self._git.get_object_header(sha) + return type, size + + def object(self, sha): + """For now, all lookup is done by git itself""" + discard, type, size, stream = self._git.stream_object_data(sha) + return type, size, stream -- cgit v1.2.1 From a1e80445ad5cb6da4c0070d7cb8af89da3b0803b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 4 Jun 2010 14:41:15 +0200 Subject: initial version of new odb design to facilitate a channel based multi-threading implementation of all odb functions --- lib/git/odb/db.py | 114 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 72 insertions(+), 42 deletions(-) (limited to 'lib/git/odb/db.py') diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index 1d1d4c40..7ae8f446 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -6,9 +6,12 @@ from git.errors import ( BadObjectType ) -from utils import ( +from stream import ( DecompressMemMapReader, - FDCompressedSha1Writer, + FDCompressedSha1Writer + ) + +from utils import ( ENOENT, to_hex_sha, exists, @@ -31,7 +34,7 @@ import mmap import os -class iObjectDBR(object): +class ObjectDBR(object): """Defines an interface for object database lookup. Objects are identified either by hex-sha (40 bytes) or by sha (20 bytes)""" @@ -48,62 +51,87 @@ class iObjectDBR(object): :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") - def object(self, sha): - """ - :return: tuple(type_string, size_in_bytes, stream) a tuple with object - information including its type, its size as well as a stream from which its - contents can be read + def info(self, sha): + """ :return: ODB_Info instance :param sha: 40 bytes hexsha or 20 bytes binary sha :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") - def object_info(self, sha): - """ - :return: tuple(type_string, size_in_bytes) tuple with the object's type - string as well as its size in bytes + def info_async(self, input_channel): + """Retrieve information of a multitude of objects asynchronously + :param input_channel: Channel yielding the sha's of the objects of interest + :return: Channel yielding ODB_Info|InvalidODB_Info, in any order""" + raise NotImplementedError("To be implemented in subclass") + + def stream(self, sha): + """:return: ODB_OStream instance :param sha: 40 bytes hexsha or 20 bytes binary sha :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") + + def stream_async(self, input_channel): + """Retrieve the ODB_OStream of multiple objects + :param input_channel: see ``info`` + :param max_threads: see ``ObjectDBW.store`` + :return: Channel yielding ODB_OStream|InvalidODB_OStream instances in any order""" + raise NotImplementedError("To be implemented in subclass") #} END query interface -class iObjectDBW(object): +class ObjectDBW(object): """Defines an interface to create objects in the database""" - __slots__ = tuple() + __slots__ = "_ostream" + + def __init__(self, *args, **kwargs): + self._ostream = None #{ Edit Interface + def set_ostream(self, stream): + """Adjusts the stream to which all data should be sent when storing new objects + :param stream: if not None, the stream to use, if None the default stream + will be used. + :return: previously installed stream, or None if there was no override + :raise TypeError: if the stream doesn't have the supported functionality""" + cstream = self._ostream + self._ostream = stream + return cstream + + def ostream(self): + """:return: overridden output stream this instance will write to, or None + if it will write to the default stream""" + return self._ostream - def to_object(self, type, size, stream, dry_run=False, sha_as_hex=True): + def store(self, istream): """Create a new object in the database - :return: the sha identifying the object in the database - :param type: type string identifying the object - :param size: size of the data to read from stream - :param stream: stream providing the data - :param dry_run: if True, the object database will not actually be changed - :param sha_as_hex: if True, the returned sha identifying the object will be - hex encoded, not binary + :return: the input istream object with its sha set to its corresponding value + :param istream: ODB_IStream compatible instance. If its sha is already set + to a value, the object will just be stored in the our database format, + in which case the input stream is expected to be in object format ( header + contents ). :raise IOError: if data could not be written""" raise NotImplementedError("To be implemented in subclass") - def to_objects(self, iter_info, dry_run=False, sha_as_hex=True, max_threads=0): - """Create multiple new objects in the database - :return: sequence of shas identifying the created objects in the order in which - they where given. - :param iter_info: iterable yielding tuples containing the type_string - size_in_bytes and the steam with the content data. - :param dry_run: see ``to_object`` - :param sha_as_hex: see ``to_object`` - :param max_threads: if < 1, any number of threads may be started while processing - the request, otherwise the given number of threads will be started. - :raise IOError: if data could not be written""" + def store_async(self, input_channel): + """Create multiple new objects in the database asynchronously. The method will + return right away, returning an output channel which receives the results as + they are computed. + + :return: Channel yielding your ODB_IStream which served as input, in any order. + The IStreams sha will be set to the sha it received during the process, + or its error attribute will be set to the exception informing about the error. + :param input_channel: Channel yielding ODB_IStream instance. + As the same instances will be used in the output channel, you can create a map + between the id(istream) -> istream + :note:As some ODB implementations implement this operation as atomic, they might + abort the whole operation if one item could not be processed. Hence check how + many items have actually been produced.""" # a trivial implementation, ignoring the threads for now # TODO: add configuration to the class to determine whether we may # actually use multiple threads, default False of course. If the add shas = list() for args in iter_info: - shas.append(self.to_object(dry_run=dry_run, sha_as_hex=sha_as_hex, *args)) + shas.append(self.store(dry_run=dry_run, sha_as_hex=sha_as_hex, *args)) return shas - + #} END edit interface @@ -118,6 +146,7 @@ class FileDBBase(object): :raise InvalidDBRoot: :note: The base will perform basic checking for accessability, but the subclass is required to verify that the root_path contains the database structure it needs""" + super(FileDBBase, self).__init__() if not os.path.isdir(root_path): raise InvalidDBRoot(root_path) self._root_path = root_path @@ -141,7 +170,7 @@ class FileDBBase(object): #} END utilities -class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): +class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): """A database which operates on loose object files""" __slots__ = ('_hexsha_to_file', '_fd_open_flags') # CONFIGURATION @@ -210,7 +239,7 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): os.close(fd) # END assure file is closed - def object_info(self, sha): + def info(self, sha): m = self._map_loose_object(sha) try: return loose_object_header_info(m) @@ -233,8 +262,9 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): return False # END check existance - def to_object(self, type, size, stream, dry_run=False, sha_as_hex=True): + def store(self, istream): # open a tmp file to write the data to + # todo: implement ostream properly fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) writer = FDCompressedSha1Writer(fd) @@ -269,11 +299,11 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): return sha -class PackedDB(FileDBBase, iObjectDBR): +class PackedDB(FileDBBase, ObjectDBR): """A database operating on a set of object packs""" -class CompoundDB(iObjectDBR): +class CompoundDB(ObjectDBR): """A database which delegates calls to sub-databases""" @@ -281,7 +311,7 @@ class ReferenceDB(CompoundDB): """A database consisting of database referred to in a file""" -#class GitObjectDB(CompoundDB, iObjectDBW): +#class GitObjectDB(CompoundDB, ObjectDBW): class GitObjectDB(LooseObjectDB): """A database representing the default git object store, which includes loose objects, pack files and an alternates file @@ -296,7 +326,7 @@ class GitObjectDB(LooseObjectDB): super(GitObjectDB, self).__init__(root_path) self._git = git - def object_info(self, sha): + def info(self, sha): discard, type, size = self._git.get_object_header(sha) return type, size -- cgit v1.2.1 From e746f96bcc29238b79118123028ca170adc4ff0f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 4 Jun 2010 17:22:08 +0200 Subject: Fixed implementation after design change to deal with it - all tests run, but next there will have to be more through testing --- lib/git/odb/db.py | 116 +++++++++++++++++++++++++++--------------------------- 1 file changed, 58 insertions(+), 58 deletions(-) (limited to 'lib/git/odb/db.py') diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index 7ae8f446..a8de28ec 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -8,7 +8,10 @@ from git.errors import ( from stream import ( DecompressMemMapReader, - FDCompressedSha1Writer + FDCompressedSha1Writer, + Sha1Writer, + OStream, + OInfo ) from utils import ( @@ -34,11 +37,13 @@ import mmap import os +__all__ = ('ObjectDBR', 'ObjectDBW', 'FileDBBase', 'LooseObjectDB', 'PackedDB', + 'CompoundDB', 'ReferenceDB', 'GitObjectDB' ) + class ObjectDBR(object): """Defines an interface for object database lookup. Objects are identified either by hex-sha (40 bytes) or by sha (20 bytes)""" - __slots__ = tuple() def __contains__(self, sha): return self.has_obj @@ -52,7 +57,7 @@ class ObjectDBR(object): raise NotImplementedError("To be implemented in subclass") def info(self, sha): - """ :return: ODB_Info instance + """ :return: OInfo instance :param sha: 40 bytes hexsha or 20 bytes binary sha :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") @@ -60,27 +65,26 @@ class ObjectDBR(object): def info_async(self, input_channel): """Retrieve information of a multitude of objects asynchronously :param input_channel: Channel yielding the sha's of the objects of interest - :return: Channel yielding ODB_Info|InvalidODB_Info, in any order""" + :return: Channel yielding OInfo|InvalidOInfo, in any order""" raise NotImplementedError("To be implemented in subclass") def stream(self, sha): - """:return: ODB_OStream instance + """:return: OStream instance :param sha: 40 bytes hexsha or 20 bytes binary sha :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") def stream_async(self, input_channel): - """Retrieve the ODB_OStream of multiple objects + """Retrieve the OStream of multiple objects :param input_channel: see ``info`` :param max_threads: see ``ObjectDBW.store`` - :return: Channel yielding ODB_OStream|InvalidODB_OStream instances in any order""" + :return: Channel yielding OStream|InvalidOStream instances in any order""" raise NotImplementedError("To be implemented in subclass") #} END query interface class ObjectDBW(object): """Defines an interface to create objects in the database""" - __slots__ = "_ostream" def __init__(self, *args, **kwargs): self._ostream = None @@ -99,12 +103,12 @@ class ObjectDBW(object): def ostream(self): """:return: overridden output stream this instance will write to, or None if it will write to the default stream""" - return self._ostream + return self._ostream def store(self, istream): """Create a new object in the database :return: the input istream object with its sha set to its corresponding value - :param istream: ODB_IStream compatible instance. If its sha is already set + :param istream: IStream compatible instance. If its sha is already set to a value, the object will just be stored in the our database format, in which case the input stream is expected to be in object format ( header + contents ). :raise IOError: if data could not be written""" @@ -115,22 +119,16 @@ class ObjectDBW(object): return right away, returning an output channel which receives the results as they are computed. - :return: Channel yielding your ODB_IStream which served as input, in any order. + :return: Channel yielding your IStream which served as input, in any order. The IStreams sha will be set to the sha it received during the process, or its error attribute will be set to the exception informing about the error. - :param input_channel: Channel yielding ODB_IStream instance. + :param input_channel: Channel yielding IStream instance. As the same instances will be used in the output channel, you can create a map between the id(istream) -> istream :note:As some ODB implementations implement this operation as atomic, they might abort the whole operation if one item could not be processed. Hence check how many items have actually been produced.""" - # a trivial implementation, ignoring the threads for now - # TODO: add configuration to the class to determine whether we may - # actually use multiple threads, default False of course. If the add - shas = list() - for args in iter_info: - shas.append(self.store(dry_run=dry_run, sha_as_hex=sha_as_hex, *args)) - return shas + raise NotImplementedError("To be implemented in subclass") #} END edit interface @@ -138,7 +136,6 @@ class ObjectDBW(object): class FileDBBase(object): """Provides basic facilities to retrieve files of interest, including caching facilities to help mapping hexsha's to objects""" - __slots__ = ('_root_path', ) def __init__(self, root_path): """Initialize this instance to look for its files at the given root path @@ -164,15 +161,11 @@ class FileDBBase(object): return join(self._root_path, rela_path) #} END interface - #{ Utiltities - - - #} END utilities class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): """A database which operates on loose object files""" - __slots__ = ('_hexsha_to_file', '_fd_open_flags') + # CONFIGURATION # chunks in which data will be copied between streams stream_chunk_size = chunk_size @@ -238,21 +231,26 @@ class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): finally: os.close(fd) # END assure file is closed + + def set_ostream(self, stream): + """:raise TypeError: if the stream does not support the Sha1Writer interface""" + if stream is not None and not isinstance(stream, Sha1Writer): + raise TypeError("Output stream musst support the %s interface" % Sha1Writer.__name__) + return super(LooseObjectDB, self).set_ostream(stream) def info(self, sha): m = self._map_loose_object(sha) try: - return loose_object_header_info(m) + type, size = loose_object_header_info(m) + return OInfo(sha, type, size) finally: m.close() # END assure release of system resources - def object(self, sha): + def stream(self, sha): m = self._map_loose_object(sha) - reader = DecompressMemMapReader(m, close_on_deletion = True) - type, size = reader.initialize() - - return type, size, reader + type, size, stream = DecompressMemMapReader.new(m, close_on_deletion = True) + return OStream(sha, type, size, stream) def has_object(self, sha): try: @@ -263,27 +261,33 @@ class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): # END check existance def store(self, istream): - # open a tmp file to write the data to - # todo: implement ostream properly - fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) - writer = FDCompressedSha1Writer(fd) + """note: The sha we produce will be hex by nature""" + assert istream.sha is None, "Direct istream writing not yet implemented" + tmp_path = None + writer = self.ostream() + if writer is None: + # open a tmp file to write the data to + fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) + writer = FDCompressedSha1Writer(fd) + # END handle custom writer try: - write_object(type, size, stream, writer, - close_target_stream=True, chunk_size=self.stream_chunk_size) - except: - os.remove(tmp_path) - raise - # END assure tmpfile removal on error - + try: + write_object(istream.type, istream.size, istream.read, writer.write, + chunk_size=self.stream_chunk_size) + except: + if tmp_path: + os.remove(tmp_path) + raise + # END assure tmpfile removal on error + finally: + if tmp_path: + writer.close() + # END assure target stream is closed - # in dry-run mode, we delete the file afterwards sha = writer.sha(as_hex=True) - if dry_run: - os.remove(tmp_path) - else: - # rename the file into place + if tmp_path: obj_path = self.db_path(self.object_path(sha)) obj_dir = dirname(obj_path) if not isdir(obj_dir): @@ -292,11 +296,8 @@ class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): rename(tmp_path, obj_path) # END handle dry_run - if not sha_as_hex: - sha = hex_to_bin(sha) - # END handle sha format - - return sha + istream.sha = sha + return istream class PackedDB(FileDBBase, ObjectDBR): @@ -320,18 +321,17 @@ class GitObjectDB(LooseObjectDB): :note: for now, we use the git command to do all the lookup, just until he have packs and the other implementations """ - __slots__ = ('_git', ) def __init__(self, root_path, git): """Initialize this instance with the root and a git command""" super(GitObjectDB, self).__init__(root_path) self._git = git def info(self, sha): - discard, type, size = self._git.get_object_header(sha) - return type, size + t = self._git.get_object_header(sha) + return OInfo(t[0], t[1], t[2]) - def object(self, sha): + def stream(self, sha): """For now, all lookup is done by git itself""" - discard, type, size, stream = self._git.stream_object_data(sha) - return type, size, stream + t = self._git.stream_object_data(sha) + return OStream(t[0], t[1], t[2], t[3]) -- cgit v1.2.1