From 1044116d25f0311033e0951d2ab30579bba4b051 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 21 Jun 2010 20:16:22 +0200 Subject: index: put serialization methods into new 'fun' module, this makes the calls faster as it removes one level of indirection, and makes the main file smaller, improving maintainability --- lib/git/index/base.py | 182 +++++++++++++------------------------------------- 1 file changed, 45 insertions(+), 137 deletions(-) (limited to 'lib/git/index/base.py') diff --git a/lib/git/index/base.py b/lib/git/index/base.py index b003195c..a605c3ec 100644 --- a/lib/git/index/base.py +++ b/lib/git/index/base.py @@ -59,7 +59,8 @@ from git.utils import ( LazyMixin, LockedFD, join_path_native, - file_contents_ro + file_contents_ro, + LockFile ) @@ -67,6 +68,12 @@ from gitdb.base import ( IStream ) +from fun import ( + write_cache, + read_cache, + entry_key + ) + __all__ = ( 'IndexFile', 'CheckoutError' ) @@ -84,7 +91,7 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): to facilitate access. You may read the entries dict or manipulate it using IndexEntry instance, i.e.:: - index.entries[index.get_entries_key(index_entry_instance)] = index_entry_instance + index.entries[index.entry_key(index_entry_instance)] = index_entry_instance Otherwise changes to it will be lost when changing the index using its methods. """ __slots__ = ( "repo", "version", "entries", "_extension_data", "_file_path" ) @@ -147,123 +154,34 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): pass # END exception handling - @classmethod - def _read_entry(cls, stream): - """Return: One entry of the given stream""" - beginoffset = stream.tell() - ctime = unpack(">8s", stream.read(8))[0] - mtime = unpack(">8s", stream.read(8))[0] - (dev, ino, mode, uid, gid, size, sha, flags) = \ - unpack(">LLLLLL20sH", stream.read(20 + 4 * 6 + 2)) - path_size = flags & CE_NAMEMASK - path = stream.read(path_size) - - real_size = ((stream.tell() - beginoffset + 8) & ~7) - data = stream.read((beginoffset + real_size) - stream.tell()) - return IndexEntry((mode, binascii.hexlify(sha), flags, path, ctime, mtime, dev, ino, uid, gid, size)) - - @classmethod - def _read_header(cls, stream): - """Return tuple(version_long, num_entries) from the given stream""" - type_id = stream.read(4) - if type_id != "DIRC": - raise AssertionError("Invalid index file header: %r" % type_id) - version, num_entries = unpack(">LL", stream.read(4 * 2)) - assert version in (1, 2) - return version, num_entries - #{ Serializable Interface def _deserialize(self, stream): """ Initialize this instance with index values read from the given stream """ - self.version, num_entries = self._read_header(stream) - count = 0 - self.entries = dict() - while count < num_entries: - entry = self._read_entry(stream) - self.entries[self.get_entries_key(entry)] = entry - count += 1 - # END for each entry - - # the footer contains extension data and a sha on the content so far - # Keep the extension footer,and verify we have a sha in the end - # Extension data format is: - # 4 bytes ID - # 4 bytes length of chunk - # repeated 0 - N times - self._extension_data = stream.read(~0) - assert len(self._extension_data) > 19, "Index Footer was not at least a sha on content as it was only %i bytes in size" % len(self._extension_data) - - content_sha = self._extension_data[-20:] - - # truncate the sha in the end as we will dynamically create it anyway - self._extension_data = self._extension_data[:-20] - + self.version, self.entries, self._extension_data, conten_sha = read_cache(stream) return self def _serialize(self, stream, ignore_tree_extension_data=False): - - # wrap the stream into a compatible writer - stream = IndexFileSHA1Writer(stream) - - # header - stream.write("DIRC") - stream.write(pack(">LL", self.version, len(self.entries))) - - # body entries_sorted = self.entries.values() entries_sorted.sort(key=lambda e: (e[3], e.stage)) # use path/stage as sort key - for entry in entries_sorted: - self._write_cache_entry(stream, entry) - # END for each entry - - stored_ext_data = None - if ignore_tree_extension_data and self._extension_data and self._extension_data[:4] == 'TREE': - stored_ext_data = self._extension_data - self._extension_data = '' - # END extension data special handling - - # write previously cached extensions data - stream.write(self._extension_data) - - if stored_ext_data: - self._extension_data = stored_ext_data - # END reset previous ext data - - # write the sha over the content - stream.write_sha() + write_cache(entries_sorted, + stream, + (ignore_tree_extension_data and None) or self._extension_data) return self - + + #} END serializable interface - @classmethod - def _write_cache_entry(cls, stream, entry): - """ Write an IndexEntry to a stream """ - beginoffset = stream.tell() - write = stream.write - write(entry[4]) # ctime - write(entry[5]) # mtime - path = entry[3] - plen = len(path) & CE_NAMEMASK # path length - assert plen == len(path), "Path %s too long to fit into index" % entry[3] - flags = plen | entry[2] - write(pack(">LLLLLL20sH", entry[6], entry[7], entry[0], - entry[8], entry[9], entry[10], binascii.unhexlify(entry[1]), flags)) - write(path) - real_size = ((stream.tell() - beginoffset + 8) & ~7) - write("\0" * ((beginoffset + real_size) - stream.tell())) - def write(self, file_path = None, ignore_tree_extension_data=False): - """ - Write the current state to our file path or to the given one + """Write the current state to our file path or to the given one - ``file_path`` + :param file_path: If None, we will write to our stored file path from which we have been initialized. Otherwise we write to the given file path. Please note that this will change the file_path of this index to the one you gave. - ``ignore_tree_extension_data`` + :param ignore_tree_extension_data: If True, the TREE type extension data read in the index will not be written to disk. Use this if you have altered the index and would like to use git-write-tree afterwards to create a tree @@ -273,12 +191,10 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): Alternatively, use IndexFile.write_tree() to handle this case automatically - Returns - self - """ + :return: self""" lfd = LockedFD(file_path or self._file_path) stream = lfd.open(write=True, stream=True) - + self._serialize(stream, ignore_tree_extension_data) lfd.commit() @@ -516,19 +432,8 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): return path_map @classmethod - def get_entries_key(cls, *entry): - """ - Returns - Key suitable to be used for the index.entries dictionary - - ``entry`` - One instance of type BaseIndexEntry or the path and the stage - """ - if len(entry) == 1: - return (entry[0].path, entry[0].stage) - else: - return tuple(entry) - + def entry_key(cls, *entry): + return entry_key(*entry) def resolve_blobs(self, iter_blobs): """ @@ -585,26 +490,31 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): # allows to lazily reread on demand return self - def write_tree(self, missing_ok=False): - """ - Writes the Index in self to a corresponding Tree file into the repository - object database and returns it as corresponding Tree object. + def _write_tree(self, missing_ok=False): + """Writes this index to a corresponding Tree object into the repository's + object database and return it. - ``missing_ok`` + :param missing_ok: If True, missing objects referenced by this index will not result in an error. - Returns - Tree object representing this index - """ + :return: Tree object representing this index""" + # we obtain no lock as we just flush our contents to disk as tree + if not self.entries: + raise ValueError("Cannot write empty index") + + + + return Tree(self.repo, tree_sha, 0, '') + + def write_tree(self, missing_ok = False): index_path = self._index_path() tmp_index_mover = TemporaryFileSwap(index_path) - + self.write(index_path, ignore_tree_extension_data=True) tree_sha = self.repo.git.write_tree(missing_ok=missing_ok) - - del(tmp_index_mover) # as soon as possible - + + del(tmp_index_mover) # as soon as possible return Tree(self.repo, tree_sha, 0, '') def _process_diff_args(self, args): @@ -837,11 +747,10 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): @post_clear_cache @default_index def remove(self, items, working_tree=False, **kwargs): - """ - Remove the given items from the index and optionally from + """Remove the given items from the index and optionally from the working tree as well. - ``items`` + :param items: Multiple types of items are supported which may be be freely mixed. - path string @@ -859,21 +768,20 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): - BaseIndexEntry or compatible type The only relevant information here Yis the path. The stage is ignored. - ``working_tree`` + :param working_tree: If True, the entry will also be removed from the working tree, physically removing the respective file. This may fail if there are uncommited changes in it. - ``**kwargs`` + :param **kwargs: Additional keyword arguments to be passed to git-rm, such as 'r' to allow recurive removal of - Returns + :return: List(path_string, ...) list of repository relative paths that have been removed effectively. This is interesting to know in case you have provided a directory or - globs. Paths are relative to the repository. - """ + globs. Paths are relative to the repository. """ args = list() if not working_tree: args.append("--cached") -- cgit v1.2.1 From 69dd8750be1fbf55010a738dc1ced4655e727f23 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 22 Jun 2010 00:05:37 +0200 Subject: index.write_tree: initial version implemented, although its not yet working correctly, a test to explicitly compare the git version with the python implementation is still missing Tree and Index internally use 20 byte shas, converting them only as needed to reduce memory footprint and processing time objects: started own 'fun' module containing the most important tree functions, more are likely to be added soon --- lib/git/index/base.py | 75 ++++++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 40 deletions(-) (limited to 'lib/git/index/base.py') diff --git a/lib/git/index/base.py b/lib/git/index/base.py index a605c3ec..96a9430c 100644 --- a/lib/git/index/base.py +++ b/lib/git/index/base.py @@ -5,13 +5,13 @@ # the BSD License: http://www.opensource.org/licenses/bsd-license.php """Module containing Index implementation, allowing to perform all kinds of index manipulations such as querying and merging. """ -import binascii import tempfile import os import sys import subprocess import glob from cStringIO import StringIO +from binascii import b2a_hex from stat import ( S_ISLNK, @@ -25,16 +25,12 @@ from stat import ( from typ import ( BaseIndexEntry, IndexEntry, - CE_NAMEMASK, - CE_STAGESHIFT ) from util import ( TemporaryFileSwap, post_clear_cache, default_index, - pack, - unpack ) import git.objects @@ -60,20 +56,17 @@ from git.utils import ( LockedFD, join_path_native, file_contents_ro, - LockFile - ) - - -from gitdb.base import ( - IStream ) from fun import ( write_cache, read_cache, + write_tree_from_cache, entry_key ) +from gitdb.base import IStream + __all__ = ( 'IndexFile', 'CheckoutError' ) @@ -161,10 +154,15 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): self.version, self.entries, self._extension_data, conten_sha = read_cache(stream) return self - def _serialize(self, stream, ignore_tree_extension_data=False): + def _entries_sorted(self): + """:return: list of entries, in a sorted fashion, first by path, then by stage""" entries_sorted = self.entries.values() - entries_sorted.sort(key=lambda e: (e[3], e.stage)) # use path/stage as sort key - write_cache(entries_sorted, + entries_sorted.sort(key=lambda e: (e.path, e.stage)) # use path/stage as sort key + return entries_sorted + + def _serialize(self, stream, ignore_tree_extension_data=False): + entries = self._entries_sorted() + write_cache(entries, stream, (ignore_tree_extension_data and None) or self._extension_data) return self @@ -403,7 +401,7 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): # TODO: is it necessary to convert the mode ? We did that when adding # it to the index, right ? mode = self._stat_mode_to_index_mode(entry.mode) - blob = Blob(self.repo, entry.sha, mode, entry.path) + blob = Blob(self.repo, entry.hexsha, mode, entry.path) blob.size = entry.size output = (entry.stage, blob) if predicate(output): @@ -490,33 +488,31 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): # allows to lazily reread on demand return self - def _write_tree(self, missing_ok=False): + def write_tree(self): """Writes this index to a corresponding Tree object into the repository's object database and return it. - - :param missing_ok: - If True, missing objects referenced by this index will not result - in an error. - - :return: Tree object representing this index""" + + :return: Tree object representing this index + :note: The tree will be written even if one or more objects the tree refers to + does not yet exist in the object database. This could happen if you added + Entries to the index directly. + :raise ValueError: if there are no entries in the cache + :raise UnmergedEntriesError: """ # we obtain no lock as we just flush our contents to disk as tree if not self.entries: raise ValueError("Cannot write empty index") + # TODO: use memory db, this helps to prevent IO if the resulting tree + # already exists + entries = self._entries_sorted() + binsha, tree_items = write_tree_from_cache(entries, self.repo.odb, slice(0, len(entries))) + # note: additional deserialization could be saved if write_tree_from_cache + # would return sorted tree entries + root_tree = Tree(self.repo, b2a_hex(binsha), path='') + root_tree._cache = tree_items + return root_tree - return Tree(self.repo, tree_sha, 0, '') - - def write_tree(self, missing_ok = False): - index_path = self._index_path() - tmp_index_mover = TemporaryFileSwap(index_path) - - self.write(index_path, ignore_tree_extension_data=True) - tree_sha = self.repo.git.write_tree(missing_ok=missing_ok) - - del(tmp_index_mover) # as soon as possible - return Tree(self.repo, tree_sha, 0, '') - def _process_diff_args(self, args): try: args.pop(args.index(self)) @@ -525,7 +521,6 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): # END remove self return args - def _to_relative_path(self, path): """:return: Version of path relative to our git directory or raise ValueError if it is not within our git direcotory""" @@ -599,7 +594,7 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): - BaseIndexEntry or type Handling equals the one of Blob objects, but the stage may be - explicitly set. + explicitly set. Please note that Index Entries require binary sha's. :param force: If True, otherwise ignored or excluded files will be @@ -666,7 +661,7 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): fprogress(filepath, True, filepath) return BaseIndexEntry((self._stat_mode_to_index_mode(st.st_mode), - istream.sha, 0, filepath)) + istream.binsha, 0, filepath)) # END utility method @@ -691,14 +686,14 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): # HANLDE ENTRY OBJECT CREATION # create objects if required, otherwise go with the existing shas - null_entries_indices = [ i for i,e in enumerate(entries) if e.sha == Object.NULL_HEX_SHA ] + null_entries_indices = [ i for i,e in enumerate(entries) if e.binsha == Object.NULL_BIN_SHA ] if null_entries_indices: for ei in null_entries_indices: null_entry = entries[ei] new_entry = store_path(null_entry.path) # update null entry - entries[ei] = BaseIndexEntry((null_entry.mode, new_entry.sha, null_entry.stage, null_entry.path)) + entries[ei] = BaseIndexEntry((null_entry.mode, new_entry.binsha, null_entry.stage, null_entry.path)) # END for each entry index # END null_entry handling @@ -707,7 +702,7 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): # all object sha's if path_rewriter: for i,e in enumerate(entries): - entries[i] = BaseIndexEntry((e.mode, e.sha, e.stage, path_rewriter(e))) + entries[i] = BaseIndexEntry((e.mode, e.binsha, e.stage, path_rewriter(e))) # END for each entry # END handle path rewriting -- cgit v1.2.1 From d2d9197cfe5d3b43cb8aee182b2e65c73ef9ab7b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 22 Jun 2010 09:52:52 +0200 Subject: Tree-Writing now works after fixing an off-by-one error --- lib/git/index/base.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'lib/git/index/base.py') diff --git a/lib/git/index/base.py b/lib/git/index/base.py index 96a9430c..48b2d766 100644 --- a/lib/git/index/base.py +++ b/lib/git/index/base.py @@ -889,40 +889,41 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): @default_index def checkout(self, paths=None, force=False, fprogress=lambda *args: None, **kwargs): - """ - Checkout the given paths or all files from the version known to the index into + """Checkout the given paths or all files from the version known to the index into the working tree. + + :note: Be sure you have written pending changes using the ``write`` method + in case you have altered the enties dictionary directly - ``paths`` + :param paths: If None, all paths in the index will be checked out. Otherwise an iterable of relative or absolute paths or a single path pointing to files or directories in the index is expected. - ``force`` + :param force: If True, existing files will be overwritten even if they contain local modifications. If False, these will trigger a CheckoutError. - ``fprogress`` + :param fprogress: see Index.add_ for signature and explanation. The provided progress information will contain None as path and item if no explicit paths are given. Otherwise progress information will be send prior and after a file has been checked out - ``**kwargs`` + :param **kwargs: Additional arguments to be pasesd to git-checkout-index - Returns + :return: iterable yielding paths to files which have been checked out and are guaranteed to match the version stored in the index - Raise CheckoutError + :raise CheckoutError: If at least one file failed to be checked out. This is a summary, hence it will checkout as many files as it can anyway. If one of files or directories do not exist in the index ( as opposed to the original git command who ignores them ). Raise GitCommandError if error lines could not be parsed - this truly is - an exceptional state - """ + an exceptional state""" args = ["--index"] if force: args.append("--force") -- cgit v1.2.1 From c4f49fb232acb2c02761a82acc12c4040699685d Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 22 Jun 2010 12:03:37 +0200 Subject: index.write_tree: now uses MemoryDB, making tree handling more efficient as IO will only be done when required. A possible disadvantage though is that time is spent on compressing the trees, although only the raw data and their shas would theoretically be needed. On the other hand, compressing their data uses less memory. An optimal implementation would just sha the data, check for existance, and compress it to write it to the database right away. This would mean more specialized code though, introducing redundancy. If IStreams would know whether they contain compressed or uncompressed data, and if there was a method to get a sha from data, this would work nicely in the existing framework though --- lib/git/index/base.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'lib/git/index/base.py') diff --git a/lib/git/index/base.py b/lib/git/index/base.py index 48b2d766..06437702 100644 --- a/lib/git/index/base.py +++ b/lib/git/index/base.py @@ -66,6 +66,7 @@ from fun import ( ) from gitdb.base import IStream +from gitdb.db import MemoryDB __all__ = ( 'IndexFile', 'CheckoutError' ) @@ -502,10 +503,13 @@ class IndexFile(LazyMixin, diff.Diffable, Serializable): if not self.entries: raise ValueError("Cannot write empty index") - # TODO: use memory db, this helps to prevent IO if the resulting tree - # already exists + mdb = MemoryDB() entries = self._entries_sorted() - binsha, tree_items = write_tree_from_cache(entries, self.repo.odb, slice(0, len(entries))) + binsha, tree_items = write_tree_from_cache(entries, mdb, slice(0, len(entries))) + + # copy changed trees only + mdb.stream_copy(mdb.sha_iter(), self.repo.odb) + # note: additional deserialization could be saved if write_tree_from_cache # would return sorted tree entries -- cgit v1.2.1