From df0892351a394d768489b5647d47b73c24d3ef5f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 00:48:16 +0200 Subject: commit: initial version of commit_from_tree which could create commit objects if it could serialize itself --- lib/git/objects/base.py | 1 + lib/git/objects/commit.py | 777 +++++++++++++++++++++++++--------------------- lib/git/objects/utils.py | 411 +++++++++++++++--------- 3 files changed, 679 insertions(+), 510 deletions(-) (limited to 'lib') diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py index 6a51eed3..bb15192d 100644 --- a/lib/git/objects/base.py +++ b/lib/git/objects/base.py @@ -141,6 +141,7 @@ class Object(LazyMixin): self.repo.git.cat_file(self.type, self.sha, output_stream=ostream) return self + class IndexObject(Object): """ Base for all objects that can be part of the index file , namely Tree, Blob and diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py index 826f684c..87eed49b 100644 --- a/lib/git/objects/commit.py +++ b/lib/git/objects/commit.py @@ -7,372 +7,425 @@ from git.utils import Iterable import git.diff as diff import git.stats as stats +from git.actor import Actor from tree import Tree import base import utils -import tempfile +import time import os class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable): - """ - Wraps a git Commit object. - - This class will act lazily on some of its attributes and will query the - value on demand only if it involves calling the git binary. - """ - - # object configuration - type = "commit" - __slots__ = ("tree", - "author", "authored_date", "author_tz_offset", - "committer", "committed_date", "committer_tz_offset", - "message", "parents") - _id_attribute_ = "sha" - - def __init__(self, repo, sha, tree=None, author=None, authored_date=None, author_tz_offset=None, - committer=None, committed_date=None, committer_tz_offset=None, message=None, parents=None): - """ - Instantiate a new Commit. All keyword arguments taking None as default will - be implicitly set if id names a valid sha. - - The parameter documentation indicates the type of the argument after a colon ':'. - - ``sha`` - is the sha id of the commit or a ref - - ``parents`` : tuple( Commit, ... ) - is a tuple of commit ids or actual Commits - - ``tree`` : Tree - is the corresponding tree id or an actual Tree - - ``author`` : Actor - is the author string ( will be implicitly converted into an Actor object ) - - ``authored_date`` : int_seconds_since_epoch - is the authored DateTime - use time.gmtime() to convert it into a - different format - - ``author_tz_offset``: int_seconds_west_of_utc - is the timezone that the authored_date is in - - ``committer`` : Actor - is the committer string - - ``committed_date`` : int_seconds_since_epoch - is the committed DateTime - use time.gmtime() to convert it into a - different format - - ``committer_tz_offset``: int_seconds_west_of_utc - is the timezone that the authored_date is in - - ``message`` : string - is the commit message - - Returns - git.Commit - """ - super(Commit,self).__init__(repo, sha) - self._set_self_from_args_(locals()) - - if parents is not None: - self.parents = tuple( self.__class__(repo, p) for p in parents ) - # END for each parent to convert - - if self.sha and tree is not None: - self.tree = Tree(repo, tree, path='') - # END id to tree conversion - - @classmethod - def _get_intermediate_items(cls, commit): - return commit.parents - - def _set_cache_(self, attr): - """ - Called by LazyMixin superclass when the given uninitialized member needs - to be set. - We set all values at once. - """ - if attr in Commit.__slots__: - # prepare our data lines to match rev-list - data_lines = self.data.splitlines() - data_lines.insert(0, "commit %s" % self.sha) - temp = self._iter_from_process_or_stream(self.repo, iter(data_lines), False).next() - self.parents = temp.parents - self.tree = temp.tree - self.author = temp.author - self.authored_date = temp.authored_date - self.author_tz_offset = temp.author_tz_offset - self.committer = temp.committer - self.committed_date = temp.committed_date - self.committer_tz_offset = temp.committer_tz_offset - self.message = temp.message - else: - super(Commit, self)._set_cache_(attr) - - @property - def summary(self): - """ - Returns - First line of the commit message. - """ - return self.message.split('\n', 1)[0] - - def count(self, paths='', **kwargs): - """ - Count the number of commits reachable from this commit - - ``paths`` - is an optinal path or a list of paths restricting the return value - to commits actually containing the paths - - ``kwargs`` - Additional options to be passed to git-rev-list. They must not alter - the ouput style of the command, or parsing will yield incorrect results - Returns - int - """ - # yes, it makes a difference whether empty paths are given or not in our case - # as the empty paths version will ignore merge commits for some reason. - if paths: - return len(self.repo.git.rev_list(self.sha, '--', paths, **kwargs).splitlines()) - else: - return len(self.repo.git.rev_list(self.sha, **kwargs).splitlines()) - - - @property - def name_rev(self): - """ - Returns - String describing the commits hex sha based on the closest Reference. - Mostly useful for UI purposes - """ - return self.repo.git.name_rev(self) - - @classmethod - def iter_items(cls, repo, rev, paths='', **kwargs): - """ - Find all commits matching the given criteria. - - ``repo`` - is the Repo - - ``rev`` - revision specifier, see git-rev-parse for viable options - - ``paths`` - is an optinal path or list of paths, if set only Commits that include the path - or paths will be considered - - ``kwargs`` - optional keyword arguments to git rev-list where - ``max_count`` is the maximum number of commits to fetch - ``skip`` is the number of commits to skip - ``since`` all commits since i.e. '1970-01-01' - - Returns - iterator yielding Commit items - """ - options = {'pretty': 'raw', 'as_process' : True } - options.update(kwargs) - - args = list() - if paths: - args.extend(('--', paths)) - # END if paths - - proc = repo.git.rev_list(rev, args, **options) - return cls._iter_from_process_or_stream(repo, proc, True) - - def iter_parents(self, paths='', **kwargs): - """ - Iterate _all_ parents of this commit. - - ``paths`` - Optional path or list of paths limiting the Commits to those that - contain at least one of the paths - - ``kwargs`` - All arguments allowed by git-rev-list - - Return: - Iterator yielding Commit objects which are parents of self - """ - # skip ourselves - skip = kwargs.get("skip", 1) - if skip == 0: # skip ourselves - skip = 1 - kwargs['skip'] = skip - - return self.iter_items( self.repo, self, paths, **kwargs ) - - @property - def stats(self): - """ - Create a git stat from changes between this commit and its first parent - or from all changes done if this is the very first commit. - - Return - git.Stats - """ - if not self.parents: - text = self.repo.git.diff_tree(self.sha, '--', numstat=True, root=True) - text2 = "" - for line in text.splitlines()[1:]: - (insertions, deletions, filename) = line.split("\t") - text2 += "%s\t%s\t%s\n" % (insertions, deletions, filename) - text = text2 - else: - text = self.repo.git.diff(self.parents[0].sha, self.sha, '--', numstat=True) - return stats.Stats._list_from_string(self.repo, text) - - @classmethod - def _iter_from_process_or_stream(cls, repo, proc_or_stream, from_rev_list): - """ - Parse out commit information into a list of Commit objects - - ``repo`` - is the Repo - - ``proc`` - git-rev-list process instance (raw format) - - ``from_rev_list`` - If True, the stream was created by rev-list in which case we parse - the message differently - Returns - iterator returning Commit objects - """ - stream = proc_or_stream - if not hasattr(stream,'next'): - stream = proc_or_stream.stdout - - for line in stream: - commit_tokens = line.split() - id = commit_tokens[1] - assert commit_tokens[0] == "commit" - tree = stream.next().split()[1] - - parents = [] - next_line = None - for parent_line in stream: - if not parent_line.startswith('parent'): - next_line = parent_line - break - # END abort reading parents - parents.append(parent_line.split()[-1]) - # END for each parent line - - author, authored_date, author_tz_offset = utils.parse_actor_and_date(next_line) - committer, committed_date, committer_tz_offset = utils.parse_actor_and_date(stream.next()) - - # empty line - stream.next() - - message_lines = [] - if from_rev_list: - for msg_line in stream: - if not msg_line.startswith(' '): - # and forget about this empty marker - break - # END abort message reading - # strip leading 4 spaces - message_lines.append(msg_line[4:]) - # END while there are message lines - else: - # a stream from our data simply gives us the plain message - for msg_line in stream: - message_lines.append(msg_line) - # END message parsing - message = '\n'.join(message_lines) - - yield Commit(repo, id, parents=tuple(parents), tree=tree, - author=author, authored_date=authored_date, author_tz_offset=author_tz_offset, - committer=committer, committed_date=committed_date, committer_tz_offset=committer_tz_offset, - message=message) - # END for each line in stream - - - @classmethod - def create_from_tree(cls, repo, tree, message, parent_commits=None, head=False): - """ - Commit the given tree, creating a commit object. - - ``repo`` - is the Repo - - ``tree`` - Sha of a tree or a tree object to become the tree of the new commit - - ``message`` - Commit message. It may be an empty string if no message is provided. - It will be converted to a string in any case. - - ``parent_commits`` - Optional Commit objects to use as parents for the new commit. - If empty list, the commit will have no parents at all and become - a root commit. - If None , the current head commit will be the parent of the - new commit object - - ``head`` - If True, the HEAD will be advanced to the new commit automatically. - Else the HEAD will remain pointing on the previous commit. This could - lead to undesired results when diffing files. - - Returns - Commit object representing the new commit - - Note: - Additional information about hte committer and Author are taken from the - environment or from the git configuration, see git-commit-tree for - more information - """ - parents = parent_commits - if parent_commits is None: - try: - parent_commits = [ repo.head.commit ] - except ValueError: - # empty repositories have no head commit - parent_commits = list() - # END handle parent commits - # END if parent commits are unset - - parent_args = [ ("-p", str(commit)) for commit in parent_commits ] - - # create message stream - tmp_file_path = tempfile.mktemp() - fp = open(tmp_file_path,"wb") - fp.write(str(message)) - fp.close() - fp = open(tmp_file_path,"rb") - fp.seek(0) - - try: - # write the current index as tree - commit_sha = repo.git.commit_tree(tree, parent_args, istream=fp) - new_commit = cls(repo, commit_sha) - - if head: - try: - repo.head.commit = new_commit - except ValueError: - # head is not yet set to the ref our HEAD points to. - import git.refs - master = git.refs.Head.create(repo, repo.head.ref, commit=new_commit) - repo.head.reference = master - # END handle empty repositories - # END advance head handling - - return new_commit - finally: - fp.close() - os.remove(tmp_file_path) - - def __str__(self): - """ Convert commit to string which is SHA1 """ - return self.sha - - def __repr__(self): - return '' % self.sha + """ + Wraps a git Commit object. + + This class will act lazily on some of its attributes and will query the + value on demand only if it involves calling the git binary. + """ + + # ENVIRONMENT VARIABLES + # read when creating new commits + env_author_name = "GIT_AUTHOR_NAME" + env_author_email = "GIT_AUTHOR_EMAIL" + env_author_date = "GIT_AUTHOR_DATE" + env_committer_name = "GIT_COMMITTER_NAME" + env_committer_email = "GIT_COMMITTER_EMAIL" + env_committer_date = "GIT_COMMITTER_DATE" + env_email = "EMAIL" + + # CONFIGURATION KEYS + conf_email = 'email' + conf_name = 'name' + conf_encoding = 'i18n.commitencoding' + + # INVARIANTS + default_encoding = "UTF-8" + + + # object configuration + type = "commit" + __slots__ = ("tree", + "author", "authored_date", "author_tz_offset", + "committer", "committed_date", "committer_tz_offset", + "message", "parents", "encoding") + _id_attribute_ = "sha" + + def __init__(self, repo, sha, tree=None, author=None, authored_date=None, author_tz_offset=None, + committer=None, committed_date=None, committer_tz_offset=None, + message=None, parents=None, encoding=None): + """ + Instantiate a new Commit. All keyword arguments taking None as default will + be implicitly set if id names a valid sha. + + The parameter documentation indicates the type of the argument after a colon ':'. + + :param sha: is the sha id of the commit or a ref + :param parents: tuple( Commit, ... ) + is a tuple of commit ids or actual Commits + :param tree: Tree + is the corresponding tree id or an actual Tree + :param author: Actor + is the author string ( will be implicitly converted into an Actor object ) + :param authored_date: int_seconds_since_epoch + is the authored DateTime - use time.gmtime() to convert it into a + different format + :param author_tz_offset: int_seconds_west_of_utc + is the timezone that the authored_date is in + :param committer: Actor + is the committer string + :param committed_date: int_seconds_since_epoch + is the committed DateTime - use time.gmtime() to convert it into a + different format + :param committer_tz_offset: int_seconds_west_of_utc + is the timezone that the authored_date is in + :param message: string + is the commit message + :param encoding: string + encoding of the message, defaults to UTF-8 + :return: git.Commit + + :note: Timezone information is in the same format and in the same sign + as what time.altzone returns. The sign is inverted compared to git's + UTC timezone. + """ + super(Commit,self).__init__(repo, sha) + self._set_self_from_args_(locals()) + + if parents is not None: + self.parents = tuple( self.__class__(repo, p) for p in parents ) + # END for each parent to convert + + if self.sha and tree is not None: + self.tree = Tree(repo, tree, path='') + # END id to tree conversion + + @classmethod + def _get_intermediate_items(cls, commit): + return commit.parents + + def _set_cache_(self, attr): + """ + Called by LazyMixin superclass when the given uninitialized member needs + to be set. + We set all values at once. + """ + if attr in Commit.__slots__: + # prepare our data lines to match rev-list + data_lines = self.data.splitlines() + data_lines.insert(0, "commit %s" % self.sha) + temp = self._iter_from_process_or_stream(self.repo, iter(data_lines), False).next() + self.parents = temp.parents + self.tree = temp.tree + self.author = temp.author + self.authored_date = temp.authored_date + self.author_tz_offset = temp.author_tz_offset + self.committer = temp.committer + self.committed_date = temp.committed_date + self.committer_tz_offset = temp.committer_tz_offset + self.message = temp.message + self.encoding = temp.encoding + else: + super(Commit, self)._set_cache_(attr) + + @property + def summary(self): + """ + Returns + First line of the commit message. + """ + return self.message.split('\n', 1)[0] + + def count(self, paths='', **kwargs): + """ + Count the number of commits reachable from this commit + + ``paths`` + is an optinal path or a list of paths restricting the return value + to commits actually containing the paths + + ``kwargs`` + Additional options to be passed to git-rev-list. They must not alter + the ouput style of the command, or parsing will yield incorrect results + Returns + int + """ + # yes, it makes a difference whether empty paths are given or not in our case + # as the empty paths version will ignore merge commits for some reason. + if paths: + return len(self.repo.git.rev_list(self.sha, '--', paths, **kwargs).splitlines()) + else: + return len(self.repo.git.rev_list(self.sha, **kwargs).splitlines()) + + + @property + def name_rev(self): + """ + Returns + String describing the commits hex sha based on the closest Reference. + Mostly useful for UI purposes + """ + return self.repo.git.name_rev(self) + + @classmethod + def iter_items(cls, repo, rev, paths='', **kwargs): + """ + Find all commits matching the given criteria. + + ``repo`` + is the Repo + + ``rev`` + revision specifier, see git-rev-parse for viable options + + ``paths`` + is an optinal path or list of paths, if set only Commits that include the path + or paths will be considered + + ``kwargs`` + optional keyword arguments to git rev-list where + ``max_count`` is the maximum number of commits to fetch + ``skip`` is the number of commits to skip + ``since`` all commits since i.e. '1970-01-01' + + Returns + iterator yielding Commit items + """ + options = {'pretty': 'raw', 'as_process' : True } + options.update(kwargs) + + args = list() + if paths: + args.extend(('--', paths)) + # END if paths + + proc = repo.git.rev_list(rev, args, **options) + return cls._iter_from_process_or_stream(repo, proc, True) + + def iter_parents(self, paths='', **kwargs): + """ + Iterate _all_ parents of this commit. + + ``paths`` + Optional path or list of paths limiting the Commits to those that + contain at least one of the paths + + ``kwargs`` + All arguments allowed by git-rev-list + + Return: + Iterator yielding Commit objects which are parents of self + """ + # skip ourselves + skip = kwargs.get("skip", 1) + if skip == 0: # skip ourselves + skip = 1 + kwargs['skip'] = skip + + return self.iter_items( self.repo, self, paths, **kwargs ) + + @property + def stats(self): + """ + Create a git stat from changes between this commit and its first parent + or from all changes done if this is the very first commit. + + Return + git.Stats + """ + if not self.parents: + text = self.repo.git.diff_tree(self.sha, '--', numstat=True, root=True) + text2 = "" + for line in text.splitlines()[1:]: + (insertions, deletions, filename) = line.split("\t") + text2 += "%s\t%s\t%s\n" % (insertions, deletions, filename) + text = text2 + else: + text = self.repo.git.diff(self.parents[0].sha, self.sha, '--', numstat=True) + return stats.Stats._list_from_string(self.repo, text) + + @classmethod + def _iter_from_process_or_stream(cls, repo, proc_or_stream, from_rev_list): + """ + Parse out commit information into a list of Commit objects + + ``repo`` + is the Repo + + ``proc`` + git-rev-list process instance (raw format) + + ``from_rev_list`` + If True, the stream was created by rev-list in which case we parse + the message differently + Returns + iterator returning Commit objects + """ + stream = proc_or_stream + if not hasattr(stream,'next'): + stream = proc_or_stream.stdout + + for line in stream: + commit_tokens = line.split() + id = commit_tokens[1] + assert commit_tokens[0] == "commit" + tree = stream.next().split()[1] + + parents = [] + next_line = None + for parent_line in stream: + if not parent_line.startswith('parent'): + next_line = parent_line + break + # END abort reading parents + parents.append(parent_line.split()[-1]) + # END for each parent line + + author, authored_date, author_tz_offset = utils.parse_actor_and_date(next_line) + committer, committed_date, committer_tz_offset = utils.parse_actor_and_date(stream.next()) + + + # empty line + encoding = stream.next() + encoding.strip() + if encoding: + encoding = encoding[encoding.find(' ')+1:] + # END parse encoding + + message_lines = list() + if from_rev_list: + for msg_line in stream: + if not msg_line.startswith(' '): + # and forget about this empty marker + break + # END abort message reading + # strip leading 4 spaces + message_lines.append(msg_line[4:]) + # END while there are message lines + else: + # a stream from our data simply gives us the plain message + for msg_line in stream: + message_lines.append(msg_line) + # END message parsing + message = '\n'.join(message_lines) + + + yield Commit(repo, id, tree, + author, authored_date, author_tz_offset, + committer, committed_date, committer_tz_offset, + message, tuple(parents), + encoding or cls.default_encoding) + # END for each line in stream + + + @classmethod + def create_from_tree(cls, repo, tree, message, parent_commits=None, head=False): + """Commit the given tree, creating a commit object. + + :param repo: Repo object the commit should be part of + :param tree: Sha of a tree or a tree object to become the tree of the new commit + :param message: Commit message. It may be an empty string if no message is provided. + It will be converted to a string in any case. + :param parent_commits: + Optional Commit objects to use as parents for the new commit. + If empty list, the commit will have no parents at all and become + a root commit. + If None , the current head commit will be the parent of the + new commit object + :param head: + If True, the HEAD will be advanced to the new commit automatically. + Else the HEAD will remain pointing on the previous commit. This could + lead to undesired results when diffing files. + + :return: Commit object representing the new commit + + :note: + Additional information about the committer and Author are taken from the + environment or from the git configuration, see git-commit-tree for + more information + """ + parents = parent_commits + if parent_commits is None: + try: + parent_commits = [ repo.head.commit ] + except ValueError: + # empty repositories have no head commit + parent_commits = list() + # END handle parent commits + # END if parent commits are unset + + # retrieve all additional information, create a commit object, and + # serialize it + # Generally: + # * Environment variables override configuration values + # * Sensible defaults are set according to the git documentation + + # COMMITER AND AUTHOR INFO + cr = repo.config_reader() + env = os.environ + default_email = utils.get_user_id() + default_name = default_email.split('@')[0] + + conf_name = cr.get_value('user', cls.conf_name, default_name) + conf_email = cr.get_value('user', cls.conf_email, default_email) + + author_name = env.get(cls.env_author_name, conf_name) + author_email = env.get(cls.env_author_email, default_email) + + committer_name = env.get(cls.env_committer_name, conf_name) + committer_email = env.get(cls.env_committer_email, conf_email) + + # PARSE THE DATES + unix_time = int(time.time()) + offset = time.altzone + + author_date_str = env.get(cls.env_author_date, '') + if author_date_str: + author_time, author_offset = utils.parse_date(author_date_str) + else: + author_time, author_offset = unix_time, offset + # END set author time + + committer_date_str = env.get(cls.env_committer_date, '') + if committer_date_str: + committer_time, committer_offset = utils.parse_date(committer_date_str) + else: + committer_time, committer_offset = unix_time, offset + # END set committer time + + # assume utf8 encoding + enc_section, enc_option = cls.conf_encoding.split('.') + conf_encoding = cr.get_value(enc_section, enc_option, default_encoding) + + author = Actor(author_name, author_email) + committer = Actor(committer_name, committer_email) + + + # CREATE NEW COMMIT + new_commit = cls(repo, cls.NULL_HEX_SHA, tree, + author, author_time, author_offset, + committer, committer_time, committer_offset, + message, parent_commits, conf_encoding) + + # serialize ! + + if head: + try: + repo.head.commit = new_commit + except ValueError: + # head is not yet set to the ref our HEAD points to + # Happens on first commit + import git.refs + master = git.refs.Head.create(repo, repo.head.ref, commit=new_commit) + repo.head.reference = master + # END handle empty repositories + # END advance head handling + + return new_commit + + + def __str__(self): + """ Convert commit to string which is SHA1 """ + return self.sha + + def __repr__(self): + return '' % self.sha diff --git a/lib/git/objects/utils.py b/lib/git/objects/utils.py index 4f17b652..7060e293 100644 --- a/lib/git/objects/utils.py +++ b/lib/git/objects/utils.py @@ -9,159 +9,274 @@ Module for general utility functions import re from collections import deque as Deque from git.actor import Actor +import platform + +from string import digits +import time +import os + +__all__ = ('get_object_type_by_name', 'get_user_id', 'parse_date', 'parse_actor_and_date', + 'ProcessStreamAdapter', 'Traversable') def get_object_type_by_name(object_type_name): - """ - Returns - type suitable to handle the given object type name. - Use the type to create new instances. - - ``object_type_name`` - Member of TYPES - - Raises - ValueError: In case object_type_name is unknown - """ - if object_type_name == "commit": - import commit - return commit.Commit - elif object_type_name == "tag": - import tag - return tag.TagObject - elif object_type_name == "blob": - import blob - return blob.Blob - elif object_type_name == "tree": - import tree - return tree.Tree - else: - raise ValueError("Cannot handle unknown object type: %s" % object_type_name) - - + """ + Returns + type suitable to handle the given object type name. + Use the type to create new instances. + + ``object_type_name`` + Member of TYPES + + Raises + ValueError: In case object_type_name is unknown + """ + if object_type_name == "commit": + import commit + return commit.Commit + elif object_type_name == "tag": + import tag + return tag.TagObject + elif object_type_name == "blob": + import blob + return blob.Blob + elif object_type_name == "tree": + import tree + return tree.Tree + else: + raise ValueError("Cannot handle unknown object type: %s" % object_type_name) + + +def get_user_id(): + """:return: string identifying the currently active system user as name@node + :note: user can be set with the 'USER' environment variable, usually set on windows""" + ukn = 'UNKNOWN' + username = os.environ.get('USER', ukn) + if username == ukn and hasattr(os, 'getlogin'): + username = os.getlogin() + # END get username from login + return "%s@%s" % (username, platform.node()) + + +def _utc_tz_to_altz(utctz): + """we convert utctz to the timezone in seconds, it is the format time.altzone + returns. Git stores it as UTC timezon which has the opposite sign as well, + which explains the -1 * ( that was made explicit here ) + :param utctz: git utc timezone string, i.e. +0200""" + return -1 * int(float(utctz)/100*3600) + +def _verify_utctz(offset): + """:raise ValueError: if offset is incorrect + :return: offset""" + fmt_exc = ValueError("Invalid timezone offset format: %s" % offset) + if len(offset) != 5: + raise fmt_exc + if offset[0] not in "+-": + raise fmt_exc + if offset[1] not in digits or \ + offset[2] not in digits or \ + offset[3] not in digits or \ + offset[4] not in digits: + raise fmt_exc + # END for each char + return offset + +def parse_date(string_date): + """ + Parse the given date as one of the following + * Git internal format: timestamp offset + * RFC 2822: Thu, 07 Apr 2005 22:13:13 +0200. + * ISO 8601 2005-04-07T22:13:13 + The T can be a space as well + + :return: Tuple(int(timestamp), int(offset), both in seconds since epoch + :raise ValueError: If the format could not be understood + :note: Date can also be YYYY.MM.DD, MM/DD/YYYY and DD.MM.YYYY + """ + # git time + try: + if string_date.count(' ') == 1 and string_date.rfind(':') == -1: + timestamp, offset = string_date.split() + timestamp = int(timestamp) + return timestamp, _utc_tz_to_altz(_verify_utctz(offset)) + else: + offset = "+0000" # local time by default + if string_date[-5] in '-+': + offset = _verify_utctz(string_date[-5:]) + string_date = string_date[:-6] # skip space as well + # END split timezone info + + # now figure out the date and time portion - split time + date_formats = list() + splitter = -1 + if ',' in string_date: + date_formats.append("%a, %d %b %Y") + splitter = string_date.rfind(' ') + else: + # iso plus additional + date_formats.append("%Y-%m-%d") + date_formats.append("%Y.%m.%d") + date_formats.append("%m/%d/%Y") + date_formats.append("%d.%m.%Y") + + splitter = string_date.rfind('T') + if splitter == -1: + splitter = string_date.rfind(' ') + # END handle 'T' and ' ' + # END handle rfc or iso + + assert splitter > -1 + + # split date and time + time_part = string_date[splitter+1:] # skip space + date_part = string_date[:splitter] + + # parse time + tstruct = time.strptime(time_part, "%H:%M:%S") + + for fmt in date_formats: + try: + dtstruct = time.strptime(date_part, fmt) + fstruct = time.struct_time((dtstruct.tm_year, dtstruct.tm_mon, dtstruct.tm_mday, + tstruct.tm_hour, tstruct.tm_min, tstruct.tm_sec, + dtstruct.tm_wday, dtstruct.tm_yday, tstruct.tm_isdst)) + return int(time.mktime(fstruct)), _utc_tz_to_altz(offset) + except ValueError: + continue + # END exception handling + # END for each fmt + + # still here ? fail + raise ValueError("no format matched") + # END handle format + except Exception: + raise ValueError("Unsupported date format: %s" % string_date) + # END handle exceptions + + # precompiled regex _re_actor_epoch = re.compile(r'^.+? (.*) (\d+) ([+-]\d+).*$') def parse_actor_and_date(line): - """ - Parse out the actor (author or committer) info from a line like:: - - author Tom Preston-Werner 1191999972 -0700 - - Returns - [Actor, int_seconds_since_epoch, int_timezone_offset] - """ - m = _re_actor_epoch.search(line) - actor, epoch, offset = m.groups() - return (Actor._from_string(actor), int(epoch), -int(float(offset)/100*3600)) - - - + """ + Parse out the actor (author or committer) info from a line like:: + + author Tom Preston-Werner 1191999972 -0700 + + Returns + [Actor, int_seconds_since_epoch, int_timezone_offset] + """ + m = _re_actor_epoch.search(line) + actor, epoch, offset = m.groups() + return (Actor._from_string(actor), int(epoch), _utc_tz_to_altz(offset)) + + + class ProcessStreamAdapter(object): - """ - Class wireing all calls to the contained Process instance. - - Use this type to hide the underlying process to provide access only to a specified - stream. The process is usually wrapped into an AutoInterrupt class to kill - it if the instance goes out of scope. - """ - __slots__ = ("_proc", "_stream") - def __init__(self, process, stream_name): - self._proc = process - self._stream = getattr(process, stream_name) - - def __getattr__(self, attr): - return getattr(self._stream, attr) - - + """ + Class wireing all calls to the contained Process instance. + + Use this type to hide the underlying process to provide access only to a specified + stream. The process is usually wrapped into an AutoInterrupt class to kill + it if the instance goes out of scope. + """ + __slots__ = ("_proc", "_stream") + def __init__(self, process, stream_name): + self._proc = process + self._stream = getattr(process, stream_name) + + def __getattr__(self, attr): + return getattr(self._stream, attr) + + class Traversable(object): - """Simple interface to perforam depth-first or breadth-first traversals - into one direction. - Subclasses only need to implement one function. - Instances of the Subclass must be hashable""" - __slots__ = tuple() - - @classmethod - def _get_intermediate_items(cls, item): - """ - Returns: - List of items connected to the given item. - Must be implemented in subclass - """ - raise NotImplementedError("To be implemented in subclass") - - - def traverse( self, predicate = lambda i,d: True, - prune = lambda i,d: False, depth = -1, branch_first=True, - visit_once = True, ignore_self=1, as_edge = False ): - """ - ``Returns`` - iterator yieling of items found when traversing self - - ``predicate`` - f(i,d) returns False if item i at depth d should not be included in the result - - ``prune`` - f(i,d) return True if the search should stop at item i at depth d. - Item i will not be returned. - - ``depth`` - define at which level the iteration should not go deeper - if -1, there is no limit - if 0, you would effectively only get self, the root of the iteration - i.e. if 1, you would only get the first level of predessessors/successors - - ``branch_first`` - if True, items will be returned branch first, otherwise depth first - - ``visit_once`` - if True, items will only be returned once, although they might be encountered - several times. Loops are prevented that way. - - ``ignore_self`` - if True, self will be ignored and automatically pruned from - the result. Otherwise it will be the first item to be returned. - If as_edge is True, the source of the first edge is None - - ``as_edge`` - if True, return a pair of items, first being the source, second the - destinatination, i.e. tuple(src, dest) with the edge spanning from - source to destination""" - visited = set() - stack = Deque() - stack.append( ( 0 ,self, None ) ) # self is always depth level 0 - - def addToStack( stack, item, branch_first, depth ): - lst = self._get_intermediate_items( item ) - if not lst: - return - if branch_first: - stack.extendleft( ( depth , i, item ) for i in lst ) - else: - reviter = ( ( depth , lst[i], item ) for i in range( len( lst )-1,-1,-1) ) - stack.extend( reviter ) - # END addToStack local method - - while stack: - d, item, src = stack.pop() # depth of item, item, item_source - - if visit_once and item in visited: - continue - - if visit_once: - visited.add(item) - - rval = ( as_edge and (src, item) ) or item - if prune( rval, d ): - continue - - skipStartItem = ignore_self and ( item == self ) - if not skipStartItem and predicate( rval, d ): - yield rval - - # only continue to next level if this is appropriate ! - nd = d + 1 - if depth > -1 and nd > depth: - continue - - addToStack( stack, item, branch_first, nd ) - # END for each item on work stack + """Simple interface to perforam depth-first or breadth-first traversals + into one direction. + Subclasses only need to implement one function. + Instances of the Subclass must be hashable""" + __slots__ = tuple() + + @classmethod + def _get_intermediate_items(cls, item): + """ + Returns: + List of items connected to the given item. + Must be implemented in subclass + """ + raise NotImplementedError("To be implemented in subclass") + + + def traverse( self, predicate = lambda i,d: True, + prune = lambda i,d: False, depth = -1, branch_first=True, + visit_once = True, ignore_self=1, as_edge = False ): + """ + ``Returns`` + iterator yieling of items found when traversing self + + ``predicate`` + f(i,d) returns False if item i at depth d should not be included in the result + + ``prune`` + f(i,d) return True if the search should stop at item i at depth d. + Item i will not be returned. + + ``depth`` + define at which level the iteration should not go deeper + if -1, there is no limit + if 0, you would effectively only get self, the root of the iteration + i.e. if 1, you would only get the first level of predessessors/successors + + ``branch_first`` + if True, items will be returned branch first, otherwise depth first + + ``visit_once`` + if True, items will only be returned once, although they might be encountered + several times. Loops are prevented that way. + + ``ignore_self`` + if True, self will be ignored and automatically pruned from + the result. Otherwise it will be the first item to be returned. + If as_edge is True, the source of the first edge is None + + ``as_edge`` + if True, return a pair of items, first being the source, second the + destinatination, i.e. tuple(src, dest) with the edge spanning from + source to destination""" + visited = set() + stack = Deque() + stack.append( ( 0 ,self, None ) ) # self is always depth level 0 + + def addToStack( stack, item, branch_first, depth ): + lst = self._get_intermediate_items( item ) + if not lst: + return + if branch_first: + stack.extendleft( ( depth , i, item ) for i in lst ) + else: + reviter = ( ( depth , lst[i], item ) for i in range( len( lst )-1,-1,-1) ) + stack.extend( reviter ) + # END addToStack local method + + while stack: + d, item, src = stack.pop() # depth of item, item, item_source + + if visit_once and item in visited: + continue + + if visit_once: + visited.add(item) + + rval = ( as_edge and (src, item) ) or item + if prune( rval, d ): + continue + + skipStartItem = ignore_self and ( item == self ) + if not skipStartItem and predicate( rval, d ): + yield rval + + # only continue to next level if this is appropriate ! + nd = d + 1 + if depth > -1 and nd > depth: + continue + + addToStack( stack, item, branch_first, nd ) + # END for each item on work stack -- cgit v1.2.1 From 8c1a87d11df666d308d14e4ae7ee0e9d614296b6 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 12:30:33 +0200 Subject: commit: refactored existing code to decode commits from streams - performance is slightly better git.cmd: added method to provide access to the content stream directly. This is more efficient if large objects are handled, if it is actually used test.helpers: removed unnecessary code --- lib/git/cmd.py | 901 +++++++++++++++++++++++++--------------------- lib/git/objects/base.py | 418 ++++++++++----------- lib/git/objects/commit.py | 139 +++---- lib/git/objects/tree.py | 2 +- lib/git/objects/utils.py | 17 + 5 files changed, 797 insertions(+), 680 deletions(-) (limited to 'lib') diff --git a/lib/git/cmd.py b/lib/git/cmd.py index ef2fdf4e..cef4ea60 100644 --- a/lib/git/cmd.py +++ b/lib/git/cmd.py @@ -13,414 +13,505 @@ from errors import GitCommandError GIT_PYTHON_TRACE = os.environ.get("GIT_PYTHON_TRACE", False) execute_kwargs = ('istream', 'with_keep_cwd', 'with_extended_output', - 'with_exceptions', 'as_process', - 'output_stream' ) + 'with_exceptions', 'as_process', + 'output_stream' ) def dashify(string): - return string.replace('_', '-') + return string.replace('_', '-') class Git(object): - """ - The Git class manages communication with the Git binary. - - It provides a convenient interface to calling the Git binary, such as in:: - - g = Git( git_dir ) - g.init() # calls 'git init' program - rval = g.ls_files() # calls 'git ls-files' program - - ``Debugging`` - Set the GIT_PYTHON_TRACE environment variable print each invocation - of the command to stdout. - Set its value to 'full' to see details about the returned values. - """ - __slots__ = ("_working_dir", "cat_file_all", "cat_file_header") - - class AutoInterrupt(object): - """ - Kill/Interrupt the stored process instance once this instance goes out of scope. It is - used to prevent processes piling up in case iterators stop reading. - Besides all attributes are wired through to the contained process object. - - The wait method was overridden to perform automatic status code checking - and possibly raise. - """ - __slots__= ("proc", "args") - - def __init__(self, proc, args ): - self.proc = proc - self.args = args - - def __del__(self): - # did the process finish already so we have a return code ? - if self.proc.poll() is not None: - return - - # can be that nothing really exists anymore ... - if os is None: - return - - # try to kill it - try: - os.kill(self.proc.pid, 2) # interrupt signal - except AttributeError: - # try windows - # for some reason, providing None for stdout/stderr still prints something. This is why - # we simply use the shell and redirect to nul. Its slower than CreateProcess, question - # is whether we really want to see all these messages. Its annoying no matter what. - subprocess.call(("TASKKILL /F /T /PID %s 2>nul 1>nul" % str(self.proc.pid)), shell=True) - # END exception handling - - def __getattr__(self, attr): - return getattr(self.proc, attr) - - def wait(self): - """ - Wait for the process and return its status code. - - Raise - GitCommandError if the return status is not 0 - """ - status = self.proc.wait() - if status != 0: - raise GitCommandError(self.args, status, self.proc.stderr.read()) - # END status handling - return status - - - - def __init__(self, working_dir=None): - """ - Initialize this instance with: - - ``working_dir`` - Git directory we should work in. If None, we always work in the current - directory as returned by os.getcwd(). - It is meant to be the working tree directory if available, or the - .git directory in case of bare repositories. - """ - super(Git, self).__init__() - self._working_dir = working_dir - - # cached command slots - self.cat_file_header = None - self.cat_file_all = None - - def __getattr__(self, name): - """ - A convenience method as it allows to call the command as if it was - an object. - Returns - Callable object that will execute call _call_process with your arguments. - """ - if name[:1] == '_': - raise AttributeError(name) - return lambda *args, **kwargs: self._call_process(name, *args, **kwargs) - - @property - def working_dir(self): - """ - Returns - Git directory we are working on - """ - return self._working_dir - - def execute(self, command, - istream=None, - with_keep_cwd=False, - with_extended_output=False, - with_exceptions=True, - as_process=False, - output_stream=None, - **subprocess_kwargs - ): - """ - Handles executing the command on the shell and consumes and returns - the returned information (stdout) - - ``command`` - The command argument list to execute. - It should be a string, or a sequence of program arguments. The - program to execute is the first item in the args sequence or string. - - ``istream`` - Standard input filehandle passed to subprocess.Popen. - - ``with_keep_cwd`` - Whether to use the current working directory from os.getcwd(). - The cmd otherwise uses its own working_dir that it has been initialized - with if possible. - - ``with_extended_output`` - Whether to return a (status, stdout, stderr) tuple. - - ``with_exceptions`` - Whether to raise an exception when git returns a non-zero status. - - ``as_process`` - Whether to return the created process instance directly from which - streams can be read on demand. This will render with_extended_output and - with_exceptions ineffective - the caller will have - to deal with the details himself. - It is important to note that the process will be placed into an AutoInterrupt - wrapper that will interrupt the process once it goes out of scope. If you - use the command in iterators, you should pass the whole process instance - instead of a single stream. - - ``output_stream`` - If set to a file-like object, data produced by the git command will be - output to the given stream directly. - This feature only has any effect if as_process is False. Processes will - always be created with a pipe due to issues with subprocess. - This merely is a workaround as data will be copied from the - output pipe to the given output stream directly. - - ``**subprocess_kwargs`` - Keyword arguments to be passed to subprocess.Popen. Please note that - some of the valid kwargs are already set by this method, the ones you - specify may not be the same ones. - - Returns:: - - str(output) # extended_output = False (Default) - tuple(int(status), str(stdout), str(stderr)) # extended_output = True - - if ouput_stream is True, the stdout value will be your output stream: - output_stream # extended_output = False - tuple(int(status), output_stream, str(stderr))# extended_output = True - - Raise - GitCommandError - - NOTE - If you add additional keyword arguments to the signature of this method, - you must update the execute_kwargs tuple housed in this module. - """ - if GIT_PYTHON_TRACE and not GIT_PYTHON_TRACE == 'full': - print ' '.join(command) - - # Allow the user to have the command executed in their working dir. - if with_keep_cwd or self._working_dir is None: - cwd = os.getcwd() - else: - cwd=self._working_dir - - # Start the process - proc = subprocess.Popen(command, - cwd=cwd, - stdin=istream, - stderr=subprocess.PIPE, - stdout=subprocess.PIPE, - close_fds=(os.name=='posix'),# unsupported on linux - **subprocess_kwargs - ) - if as_process: - return self.AutoInterrupt(proc, command) - - # Wait for the process to return - status = 0 - stdout_value = '' - stderr_value = '' - try: - if output_stream is None: - stdout_value = proc.stdout.read().rstrip() # strip trailing "\n" - else: - max_chunk_size = 1024*64 - while True: - chunk = proc.stdout.read(max_chunk_size) - output_stream.write(chunk) - if len(chunk) < max_chunk_size: - break - # END reading output stream - stdout_value = output_stream - # END stdout handling - stderr_value = proc.stderr.read().rstrip() # strip trailing "\n" - - # waiting here should do nothing as we have finished stream reading - status = proc.wait() - finally: - proc.stdout.close() - proc.stderr.close() - - if with_exceptions and status != 0: - raise GitCommandError(command, status, stderr_value) - - if GIT_PYTHON_TRACE == 'full': - if stderr_value: - print "%s -> %d: '%s' !! '%s'" % (command, status, stdout_value, stderr_value) - elif stdout_value: - print "%s -> %d: '%s'" % (command, status, stdout_value) - else: - print "%s -> %d" % (command, status) - - # Allow access to the command's status code - if with_extended_output: - return (status, stdout_value, stderr_value) - else: - return stdout_value - - def transform_kwargs(self, **kwargs): - """ - Transforms Python style kwargs into git command line options. - """ - args = [] - for k, v in kwargs.items(): - if len(k) == 1: - if v is True: - args.append("-%s" % k) - elif type(v) is not bool: - args.append("-%s%s" % (k, v)) - else: - if v is True: - args.append("--%s" % dashify(k)) - elif type(v) is not bool: - args.append("--%s=%s" % (dashify(k), v)) - return args - - @classmethod - def __unpack_args(cls, arg_list): - if not isinstance(arg_list, (list,tuple)): - return [ str(arg_list) ] - - outlist = list() - for arg in arg_list: - if isinstance(arg_list, (list, tuple)): - outlist.extend(cls.__unpack_args( arg )) - # END recursion - else: - outlist.append(str(arg)) - # END for each arg - return outlist - - def _call_process(self, method, *args, **kwargs): - """ - Run the given git command with the specified arguments and return - the result as a String - - ``method`` - is the command. Contained "_" characters will be converted to dashes, - such as in 'ls_files' to call 'ls-files'. - - ``args`` - is the list of arguments. If None is included, it will be pruned. - This allows your commands to call git more conveniently as None - is realized as non-existent - - ``kwargs`` - is a dict of keyword arguments. - This function accepts the same optional keyword arguments - as execute(). - - Examples:: - git.rev_list('master', max_count=10, header=True) - - Returns - Same as execute() - """ - - # Handle optional arguments prior to calling transform_kwargs - # otherwise these'll end up in args, which is bad. - _kwargs = {} - for kwarg in execute_kwargs: - try: - _kwargs[kwarg] = kwargs.pop(kwarg) - except KeyError: - pass - - # Prepare the argument list - opt_args = self.transform_kwargs(**kwargs) - - ext_args = self.__unpack_args([a for a in args if a is not None]) - args = opt_args + ext_args - - call = ["git", dashify(method)] - call.extend(args) - - return self.execute(call, **_kwargs) - - def _parse_object_header(self, header_line): - """ - ``header_line`` - type_string size_as_int - - Returns - (hex_sha, type_string, size_as_int) - - Raises - ValueError if the header contains indication for an error due to incorrect - input sha - """ - tokens = header_line.split() - if len(tokens) != 3: - raise ValueError("SHA named %s could not be resolved, git returned: %r" % (tokens[0], header_line.strip()) ) - if len(tokens[0]) != 40: - raise ValueError("Failed to parse header: %r" % header_line) - return (tokens[0], tokens[1], int(tokens[2])) - - def __prepare_ref(self, ref): - # required for command to separate refs on stdin - refstr = str(ref) # could be ref-object - if refstr.endswith("\n"): - return refstr - return refstr + "\n" - - def __get_persistent_cmd(self, attr_name, cmd_name, *args,**kwargs): - cur_val = getattr(self, attr_name) - if cur_val is not None: - return cur_val - - options = { "istream" : subprocess.PIPE, "as_process" : True } - options.update( kwargs ) - - cmd = self._call_process( cmd_name, *args, **options ) - setattr(self, attr_name, cmd ) - return cmd - - def __get_object_header(self, cmd, ref): - cmd.stdin.write(self.__prepare_ref(ref)) - cmd.stdin.flush() - return self._parse_object_header(cmd.stdout.readline()) - - def get_object_header(self, ref): - """ - Use this method to quickly examine the type and size of the object behind - the given ref. - - NOTE - The method will only suffer from the costs of command invocation - once and reuses the command in subsequent calls. - - Return: - (hexsha, type_string, size_as_int) - """ - cmd = self.__get_persistent_cmd("cat_file_header", "cat_file", batch_check=True) - return self.__get_object_header(cmd, ref) - - def get_object_data(self, ref): - """ - As get_object_header, but returns object data as well - - Return: - (hexsha, type_string, size_as_int,data_string) - """ - cmd = self.__get_persistent_cmd("cat_file_all", "cat_file", batch=True) - hexsha, typename, size = self.__get_object_header(cmd, ref) - data = cmd.stdout.read(size) - cmd.stdout.read(1) # finishing newlines - - return (hexsha, typename, size, data) - - def clear_cache(self): - """ - Clear all kinds of internal caches to release resources. - - Currently persistent commands will be interrupted. - - Returns - self - """ - self.cat_file_all = None - self.cat_file_header = None - return self + """ + The Git class manages communication with the Git binary. + + It provides a convenient interface to calling the Git binary, such as in:: + + g = Git( git_dir ) + g.init() # calls 'git init' program + rval = g.ls_files() # calls 'git ls-files' program + + ``Debugging`` + Set the GIT_PYTHON_TRACE environment variable print each invocation + of the command to stdout. + Set its value to 'full' to see details about the returned values. + """ + __slots__ = ("_working_dir", "cat_file_all", "cat_file_header") + + class AutoInterrupt(object): + """ + Kill/Interrupt the stored process instance once this instance goes out of scope. It is + used to prevent processes piling up in case iterators stop reading. + Besides all attributes are wired through to the contained process object. + + The wait method was overridden to perform automatic status code checking + and possibly raise. + """ + __slots__= ("proc", "args") + + def __init__(self, proc, args ): + self.proc = proc + self.args = args + + def __del__(self): + # did the process finish already so we have a return code ? + if self.proc.poll() is not None: + return + + # can be that nothing really exists anymore ... + if os is None: + return + + # try to kill it + try: + os.kill(self.proc.pid, 2) # interrupt signal + except AttributeError: + # try windows + # for some reason, providing None for stdout/stderr still prints something. This is why + # we simply use the shell and redirect to nul. Its slower than CreateProcess, question + # is whether we really want to see all these messages. Its annoying no matter what. + subprocess.call(("TASKKILL /F /T /PID %s 2>nul 1>nul" % str(self.proc.pid)), shell=True) + # END exception handling + + def __getattr__(self, attr): + return getattr(self.proc, attr) + + def wait(self): + """ + Wait for the process and return its status code. + + Raise + GitCommandError if the return status is not 0 + """ + status = self.proc.wait() + if status != 0: + raise GitCommandError(self.args, status, self.proc.stderr.read()) + # END status handling + return status + # END auto interrupt + + class CatFileContentStream(object): + """Object representing a sized read-only stream returning the contents of + an object. + It behaves like a stream, but counts the data read and simulates an empty + stream once our sized content region is empty. + If not all data is read to the end of the objects's lifetime, we read the + rest to assure the underlying stream continues to work""" + + __slots__ = ('_stream', '_nbr', '_size') + + def __init__(self, size, stream): + self._stream = stream + self._size = size + self._nbr = 0 # num bytes read + + def read(self, size=-1): + bytes_left = self._size - self._nbr + if bytes_left == 0: + return '' + if size > -1: + # assure we don't try to read past our limit + size = min(bytes_left, size) + else: + # they try to read all, make sure its not more than what remains + size = bytes_left + # END check early depletion + data = self._stream.read(size) + self._nbr += len(data) + + # check for depletion, read our final byte to make the stream usable by others + if self._size - self._nbr == 0: + self._stream.read(1) # final newline + # END finish reading + + return data + + def readline(self, size=-1): + if self._nbr == self._size: + return '' + + if size > -1: + size = min(self._size - self._nbr, size) + + data = self._stream.readline(size) + self._nbr += len(data) + + # handle final byte + # we inline everything, it must be fast ! + if self._size - self._nbr == 0: + self._stream.read(1) + # END finish reading + + return data + + def readlines(self, size=-1): + if self._nbr == self._size: + return list() + + # leave all additional logic to our readline method, we just check the size + out = list() + nbr = 0 + while True: + line = self.readline() + if not line: + break + out.append(line) + if size > -1: + nbr += len(line) + if nbr > size: + break + # END handle size constraint + # END readline loop + return out + + def __iter__(self): + return self + + def next(self): + line = self.readline() + if not line: + raise StopIteration + return line + + def __del__(self): + bytes_left = self._size - self._nbr + if bytes_left: + # seek and discard + self._stream.seek(bytes_left + 1, os.SEEK_CUR) # includes terminating newline + # END handle incomplete read + + + def __init__(self, working_dir=None): + """ + Initialize this instance with: + + ``working_dir`` + Git directory we should work in. If None, we always work in the current + directory as returned by os.getcwd(). + It is meant to be the working tree directory if available, or the + .git directory in case of bare repositories. + """ + super(Git, self).__init__() + self._working_dir = working_dir + + # cached command slots + self.cat_file_header = None + self.cat_file_all = None + + def __getattr__(self, name): + """ + A convenience method as it allows to call the command as if it was + an object. + Returns + Callable object that will execute call _call_process with your arguments. + """ + if name[:1] == '_': + raise AttributeError(name) + return lambda *args, **kwargs: self._call_process(name, *args, **kwargs) + + @property + def working_dir(self): + """ + Returns + Git directory we are working on + """ + return self._working_dir + + def execute(self, command, + istream=None, + with_keep_cwd=False, + with_extended_output=False, + with_exceptions=True, + as_process=False, + output_stream=None, + **subprocess_kwargs + ): + """ + Handles executing the command on the shell and consumes and returns + the returned information (stdout) + + ``command`` + The command argument list to execute. + It should be a string, or a sequence of program arguments. The + program to execute is the first item in the args sequence or string. + + ``istream`` + Standard input filehandle passed to subprocess.Popen. + + ``with_keep_cwd`` + Whether to use the current working directory from os.getcwd(). + The cmd otherwise uses its own working_dir that it has been initialized + with if possible. + + ``with_extended_output`` + Whether to return a (status, stdout, stderr) tuple. + + ``with_exceptions`` + Whether to raise an exception when git returns a non-zero status. + + ``as_process`` + Whether to return the created process instance directly from which + streams can be read on demand. This will render with_extended_output and + with_exceptions ineffective - the caller will have + to deal with the details himself. + It is important to note that the process will be placed into an AutoInterrupt + wrapper that will interrupt the process once it goes out of scope. If you + use the command in iterators, you should pass the whole process instance + instead of a single stream. + + ``output_stream`` + If set to a file-like object, data produced by the git command will be + output to the given stream directly. + This feature only has any effect if as_process is False. Processes will + always be created with a pipe due to issues with subprocess. + This merely is a workaround as data will be copied from the + output pipe to the given output stream directly. + + ``**subprocess_kwargs`` + Keyword arguments to be passed to subprocess.Popen. Please note that + some of the valid kwargs are already set by this method, the ones you + specify may not be the same ones. + + Returns:: + + str(output) # extended_output = False (Default) + tuple(int(status), str(stdout), str(stderr)) # extended_output = True + + if ouput_stream is True, the stdout value will be your output stream: + output_stream # extended_output = False + tuple(int(status), output_stream, str(stderr))# extended_output = True + + Raise + GitCommandError + + NOTE + If you add additional keyword arguments to the signature of this method, + you must update the execute_kwargs tuple housed in this module. + """ + if GIT_PYTHON_TRACE and not GIT_PYTHON_TRACE == 'full': + print ' '.join(command) + + # Allow the user to have the command executed in their working dir. + if with_keep_cwd or self._working_dir is None: + cwd = os.getcwd() + else: + cwd=self._working_dir + + # Start the process + proc = subprocess.Popen(command, + cwd=cwd, + stdin=istream, + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + close_fds=(os.name=='posix'),# unsupported on linux + **subprocess_kwargs + ) + if as_process: + return self.AutoInterrupt(proc, command) + + # Wait for the process to return + status = 0 + stdout_value = '' + stderr_value = '' + try: + if output_stream is None: + stdout_value = proc.stdout.read().rstrip() # strip trailing "\n" + else: + max_chunk_size = 1024*64 + while True: + chunk = proc.stdout.read(max_chunk_size) + output_stream.write(chunk) + if len(chunk) < max_chunk_size: + break + # END reading output stream + stdout_value = output_stream + # END stdout handling + stderr_value = proc.stderr.read().rstrip() # strip trailing "\n" + + # waiting here should do nothing as we have finished stream reading + status = proc.wait() + finally: + proc.stdout.close() + proc.stderr.close() + + if with_exceptions and status != 0: + raise GitCommandError(command, status, stderr_value) + + if GIT_PYTHON_TRACE == 'full': + if stderr_value: + print "%s -> %d: '%s' !! '%s'" % (command, status, stdout_value, stderr_value) + elif stdout_value: + print "%s -> %d: '%s'" % (command, status, stdout_value) + else: + print "%s -> %d" % (command, status) + + # Allow access to the command's status code + if with_extended_output: + return (status, stdout_value, stderr_value) + else: + return stdout_value + + def transform_kwargs(self, **kwargs): + """ + Transforms Python style kwargs into git command line options. + """ + args = [] + for k, v in kwargs.items(): + if len(k) == 1: + if v is True: + args.append("-%s" % k) + elif type(v) is not bool: + args.append("-%s%s" % (k, v)) + else: + if v is True: + args.append("--%s" % dashify(k)) + elif type(v) is not bool: + args.append("--%s=%s" % (dashify(k), v)) + return args + + @classmethod + def __unpack_args(cls, arg_list): + if not isinstance(arg_list, (list,tuple)): + return [ str(arg_list) ] + + outlist = list() + for arg in arg_list: + if isinstance(arg_list, (list, tuple)): + outlist.extend(cls.__unpack_args( arg )) + # END recursion + else: + outlist.append(str(arg)) + # END for each arg + return outlist + + def _call_process(self, method, *args, **kwargs): + """ + Run the given git command with the specified arguments and return + the result as a String + + ``method`` + is the command. Contained "_" characters will be converted to dashes, + such as in 'ls_files' to call 'ls-files'. + + ``args`` + is the list of arguments. If None is included, it will be pruned. + This allows your commands to call git more conveniently as None + is realized as non-existent + + ``kwargs`` + is a dict of keyword arguments. + This function accepts the same optional keyword arguments + as execute(). + + Examples:: + git.rev_list('master', max_count=10, header=True) + + Returns + Same as execute() + """ + + # Handle optional arguments prior to calling transform_kwargs + # otherwise these'll end up in args, which is bad. + _kwargs = {} + for kwarg in execute_kwargs: + try: + _kwargs[kwarg] = kwargs.pop(kwarg) + except KeyError: + pass + + # Prepare the argument list + opt_args = self.transform_kwargs(**kwargs) + + ext_args = self.__unpack_args([a for a in args if a is not None]) + args = opt_args + ext_args + + call = ["git", dashify(method)] + call.extend(args) + + return self.execute(call, **_kwargs) + + def _parse_object_header(self, header_line): + """ + ``header_line`` + type_string size_as_int + + Returns + (hex_sha, type_string, size_as_int) + + Raises + ValueError if the header contains indication for an error due to incorrect + input sha + """ + tokens = header_line.split() + if len(tokens) != 3: + raise ValueError("SHA named %s could not be resolved, git returned: %r" % (tokens[0], header_line.strip()) ) + if len(tokens[0]) != 40: + raise ValueError("Failed to parse header: %r" % header_line) + return (tokens[0], tokens[1], int(tokens[2])) + + def __prepare_ref(self, ref): + # required for command to separate refs on stdin + refstr = str(ref) # could be ref-object + if refstr.endswith("\n"): + return refstr + return refstr + "\n" + + def __get_persistent_cmd(self, attr_name, cmd_name, *args,**kwargs): + cur_val = getattr(self, attr_name) + if cur_val is not None: + return cur_val + + options = { "istream" : subprocess.PIPE, "as_process" : True } + options.update( kwargs ) + + cmd = self._call_process( cmd_name, *args, **options ) + setattr(self, attr_name, cmd ) + return cmd + + def __get_object_header(self, cmd, ref): + cmd.stdin.write(self.__prepare_ref(ref)) + cmd.stdin.flush() + return self._parse_object_header(cmd.stdout.readline()) + + def get_object_header(self, ref): + """ Use this method to quickly examine the type and size of the object behind + the given ref. + + :note: The method will only suffer from the costs of command invocation + once and reuses the command in subsequent calls. + + :return: (hexsha, type_string, size_as_int) """ + cmd = self.__get_persistent_cmd("cat_file_header", "cat_file", batch_check=True) + return self.__get_object_header(cmd, ref) + + def get_object_data(self, ref): + """ As get_object_header, but returns object data as well + :return: (hexsha, type_string, size_as_int,data_string) + :note: not threadsafe + """ + hexsha, typename, size, stream = self.stream_object_data(ref) + data = stream.read(size) + del(stream) + return (hexsha, typename, size, data) + + def stream_object_data(self, ref): + """As get_object_header, but returns the data as a stream + :return: (hexsha, type_string, size_as_int, stream) + :note: This method is not threadsafe, you need one independent Command instance + per thread to be safe !""" + cmd = self.__get_persistent_cmd("cat_file_all", "cat_file", batch=True) + hexsha, typename, size = self.__get_object_header(cmd, ref) + return (hexsha, typename, size, self.CatFileContentStream(size, cmd.stdout)) + + def clear_cache(self): + """ + Clear all kinds of internal caches to release resources. + + Currently persistent commands will be interrupted. + + Returns + self + """ + self.cat_file_all = None + self.cat_file_header = None + return self diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py index bb15192d..f7043199 100644 --- a/lib/git/objects/base.py +++ b/lib/git/objects/base.py @@ -6,223 +6,223 @@ import os from git.utils import LazyMixin, join_path_native import utils - + _assertion_msg_format = "Created object %r whose python type %r disagrees with the acutal git object type %r" class Object(LazyMixin): - """ - Implements an Object which may be Blobs, Trees, Commits and Tags - - This Object also serves as a constructor for instances of the correct type:: - - inst = Object.new(repo,id) - inst.sha # objects sha in hex - inst.size # objects uncompressed data size - inst.data # byte string containing the whole data of the object - """ - NULL_HEX_SHA = '0'*40 - TYPES = ("blob", "tree", "commit", "tag") - __slots__ = ("repo", "sha", "size", "data" ) - type = None # to be set by subclass - - def __init__(self, repo, id): - """ - Initialize an object by identifying it by its id. All keyword arguments - will be set on demand if None. - - ``repo`` - repository this object is located in - - ``id`` - SHA1 or ref suitable for git-rev-parse - """ - super(Object,self).__init__() - self.repo = repo - self.sha = id + """ + Implements an Object which may be Blobs, Trees, Commits and Tags + + This Object also serves as a constructor for instances of the correct type:: + + inst = Object.new(repo,id) + inst.sha # objects sha in hex + inst.size # objects uncompressed data size + inst.data # byte string containing the whole data of the object + """ + NULL_HEX_SHA = '0'*40 + TYPES = ("blob", "tree", "commit", "tag") + __slots__ = ("repo", "sha", "size", "data" ) + type = None # to be set by subclass + + def __init__(self, repo, id): + """ + Initialize an object by identifying it by its id. All keyword arguments + will be set on demand if None. + + ``repo`` + repository this object is located in + + ``id`` + SHA1 or ref suitable for git-rev-parse + """ + super(Object,self).__init__() + self.repo = repo + self.sha = id - @classmethod - def new(cls, repo, id): - """ - Return - New Object instance of a type appropriate to the object type behind - id. The id of the newly created object will be a hexsha even though - the input id may have been a Reference or Rev-Spec - - Note - This cannot be a __new__ method as it would always call __init__ - with the input id which is not necessarily a hexsha. - """ - hexsha, typename, size = repo.git.get_object_header(id) - obj_type = utils.get_object_type_by_name(typename) - inst = obj_type(repo, hexsha) - inst.size = size - return inst - - def _set_self_from_args_(self, args_dict): - """ - Initialize attributes on self from the given dict that was retrieved - from locals() in the calling method. - - Will only set an attribute on self if the corresponding value in args_dict - is not None - """ - for attr, val in args_dict.items(): - if attr != "self" and val is not None: - setattr( self, attr, val ) - # END set all non-None attributes - - def _set_cache_(self, attr): - """ - Retrieve object information - """ - if attr == "size": - hexsha, typename, self.size = self.repo.git.get_object_header(self.sha) - assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) - elif attr == "data": - hexsha, typename, self.size, self.data = self.repo.git.get_object_data(self.sha) - assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) - else: - super(Object,self)._set_cache_(attr) - - def __eq__(self, other): - """ - Returns - True if the objects have the same SHA1 - """ - return self.sha == other.sha - - def __ne__(self, other): - """ - Returns - True if the objects do not have the same SHA1 - """ - return self.sha != other.sha - - def __hash__(self): - """ - Returns - Hash of our id allowing objects to be used in dicts and sets - """ - return hash(self.sha) - - def __str__(self): - """ - Returns - string of our SHA1 as understood by all git commands - """ - return self.sha - - def __repr__(self): - """ - Returns - string with pythonic representation of our object - """ - return '' % (self.__class__.__name__, self.sha) + @classmethod + def new(cls, repo, id): + """ + Return + New Object instance of a type appropriate to the object type behind + id. The id of the newly created object will be a hexsha even though + the input id may have been a Reference or Rev-Spec + + Note + This cannot be a __new__ method as it would always call __init__ + with the input id which is not necessarily a hexsha. + """ + hexsha, typename, size = repo.git.get_object_header(id) + obj_type = utils.get_object_type_by_name(typename) + inst = obj_type(repo, hexsha) + inst.size = size + return inst + + def _set_self_from_args_(self, args_dict): + """ + Initialize attributes on self from the given dict that was retrieved + from locals() in the calling method. + + Will only set an attribute on self if the corresponding value in args_dict + is not None + """ + for attr, val in args_dict.items(): + if attr != "self" and val is not None: + setattr( self, attr, val ) + # END set all non-None attributes + + def _set_cache_(self, attr): + """ + Retrieve object information + """ + if attr == "size": + hexsha, typename, self.size = self.repo.git.get_object_header(self.sha) + assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) + elif attr == "data": + hexsha, typename, self.size, self.data = self.repo.git.get_object_data(self.sha) + assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) + else: + super(Object,self)._set_cache_(attr) + + def __eq__(self, other): + """ + Returns + True if the objects have the same SHA1 + """ + return self.sha == other.sha + + def __ne__(self, other): + """ + Returns + True if the objects do not have the same SHA1 + """ + return self.sha != other.sha + + def __hash__(self): + """ + Returns + Hash of our id allowing objects to be used in dicts and sets + """ + return hash(self.sha) + + def __str__(self): + """ + Returns + string of our SHA1 as understood by all git commands + """ + return self.sha + + def __repr__(self): + """ + Returns + string with pythonic representation of our object + """ + return '' % (self.__class__.__name__, self.sha) - @property - def data_stream(self): - """ - Returns - File Object compatible stream to the uncompressed raw data of the object - """ - proc = self.repo.git.cat_file(self.type, self.sha, as_process=True) - return utils.ProcessStreamAdapter(proc, "stdout") - - def stream_data(self, ostream): - """ - Writes our data directly to the given output stream - - ``ostream`` - File object compatible stream object. - - Returns - self - """ - self.repo.git.cat_file(self.type, self.sha, output_stream=ostream) - return self + @property + def data_stream(self): + """ + Returns + File Object compatible stream to the uncompressed raw data of the object + """ + proc = self.repo.git.cat_file(self.type, self.sha, as_process=True) + return utils.ProcessStreamAdapter(proc, "stdout") + def stream_data(self, ostream): + """ + Writes our data directly to the given output stream + + ``ostream`` + File object compatible stream object. + + Returns + self + """ + self.repo.git.cat_file(self.type, self.sha, output_stream=ostream) + return self + class IndexObject(Object): - """ - Base for all objects that can be part of the index file , namely Tree, Blob and - SubModule objects - """ - __slots__ = ("path", "mode") - - def __init__(self, repo, sha, mode=None, path=None): - """ - Initialize a newly instanced IndexObject - ``repo`` - is the Repo we are located in + """ + Base for all objects that can be part of the index file , namely Tree, Blob and + SubModule objects + """ + __slots__ = ("path", "mode") + + def __init__(self, repo, sha, mode=None, path=None): + """ + Initialize a newly instanced IndexObject + ``repo`` + is the Repo we are located in - ``sha`` : string - is the git object id as hex sha + ``sha`` : string + is the git object id as hex sha - ``mode`` : int - is the file mode as int, use the stat module to evaluate the infomration + ``mode`` : int + is the file mode as int, use the stat module to evaluate the infomration - ``path`` : str - is the path to the file in the file system, relative to the git repository root, i.e. - file.ext or folder/other.ext - - NOTE - Path may not be set of the index object has been created directly as it cannot - be retrieved without knowing the parent tree. - """ - super(IndexObject, self).__init__(repo, sha) - self._set_self_from_args_(locals()) - if isinstance(mode, basestring): - self.mode = self._mode_str_to_int(mode) - - def __hash__(self): - """ - Returns - Hash of our path as index items are uniquely identifyable by path, not - by their data ! - """ - return hash(self.path) - - def _set_cache_(self, attr): - if attr in IndexObject.__slots__: - # they cannot be retrieved lateron ( not without searching for them ) - raise AttributeError( "path and mode attributes must have been set during %s object creation" % type(self).__name__ ) - else: - super(IndexObject, self)._set_cache_(attr) - - @classmethod - def _mode_str_to_int(cls, modestr): - """ - ``modestr`` - string like 755 or 644 or 100644 - only the last 6 chars will be used - - Returns - String identifying a mode compatible to the mode methods ids of the - stat module regarding the rwx permissions for user, group and other, - special flags and file system flags, i.e. whether it is a symlink - for example. - """ - mode = 0 - for iteration,char in enumerate(reversed(modestr[-6:])): - mode += int(char) << iteration*3 - # END for each char - return mode - - @property - def name(self): - """ - Returns - Name portion of the path, effectively being the basename - """ - return os.path.basename(self.path) - - @property - def abspath(self): - """ - Returns - Absolute path to this index object in the file system ( as opposed to the - .path field which is a path relative to the git repository ). - - The returned path will be native to the system and contains '\' on windows. - """ - return join_path_native(self.repo.working_tree_dir, self.path) - + ``path`` : str + is the path to the file in the file system, relative to the git repository root, i.e. + file.ext or folder/other.ext + + NOTE + Path may not be set of the index object has been created directly as it cannot + be retrieved without knowing the parent tree. + """ + super(IndexObject, self).__init__(repo, sha) + self._set_self_from_args_(locals()) + if isinstance(mode, basestring): + self.mode = self._mode_str_to_int(mode) + + def __hash__(self): + """ + Returns + Hash of our path as index items are uniquely identifyable by path, not + by their data ! + """ + return hash(self.path) + + def _set_cache_(self, attr): + if attr in IndexObject.__slots__: + # they cannot be retrieved lateron ( not without searching for them ) + raise AttributeError( "path and mode attributes must have been set during %s object creation" % type(self).__name__ ) + else: + super(IndexObject, self)._set_cache_(attr) + + @classmethod + def _mode_str_to_int(cls, modestr): + """ + ``modestr`` + string like 755 or 644 or 100644 - only the last 6 chars will be used + + Returns + String identifying a mode compatible to the mode methods ids of the + stat module regarding the rwx permissions for user, group and other, + special flags and file system flags, i.e. whether it is a symlink + for example. + """ + mode = 0 + for iteration,char in enumerate(reversed(modestr[-6:])): + mode += int(char) << iteration*3 + # END for each char + return mode + + @property + def name(self): + """ + Returns + Name portion of the path, effectively being the basename + """ + return os.path.basename(self.path) + + @property + def abspath(self): + """ + Returns + Absolute path to this index object in the file system ( as opposed to the + .path field which is a path relative to the git repository ). + + The returned path will be native to the system and contains '\' on windows. + """ + return join_path_native(self.repo.working_tree_dir, self.path) + diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py index 87eed49b..948e9a54 100644 --- a/lib/git/objects/commit.py +++ b/lib/git/objects/commit.py @@ -9,12 +9,14 @@ import git.diff as diff import git.stats as stats from git.actor import Actor from tree import Tree +from cStringIO import StringIO import base import utils import time import os -class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable): + +class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Serializable): """ Wraps a git Commit object. @@ -91,7 +93,8 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable): self._set_self_from_args_(locals()) if parents is not None: - self.parents = tuple( self.__class__(repo, p) for p in parents ) + cls = type(self) + self.parents = tuple(cls(repo, p) for p in parents if not isinstance(p, cls)) # END for each parent to convert if self.sha and tree is not None: @@ -109,20 +112,9 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable): We set all values at once. """ if attr in Commit.__slots__: - # prepare our data lines to match rev-list - data_lines = self.data.splitlines() - data_lines.insert(0, "commit %s" % self.sha) - temp = self._iter_from_process_or_stream(self.repo, iter(data_lines), False).next() - self.parents = temp.parents - self.tree = temp.tree - self.author = temp.author - self.authored_date = temp.authored_date - self.author_tz_offset = temp.author_tz_offset - self.committer = temp.committer - self.committed_date = temp.committed_date - self.committer_tz_offset = temp.committer_tz_offset - self.message = temp.message - self.encoding = temp.encoding + # read the data in a chunk, its faster - then provide a file wrapper + hexsha, typename, size, data = self.repo.git.get_object_data(self) + self._deserialize(StringIO(data)) else: super(Commit, self)._set_cache_(attr) @@ -260,59 +252,18 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable): iterator returning Commit objects """ stream = proc_or_stream - if not hasattr(stream,'next'): + if not hasattr(stream,'readline'): stream = proc_or_stream.stdout - for line in stream: - commit_tokens = line.split() + while True: + line = stream.readline() + if not line: + break + commit_tokens = line.split() id = commit_tokens[1] assert commit_tokens[0] == "commit" - tree = stream.next().split()[1] - - parents = [] - next_line = None - for parent_line in stream: - if not parent_line.startswith('parent'): - next_line = parent_line - break - # END abort reading parents - parents.append(parent_line.split()[-1]) - # END for each parent line - - author, authored_date, author_tz_offset = utils.parse_actor_and_date(next_line) - committer, committed_date, committer_tz_offset = utils.parse_actor_and_date(stream.next()) - - # empty line - encoding = stream.next() - encoding.strip() - if encoding: - encoding = encoding[encoding.find(' ')+1:] - # END parse encoding - - message_lines = list() - if from_rev_list: - for msg_line in stream: - if not msg_line.startswith(' '): - # and forget about this empty marker - break - # END abort message reading - # strip leading 4 spaces - message_lines.append(msg_line[4:]) - # END while there are message lines - else: - # a stream from our data simply gives us the plain message - for msg_line in stream: - message_lines.append(msg_line) - # END message parsing - message = '\n'.join(message_lines) - - - yield Commit(repo, id, tree, - author, authored_date, author_tz_offset, - committer, committed_date, committer_tz_offset, - message, tuple(parents), - encoding or cls.default_encoding) + yield Commit(repo, id)._deserialize(stream, from_rev_list) # END for each line in stream @@ -393,7 +344,7 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable): # assume utf8 encoding enc_section, enc_option = cls.conf_encoding.split('.') - conf_encoding = cr.get_value(enc_section, enc_option, default_encoding) + conf_encoding = cr.get_value(enc_section, enc_option, cls.default_encoding) author = Actor(author_name, author_email) committer = Actor(committer_name, committer_email) @@ -429,3 +380,61 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable): def __repr__(self): return '' % self.sha + #{ Serializable Implementation + + def _serialize(self, stream): + # for now, this is very inefficient and in fact shouldn't be used like this + return super(Commit, self)._serialize(stream) + + def _deserialize(self, stream, from_rev_list=False): + """:param from_rev_list: if true, the stream format is coming from the rev-list command + Otherwise it is assumed to be a plain data stream from our object""" + self.tree = Tree(self.repo, stream.readline().split()[1], 0, '') + + self.parents = list() + next_line = None + while True: + parent_line = stream.readline() + if not parent_line.startswith('parent'): + next_line = parent_line + break + # END abort reading parents + self.parents.append(type(self)(self.repo, parent_line.split()[-1])) + # END for each parent line + self.parents = tuple(self.parents) + + self.author, self.authored_date, self.author_tz_offset = utils.parse_actor_and_date(next_line) + self.committer, self.committed_date, self.committer_tz_offset = utils.parse_actor_and_date(stream.readline()) + + + # empty line + self.encoding = self.default_encoding + enc = stream.readline() + enc.strip() + if enc: + self.encoding = enc[enc.find(' ')+1:] + # END parse encoding + + message_lines = list() + if from_rev_list: + while True: + msg_line = stream.readline() + if not msg_line.startswith(' '): + # and forget about this empty marker + # cut the last newline to get rid of the artificial newline added + # by rev-list command. Lets hope its just linux style \n + message_lines[-1] = message_lines[-1][:-1] + break + # END abort message reading + # strip leading 4 spaces + message_lines.append(msg_line[4:]) + # END while there are message lines + self.message = ''.join(message_lines) + else: + # a stream from our data simply gives us the plain message + # The end of our message stream is marked with a newline that we strip + self.message = stream.read()[:-1] + # END message parsing + return self + + #} END serializable implementation diff --git a/lib/git/objects/tree.py b/lib/git/objects/tree.py index a9e60981..285d3b5b 100644 --- a/lib/git/objects/tree.py +++ b/lib/git/objects/tree.py @@ -209,7 +209,7 @@ class Tree(base.IndexObject, diff.Diffable, utils.Traversable): visit_once = False, ignore_self=1 ): """For documentation, see utils.Traversable.traverse - Trees are set to visist_once = False to gain more performance in the traversal""" + Trees are set to visit_once = False to gain more performance in the traversal""" return super(Tree, self).traverse(predicate, prune, depth, branch_first, visit_once, ignore_self) # List protocol diff --git a/lib/git/objects/utils.py b/lib/git/objects/utils.py index 7060e293..6d378a72 100644 --- a/lib/git/objects/utils.py +++ b/lib/git/objects/utils.py @@ -280,3 +280,20 @@ class Traversable(object): addToStack( stack, item, branch_first, nd ) # END for each item on work stack + + +class Serializable(object): + """Defines methods to serialize and deserialize objects from and into a data stream""" + + def _serialize(self, stream): + """Serialize the data of this object into the given data stream + :note: a serialized object would ``_deserialize`` into the same objet + :param stream: a file-like object + :return: self""" + raise NotImplementedError("To be implemented in subclass") + + def _deserialize(self, stream): + """Deserialize all information regarding this object from the stream + :param stream: a file-like object + :return: self""" + raise NotImplementedError("To be implemented in subclass") -- cgit v1.2.1 From ae5a69f67822d81bbbd8f4af93be68703e730b37 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 16:41:28 +0200 Subject: commit: redesigned revlist and commit parsing, commits are always retrieved from their object information directly. This is faster, and resolves issues with the rev-list format and empty commit messages Adjusted many tests to go with the changes, as they were still mocked. The mock was removed if necessary and replaced by code that actually executes --- lib/git/objects/commit.py | 98 +++++++++++++++++++---------------------------- 1 file changed, 40 insertions(+), 58 deletions(-) (limited to 'lib') diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py index 948e9a54..98aca360 100644 --- a/lib/git/objects/commit.py +++ b/lib/git/objects/commit.py @@ -106,13 +106,12 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri return commit.parents def _set_cache_(self, attr): - """ - Called by LazyMixin superclass when the given uninitialized member needs + """ Called by LazyMixin superclass when the given uninitialized member needs to be set. - We set all values at once. - """ + We set all values at once. """ if attr in Commit.__slots__: # read the data in a chunk, its faster - then provide a file wrapper + # Could use self.data, but lets try to get it with less calls hexsha, typename, size, data = self.repo.git.get_object_data(self) self._deserialize(StringIO(data)) else: @@ -181,16 +180,16 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri Returns iterator yielding Commit items """ - options = {'pretty': 'raw', 'as_process' : True } - options.update(kwargs) - + if 'pretty' in kwargs: + raise ValueError("--pretty cannot be used as parsing expects single sha's only") + # END handle pretty args = list() if paths: args.extend(('--', paths)) # END if paths - proc = repo.git.rev_list(rev, args, **options) - return cls._iter_from_process_or_stream(repo, proc, True) + proc = repo.git.rev_list(rev, args, as_process=True, **kwargs) + return cls._iter_from_process_or_stream(repo, proc) def iter_parents(self, paths='', **kwargs): """ @@ -235,35 +234,30 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri return stats.Stats._list_from_string(self.repo, text) @classmethod - def _iter_from_process_or_stream(cls, repo, proc_or_stream, from_rev_list): - """ - Parse out commit information into a list of Commit objects - - ``repo`` - is the Repo - - ``proc`` - git-rev-list process instance (raw format) + def _iter_from_process_or_stream(cls, repo, proc_or_stream): + """Parse out commit information into a list of Commit objects + We expect one-line per commit, and parse the actual commit information directly + from our lighting fast object database - ``from_rev_list`` - If True, the stream was created by rev-list in which case we parse - the message differently - Returns - iterator returning Commit objects - """ + :param proc: git-rev-list process instance - one sha per line + :return: iterator returning Commit objects""" stream = proc_or_stream if not hasattr(stream,'readline'): stream = proc_or_stream.stdout + readline = stream.readline while True: - line = stream.readline() + line = readline() if not line: break - commit_tokens = line.split() - id = commit_tokens[1] - assert commit_tokens[0] == "commit" + sha = line.strip() + if len(sha) > 40: + # split additional information, as returned by bisect for instance + sha, rest = line.split(None, 1) + # END handle extra info - yield Commit(repo, id)._deserialize(stream, from_rev_list) + assert len(sha) == 40, "Invalid line: %s" % sha + yield Commit(repo, sha) # END for each line in stream @@ -386,15 +380,16 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri # for now, this is very inefficient and in fact shouldn't be used like this return super(Commit, self)._serialize(stream) - def _deserialize(self, stream, from_rev_list=False): + def _deserialize(self, stream): """:param from_rev_list: if true, the stream format is coming from the rev-list command Otherwise it is assumed to be a plain data stream from our object""" - self.tree = Tree(self.repo, stream.readline().split()[1], 0, '') + readline = stream.readline + self.tree = Tree(self.repo, readline().split()[1], 0, '') self.parents = list() next_line = None while True: - parent_line = stream.readline() + parent_line = readline() if not parent_line.startswith('parent'): next_line = parent_line break @@ -404,37 +399,24 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri self.parents = tuple(self.parents) self.author, self.authored_date, self.author_tz_offset = utils.parse_actor_and_date(next_line) - self.committer, self.committed_date, self.committer_tz_offset = utils.parse_actor_and_date(stream.readline()) + self.committer, self.committed_date, self.committer_tz_offset = utils.parse_actor_and_date(readline()) - # empty line + # now we can have the encoding line, or an empty line followed by the optional + # message. self.encoding = self.default_encoding - enc = stream.readline() - enc.strip() + # read encoding or empty line to separate message + enc = readline() + enc = enc.strip() if enc: self.encoding = enc[enc.find(' ')+1:] - # END parse encoding - - message_lines = list() - if from_rev_list: - while True: - msg_line = stream.readline() - if not msg_line.startswith(' '): - # and forget about this empty marker - # cut the last newline to get rid of the artificial newline added - # by rev-list command. Lets hope its just linux style \n - message_lines[-1] = message_lines[-1][:-1] - break - # END abort message reading - # strip leading 4 spaces - message_lines.append(msg_line[4:]) - # END while there are message lines - self.message = ''.join(message_lines) - else: - # a stream from our data simply gives us the plain message - # The end of our message stream is marked with a newline that we strip - self.message = stream.read()[:-1] - # END message parsing + # now comes the message separator + readline() + # END handle encoding + + # a stream from our data simply gives us the plain message + # The end of our message stream is marked with a newline that we strip + self.message = stream.read()[:-1] return self #} END serializable implementation -- cgit v1.2.1 From 538820055ce1bf9dd07ecda48210832f96194504 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 17:39:17 +0200 Subject: git.cmd: added test for stream section constraint used in git command, found bug of course which just didn't kick in yet --- lib/git/cmd.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/git/cmd.py b/lib/git/cmd.py index cef4ea60..aaa27adc 100644 --- a/lib/git/cmd.py +++ b/lib/git/cmd.py @@ -130,8 +130,13 @@ class Git(object): if self._nbr == self._size: return '' + # clamp size to lowest allowed value + bytes_left = self._size - self._nbr if size > -1: - size = min(self._size - self._nbr, size) + size = min(bytes_left, size) + else: + size = bytes_left + # END handle size data = self._stream.readline(size) self._nbr += len(data) -- cgit v1.2.1 From 282018b79cc8df078381097cb3aeb29ff56e83c6 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 20:11:00 +0200 Subject: Added first design and frame for object database. In a first step, loose objects will be written using our utilities, and certain object retrieval functionality moves into the GitObjectDatabase which is used by the repo instance Added performance test for object database access, which shows quite respectable tree parsing performance, and okay blob access. Nonetheless, it will be hard to beat the c performance using a pure python implementation, but it can be a nice practice to write it anyway to allow more direct pack manipulations. Some could benefit from the ability to write packs as these can serve as local cache if alternates are used --- lib/git/errors.py | 15 +++--- lib/git/odb/__init__.py | 2 + lib/git/odb/db.py | 129 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 137 insertions(+), 9 deletions(-) create mode 100644 lib/git/odb/__init__.py create mode 100644 lib/git/odb/db.py (limited to 'lib') diff --git a/lib/git/errors.py b/lib/git/errors.py index f66fb528..ecb1c35b 100644 --- a/lib/git/errors.py +++ b/lib/git/errors.py @@ -8,19 +8,16 @@ Module containing all exceptions thrown througout the git package, """ class InvalidGitRepositoryError(Exception): - """ - Thrown if the given repository appears to have an invalid format. - """ + """ Thrown if the given repository appears to have an invalid format. """ + +class InvalidDBRoot(Exception): + """Thrown if an object database cannot be initialized at the given path""" class NoSuchPathError(OSError): - """ - Thrown if a path could not be access by the system. - """ + """ Thrown if a path could not be access by the system. """ class GitCommandError(Exception): - """ - Thrown if execution of the git command fails with non-zero status code. - """ + """ Thrown if execution of the git command fails with non-zero status code. """ def __init__(self, command, status, stderr=None): self.stderr = stderr self.status = status diff --git a/lib/git/odb/__init__.py b/lib/git/odb/__init__.py new file mode 100644 index 00000000..17000244 --- /dev/null +++ b/lib/git/odb/__init__.py @@ -0,0 +1,2 @@ +"""Initialize the object database module""" + diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py new file mode 100644 index 00000000..fd1b640a --- /dev/null +++ b/lib/git/odb/db.py @@ -0,0 +1,129 @@ +"""Contains implementations of database retrieveing objects""" +import os +from git.errors import InvalidDBRoot + + +class iObjectDBR(object): + """Defines an interface for object database lookup. + Objects are identified either by hex-sha (40 bytes) or + by sha (20 bytes)""" + __slots__ = tuple() + + #{ Query Interface + def has_obj_hex(self, hexsha): + """:return: True if the object identified by the given 40 byte hexsha is + contained in the database""" + raise NotImplementedError("To be implemented in subclass") + + def has_obj_bin(self, sha): + """:return: as ``has_obj_hex``, but takes a 20 byte binary sha""" + raise NotImplementedError("To be implemented in subclass") + + def obj_hex(self, hexsha): + """:return: tuple(type_string, size_in_bytes, stream) a tuple with object + information including its type, its size as well as a stream from which its + contents can be read""" + raise NotImplementedError("To be implemented in subclass") + + def obj_bin(self, sha): + """:return: as in ``obj_hex``, but takes a binary sha""" + raise NotImplementedError("To be implemented in subclass") + + def obj_info_hex(self, hexsha): + """:return: tuple(type_string, size_in_bytes) tuple with the object's type + string as well as its size in bytes""" + raise NotImplementedError("To be implemented in subclass") + + #} END query interface + +class iObjectDBW(object): + """Defines an interface to create objects in the database""" + __slots__ = tuple() + + #{ Edit Interface + + def to_obj(self, type, size, stream, dry_run=False, sha_as_hex=True): + """Create a new object in the database + :return: the sha identifying the object in the database + :param type: type string identifying the object + :param size: size of the data to read from stream + :param stream: stream providing the data + :param dry_run: if True, the object database will not actually be changed + :param sha_as_hex: if True, the returned sha identifying the object will be + hex encoded, not binary""" + raise NotImplementedError("To be implemented in subclass") + + def to_objs(self, iter_info, dry_run=False, sha_as_hex=True, max_threads=0): + """Create multiple new objects in the database + :return: sequence of shas identifying the created objects in the order in which + they where given. + :param iter_info: iterable yielding tuples containing the type_string + size_in_bytes and the steam with the content data. + :param dry_run: see ``to_obj`` + :param sha_as_hex: see ``to_obj`` + :param max_threads: if < 1, any number of threads may be started while processing + the request, otherwise the given number of threads will be started.""" + # a trivial implementation, ignoring the threads for now + # TODO: add configuration to the class to determine whether we may + # actually use multiple threads, default False of course. If the add + shas = list() + for args in iter_info: + shas.append(self.to_obj(*args, dry_run=dry_run, sha_as_hex=sha_as_hex)) + return shas + + #} END edit interface + + +class FileDBBase(object): + """Provides basic facilities to retrieve files of interest, including + caching facilities to help mapping hexsha's to objects""" + __slots__ = ('_root_path', ) + + def __init__(self, root_path): + """Initialize this instance to look for its files at the given root path + All subsequent operations will be relative to this path + :raise InvalidDBRoot: + :note: The base will perform basic checking for accessability, but the subclass + is required to verify that the root_path contains the database structure it needs""" + if not os.path.isdir(root_path): + raise InvalidDBRoot(root_path) + self._root_path = root_path + + + #{ Interface + def root_path(self): + """:return: path at which this db operates""" + return self._root_path + + #} END interface + + #{ Utiltities + def _root_rela_path(self, rela_path): + """:return: the given relative path relative to our database root""" + return os.path.join(self._root_path, rela_path) + + #} END utilities + + +class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): + """A database which operates on loose object files""" + + +class PackedDB(FileDBBase, iObjectDBR): + """A database operating on a set of object packs""" + + +class CompoundDB(iObjectDBR): + """A database which delegates calls to sub-databases""" + + +class ReferenceDB(CompoundDB): + """A database consisting of database referred to in a file""" + + +class GitObjectDB(CompoundDB, iObjectDBW): + """A database representing the default git object store, which includes loose + objects, pack files and an alternates file + + It will create objects only in the loose object database.""" + -- cgit v1.2.1 From 8b86f9b399a8f5af792a04025fdeefc02883f3e5 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 22:40:52 +0200 Subject: initial version of loose object writing and simple cached object lookup appears to be working --- lib/git/index.py | 4 +- lib/git/odb/db.py | 148 ++++++++++++++++++++++++++++++++++++++++++--------- lib/git/odb/utils.py | 78 +++++++++++++++++++++++++++ lib/git/utils.py | 6 ++- 4 files changed, 208 insertions(+), 28 deletions(-) create mode 100644 lib/git/odb/utils.py (limited to 'lib') diff --git a/lib/git/index.py b/lib/git/index.py index 8ccc3fe3..36428315 100644 --- a/lib/git/index.py +++ b/lib/git/index.py @@ -21,7 +21,7 @@ import git.diff as diff from errors import GitCommandError from git.objects import Blob, Tree, Object, Commit -from git.utils import SHA1Writer, LazyMixin, ConcurrentWriteOperation, join_path_native +from git.utils import IndexFileSHA1Writer, LazyMixin, ConcurrentWriteOperation, join_path_native class CheckoutError( Exception ): @@ -461,7 +461,7 @@ class IndexFile(LazyMixin, diff.Diffable): write_op = ConcurrentWriteOperation(file_path or self._file_path) stream = write_op._begin_writing() - stream = SHA1Writer(stream) + stream = IndexFileSHA1Writer(stream) # header stream.write("DIRC") diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index fd1b640a..204da9ad 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -1,6 +1,21 @@ """Contains implementations of database retrieveing objects""" import os from git.errors import InvalidDBRoot +from git.utils import IndexFileSHA1Writer + +from utils import ( + to_hex_sha, + exists, + hex_to_bin, + FDCompressedSha1Writer, + isdir, + mkdir, + rename, + dirname, + join + ) + +import tempfile class iObjectDBR(object): @@ -9,29 +24,29 @@ class iObjectDBR(object): by sha (20 bytes)""" __slots__ = tuple() + def __contains__(self, sha): + return self.has_obj + #{ Query Interface - def has_obj_hex(self, hexsha): - """:return: True if the object identified by the given 40 byte hexsha is - contained in the database""" - raise NotImplementedError("To be implemented in subclass") - - def has_obj_bin(self, sha): - """:return: as ``has_obj_hex``, but takes a 20 byte binary sha""" - raise NotImplementedError("To be implemented in subclass") - - def obj_hex(self, hexsha): - """:return: tuple(type_string, size_in_bytes, stream) a tuple with object - information including its type, its size as well as a stream from which its - contents can be read""" + def has_object(self, sha): + """ + :return: True if the object identified by the given 40 byte hexsha or 20 bytes + binary sha is contained in the database""" raise NotImplementedError("To be implemented in subclass") - def obj_bin(self, sha): - """:return: as in ``obj_hex``, but takes a binary sha""" + def object(self, sha): + """ + :return: tuple(type_string, size_in_bytes, stream) a tuple with object + information including its type, its size as well as a stream from which its + contents can be read + :param sha: 40 bytes hexsha or 20 bytes binary sha """ raise NotImplementedError("To be implemented in subclass") - def obj_info_hex(self, hexsha): - """:return: tuple(type_string, size_in_bytes) tuple with the object's type - string as well as its size in bytes""" + def object_info(self, sha): + """ + :return: tuple(type_string, size_in_bytes) tuple with the object's type + string as well as its size in bytes + :param sha: 40 bytes hexsha or 20 bytes binary sha""" raise NotImplementedError("To be implemented in subclass") #} END query interface @@ -42,7 +57,7 @@ class iObjectDBW(object): #{ Edit Interface - def to_obj(self, type, size, stream, dry_run=False, sha_as_hex=True): + def to_object(self, type, size, stream, dry_run=False, sha_as_hex=True): """Create a new object in the database :return: the sha identifying the object in the database :param type: type string identifying the object @@ -53,7 +68,7 @@ class iObjectDBW(object): hex encoded, not binary""" raise NotImplementedError("To be implemented in subclass") - def to_objs(self, iter_info, dry_run=False, sha_as_hex=True, max_threads=0): + def to_objects(self, iter_info, dry_run=False, sha_as_hex=True, max_threads=0): """Create multiple new objects in the database :return: sequence of shas identifying the created objects in the order in which they where given. @@ -68,7 +83,7 @@ class iObjectDBW(object): # actually use multiple threads, default False of course. If the add shas = list() for args in iter_info: - shas.append(self.to_obj(*args, dry_run=dry_run, sha_as_hex=sha_as_hex)) + shas.append(self.to_object(*args, dry_run=dry_run, sha_as_hex=sha_as_hex)) return shas #} END edit interface @@ -95,18 +110,103 @@ class FileDBBase(object): """:return: path at which this db operates""" return self._root_path + def db_path(self, rela_path): + """ + :return: the given relative path relative to our database root, allowing + to pontentially access datafiles""" + return join(self._root_path, rela_path) #} END interface #{ Utiltities - def _root_rela_path(self, rela_path): - """:return: the given relative path relative to our database root""" - return os.path.join(self._root_path, rela_path) + #} END utilities class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): """A database which operates on loose object files""" + __slots__ = ('_hexsha_to_file', ) + + # CONFIGURATION + # chunks in which data will be copied between streams + stream_chunk_size = 1000*1000 + + def __init__(self, root_path): + super(LooseObjectDB, self).__init__(root_path) + self._hexsha_to_file = dict() + + #{ Interface + def hexsha_to_object_path(self, hexsha): + """ + :return: path at which the object with the given hexsha would be stored, + relative to the database root""" + return join(hexsha[:2], hexsha[2:]) + + #} END interface + + def has_object(self, sha): + sha = to_hex_sha(sha) + # try cache + if sha in self._hexsha_to_file: + return True + + # try filesystem + path = self.db_path(self.hexsha_to_object_path(sha)) + if exists(path): + self._hexsha_to_file[sha] = path + return True + # END handle cache + return False + + def to_object(self, type, size, stream, dry_run=False, sha_as_hex=True): + # open a tmp file to write the data to + fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) + writer = FDCompressedSha1Writer(fd) + + # WRITE HEADER: type SP size NULL + writer.write("%s %i%s" % (type, size, chr(0))) + + # WRITE ALL DATA + chunksize = self.stream_chunk_size + try: + try: + while True: + data_len = writer.write(stream.read(chunksize)) + if data_len < chunksize: + # WRITE FOOTER + writer.write('\n') + break + # END check for stream end + # END duplicate data + finally: + writer.close() + # END assure file was closed + except: + os.remove(tmp_path) + raise + # END assure tmpfile removal on error + + + # in dry-run mode, we delete the file afterwards + sha = writer.sha(as_hex=True) + + if dry_run: + os.remove(tmp_path) + else: + # rename the file into place + obj_path = self.db_path(self.hexsha_to_object_path(sha)) + obj_dir = dirname(obj_path) + if not isdir(obj_dir): + mkdir(obj_dir) + # END handle destination directory + rename(tmp_path, obj_path) + # END handle dry_run + + if not sha_as_hex: + sha = hex_to_bin(sha) + # END handle sha format + + return sha class PackedDB(FileDBBase, iObjectDBR): diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py new file mode 100644 index 00000000..04d3eaba --- /dev/null +++ b/lib/git/odb/utils.py @@ -0,0 +1,78 @@ +import binascii +import os +import zlib +from git.utils import make_sha + +__all__ = ('FDSha1Writer', ) + +#{ Routines + +hex_to_bin = binascii.a2b_hex +bin_to_hex = binascii.b2a_hex + +def to_hex_sha(sha): + """:return: hexified version of sha""" + if len(sha) == 40: + return sha + return bin_to_hex(sha) + +def to_bin_sha(sha): + if len(sha) == 20: + return sha + return hex_to_bin(sha) + +# os shortcuts +exists = os.path.exists +mkdir = os.mkdir +isdir = os.path.isdir +rename = os.rename +dirname = os.path.dirname +join = os.path.join +read = os.read +write = os.write +close = os.close +#} END Routines + + +#{ Classes + +class FDCompressedSha1Writer(object): + """Digests data written to it, making the sha available, then compress the + data and write it to the file descriptor + :note: operates on raw file descriptors + :note: for this to work, you have to use the close-method of this instance""" + __slots__ = ("fd", "sha1", "zip") + + # default exception + exc = IOError("Failed to write all bytes to filedescriptor") + + def __init__(self, fd): + self.fd = fd + self.sha1 = make_sha("") + self.zip = zlib.compressobj() + + def write(self, data): + """:raise IOError: If not all bytes could be written + :return: lenght of incoming data""" + self.sha1.update(data) + cdata = self.zip.compress(data) + bytes_written = write(self.fd, cdata) + if bytes_written != len(cdata): + raise self.exc + return bytes_written + + def sha(self, as_hex = False): + """:return: sha so far + :param as_hex: if True, sha will be hex-encoded, binary otherwise""" + if as_hex: + return self.sha1.hexdigest() + return self.sha1.digest() + + def close(self): + remainder = self.zip.flush() + if write(self.fd, remainder) != len(remainder): + raise self.exc + return close(self.fd) + + +#} END classes diff --git a/lib/git/utils.py b/lib/git/utils.py index c21528b1..360c77c9 100644 --- a/lib/git/utils.py +++ b/lib/git/utils.py @@ -61,12 +61,14 @@ def join_path_native(a, *p): return to_native_path(join_path(a, *p)) -class SHA1Writer(object): +class IndexFileSHA1Writer(object): """ Wrapper around a file-like object that remembers the SHA1 of the data written to it. It will write a sha when the stream is closed or if the asked for explicitly usign write_sha. + Only useful to the indexfile + Note: Based on the dulwich project """ @@ -78,7 +80,7 @@ class SHA1Writer(object): def write(self, data): self.sha1.update(data) - self.f.write(data) + return self.f.write(data) def write_sha(self): sha = self.sha1.digest() -- cgit v1.2.1 From 6f8ce8901e21587cd2320562df412e05b5ab1731 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 23:53:29 +0200 Subject: added frame for object reading, including simple test --- lib/git/errors.py | 8 +++++- lib/git/odb/db.py | 76 ++++++++++++++++++++++++++++++++++++++++++---------- lib/git/odb/utils.py | 1 + 3 files changed, 70 insertions(+), 15 deletions(-) (limited to 'lib') diff --git a/lib/git/errors.py b/lib/git/errors.py index ecb1c35b..956e007f 100644 --- a/lib/git/errors.py +++ b/lib/git/errors.py @@ -10,8 +10,14 @@ Module containing all exceptions thrown througout the git package, class InvalidGitRepositoryError(Exception): """ Thrown if the given repository appears to have an invalid format. """ -class InvalidDBRoot(Exception): +class ODBError(Exception): + """All errors thrown by the object database""" + +class InvalidDBRoot(ODBError): """Thrown if an object database cannot be initialized at the given path""" + +class BadObject(ODBError): + """The object with the given SHA does not exist""" class NoSuchPathError(OSError): """ Thrown if a path could not be access by the system. """ diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index 204da9ad..1248a3f4 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -1,9 +1,13 @@ """Contains implementations of database retrieveing objects""" import os -from git.errors import InvalidDBRoot +from git.errors import ( + InvalidDBRoot, + BadObject + ) from git.utils import IndexFileSHA1Writer from utils import ( + getsize, to_hex_sha, exists, hex_to_bin, @@ -16,6 +20,7 @@ from utils import ( ) import tempfile +import mmap class iObjectDBR(object): @@ -136,27 +141,70 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): self._hexsha_to_file = dict() #{ Interface - def hexsha_to_object_path(self, hexsha): + def object_path(self, hexsha): """ :return: path at which the object with the given hexsha would be stored, relative to the database root""" return join(hexsha[:2], hexsha[2:]) - #} END interface - - def has_object(self, sha): - sha = to_hex_sha(sha) - # try cache - if sha in self._hexsha_to_file: - return True + def readable_db_object_path(self, hexsha): + """ + :return: readable object path to the object identified by hexsha + :raise BadObject: If the object file does not exist""" + try: + return self._hexsha_to_file[hexsha] + except KeyError: + pass + # END ignore cache misses # try filesystem - path = self.db_path(self.hexsha_to_object_path(sha)) + path = self.db_path(self.object_path(hexsha)) if exists(path): - self._hexsha_to_file[sha] = path - return True + self._hexsha_to_file[hexsha] = path + return path # END handle cache - return False + raise BadObject(hexsha) + + #} END interface + + def _object_header_info(self, mmap): + """:return: tuple(type_string, uncompressed_size_in_bytes + :param mmap: newly mapped memory map at position 0. It will be + seeked to the actual start of the object contents, which can be used + to initialize a zlib decompress object.""" + raise NotImplementedError("todo") + + def _map_object(self, sha): + """ + :return: tuple(file, mmap) tuple with an opened file for reading, and + a memory map of that file""" + db_path = self.readable_db_object_path(to_hex_sha(sha)) + f = open(db_path, 'rb') + m = mmap.mmap(f.fileno(), getsize(db_path), access=mmap.ACCESS_READ) + return (f, m) + + def object_info(self, sha): + f, m = self._map_object(sha) + try: + type, size = self._object_header_info(m) + finally: + f.close() + m.close() + # END assure release of system resources + + def object(self, sha): + f, m = self._map_object(sha) + type, size = self._object_header_info(m) + # TODO: init a dynamic decompress stream from our memory map + + + def has_object(self, sha): + try: + self.readable_db_object_path(to_hex_sha(sha)) + return True + except BadObject: + return False + # END check existance def to_object(self, type, size, stream, dry_run=False, sha_as_hex=True): # open a tmp file to write the data to @@ -194,7 +242,7 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): os.remove(tmp_path) else: # rename the file into place - obj_path = self.db_path(self.hexsha_to_object_path(sha)) + obj_path = self.db_path(self.object_path(sha)) obj_dir = dirname(obj_path) if not isdir(obj_dir): mkdir(obj_dir) diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py index 04d3eaba..d88dca1a 100644 --- a/lib/git/odb/utils.py +++ b/lib/git/odb/utils.py @@ -22,6 +22,7 @@ def to_bin_sha(sha): return hex_to_bin(sha) # os shortcuts +getsize = os.path.getsize exists = os.path.exists mkdir = os.mkdir isdir = os.path.isdir -- cgit v1.2.1 From 38d59fc8ccccae8882fa48671377bf40a27915a7 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 3 Jun 2010 16:35:35 +0200 Subject: odb: implemented loose object streaming, which is impossible to do efficiently considering that it copies string buffers all the time --- lib/git/errors.py | 3 + lib/git/objects/base.py | 4 +- lib/git/odb/db.py | 114 ++++++++++++++++++++----------------- lib/git/odb/fun.py | 114 +++++++++++++++++++++++++++++++++++++ lib/git/odb/utils.py | 147 +++++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 327 insertions(+), 55 deletions(-) create mode 100644 lib/git/odb/fun.py (limited to 'lib') diff --git a/lib/git/errors.py b/lib/git/errors.py index 956e007f..d8a35e02 100644 --- a/lib/git/errors.py +++ b/lib/git/errors.py @@ -18,6 +18,9 @@ class InvalidDBRoot(ODBError): class BadObject(ODBError): """The object with the given SHA does not exist""" + +class BadObjectType(ODBError): + """The object had an unsupported type""" class NoSuchPathError(OSError): """ Thrown if a path could not be access by the system. """ diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py index f7043199..64a5678e 100644 --- a/lib/git/objects/base.py +++ b/lib/git/objects/base.py @@ -125,8 +125,8 @@ class Object(LazyMixin): Returns File Object compatible stream to the uncompressed raw data of the object """ - proc = self.repo.git.cat_file(self.type, self.sha, as_process=True) - return utils.ProcessStreamAdapter(proc, "stdout") + sha, type, size, stream = self.repo.git.stream_object_data(self.sha) + return stream def stream_data(self, ostream): """ diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index 1248a3f4..5c50a512 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -1,17 +1,18 @@ """Contains implementations of database retrieveing objects""" -import os +from git.utils import IndexFileSHA1Writer from git.errors import ( InvalidDBRoot, - BadObject + BadObject, + BadObjectType ) -from git.utils import IndexFileSHA1Writer from utils import ( - getsize, + DecompressMemMapReader, + FDCompressedSha1Writer, + ENOENT, to_hex_sha, exists, hex_to_bin, - FDCompressedSha1Writer, isdir, mkdir, rename, @@ -19,8 +20,15 @@ from utils import ( join ) +from fun import ( + chunk_size, + loose_object_header_info, + write_object + ) + import tempfile import mmap +import os class iObjectDBR(object): @@ -36,7 +44,8 @@ class iObjectDBR(object): def has_object(self, sha): """ :return: True if the object identified by the given 40 byte hexsha or 20 bytes - binary sha is contained in the database""" + binary sha is contained in the database + :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") def object(self, sha): @@ -44,14 +53,16 @@ class iObjectDBR(object): :return: tuple(type_string, size_in_bytes, stream) a tuple with object information including its type, its size as well as a stream from which its contents can be read - :param sha: 40 bytes hexsha or 20 bytes binary sha """ + :param sha: 40 bytes hexsha or 20 bytes binary sha + :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") def object_info(self, sha): """ :return: tuple(type_string, size_in_bytes) tuple with the object's type string as well as its size in bytes - :param sha: 40 bytes hexsha or 20 bytes binary sha""" + :param sha: 40 bytes hexsha or 20 bytes binary sha + :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") #} END query interface @@ -70,7 +81,8 @@ class iObjectDBW(object): :param stream: stream providing the data :param dry_run: if True, the object database will not actually be changed :param sha_as_hex: if True, the returned sha identifying the object will be - hex encoded, not binary""" + hex encoded, not binary + :raise IOError: if data could not be written""" raise NotImplementedError("To be implemented in subclass") def to_objects(self, iter_info, dry_run=False, sha_as_hex=True, max_threads=0): @@ -82,7 +94,8 @@ class iObjectDBW(object): :param dry_run: see ``to_obj`` :param sha_as_hex: see ``to_obj`` :param max_threads: if < 1, any number of threads may be started while processing - the request, otherwise the given number of threads will be started.""" + the request, otherwise the given number of threads will be started. + :raise IOError: if data could not be written""" # a trivial implementation, ignoring the threads for now # TODO: add configuration to the class to determine whether we may # actually use multiple threads, default False of course. If the add @@ -130,15 +143,19 @@ class FileDBBase(object): class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): """A database which operates on loose object files""" - __slots__ = ('_hexsha_to_file', ) - + __slots__ = ('_hexsha_to_file', '_fd_open_flags') # CONFIGURATION # chunks in which data will be copied between streams - stream_chunk_size = 1000*1000 + stream_chunk_size = chunk_size + def __init__(self, root_path): super(LooseObjectDB, self).__init__(root_path) self._hexsha_to_file = dict() + # Additional Flags - might be set to 0 after the first failure + # Depending on the root, this might work for some mounts, for others not, which + # is why it is per instance + self._fd_open_flags = os.O_NOATIME #{ Interface def object_path(self, hexsha): @@ -167,36 +184,46 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): #} END interface - def _object_header_info(self, mmap): - """:return: tuple(type_string, uncompressed_size_in_bytes - :param mmap: newly mapped memory map at position 0. It will be - seeked to the actual start of the object contents, which can be used - to initialize a zlib decompress object.""" - raise NotImplementedError("todo") - - def _map_object(self, sha): + def _map_loose_object(self, sha): """ - :return: tuple(file, mmap) tuple with an opened file for reading, and - a memory map of that file""" - db_path = self.readable_db_object_path(to_hex_sha(sha)) - f = open(db_path, 'rb') - m = mmap.mmap(f.fileno(), getsize(db_path), access=mmap.ACCESS_READ) - return (f, m) + :return: memory map of that file to allow random read access + :raise BadObject: if object could not be located""" + db_path = self.db_path(self.object_path(to_hex_sha(sha))) + try: + fd = os.open(db_path, os.O_RDONLY|self._fd_open_flags) + except OSError,e: + if e.errno != ENOENT: + # try again without noatime + try: + fd = os.open(db_path, os.O_RDONLY) + except OSError: + raise BadObject(to_hex_sha(sha)) + # didn't work because of our flag, don't try it again + self._fd_open_flags = 0 + else: + raise BadObject(to_hex_sha(sha)) + # END handle error + # END exception handling + try: + return mmap.mmap(fd, 0, access=mmap.ACCESS_READ) + finally: + os.close(fd) + # END assure file is closed def object_info(self, sha): - f, m = self._map_object(sha) + m = self._map_loose_object(sha) try: - type, size = self._object_header_info(m) + return loose_object_header_info(m) finally: - f.close() m.close() # END assure release of system resources def object(self, sha): - f, m = self._map_object(sha) - type, size = self._object_header_info(m) - # TODO: init a dynamic decompress stream from our memory map + m = self._map_loose_object(sha) + reader = DecompressMemMapReader(m, close_on_deletion = True) + type, size = reader.initialize() + return type, size, reader def has_object(self, sha): try: @@ -210,25 +237,10 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): # open a tmp file to write the data to fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) writer = FDCompressedSha1Writer(fd) - - # WRITE HEADER: type SP size NULL - writer.write("%s %i%s" % (type, size, chr(0))) - - # WRITE ALL DATA - chunksize = self.stream_chunk_size + try: - try: - while True: - data_len = writer.write(stream.read(chunksize)) - if data_len < chunksize: - # WRITE FOOTER - writer.write('\n') - break - # END check for stream end - # END duplicate data - finally: - writer.close() - # END assure file was closed + write_object(type, size, stream, writer, + close_target_stream=True, chunk_size=self.stream_chunk_size) except: os.remove(tmp_path) raise diff --git a/lib/git/odb/fun.py b/lib/git/odb/fun.py new file mode 100644 index 00000000..ee7144dd --- /dev/null +++ b/lib/git/odb/fun.py @@ -0,0 +1,114 @@ +"""Contains basic c-functions which usually contain performance critical code +Keeping this code separate from the beginning makes it easier to out-source +it into c later, if required""" + +from git.errors import ( + BadObjectType + ) + +import zlib +decompressobj = zlib.decompressobj + + +# INVARIANTS +type_id_to_type_map = { + 1 : "commit", + 2 : "tree", + 3 : "blob", + 4 : "tag" + } + +# used when dealing with larger streams +chunk_size = 1000*1000 + + +#{ Routines + +def is_loose_object(m): + """:return: True the file contained in memory map m appears to be a loose object. + Only the first two bytes are needed""" + b0, b1 = map(ord, m[:2]) + word = (b0 << 8) + b1 + return b0 == 0x78 and (word % 31) == 0 + +def loose_object_header_info(m): + """:return: tuple(type_string, uncompressed_size_in_bytes) the type string of the + object as well as its uncompressed size in bytes. + :param m: memory map from which to read the compressed object data""" + decompress_size = 8192 # is used in cgit as well + hdr = decompressobj().decompress(m, decompress_size) + type_name, size = hdr[:hdr.find("\0")].split(" ") + return type_name, int(size) + +def object_header_info(m): + """:return: tuple(type_string, uncompressed_size_in_bytes + :param mmap: mapped memory map. It will be + seeked to the actual start of the object contents, which can be used + to initialize a zlib decompress object. + :note: This routine can only handle new-style objects which are assumably contained + in packs + """ + assert not is_loose_object(m), "Use loose_object_header_info instead" + + c = b0 # first byte + i = 1 # next char to read + type_id = (c >> 4) & 7 # numeric type + size = c & 15 # starting size + s = 4 # starting bit-shift size + while c & 0x80: + c = ord(m[i]) + i += 1 + size += (c & 0x7f) << s + s += 7 + # END character loop + + # finally seek the map to the start of the data stream + m.seek(i) + try: + return (type_id_to_type_map[type_id], size) + except KeyError: + # invalid object type - we could try to be smart now and decode part + # of the stream to get the info, problem is that we had trouble finding + # the exact start of the content stream + raise BadObjectType(type_id) + # END handle exceptions + +def write_object(type, size, source_stream, target_stream, close_target_stream=True, + chunk_size=chunk_size): + """Write the object as identified by type, size and source_stream into the + target_stream + + :param type: type string of the object + :param size: amount of bytes to write from source_stream + :param source_stream: stream as file-like object providing at least size bytes + :param target_stream: stream as file-like object to receive the data + :param close_target_stream: if True, the target stream will be closed when + the routine exits, even if an error is thrown + :param chunk_size: size of chunks to read from source. Larger values can be beneficial + for io performance, but cost more memory as well + :return: The actual amount of bytes written to stream, which includes the header and a trailing newline""" + tbw = 0 # total num bytes written + dbw = 0 # num data bytes written + try: + # WRITE HEADER: type SP size NULL + tbw += target_stream.write("%s %i\0" % (type, size)) + + # WRITE ALL DATA UP TO SIZE + while True: + cs = min(chunk_size, size-dbw) + data_len = target_stream.write(source_stream.read(cs)) + dbw += data_len + if data_len < cs or dbw == size: + tbw += dbw + break + # END check for stream end + # END duplicate data + return tbw + finally: + if close_target_stream: + target_stream.close() + # END handle stream closing + # END assure file was closed + + +#} END routines diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py index d88dca1a..8a054201 100644 --- a/lib/git/odb/utils.py +++ b/lib/git/odb/utils.py @@ -1,7 +1,10 @@ import binascii import os import zlib +from cStringIO import StringIO from git.utils import make_sha +import errno +from fun import chunk_size __all__ = ('FDSha1Writer', ) @@ -21,8 +24,10 @@ def to_bin_sha(sha): return sha return hex_to_bin(sha) +# errors +ENOENT = errno.ENOENT + # os shortcuts -getsize = os.path.getsize exists = os.path.exists mkdir = os.mkdir isdir = os.path.isdir @@ -32,6 +37,11 @@ join = os.path.join read = os.read write = os.write close = os.close + +# ZLIB configuration +# used when compressing objects +Z_BEST_SPEED = 1 + #} END Routines @@ -50,7 +60,7 @@ class FDCompressedSha1Writer(object): def __init__(self, fd): self.fd = fd self.sha1 = make_sha("") - self.zip = zlib.compressobj() + self.zip = zlib.compressobj(Z_BEST_SPEED) def write(self, data): """:raise IOError: If not all bytes could be written @@ -76,4 +86,137 @@ class FDCompressedSha1Writer(object): return close(self.fd) +class DecompressMemMapReader(object): + """Reads data in chunks from a memory map and decompresses it. The client sees + only the uncompressed data, respective file-like read calls are handling on-demand + buffered decompression accordingly + + A constraint on the total size of bytes is activated, simulating + a logical file within a possibly larger physical memory area + + To read efficiently, you clearly don't want to read individual bytes, instead, + read a few kilobytes at least. + + :note: The chunk-size should be carefully selected as it will involve quite a bit + of string copying due to the way the zlib is implemented. Its very wasteful, + hence we try to find a good tradeoff between allocation time and number of + times we actually allocate. An own zlib implementation would be good here + to better support streamed reading - it would only need to keep the mmap + and decompress it into chunks, thats all ... """ + __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_cs', '_close') + + def __init__(self, m, close_on_deletion, cs = 128*1024): + """Initialize with mmap and chunk_size for stream reading""" + self._m = m + self._zip = zlib.decompressobj() + self._buf = None # buffer of decompressed bytes + self._buflen = 0 # length of bytes in buffer + self._s = 0 # size of uncompressed data to read in total + self._br = 0 # num uncompressed bytes read + self._cws = 0 # start byte of compression window + self._cwe = 0 # end byte of compression window + self._cs = cs # chunk size (when reading from zip) + self._close = close_on_deletion # close the memmap on deletion ? + + def __del__(self): + if self._close: + self._m.close() + # END handle resource freeing + + def initialize(self, size=0): + """Initialize this instance for acting as a read-only stream for size bytes. + :param size: size in bytes to be decompresed before being depleted. + If 0, default object header information is parsed from the data, + returning a tuple of (type_string, uncompressed_size) + If not 0, the size will be used, and None is returned. + :note: must only be called exactly once""" + if size: + self._s = size + return + # END handle size + + # read header + maxb = 8192 + self._s = maxb + hdr = self.read(maxb) + hdrend = hdr.find("\0") + type, size = hdr[:hdrend].split(" ") + self._s = int(size) + + # adjust internal state to match actual header length that we ignore + # The buffer will be depleted first on future reads + self._br = 0 + hdrend += 1 # count terminating \0 + self._buf = StringIO(hdr[hdrend:]) + self._buflen = len(hdr) - hdrend + + return type, size + + def read(self, size=-1): + if size < 1: + size = self._s - self._br + else: + size = min(size, self._s - self._br) + # END clamp size + + if size == 0: + return str() + # END handle depletion + + # deplete the buffer, then just continue using the decompress object + # which has an own buffer. We just need this to transparently parse the + # header from the zlib stream + dat = str() + if self._buf: + if self._buflen >= size: + # have enough data + dat = self._buf.read(size) + self._buflen -= size + self._br += size + return dat + else: + dat = self._buf.getvalue() # ouch, duplicates data + size -= self._buflen + self._br += self._buflen + + self._buflen = 0 + self._buf = None + # END handle buffer len + # END handle buffer + + # decompress some data + # Abstract: zlib needs to operate on chunks of our memory map ( which may + # be large ), as it will otherwise and always fill in the 'unconsumed_tail' + # attribute which possible reads our whole map to the end, forcing + # everything to be read from disk even though just a portion was requested. + # As this would be a nogo, we workaround it by passing only chunks of data, + # moving the window into the memory map along as we decompress, which keeps + # the tail smaller than our chunk-size. This causes 'only' the chunk to be + # copied once, and another copy of a part of it when it creates the unconsumed + # tail. We have to use it to hand in the appropriate amount of bytes durin g + # the next read. + if self._zip.unconsumed_tail: + # move the window, make it as large as size demands. For code-clarity, + # we just take the chunk from our map again instead of reusing the unconsumed + # tail. The latter one would safe some memory copying, but we could end up + # with not getting enough data uncompressed, so we had to sort that out as well. + # Now we just assume the worst case, hence the data is uncompressed and the window + # needs to be as large as the uncompressed bytes we want to read. + self._cws = self._cwe - len(self._zip.unconsumed_tail) + self._cwe = self._cws + size + indata = self._m[self._cws:self._cwe] # another copy ... :( + else: + cws = self._cws + self._cws = self._cwe + self._cwe = cws + size + indata = self._m[self._cws:self._cwe] # ... copy it again :( + # END handle tail + + dcompdat = self._zip.decompress(indata, size) + self._br += len(dcompdat) + + if dat: + return dat + dcompdat + return dcompdat + #} END classes -- cgit v1.2.1 From 26e138cb47dccc859ff219f108ce9b7d96cbcbcd Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 3 Jun 2010 18:21:05 +0200 Subject: odb: fixed streamed decompression reader ( specific tests would still be missing ) and added performance tests which are extremely promising --- lib/git/odb/db.py | 4 ++-- lib/git/odb/utils.py | 22 ++++++++++++++-------- 2 files changed, 16 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index 5c50a512..e656b2b5 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -91,8 +91,8 @@ class iObjectDBW(object): they where given. :param iter_info: iterable yielding tuples containing the type_string size_in_bytes and the steam with the content data. - :param dry_run: see ``to_obj`` - :param sha_as_hex: see ``to_obj`` + :param dry_run: see ``to_object`` + :param sha_as_hex: see ``to_object`` :param max_threads: if < 1, any number of threads may be started while processing the request, otherwise the given number of threads will be started. :raise IOError: if data could not be written""" diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py index 8a054201..1e4a8e9d 100644 --- a/lib/git/odb/utils.py +++ b/lib/git/odb/utils.py @@ -39,7 +39,7 @@ write = os.write close = os.close # ZLIB configuration -# used when compressing objects +# used when compressing objects - 1 to 9 ( slowest ) Z_BEST_SPEED = 1 #} END Routines @@ -70,7 +70,7 @@ class FDCompressedSha1Writer(object): bytes_written = write(self.fd, cdata) if bytes_written != len(cdata): raise self.exc - return bytes_written + return len(data) def sha(self, as_hex = False): """:return: sha so far @@ -175,7 +175,7 @@ class DecompressMemMapReader(object): self._br += size return dat else: - dat = self._buf.getvalue() # ouch, duplicates data + dat = self._buf.read() # ouch, duplicates data size -= self._buflen self._br += self._buflen @@ -195,28 +195,34 @@ class DecompressMemMapReader(object): # copied once, and another copy of a part of it when it creates the unconsumed # tail. We have to use it to hand in the appropriate amount of bytes durin g # the next read. - if self._zip.unconsumed_tail: + tail = self._zip.unconsumed_tail + if tail: # move the window, make it as large as size demands. For code-clarity, # we just take the chunk from our map again instead of reusing the unconsumed # tail. The latter one would safe some memory copying, but we could end up # with not getting enough data uncompressed, so we had to sort that out as well. # Now we just assume the worst case, hence the data is uncompressed and the window # needs to be as large as the uncompressed bytes we want to read. - self._cws = self._cwe - len(self._zip.unconsumed_tail) + self._cws = self._cwe - len(tail) self._cwe = self._cws + size + + indata = self._m[self._cws:self._cwe] # another copy ... :( + # get the actual window end to be sure we don't use it for computations + self._cwe = self._cws + len(indata) else: cws = self._cws self._cws = self._cwe self._cwe = cws + size indata = self._m[self._cws:self._cwe] # ... copy it again :( # END handle tail - + dcompdat = self._zip.decompress(indata, size) - self._br += len(dcompdat) + self._br += len(dcompdat) if dat: - return dat + dcompdat + dcompdat = dat + dcompdat + return dcompdat #} END classes -- cgit v1.2.1 From 4b4a514e51fbc7dc6ddcb27c188159d57b5d1fa9 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 3 Jun 2010 19:04:18 +0200 Subject: Added performance comparison to cgit ... and yes, git-python is faster :) --- lib/git/odb/utils.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py index 1e4a8e9d..94d1cea8 100644 --- a/lib/git/odb/utils.py +++ b/lib/git/odb/utils.py @@ -103,10 +103,12 @@ class DecompressMemMapReader(object): times we actually allocate. An own zlib implementation would be good here to better support streamed reading - it would only need to keep the mmap and decompress it into chunks, thats all ... """ - __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_cs', '_close') + __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close') - def __init__(self, m, close_on_deletion, cs = 128*1024): - """Initialize with mmap and chunk_size for stream reading""" + max_read_size = 512*1024 + + def __init__(self, m, close_on_deletion): + """Initialize with mmap for stream reading""" self._m = m self._zip = zlib.decompressobj() self._buf = None # buffer of decompressed bytes @@ -115,7 +117,6 @@ class DecompressMemMapReader(object): self._br = 0 # num uncompressed bytes read self._cws = 0 # start byte of compression window self._cwe = 0 # end byte of compression window - self._cs = cs # chunk size (when reading from zip) self._close = close_on_deletion # close the memmap on deletion ? def __del__(self): @@ -163,6 +164,28 @@ class DecompressMemMapReader(object): return str() # END handle depletion + # protect from memory peaks + # If he tries to read large chunks, our memory patterns get really bad + # as we end up copying a possibly huge chunk from our memory map right into + # memory. This might not even be possible. Nonetheless, try to dampen the + # effect a bit by reading in chunks, returning a huge string in the end. + # Our performance now depends on StringIO. This way we don't need two large + # buffers in peak times, but only one large one in the end which is + # the return buffer + if size > self.max_read_size: + sio = StringIO() + while size: + read_size = min(self.max_read_size, size) + data = self.read(read_size) + sio.write(data) + size -= len(data) + if len(data) < read_size: + break + # END data loop + sio.seek(0) + return sio.getvalue() + # END handle maxread + # deplete the buffer, then just continue using the decompress object # which has an own buffer. We just need this to transparently parse the # header from the zlib stream -- cgit v1.2.1 From 1e2b46138ba58033738a24dadccc265748fce2ca Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 3 Jun 2010 23:20:34 +0200 Subject: commit.create_from_tree now uses pure python implementation, fixed message parsing which truncated newlines although it was ilegitimate. Its up to the reader to truncate therse, nowhere in the git code I could find anyone adding newlines to commits where it is written Added performance tests for serialization, it does about 5k commits per second if writing to tmpfs --- lib/git/cmd.py | 7 +------ lib/git/objects/base.py | 4 ++-- lib/git/objects/commit.py | 42 +++++++++++++++++++++++++++++------------- lib/git/objects/utils.py | 25 ++++++++++++++++++------- lib/git/odb/utils.py | 32 ++++++++++++++++++-------------- lib/git/repo.py | 43 +++++++++++++++++++++---------------------- lib/git/utils.py | 15 +++++++++++++++ 7 files changed, 104 insertions(+), 64 deletions(-) (limited to 'lib') diff --git a/lib/git/cmd.py b/lib/git/cmd.py index aaa27adc..18d1c505 100644 --- a/lib/git/cmd.py +++ b/lib/git/cmd.py @@ -323,12 +323,7 @@ class Git(object): stdout_value = proc.stdout.read().rstrip() # strip trailing "\n" else: max_chunk_size = 1024*64 - while True: - chunk = proc.stdout.read(max_chunk_size) - output_stream.write(chunk) - if len(chunk) < max_chunk_size: - break - # END reading output stream + stream_copy(proc.stdout, output_stream, max_chunk_size) stdout_value = output_stream # END stdout handling stderr_value = proc.stderr.read().rstrip() # strip trailing "\n" diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py index 64a5678e..f7043199 100644 --- a/lib/git/objects/base.py +++ b/lib/git/objects/base.py @@ -125,8 +125,8 @@ class Object(LazyMixin): Returns File Object compatible stream to the uncompressed raw data of the object """ - sha, type, size, stream = self.repo.git.stream_object_data(self.sha) - return stream + proc = self.repo.git.cat_file(self.type, self.sha, as_process=True) + return utils.ProcessStreamAdapter(proc, "stdout") def stream_data(self, ostream): """ diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py index 98aca360..d56ce306 100644 --- a/lib/git/objects/commit.py +++ b/lib/git/objects/commit.py @@ -91,15 +91,6 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri """ super(Commit,self).__init__(repo, sha) self._set_self_from_args_(locals()) - - if parents is not None: - cls = type(self) - self.parents = tuple(cls(repo, p) for p in parents if not isinstance(p, cls)) - # END for each parent to convert - - if self.sha and tree is not None: - self.tree = Tree(repo, tree, path='') - # END id to tree conversion @classmethod def _get_intermediate_items(cls, commit): @@ -350,7 +341,12 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri committer, committer_time, committer_offset, message, parent_commits, conf_encoding) - # serialize ! + stream = StringIO() + new_commit._serialize(stream) + streamlen = stream.tell() + stream.seek(0) + + new_commit.sha = repo.odb.to_object(cls.type, streamlen, stream, sha_as_hex=True) if head: try: @@ -377,8 +373,28 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri #{ Serializable Implementation def _serialize(self, stream): - # for now, this is very inefficient and in fact shouldn't be used like this - return super(Commit, self)._serialize(stream) + write = stream.write + write("tree %s\n" % self.tree) + for p in self.parents: + write("parent %s\n" % p) + + a = self.author + c = self.committer + fmt = "%s %s <%s> %s %s\n" + write(fmt % ("author", a.name, a.email, + self.authored_date, + utils.altz_to_utctz_str(self.author_tz_offset))) + + write(fmt % ("committer", c.name, c.email, + self.committed_date, + utils.altz_to_utctz_str(self.committer_tz_offset))) + + if self.encoding != self.default_encoding: + write("encoding %s\n" % self.encoding) + + write("\n") + write(self.message) + return self def _deserialize(self, stream): """:param from_rev_list: if true, the stream format is coming from the rev-list command @@ -416,7 +432,7 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri # a stream from our data simply gives us the plain message # The end of our message stream is marked with a newline that we strip - self.message = stream.read()[:-1] + self.message = stream.read() return self #} END serializable implementation diff --git a/lib/git/objects/utils.py b/lib/git/objects/utils.py index 6d378a72..c93f2091 100644 --- a/lib/git/objects/utils.py +++ b/lib/git/objects/utils.py @@ -16,7 +16,8 @@ import time import os __all__ = ('get_object_type_by_name', 'get_user_id', 'parse_date', 'parse_actor_and_date', - 'ProcessStreamAdapter', 'Traversable') + 'ProcessStreamAdapter', 'Traversable', 'altz_to_utctz_str', 'utctz_to_altz', + 'verify_utctz') def get_object_type_by_name(object_type_name): """ @@ -57,14 +58,24 @@ def get_user_id(): return "%s@%s" % (username, platform.node()) -def _utc_tz_to_altz(utctz): +def utctz_to_altz(utctz): """we convert utctz to the timezone in seconds, it is the format time.altzone returns. Git stores it as UTC timezon which has the opposite sign as well, which explains the -1 * ( that was made explicit here ) :param utctz: git utc timezone string, i.e. +0200""" return -1 * int(float(utctz)/100*3600) + +def altz_to_utctz_str(altz): + """As above, but inverses the operation, returning a string that can be used + in commit objects""" + utci = -1 * int((altz / 3600)*100) + utcs = str(abs(utci)) + utcs = "0"*(4-len(utcs)) + utcs + prefix = (utci < 0 and '-') or '+' + return prefix + utcs + -def _verify_utctz(offset): +def verify_utctz(offset): """:raise ValueError: if offset is incorrect :return: offset""" fmt_exc = ValueError("Invalid timezone offset format: %s" % offset) @@ -97,11 +108,11 @@ def parse_date(string_date): if string_date.count(' ') == 1 and string_date.rfind(':') == -1: timestamp, offset = string_date.split() timestamp = int(timestamp) - return timestamp, _utc_tz_to_altz(_verify_utctz(offset)) + return timestamp, utctz_to_altz(verify_utctz(offset)) else: offset = "+0000" # local time by default if string_date[-5] in '-+': - offset = _verify_utctz(string_date[-5:]) + offset = verify_utctz(string_date[-5:]) string_date = string_date[:-6] # skip space as well # END split timezone info @@ -139,7 +150,7 @@ def parse_date(string_date): fstruct = time.struct_time((dtstruct.tm_year, dtstruct.tm_mon, dtstruct.tm_mday, tstruct.tm_hour, tstruct.tm_min, tstruct.tm_sec, dtstruct.tm_wday, dtstruct.tm_yday, tstruct.tm_isdst)) - return int(time.mktime(fstruct)), _utc_tz_to_altz(offset) + return int(time.mktime(fstruct)), utctz_to_altz(offset) except ValueError: continue # END exception handling @@ -167,7 +178,7 @@ def parse_actor_and_date(line): """ m = _re_actor_epoch.search(line) actor, epoch, offset = m.groups() - return (Actor._from_string(actor), int(epoch), _utc_tz_to_altz(offset)) + return (Actor._from_string(actor), int(epoch), utctz_to_altz(offset)) diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py index 94d1cea8..fd340962 100644 --- a/lib/git/odb/utils.py +++ b/lib/git/odb/utils.py @@ -137,7 +137,7 @@ class DecompressMemMapReader(object): # END handle size # read header - maxb = 8192 + maxb = 512 # should really be enough, cgit uses 8192 I believe self._s = maxb hdr = self.read(maxb) hdrend = hdr.find("\0") @@ -172,20 +172,24 @@ class DecompressMemMapReader(object): # Our performance now depends on StringIO. This way we don't need two large # buffers in peak times, but only one large one in the end which is # the return buffer - if size > self.max_read_size: - sio = StringIO() - while size: - read_size = min(self.max_read_size, size) - data = self.read(read_size) - sio.write(data) - size -= len(data) - if len(data) < read_size: - break - # END data loop - sio.seek(0) - return sio.getvalue() - # END handle maxread + # NO: We don't do it - if the user thinks its best, he is right. If he + # has trouble, he will start reading in chunks. According to our tests + # its still faster if we read 10 Mb at once instead of chunking it. + # if size > self.max_read_size: + # sio = StringIO() + # while size: + # read_size = min(self.max_read_size, size) + # data = self.read(read_size) + # sio.write(data) + # size -= len(data) + # if len(data) < read_size: + # break + # # END data loop + # sio.seek(0) + # return sio.getvalue() + # # END handle maxread + # # deplete the buffer, then just continue using the decompress object # which has an own buffer. We just need this to transparently parse the # header from the zlib stream diff --git a/lib/git/repo.py b/lib/git/repo.py index f4caa3fb..0bd2249c 100644 --- a/lib/git/repo.py +++ b/lib/git/repo.py @@ -4,12 +4,6 @@ # This module is part of GitPython and is released under # the BSD License: http://www.opensource.org/licenses/bsd-license.php -import os -import sys -import re -import gzip -import StringIO - from errors import InvalidGitRepositoryError, NoSuchPathError from cmd import Git from actor import Actor @@ -19,6 +13,15 @@ from objects import * from config import GitConfigParser from remote import Remote +from odb.db import LooseObjectDB + +import os +import sys +import re +import gzip +import StringIO + + def touch(filename): fp = open(filename, "a") fp.close() @@ -53,7 +56,7 @@ class Repo(object): 'git_dir' is the .git repository directoy, which is always set. """ DAEMON_EXPORT_FILE = 'git-daemon-export-ok' - __slots__ = ( "working_dir", "_working_tree_dir", "git_dir", "_bare", "git" ) + __slots__ = ( "working_dir", "_working_tree_dir", "git_dir", "_bare", "git", "odb" ) # precompiled regex re_whitespace = re.compile(r'\s+') @@ -65,27 +68,22 @@ class Repo(object): # represents the configuration level of a configuration file config_level = ("system", "global", "repository") - def __init__(self, path=None): - """ - Create a new Repo instance - - ``path`` - is the path to either the root git directory or the bare git repo + def __init__(self, path=None, odbt = LooseObjectDB): + """ Create a new Repo instance - Examples:: + :param path: is the path to either the root git directory or the bare git repo:: repo = Repo("/Users/mtrier/Development/git-python") repo = Repo("/Users/mtrier/Development/git-python.git") repo = Repo("~/Development/git-python.git") repo = Repo("$REPOSITORIES/Development/git-python.git") - - Raises - InvalidGitRepositoryError or NoSuchPathError - - Returns - ``git.Repo`` - """ - + + :param odbt: Object DataBase type - a type which is constructed by providing + the directory containing the database objects, i.e. .git/objects. It will + be used to access all object data + :raise InvalidGitRepositoryError: + :raise NoSuchPathError: + :return: git.Repo """ epath = os.path.abspath(os.path.expandvars(os.path.expanduser(path or os.getcwd()))) if not os.path.exists(epath): @@ -130,6 +128,7 @@ class Repo(object): self.working_dir = self._working_tree_dir or self.git_dir self.git = Git(self.working_dir) + self.odb = odbt(os.path.join(self.git_dir, 'objects')) def __eq__(self, rhs): if isinstance(rhs, Repo): diff --git a/lib/git/utils.py b/lib/git/utils.py index 360c77c9..60a7de48 100644 --- a/lib/git/utils.py +++ b/lib/git/utils.py @@ -27,6 +27,21 @@ def make_sha(source=''): sha1 = sha.sha(source) return sha1 +def stream_copy(source, destination, chunk_size=512*1024): + """Copy all data from the source stream into the destination stream in chunks + of size chunk_size + :return: amount of bytes written""" + br = 0 + while True: + chunk = source.read(chunk_size) + destination.write(chunk) + br += len(chunk) + if len(chunk) < chunk_size: + break + # END reading output stream + return br + + def join_path(a, *p): """Join path tokens together similar to os.path.join, but always use '/' instead of possibly '\' on windows.""" -- cgit v1.2.1 From 1906ee4df9ae4e734288c5203cf79894dff76cab Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 3 Jun 2010 23:27:09 +0200 Subject: Fixed compatability issues with python 2.5, made sure all tests run --- lib/git/odb/db.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index e656b2b5..c970410d 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -101,7 +101,7 @@ class iObjectDBW(object): # actually use multiple threads, default False of course. If the add shas = list() for args in iter_info: - shas.append(self.to_object(*args, dry_run=dry_run, sha_as_hex=sha_as_hex)) + shas.append(self.to_object(dry_run=dry_run, sha_as_hex=sha_as_hex, *args)) return shas #} END edit interface @@ -155,7 +155,7 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): # Additional Flags - might be set to 0 after the first failure # Depending on the root, this might work for some mounts, for others not, which # is why it is per instance - self._fd_open_flags = os.O_NOATIME + self._fd_open_flags = getattr(os, 'O_NOATIME', 0) #{ Interface def object_path(self, hexsha): -- cgit v1.2.1 From b01ca6a3e4ae9d944d799743c8ff774e2a7a82b6 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 4 Jun 2010 00:09:00 +0200 Subject: db: implemented GitObjectDB using the git command to make sure we can lookup everything. Next is to implement pack-file reading, then alternates which should allow to resolve everything --- lib/git/objects/base.py | 32 +++++++++++++------------------- lib/git/odb/db.py | 22 ++++++++++++++++++++-- lib/git/repo.py | 11 ++++++++--- 3 files changed, 41 insertions(+), 24 deletions(-) (limited to 'lib') diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py index f7043199..446c4406 100644 --- a/lib/git/objects/base.py +++ b/lib/git/objects/base.py @@ -4,7 +4,7 @@ # This module is part of GitPython and is released under # the BSD License: http://www.opensource.org/licenses/bsd-license.php import os -from git.utils import LazyMixin, join_path_native +from git.utils import LazyMixin, join_path_native, stream_copy import utils _assertion_msg_format = "Created object %r whose python type %r disagrees with the acutal git object type %r" @@ -76,10 +76,11 @@ class Object(LazyMixin): Retrieve object information """ if attr == "size": - hexsha, typename, self.size = self.repo.git.get_object_header(self.sha) + typename, self.size = self.repo.odb.object_info(self.sha) assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) elif attr == "data": - hexsha, typename, self.size, self.data = self.repo.git.get_object_data(self.sha) + typename, self.size, stream = self.repo.odb.object(self.sha) + self.data = stream.read() # once we have an own odb, we can delay reading assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) else: super(Object,self)._set_cache_(attr) @@ -121,24 +122,17 @@ class Object(LazyMixin): @property def data_stream(self): - """ - Returns - File Object compatible stream to the uncompressed raw data of the object - """ - proc = self.repo.git.cat_file(self.type, self.sha, as_process=True) - return utils.ProcessStreamAdapter(proc, "stdout") + """ :return: File Object compatible stream to the uncompressed raw data of the object + :note: returned streams must be read in order""" + type, size, stream = self.repo.odb.object(self.sha) + return stream def stream_data(self, ostream): - """ - Writes our data directly to the given output stream - - ``ostream`` - File object compatible stream object. - - Returns - self - """ - self.repo.git.cat_file(self.type, self.sha, output_stream=ostream) + """Writes our data directly to the given output stream + :param ostream: File object compatible stream object. + :return: self""" + type, size, istream = self.repo.odb.object(self.sha) + stream_copy(istream, ostream) return self diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index c970410d..1d1d4c40 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -281,9 +281,27 @@ class ReferenceDB(CompoundDB): """A database consisting of database referred to in a file""" -class GitObjectDB(CompoundDB, iObjectDBW): +#class GitObjectDB(CompoundDB, iObjectDBW): +class GitObjectDB(LooseObjectDB): """A database representing the default git object store, which includes loose objects, pack files and an alternates file - It will create objects only in the loose object database.""" + It will create objects only in the loose object database. + :note: for now, we use the git command to do all the lookup, just until he + have packs and the other implementations + """ + __slots__ = ('_git', ) + def __init__(self, root_path, git): + """Initialize this instance with the root and a git command""" + super(GitObjectDB, self).__init__(root_path) + self._git = git + + def object_info(self, sha): + discard, type, size = self._git.get_object_header(sha) + return type, size + + def object(self, sha): + """For now, all lookup is done by git itself""" + discard, type, size, stream = self._git.stream_object_data(sha) + return type, size, stream diff --git a/lib/git/repo.py b/lib/git/repo.py index 0bd2249c..1afb1eb7 100644 --- a/lib/git/repo.py +++ b/lib/git/repo.py @@ -13,7 +13,7 @@ from objects import * from config import GitConfigParser from remote import Remote -from odb.db import LooseObjectDB +from odb.db import GitObjectDB import os import sys @@ -68,7 +68,7 @@ class Repo(object): # represents the configuration level of a configuration file config_level = ("system", "global", "repository") - def __init__(self, path=None, odbt = LooseObjectDB): + def __init__(self, path=None, odbt = GitObjectDB): """ Create a new Repo instance :param path: is the path to either the root git directory or the bare git repo:: @@ -128,7 +128,12 @@ class Repo(object): self.working_dir = self._working_tree_dir or self.git_dir self.git = Git(self.working_dir) - self.odb = odbt(os.path.join(self.git_dir, 'objects')) + + # special handling, in special times + args = [os.path.join(self.git_dir, 'objects')] + if issubclass(odbt, GitObjectDB): + args.append(self.git) + self.odb = odbt(*args) def __eq__(self, rhs): if isinstance(rhs, Repo): -- cgit v1.2.1 From a1e80445ad5cb6da4c0070d7cb8af89da3b0803b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 4 Jun 2010 14:41:15 +0200 Subject: initial version of new odb design to facilitate a channel based multi-threading implementation of all odb functions --- lib/git/objects/base.py | 8 +- lib/git/objects/commit.py | 2 +- lib/git/odb/db.py | 114 +++++++++----- lib/git/odb/stream.py | 388 ++++++++++++++++++++++++++++++++++++++++++++++ lib/git/odb/utils.py | 215 ------------------------- 5 files changed, 465 insertions(+), 262 deletions(-) create mode 100644 lib/git/odb/stream.py (limited to 'lib') diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py index 446c4406..76384888 100644 --- a/lib/git/objects/base.py +++ b/lib/git/objects/base.py @@ -76,10 +76,10 @@ class Object(LazyMixin): Retrieve object information """ if attr == "size": - typename, self.size = self.repo.odb.object_info(self.sha) + typename, self.size = self.repo.odb.info(self.sha) assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) elif attr == "data": - typename, self.size, stream = self.repo.odb.object(self.sha) + typename, self.size, stream = self.repo.odb.stream(self.sha) self.data = stream.read() # once we have an own odb, we can delay reading assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) else: @@ -124,14 +124,14 @@ class Object(LazyMixin): def data_stream(self): """ :return: File Object compatible stream to the uncompressed raw data of the object :note: returned streams must be read in order""" - type, size, stream = self.repo.odb.object(self.sha) + type, size, stream = self.repo.odb.stream(self.sha) return stream def stream_data(self, ostream): """Writes our data directly to the given output stream :param ostream: File object compatible stream object. :return: self""" - type, size, istream = self.repo.odb.object(self.sha) + type, size, istream = self.repo.odb.stream(self.sha) stream_copy(istream, ostream) return self diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py index d56ce306..dbc0cf27 100644 --- a/lib/git/objects/commit.py +++ b/lib/git/objects/commit.py @@ -346,7 +346,7 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri streamlen = stream.tell() stream.seek(0) - new_commit.sha = repo.odb.to_object(cls.type, streamlen, stream, sha_as_hex=True) + new_commit.sha = repo.odb.store(cls.type, streamlen, stream, sha_as_hex=True) if head: try: diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index 1d1d4c40..7ae8f446 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -6,9 +6,12 @@ from git.errors import ( BadObjectType ) -from utils import ( +from stream import ( DecompressMemMapReader, - FDCompressedSha1Writer, + FDCompressedSha1Writer + ) + +from utils import ( ENOENT, to_hex_sha, exists, @@ -31,7 +34,7 @@ import mmap import os -class iObjectDBR(object): +class ObjectDBR(object): """Defines an interface for object database lookup. Objects are identified either by hex-sha (40 bytes) or by sha (20 bytes)""" @@ -48,62 +51,87 @@ class iObjectDBR(object): :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") - def object(self, sha): - """ - :return: tuple(type_string, size_in_bytes, stream) a tuple with object - information including its type, its size as well as a stream from which its - contents can be read + def info(self, sha): + """ :return: ODB_Info instance :param sha: 40 bytes hexsha or 20 bytes binary sha :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") - def object_info(self, sha): - """ - :return: tuple(type_string, size_in_bytes) tuple with the object's type - string as well as its size in bytes + def info_async(self, input_channel): + """Retrieve information of a multitude of objects asynchronously + :param input_channel: Channel yielding the sha's of the objects of interest + :return: Channel yielding ODB_Info|InvalidODB_Info, in any order""" + raise NotImplementedError("To be implemented in subclass") + + def stream(self, sha): + """:return: ODB_OStream instance :param sha: 40 bytes hexsha or 20 bytes binary sha :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") + + def stream_async(self, input_channel): + """Retrieve the ODB_OStream of multiple objects + :param input_channel: see ``info`` + :param max_threads: see ``ObjectDBW.store`` + :return: Channel yielding ODB_OStream|InvalidODB_OStream instances in any order""" + raise NotImplementedError("To be implemented in subclass") #} END query interface -class iObjectDBW(object): +class ObjectDBW(object): """Defines an interface to create objects in the database""" - __slots__ = tuple() + __slots__ = "_ostream" + + def __init__(self, *args, **kwargs): + self._ostream = None #{ Edit Interface + def set_ostream(self, stream): + """Adjusts the stream to which all data should be sent when storing new objects + :param stream: if not None, the stream to use, if None the default stream + will be used. + :return: previously installed stream, or None if there was no override + :raise TypeError: if the stream doesn't have the supported functionality""" + cstream = self._ostream + self._ostream = stream + return cstream + + def ostream(self): + """:return: overridden output stream this instance will write to, or None + if it will write to the default stream""" + return self._ostream - def to_object(self, type, size, stream, dry_run=False, sha_as_hex=True): + def store(self, istream): """Create a new object in the database - :return: the sha identifying the object in the database - :param type: type string identifying the object - :param size: size of the data to read from stream - :param stream: stream providing the data - :param dry_run: if True, the object database will not actually be changed - :param sha_as_hex: if True, the returned sha identifying the object will be - hex encoded, not binary + :return: the input istream object with its sha set to its corresponding value + :param istream: ODB_IStream compatible instance. If its sha is already set + to a value, the object will just be stored in the our database format, + in which case the input stream is expected to be in object format ( header + contents ). :raise IOError: if data could not be written""" raise NotImplementedError("To be implemented in subclass") - def to_objects(self, iter_info, dry_run=False, sha_as_hex=True, max_threads=0): - """Create multiple new objects in the database - :return: sequence of shas identifying the created objects in the order in which - they where given. - :param iter_info: iterable yielding tuples containing the type_string - size_in_bytes and the steam with the content data. - :param dry_run: see ``to_object`` - :param sha_as_hex: see ``to_object`` - :param max_threads: if < 1, any number of threads may be started while processing - the request, otherwise the given number of threads will be started. - :raise IOError: if data could not be written""" + def store_async(self, input_channel): + """Create multiple new objects in the database asynchronously. The method will + return right away, returning an output channel which receives the results as + they are computed. + + :return: Channel yielding your ODB_IStream which served as input, in any order. + The IStreams sha will be set to the sha it received during the process, + or its error attribute will be set to the exception informing about the error. + :param input_channel: Channel yielding ODB_IStream instance. + As the same instances will be used in the output channel, you can create a map + between the id(istream) -> istream + :note:As some ODB implementations implement this operation as atomic, they might + abort the whole operation if one item could not be processed. Hence check how + many items have actually been produced.""" # a trivial implementation, ignoring the threads for now # TODO: add configuration to the class to determine whether we may # actually use multiple threads, default False of course. If the add shas = list() for args in iter_info: - shas.append(self.to_object(dry_run=dry_run, sha_as_hex=sha_as_hex, *args)) + shas.append(self.store(dry_run=dry_run, sha_as_hex=sha_as_hex, *args)) return shas - + #} END edit interface @@ -118,6 +146,7 @@ class FileDBBase(object): :raise InvalidDBRoot: :note: The base will perform basic checking for accessability, but the subclass is required to verify that the root_path contains the database structure it needs""" + super(FileDBBase, self).__init__() if not os.path.isdir(root_path): raise InvalidDBRoot(root_path) self._root_path = root_path @@ -141,7 +170,7 @@ class FileDBBase(object): #} END utilities -class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): +class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): """A database which operates on loose object files""" __slots__ = ('_hexsha_to_file', '_fd_open_flags') # CONFIGURATION @@ -210,7 +239,7 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): os.close(fd) # END assure file is closed - def object_info(self, sha): + def info(self, sha): m = self._map_loose_object(sha) try: return loose_object_header_info(m) @@ -233,8 +262,9 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): return False # END check existance - def to_object(self, type, size, stream, dry_run=False, sha_as_hex=True): + def store(self, istream): # open a tmp file to write the data to + # todo: implement ostream properly fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) writer = FDCompressedSha1Writer(fd) @@ -269,11 +299,11 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): return sha -class PackedDB(FileDBBase, iObjectDBR): +class PackedDB(FileDBBase, ObjectDBR): """A database operating on a set of object packs""" -class CompoundDB(iObjectDBR): +class CompoundDB(ObjectDBR): """A database which delegates calls to sub-databases""" @@ -281,7 +311,7 @@ class ReferenceDB(CompoundDB): """A database consisting of database referred to in a file""" -#class GitObjectDB(CompoundDB, iObjectDBW): +#class GitObjectDB(CompoundDB, ObjectDBW): class GitObjectDB(LooseObjectDB): """A database representing the default git object store, which includes loose objects, pack files and an alternates file @@ -296,7 +326,7 @@ class GitObjectDB(LooseObjectDB): super(GitObjectDB, self).__init__(root_path) self._git = git - def object_info(self, sha): + def info(self, sha): discard, type, size = self._git.get_object_header(sha) return type, size diff --git a/lib/git/odb/stream.py b/lib/git/odb/stream.py new file mode 100644 index 00000000..325c1444 --- /dev/null +++ b/lib/git/odb/stream.py @@ -0,0 +1,388 @@ +import zlib +from cStringIO import StringIO +from git.utils import make_sha +import errno + +from utils import ( + to_hex_sha, + to_bin_sha + ) + +__all__ = ('FDCompressedSha1Writer', 'DecompressMemMapReader') + + +# ZLIB configuration +# used when compressing objects - 1 to 9 ( slowest ) +Z_BEST_SPEED = 1 + + +#{ ODB Bases + +class ODB_Info(tuple): + """Carries information about an object in an ODB, provdiing information + about the sha of the object, the type_string as well as the uncompressed size + in bytes. + + It can be accessed using tuple notation and using attribute access notation:: + + assert dbi[0] == dbi.sha + assert dbi[1] == dbi.type + assert dbi[2] == dbi.size + + The type is designed to be as lighteight as possible.""" + __slots__ = tuple() + + def __new__(cls, sha, type, size): + return tuple.__new__(cls, (sha, type, size)) + + def __init__(self, sha, type, size): + pass + + #{ Interface + @property + def sha(self): + return self[0] + + @property + def type(self): + return self[1] + + @property + def size(self): + return self[2] + #} END interface + + +class ODB_OStream(ODB_Info): + """Base for object streams retrieved from the database, providing additional + information about the stream. + Generally, ODB streams are read-only as objects are immutable""" + __slots__ = tuple() + + def __new__(cls, sha, type, size, *args, **kwargs): + """Helps with the initialization of subclasses""" + return tuple.__new__(cls, (sha, type, size)) + + def is_compressed(self): + """:return: True if reads of this stream yield zlib compressed data. + :note: this does not imply anything about the actual internal storage. + Hence the data could be uncompressed, but read compressed, or vice versa""" + raise NotImplementedError("To be implemented by subclass") + + +class ODB_IStream(list): + """Represents an input content stream to be fed into the ODB. It is mutable to allow + the ODB to record information about the operations outcome right in this instance. + + It provides interfaces for the ODB_OStream and a StreamReader to allow the instance + to blend in without prior conversion. + + The only method your content stream must support is 'read'""" + __slots__ = tuple() + + def __new__(cls, type, size, stream, sha=None, compressed=False): + list.__new__(cls, (sha, type, size, stream, compressed, None)) + + def __init__(cls, type, size, stream, sha=None, compressed=None): + pass + + #{ Interface + + def hexsha(self): + """:return: our sha, hex encoded, 40 bytes""" + return to_hex_sha(self[0]) + + def binsha(self): + """:return: our sha as binary, 20 bytes""" + return to_bin_sha(self[0]) + + def _error(self): + """:return: the error that occurred when processing the stream, or None""" + return self[5] + + def _set_error(self, exc): + """Set this input stream to the given exc, may be None to reset the error""" + self[5] = exc + + error = property(_error, _set_error) + + #} END interface + + #{ Stream Reader Interface + + def read(self, size=-1): + """Implements a simple stream reader interface, passing the read call on + to our internal stream""" + return self[3].read(size) + + #} END stream reader interface + + #{ interface + + def _set_sha(self, sha): + self[0] = sha + + def _sha(self): + return self[0] + + sha = property(_sha, _set_sha) + + @property + def type(self): + return self[1] + + @property + def size(self): + return self[2] + + #} END odb info interface + + #{ ODB_OStream interface + + def is_compressed(self): + return self[4] + + #} END ODB_OStream interface + + +class InvalidODB_Info(tuple): + """Carries information about a sha identifying an object which is invalid in + the queried database. The exception attribute provides more information about + the cause of the issue""" + __slots__ = tuple() + + def __new__(cls, sha, exc): + return tuple.__new__(cls, (sha, exc)) + + def __init__(self, sha, exc): + pass + + @property + def sha(self): + return self[0] + + @property + def error(self): + """:return: exception instance explaining the failure""" + return self[1] + +class InvalidODB_OStream(InvalidODB_Info): + """Carries information about an invalid ODB stream""" + __slots__ = tuple() + +#} END ODB Bases + + +#{ RO Streams + +class DecompressMemMapReader(ODB_OStream): + """Reads data in chunks from a memory map and decompresses it. The client sees + only the uncompressed data, respective file-like read calls are handling on-demand + buffered decompression accordingly + + A constraint on the total size of bytes is activated, simulating + a logical file within a possibly larger physical memory area + + To read efficiently, you clearly don't want to read individual bytes, instead, + read a few kilobytes at least. + + :note: The chunk-size should be carefully selected as it will involve quite a bit + of string copying due to the way the zlib is implemented. Its very wasteful, + hence we try to find a good tradeoff between allocation time and number of + times we actually allocate. An own zlib implementation would be good here + to better support streamed reading - it would only need to keep the mmap + and decompress it into chunks, thats all ... """ + # __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close') + + max_read_size = 512*1024 + + def __init__(self, sha, type, size, m, close_on_deletion): + """Initialize with mmap for stream reading""" + self._m = m + self._zip = zlib.decompressobj() + self._buf = None # buffer of decompressed bytes + self._buflen = 0 # length of bytes in buffer + self._s = 0 # size of uncompressed data to read in total + self._br = 0 # num uncompressed bytes read + self._cws = 0 # start byte of compression window + self._cwe = 0 # end byte of compression window + self._close = close_on_deletion # close the memmap on deletion ? + + def __del__(self): + if self._close: + self._m.close() + # END handle resource freeing + + def initialize(self, size=0): + """Initialize this instance for acting as a read-only stream for size bytes. + :param size: size in bytes to be decompresed before being depleted. + If 0, default object header information is parsed from the data, + returning a tuple of (type_string, uncompressed_size) + If not 0, the size will be used, and None is returned. + :note: must only be called exactly once""" + if size: + self._s = size + return + # END handle size + + # read header + maxb = 512 # should really be enough, cgit uses 8192 I believe + self._s = maxb + hdr = self.read(maxb) + hdrend = hdr.find("\0") + type, size = hdr[:hdrend].split(" ") + self._s = int(size) + + # adjust internal state to match actual header length that we ignore + # The buffer will be depleted first on future reads + self._br = 0 + hdrend += 1 # count terminating \0 + self._buf = StringIO(hdr[hdrend:]) + self._buflen = len(hdr) - hdrend + + return type, size + + def read(self, size=-1): + if size < 1: + size = self._s - self._br + else: + size = min(size, self._s - self._br) + # END clamp size + + if size == 0: + return str() + # END handle depletion + + # protect from memory peaks + # If he tries to read large chunks, our memory patterns get really bad + # as we end up copying a possibly huge chunk from our memory map right into + # memory. This might not even be possible. Nonetheless, try to dampen the + # effect a bit by reading in chunks, returning a huge string in the end. + # Our performance now depends on StringIO. This way we don't need two large + # buffers in peak times, but only one large one in the end which is + # the return buffer + # NO: We don't do it - if the user thinks its best, he is right. If he + # has trouble, he will start reading in chunks. According to our tests + # its still faster if we read 10 Mb at once instead of chunking it. + + # if size > self.max_read_size: + # sio = StringIO() + # while size: + # read_size = min(self.max_read_size, size) + # data = self.read(read_size) + # sio.write(data) + # size -= len(data) + # if len(data) < read_size: + # break + # # END data loop + # sio.seek(0) + # return sio.getvalue() + # # END handle maxread + # + # deplete the buffer, then just continue using the decompress object + # which has an own buffer. We just need this to transparently parse the + # header from the zlib stream + dat = str() + if self._buf: + if self._buflen >= size: + # have enough data + dat = self._buf.read(size) + self._buflen -= size + self._br += size + return dat + else: + dat = self._buf.read() # ouch, duplicates data + size -= self._buflen + self._br += self._buflen + + self._buflen = 0 + self._buf = None + # END handle buffer len + # END handle buffer + + # decompress some data + # Abstract: zlib needs to operate on chunks of our memory map ( which may + # be large ), as it will otherwise and always fill in the 'unconsumed_tail' + # attribute which possible reads our whole map to the end, forcing + # everything to be read from disk even though just a portion was requested. + # As this would be a nogo, we workaround it by passing only chunks of data, + # moving the window into the memory map along as we decompress, which keeps + # the tail smaller than our chunk-size. This causes 'only' the chunk to be + # copied once, and another copy of a part of it when it creates the unconsumed + # tail. We have to use it to hand in the appropriate amount of bytes durin g + # the next read. + tail = self._zip.unconsumed_tail + if tail: + # move the window, make it as large as size demands. For code-clarity, + # we just take the chunk from our map again instead of reusing the unconsumed + # tail. The latter one would safe some memory copying, but we could end up + # with not getting enough data uncompressed, so we had to sort that out as well. + # Now we just assume the worst case, hence the data is uncompressed and the window + # needs to be as large as the uncompressed bytes we want to read. + self._cws = self._cwe - len(tail) + self._cwe = self._cws + size + + + indata = self._m[self._cws:self._cwe] # another copy ... :( + # get the actual window end to be sure we don't use it for computations + self._cwe = self._cws + len(indata) + else: + cws = self._cws + self._cws = self._cwe + self._cwe = cws + size + indata = self._m[self._cws:self._cwe] # ... copy it again :( + # END handle tail + + dcompdat = self._zip.decompress(indata, size) + + self._br += len(dcompdat) + if dat: + dcompdat = dat + dcompdat + + return dcompdat + +#} END RO streams + + +#{ W Streams + +class FDCompressedSha1Writer(object): + """Digests data written to it, making the sha available, then compress the + data and write it to the file descriptor + :note: operates on raw file descriptors + :note: for this to work, you have to use the close-method of this instance""" + __slots__ = ("fd", "sha1", "zip") + + # default exception + exc = IOError("Failed to write all bytes to filedescriptor") + + def __init__(self, fd): + self.fd = fd + self.sha1 = make_sha("") + self.zip = zlib.compressobj(Z_BEST_SPEED) + + def write(self, data): + """:raise IOError: If not all bytes could be written + :return: lenght of incoming data""" + self.sha1.update(data) + cdata = self.zip.compress(data) + bytes_written = write(self.fd, cdata) + if bytes_written != len(cdata): + raise self.exc + return len(data) + + def sha(self, as_hex = False): + """:return: sha so far + :param as_hex: if True, sha will be hex-encoded, binary otherwise""" + if as_hex: + return self.sha1.hexdigest() + return self.sha1.digest() + + def close(self): + remainder = self.zip.flush() + if write(self.fd, remainder) != len(remainder): + raise self.exc + return close(self.fd) + + +#} END W streams diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py index fd340962..61565ba9 100644 --- a/lib/git/odb/utils.py +++ b/lib/git/odb/utils.py @@ -1,10 +1,6 @@ import binascii import os -import zlib -from cStringIO import StringIO -from git.utils import make_sha import errno -from fun import chunk_size __all__ = ('FDSha1Writer', ) @@ -38,218 +34,7 @@ read = os.read write = os.write close = os.close -# ZLIB configuration -# used when compressing objects - 1 to 9 ( slowest ) -Z_BEST_SPEED = 1 #} END Routines -#{ Classes - -class FDCompressedSha1Writer(object): - """Digests data written to it, making the sha available, then compress the - data and write it to the file descriptor - :note: operates on raw file descriptors - :note: for this to work, you have to use the close-method of this instance""" - __slots__ = ("fd", "sha1", "zip") - - # default exception - exc = IOError("Failed to write all bytes to filedescriptor") - - def __init__(self, fd): - self.fd = fd - self.sha1 = make_sha("") - self.zip = zlib.compressobj(Z_BEST_SPEED) - - def write(self, data): - """:raise IOError: If not all bytes could be written - :return: lenght of incoming data""" - self.sha1.update(data) - cdata = self.zip.compress(data) - bytes_written = write(self.fd, cdata) - if bytes_written != len(cdata): - raise self.exc - return len(data) - - def sha(self, as_hex = False): - """:return: sha so far - :param as_hex: if True, sha will be hex-encoded, binary otherwise""" - if as_hex: - return self.sha1.hexdigest() - return self.sha1.digest() - - def close(self): - remainder = self.zip.flush() - if write(self.fd, remainder) != len(remainder): - raise self.exc - return close(self.fd) - - -class DecompressMemMapReader(object): - """Reads data in chunks from a memory map and decompresses it. The client sees - only the uncompressed data, respective file-like read calls are handling on-demand - buffered decompression accordingly - - A constraint on the total size of bytes is activated, simulating - a logical file within a possibly larger physical memory area - - To read efficiently, you clearly don't want to read individual bytes, instead, - read a few kilobytes at least. - - :note: The chunk-size should be carefully selected as it will involve quite a bit - of string copying due to the way the zlib is implemented. Its very wasteful, - hence we try to find a good tradeoff between allocation time and number of - times we actually allocate. An own zlib implementation would be good here - to better support streamed reading - it would only need to keep the mmap - and decompress it into chunks, thats all ... """ - __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close') - - max_read_size = 512*1024 - - def __init__(self, m, close_on_deletion): - """Initialize with mmap for stream reading""" - self._m = m - self._zip = zlib.decompressobj() - self._buf = None # buffer of decompressed bytes - self._buflen = 0 # length of bytes in buffer - self._s = 0 # size of uncompressed data to read in total - self._br = 0 # num uncompressed bytes read - self._cws = 0 # start byte of compression window - self._cwe = 0 # end byte of compression window - self._close = close_on_deletion # close the memmap on deletion ? - - def __del__(self): - if self._close: - self._m.close() - # END handle resource freeing - - def initialize(self, size=0): - """Initialize this instance for acting as a read-only stream for size bytes. - :param size: size in bytes to be decompresed before being depleted. - If 0, default object header information is parsed from the data, - returning a tuple of (type_string, uncompressed_size) - If not 0, the size will be used, and None is returned. - :note: must only be called exactly once""" - if size: - self._s = size - return - # END handle size - - # read header - maxb = 512 # should really be enough, cgit uses 8192 I believe - self._s = maxb - hdr = self.read(maxb) - hdrend = hdr.find("\0") - type, size = hdr[:hdrend].split(" ") - self._s = int(size) - - # adjust internal state to match actual header length that we ignore - # The buffer will be depleted first on future reads - self._br = 0 - hdrend += 1 # count terminating \0 - self._buf = StringIO(hdr[hdrend:]) - self._buflen = len(hdr) - hdrend - - return type, size - - def read(self, size=-1): - if size < 1: - size = self._s - self._br - else: - size = min(size, self._s - self._br) - # END clamp size - - if size == 0: - return str() - # END handle depletion - - # protect from memory peaks - # If he tries to read large chunks, our memory patterns get really bad - # as we end up copying a possibly huge chunk from our memory map right into - # memory. This might not even be possible. Nonetheless, try to dampen the - # effect a bit by reading in chunks, returning a huge string in the end. - # Our performance now depends on StringIO. This way we don't need two large - # buffers in peak times, but only one large one in the end which is - # the return buffer - # NO: We don't do it - if the user thinks its best, he is right. If he - # has trouble, he will start reading in chunks. According to our tests - # its still faster if we read 10 Mb at once instead of chunking it. - - # if size > self.max_read_size: - # sio = StringIO() - # while size: - # read_size = min(self.max_read_size, size) - # data = self.read(read_size) - # sio.write(data) - # size -= len(data) - # if len(data) < read_size: - # break - # # END data loop - # sio.seek(0) - # return sio.getvalue() - # # END handle maxread - # - # deplete the buffer, then just continue using the decompress object - # which has an own buffer. We just need this to transparently parse the - # header from the zlib stream - dat = str() - if self._buf: - if self._buflen >= size: - # have enough data - dat = self._buf.read(size) - self._buflen -= size - self._br += size - return dat - else: - dat = self._buf.read() # ouch, duplicates data - size -= self._buflen - self._br += self._buflen - - self._buflen = 0 - self._buf = None - # END handle buffer len - # END handle buffer - - # decompress some data - # Abstract: zlib needs to operate on chunks of our memory map ( which may - # be large ), as it will otherwise and always fill in the 'unconsumed_tail' - # attribute which possible reads our whole map to the end, forcing - # everything to be read from disk even though just a portion was requested. - # As this would be a nogo, we workaround it by passing only chunks of data, - # moving the window into the memory map along as we decompress, which keeps - # the tail smaller than our chunk-size. This causes 'only' the chunk to be - # copied once, and another copy of a part of it when it creates the unconsumed - # tail. We have to use it to hand in the appropriate amount of bytes durin g - # the next read. - tail = self._zip.unconsumed_tail - if tail: - # move the window, make it as large as size demands. For code-clarity, - # we just take the chunk from our map again instead of reusing the unconsumed - # tail. The latter one would safe some memory copying, but we could end up - # with not getting enough data uncompressed, so we had to sort that out as well. - # Now we just assume the worst case, hence the data is uncompressed and the window - # needs to be as large as the uncompressed bytes we want to read. - self._cws = self._cwe - len(tail) - self._cwe = self._cws + size - - - indata = self._m[self._cws:self._cwe] # another copy ... :( - # get the actual window end to be sure we don't use it for computations - self._cwe = self._cws + len(indata) - else: - cws = self._cws - self._cws = self._cwe - self._cwe = cws + size - indata = self._m[self._cws:self._cwe] # ... copy it again :( - # END handle tail - - dcompdat = self._zip.decompress(indata, size) - - self._br += len(dcompdat) - if dat: - dcompdat = dat + dcompdat - - return dcompdat - -#} END classes -- cgit v1.2.1 From e746f96bcc29238b79118123028ca170adc4ff0f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 4 Jun 2010 17:22:08 +0200 Subject: Fixed implementation after design change to deal with it - all tests run, but next there will have to be more through testing --- lib/git/__init__.py | 3 + lib/git/objects/base.py | 17 ++--- lib/git/objects/commit.py | 4 +- lib/git/odb/__init__.py | 4 ++ lib/git/odb/db.py | 116 ++++++++++++++++---------------- lib/git/odb/fun.py | 46 ++++++------- lib/git/odb/stream.py | 168 +++++++++++++++++++++++++++++++--------------- lib/git/odb/utils.py | 2 - lib/git/repo.py | 2 +- 9 files changed, 211 insertions(+), 151 deletions(-) (limited to 'lib') diff --git a/lib/git/__init__.py b/lib/git/__init__.py index aac539eb..2f17c55b 100644 --- a/lib/git/__init__.py +++ b/lib/git/__init__.py @@ -22,5 +22,8 @@ from git.remote import * from git.index import * from git.utils import LockFile, BlockingLockFile +# odb is NOT imported intentionally - if you really want it, you should get it +# yourself as its part of the core + __all__ = [ name for name, obj in locals().items() if not (name.startswith('_') or inspect.ismodule(obj)) ] diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py index 76384888..5a3a15a7 100644 --- a/lib/git/objects/base.py +++ b/lib/git/objects/base.py @@ -76,12 +76,14 @@ class Object(LazyMixin): Retrieve object information """ if attr == "size": - typename, self.size = self.repo.odb.info(self.sha) - assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) + oinfo = self.repo.odb.info(self.sha) + self.size = oinfo.size + assert oinfo.type == self.type, _assertion_msg_format % (self.sha, oinfo.type, self.type) elif attr == "data": - typename, self.size, stream = self.repo.odb.stream(self.sha) - self.data = stream.read() # once we have an own odb, we can delay reading - assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) + ostream = self.repo.odb.stream(self.sha) + self.size = ostream.size + self.data = ostream.read() + assert ostream.type == self.type, _assertion_msg_format % (self.sha, ostream.type, self.type) else: super(Object,self)._set_cache_(attr) @@ -124,14 +126,13 @@ class Object(LazyMixin): def data_stream(self): """ :return: File Object compatible stream to the uncompressed raw data of the object :note: returned streams must be read in order""" - type, size, stream = self.repo.odb.stream(self.sha) - return stream + return self.repo.odb.stream(self.sha) def stream_data(self, ostream): """Writes our data directly to the given output stream :param ostream: File object compatible stream object. :return: self""" - type, size, istream = self.repo.odb.stream(self.sha) + istream = self.repo.odb.stream(self.sha) stream_copy(istream, ostream) return self diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py index dbc0cf27..9a3c2c95 100644 --- a/lib/git/objects/commit.py +++ b/lib/git/objects/commit.py @@ -9,6 +9,7 @@ import git.diff as diff import git.stats as stats from git.actor import Actor from tree import Tree +from git.odb import IStream from cStringIO import StringIO import base import utils @@ -346,7 +347,8 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri streamlen = stream.tell() stream.seek(0) - new_commit.sha = repo.odb.store(cls.type, streamlen, stream, sha_as_hex=True) + istream = repo.odb.store(IStream(cls.type, streamlen, stream)) + new_commit.sha = istream.sha if head: try: diff --git a/lib/git/odb/__init__.py b/lib/git/odb/__init__.py index 17000244..5789d7eb 100644 --- a/lib/git/odb/__init__.py +++ b/lib/git/odb/__init__.py @@ -1,2 +1,6 @@ """Initialize the object database module""" +# default imports +from db import * +from stream import * + diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index 7ae8f446..a8de28ec 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -8,7 +8,10 @@ from git.errors import ( from stream import ( DecompressMemMapReader, - FDCompressedSha1Writer + FDCompressedSha1Writer, + Sha1Writer, + OStream, + OInfo ) from utils import ( @@ -34,11 +37,13 @@ import mmap import os +__all__ = ('ObjectDBR', 'ObjectDBW', 'FileDBBase', 'LooseObjectDB', 'PackedDB', + 'CompoundDB', 'ReferenceDB', 'GitObjectDB' ) + class ObjectDBR(object): """Defines an interface for object database lookup. Objects are identified either by hex-sha (40 bytes) or by sha (20 bytes)""" - __slots__ = tuple() def __contains__(self, sha): return self.has_obj @@ -52,7 +57,7 @@ class ObjectDBR(object): raise NotImplementedError("To be implemented in subclass") def info(self, sha): - """ :return: ODB_Info instance + """ :return: OInfo instance :param sha: 40 bytes hexsha or 20 bytes binary sha :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") @@ -60,27 +65,26 @@ class ObjectDBR(object): def info_async(self, input_channel): """Retrieve information of a multitude of objects asynchronously :param input_channel: Channel yielding the sha's of the objects of interest - :return: Channel yielding ODB_Info|InvalidODB_Info, in any order""" + :return: Channel yielding OInfo|InvalidOInfo, in any order""" raise NotImplementedError("To be implemented in subclass") def stream(self, sha): - """:return: ODB_OStream instance + """:return: OStream instance :param sha: 40 bytes hexsha or 20 bytes binary sha :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") def stream_async(self, input_channel): - """Retrieve the ODB_OStream of multiple objects + """Retrieve the OStream of multiple objects :param input_channel: see ``info`` :param max_threads: see ``ObjectDBW.store`` - :return: Channel yielding ODB_OStream|InvalidODB_OStream instances in any order""" + :return: Channel yielding OStream|InvalidOStream instances in any order""" raise NotImplementedError("To be implemented in subclass") #} END query interface class ObjectDBW(object): """Defines an interface to create objects in the database""" - __slots__ = "_ostream" def __init__(self, *args, **kwargs): self._ostream = None @@ -99,12 +103,12 @@ class ObjectDBW(object): def ostream(self): """:return: overridden output stream this instance will write to, or None if it will write to the default stream""" - return self._ostream + return self._ostream def store(self, istream): """Create a new object in the database :return: the input istream object with its sha set to its corresponding value - :param istream: ODB_IStream compatible instance. If its sha is already set + :param istream: IStream compatible instance. If its sha is already set to a value, the object will just be stored in the our database format, in which case the input stream is expected to be in object format ( header + contents ). :raise IOError: if data could not be written""" @@ -115,22 +119,16 @@ class ObjectDBW(object): return right away, returning an output channel which receives the results as they are computed. - :return: Channel yielding your ODB_IStream which served as input, in any order. + :return: Channel yielding your IStream which served as input, in any order. The IStreams sha will be set to the sha it received during the process, or its error attribute will be set to the exception informing about the error. - :param input_channel: Channel yielding ODB_IStream instance. + :param input_channel: Channel yielding IStream instance. As the same instances will be used in the output channel, you can create a map between the id(istream) -> istream :note:As some ODB implementations implement this operation as atomic, they might abort the whole operation if one item could not be processed. Hence check how many items have actually been produced.""" - # a trivial implementation, ignoring the threads for now - # TODO: add configuration to the class to determine whether we may - # actually use multiple threads, default False of course. If the add - shas = list() - for args in iter_info: - shas.append(self.store(dry_run=dry_run, sha_as_hex=sha_as_hex, *args)) - return shas + raise NotImplementedError("To be implemented in subclass") #} END edit interface @@ -138,7 +136,6 @@ class ObjectDBW(object): class FileDBBase(object): """Provides basic facilities to retrieve files of interest, including caching facilities to help mapping hexsha's to objects""" - __slots__ = ('_root_path', ) def __init__(self, root_path): """Initialize this instance to look for its files at the given root path @@ -164,15 +161,11 @@ class FileDBBase(object): return join(self._root_path, rela_path) #} END interface - #{ Utiltities - - - #} END utilities class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): """A database which operates on loose object files""" - __slots__ = ('_hexsha_to_file', '_fd_open_flags') + # CONFIGURATION # chunks in which data will be copied between streams stream_chunk_size = chunk_size @@ -238,21 +231,26 @@ class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): finally: os.close(fd) # END assure file is closed + + def set_ostream(self, stream): + """:raise TypeError: if the stream does not support the Sha1Writer interface""" + if stream is not None and not isinstance(stream, Sha1Writer): + raise TypeError("Output stream musst support the %s interface" % Sha1Writer.__name__) + return super(LooseObjectDB, self).set_ostream(stream) def info(self, sha): m = self._map_loose_object(sha) try: - return loose_object_header_info(m) + type, size = loose_object_header_info(m) + return OInfo(sha, type, size) finally: m.close() # END assure release of system resources - def object(self, sha): + def stream(self, sha): m = self._map_loose_object(sha) - reader = DecompressMemMapReader(m, close_on_deletion = True) - type, size = reader.initialize() - - return type, size, reader + type, size, stream = DecompressMemMapReader.new(m, close_on_deletion = True) + return OStream(sha, type, size, stream) def has_object(self, sha): try: @@ -263,27 +261,33 @@ class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): # END check existance def store(self, istream): - # open a tmp file to write the data to - # todo: implement ostream properly - fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) - writer = FDCompressedSha1Writer(fd) + """note: The sha we produce will be hex by nature""" + assert istream.sha is None, "Direct istream writing not yet implemented" + tmp_path = None + writer = self.ostream() + if writer is None: + # open a tmp file to write the data to + fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) + writer = FDCompressedSha1Writer(fd) + # END handle custom writer try: - write_object(type, size, stream, writer, - close_target_stream=True, chunk_size=self.stream_chunk_size) - except: - os.remove(tmp_path) - raise - # END assure tmpfile removal on error - + try: + write_object(istream.type, istream.size, istream.read, writer.write, + chunk_size=self.stream_chunk_size) + except: + if tmp_path: + os.remove(tmp_path) + raise + # END assure tmpfile removal on error + finally: + if tmp_path: + writer.close() + # END assure target stream is closed - # in dry-run mode, we delete the file afterwards sha = writer.sha(as_hex=True) - if dry_run: - os.remove(tmp_path) - else: - # rename the file into place + if tmp_path: obj_path = self.db_path(self.object_path(sha)) obj_dir = dirname(obj_path) if not isdir(obj_dir): @@ -292,11 +296,8 @@ class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): rename(tmp_path, obj_path) # END handle dry_run - if not sha_as_hex: - sha = hex_to_bin(sha) - # END handle sha format - - return sha + istream.sha = sha + return istream class PackedDB(FileDBBase, ObjectDBR): @@ -320,18 +321,17 @@ class GitObjectDB(LooseObjectDB): :note: for now, we use the git command to do all the lookup, just until he have packs and the other implementations """ - __slots__ = ('_git', ) def __init__(self, root_path, git): """Initialize this instance with the root and a git command""" super(GitObjectDB, self).__init__(root_path) self._git = git def info(self, sha): - discard, type, size = self._git.get_object_header(sha) - return type, size + t = self._git.get_object_header(sha) + return OInfo(t[0], t[1], t[2]) - def object(self, sha): + def stream(self, sha): """For now, all lookup is done by git itself""" - discard, type, size, stream = self._git.stream_object_data(sha) - return type, size, stream + t = self._git.stream_object_data(sha) + return OStream(t[0], t[1], t[2], t[3]) diff --git a/lib/git/odb/fun.py b/lib/git/odb/fun.py index ee7144dd..870a6f02 100644 --- a/lib/git/odb/fun.py +++ b/lib/git/odb/fun.py @@ -21,6 +21,8 @@ type_id_to_type_map = { # used when dealing with larger streams chunk_size = 1000*1000 +__all__ = ('is_loose_object', 'loose_object_header_info', 'object_header_info', + 'write_object' ) #{ Routines @@ -73,42 +75,34 @@ def object_header_info(m): raise BadObjectType(type_id) # END handle exceptions -def write_object(type, size, source_stream, target_stream, close_target_stream=True, - chunk_size=chunk_size): +def write_object(type, size, read, write, chunk_size=chunk_size): """Write the object as identified by type, size and source_stream into the target_stream :param type: type string of the object :param size: amount of bytes to write from source_stream - :param source_stream: stream as file-like object providing at least size bytes - :param target_stream: stream as file-like object to receive the data + :param read: read method of a stream providing the content data + :param write: write method of the output stream :param close_target_stream: if True, the target stream will be closed when the routine exits, even if an error is thrown - :param chunk_size: size of chunks to read from source. Larger values can be beneficial - for io performance, but cost more memory as well :return: The actual amount of bytes written to stream, which includes the header and a trailing newline""" tbw = 0 # total num bytes written dbw = 0 # num data bytes written - try: - # WRITE HEADER: type SP size NULL - tbw += target_stream.write("%s %i\0" % (type, size)) - - # WRITE ALL DATA UP TO SIZE - while True: - cs = min(chunk_size, size-dbw) - data_len = target_stream.write(source_stream.read(cs)) - dbw += data_len - if data_len < cs or dbw == size: - tbw += dbw - break - # END check for stream end - # END duplicate data - return tbw - finally: - if close_target_stream: - target_stream.close() - # END handle stream closing - # END assure file was closed + # WRITE HEADER: type SP size NULL + tbw += write("%s %i\0" % (type, size)) + + # WRITE ALL DATA UP TO SIZE + while True: + cs = min(chunk_size, size-dbw) + data_len = write(read(cs)) + dbw += data_len + if data_len < cs or dbw == size: + tbw += dbw + break + # END check for stream end + # END duplicate data + return tbw + #} END routines diff --git a/lib/git/odb/stream.py b/lib/git/odb/stream.py index 325c1444..d1181382 100644 --- a/lib/git/odb/stream.py +++ b/lib/git/odb/stream.py @@ -5,10 +5,13 @@ import errno from utils import ( to_hex_sha, - to_bin_sha + to_bin_sha, + write, + close ) -__all__ = ('FDCompressedSha1Writer', 'DecompressMemMapReader') +__all__ = ('OInfo', 'OStream', 'IStream', 'InvalidOInfo', 'InvalidOStream', + 'DecompressMemMapReader', 'FDCompressedSha1Writer') # ZLIB configuration @@ -18,7 +21,7 @@ Z_BEST_SPEED = 1 #{ ODB Bases -class ODB_Info(tuple): +class OInfo(tuple): """Carries information about an object in an ODB, provdiing information about the sha of the object, the type_string as well as the uncompressed size in bytes. @@ -35,8 +38,8 @@ class ODB_Info(tuple): def __new__(cls, sha, type, size): return tuple.__new__(cls, (sha, type, size)) - def __init__(self, sha, type, size): - pass + def __init__(self, *args): + tuple.__init__(self) #{ Interface @property @@ -53,38 +56,52 @@ class ODB_Info(tuple): #} END interface -class ODB_OStream(ODB_Info): +class OStream(OInfo): """Base for object streams retrieved from the database, providing additional information about the stream. Generally, ODB streams are read-only as objects are immutable""" __slots__ = tuple() - def __new__(cls, sha, type, size, *args, **kwargs): + def __new__(cls, sha, type, size, stream, *args, **kwargs): """Helps with the initialization of subclasses""" - return tuple.__new__(cls, (sha, type, size)) + return tuple.__new__(cls, (sha, type, size, stream)) + + + def __init__(self, *args, **kwargs): + tuple.__init__(self) + #{ Interface def is_compressed(self): - """:return: True if reads of this stream yield zlib compressed data. + """:return: True if reads of this stream yield zlib compressed data. Default False :note: this does not imply anything about the actual internal storage. Hence the data could be uncompressed, but read compressed, or vice versa""" - raise NotImplementedError("To be implemented by subclass") + raise False + + #} END interface + + #{ Stream Reader Interface + + def read(self, size=-1): + return self[3].read(size) + + #} END stream reader interface -class ODB_IStream(list): +class IStream(list): """Represents an input content stream to be fed into the ODB. It is mutable to allow the ODB to record information about the operations outcome right in this instance. - It provides interfaces for the ODB_OStream and a StreamReader to allow the instance + It provides interfaces for the OStream and a StreamReader to allow the instance to blend in without prior conversion. The only method your content stream must support is 'read'""" __slots__ = tuple() def __new__(cls, type, size, stream, sha=None, compressed=False): - list.__new__(cls, (sha, type, size, stream, compressed, None)) + return list.__new__(cls, (sha, type, size, stream, compressed, None)) - def __init__(cls, type, size, stream, sha=None, compressed=None): - pass + def __init__(self, type, size, stream, sha=None, compressed=None): + list.__init__(self, (sha, type, size, stream, compressed, None)) #{ Interface @@ -127,25 +144,42 @@ class ODB_IStream(list): sha = property(_sha, _set_sha) - @property - def type(self): + + def _type(self): return self[1] + + def _set_type(self, type): + self[1] = type - @property - def size(self): + type = property(_type, _set_type) + + def _size(self): return self[2] + + def _set_size(self, size): + self[2] = size + + size = property(_size, _set_size) + + def _stream(self): + return self[3] + + def _set_stream(self, stream): + self[3] = stream + + stream = property(_stream, _set_stream) #} END odb info interface - #{ ODB_OStream interface + #{ OStream interface def is_compressed(self): return self[4] - #} END ODB_OStream interface + #} END OStream interface -class InvalidODB_Info(tuple): +class InvalidOInfo(tuple): """Carries information about a sha identifying an object which is invalid in the queried database. The exception attribute provides more information about the cause of the issue""" @@ -155,7 +189,7 @@ class InvalidODB_Info(tuple): return tuple.__new__(cls, (sha, exc)) def __init__(self, sha, exc): - pass + tuple.__init__(self, (sha, exc)) @property def sha(self): @@ -166,7 +200,8 @@ class InvalidODB_Info(tuple): """:return: exception instance explaining the failure""" return self[1] -class InvalidODB_OStream(InvalidODB_Info): + +class InvalidOStream(InvalidOInfo): """Carries information about an invalid ODB stream""" __slots__ = tuple() @@ -175,7 +210,7 @@ class InvalidODB_OStream(InvalidODB_Info): #{ RO Streams -class DecompressMemMapReader(ODB_OStream): +class DecompressMemMapReader(object): """Reads data in chunks from a memory map and decompresses it. The client sees only the uncompressed data, respective file-like read calls are handling on-demand buffered decompression accordingly @@ -192,17 +227,17 @@ class DecompressMemMapReader(ODB_OStream): times we actually allocate. An own zlib implementation would be good here to better support streamed reading - it would only need to keep the mmap and decompress it into chunks, thats all ... """ - # __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close') + __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close') max_read_size = 512*1024 - def __init__(self, sha, type, size, m, close_on_deletion): + def __init__(self, m, close_on_deletion, size): """Initialize with mmap for stream reading""" self._m = m self._zip = zlib.decompressobj() self._buf = None # buffer of decompressed bytes self._buflen = 0 # length of bytes in buffer - self._s = 0 # size of uncompressed data to read in total + self._s = size # size of uncompressed data to read in total self._br = 0 # num uncompressed bytes read self._cws = 0 # start byte of compression window self._cwe = 0 # end byte of compression window @@ -213,34 +248,33 @@ class DecompressMemMapReader(ODB_OStream): self._m.close() # END handle resource freeing - def initialize(self, size=0): - """Initialize this instance for acting as a read-only stream for size bytes. - :param size: size in bytes to be decompresed before being depleted. - If 0, default object header information is parsed from the data, - returning a tuple of (type_string, uncompressed_size) - If not 0, the size will be used, and None is returned. - :note: must only be called exactly once""" - if size: - self._s = size - return - # END handle size + @classmethod + def new(self, m, close_on_deletion=False): + """Create a new DecompressMemMapReader instance for acting as a read-only stream + This method parses the object header from m and returns the parsed + type and size, as well as the created stream instance. + :param m: memory map on which to oparate + :param close_on_deletion: if True, the memory map will be closed once we are + being deleted""" + inst = DecompressMemMapReader(m, close_on_deletion, 0) # read header maxb = 512 # should really be enough, cgit uses 8192 I believe - self._s = maxb - hdr = self.read(maxb) + inst._s = maxb + hdr = inst.read(maxb) hdrend = hdr.find("\0") type, size = hdr[:hdrend].split(" ") - self._s = int(size) + size = int(size) + inst._s = size # adjust internal state to match actual header length that we ignore # The buffer will be depleted first on future reads - self._br = 0 + inst._br = 0 hdrend += 1 # count terminating \0 - self._buf = StringIO(hdr[hdrend:]) - self._buflen = len(hdr) - hdrend + inst._buf = StringIO(hdr[hdrend:]) + inst._buflen = len(hdr) - hdrend - return type, size + return type, size, inst def read(self, size=-1): if size < 1: @@ -346,7 +380,35 @@ class DecompressMemMapReader(ODB_OStream): #{ W Streams -class FDCompressedSha1Writer(object): +class Sha1Writer(object): + """Simple stream writer which produces a sha whenever you like as it degests + everything it is supposed to write""" + + def __init__(self): + self.sha1 = make_sha("") + + #{ Stream Interface + + def write(self, data): + """:raise IOError: If not all bytes could be written + :return: lenght of incoming data""" + self.sha1.update(data) + return len(data) + + # END stream interface + + #{ Interface + + def sha(self, as_hex = False): + """:return: sha so far + :param as_hex: if True, sha will be hex-encoded, binary otherwise""" + if as_hex: + return self.sha1.hexdigest() + return self.sha1.digest() + + #} END interface + +class FDCompressedSha1Writer(Sha1Writer): """Digests data written to it, making the sha available, then compress the data and write it to the file descriptor :note: operates on raw file descriptors @@ -357,10 +419,12 @@ class FDCompressedSha1Writer(object): exc = IOError("Failed to write all bytes to filedescriptor") def __init__(self, fd): + super(FDCompressedSha1Writer, self).__init__() self.fd = fd - self.sha1 = make_sha("") self.zip = zlib.compressobj(Z_BEST_SPEED) + #{ Stream Interface + def write(self, data): """:raise IOError: If not all bytes could be written :return: lenght of incoming data""" @@ -371,18 +435,12 @@ class FDCompressedSha1Writer(object): raise self.exc return len(data) - def sha(self, as_hex = False): - """:return: sha so far - :param as_hex: if True, sha will be hex-encoded, binary otherwise""" - if as_hex: - return self.sha1.hexdigest() - return self.sha1.digest() - def close(self): remainder = self.zip.flush() if write(self.fd, remainder) != len(remainder): raise self.exc return close(self.fd) + #} END stream interface #} END W streams diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py index 61565ba9..6863e97b 100644 --- a/lib/git/odb/utils.py +++ b/lib/git/odb/utils.py @@ -2,8 +2,6 @@ import binascii import os import errno -__all__ = ('FDSha1Writer', ) - #{ Routines hex_to_bin = binascii.a2b_hex diff --git a/lib/git/repo.py b/lib/git/repo.py index 1afb1eb7..78e5f526 100644 --- a/lib/git/repo.py +++ b/lib/git/repo.py @@ -13,7 +13,7 @@ from objects import * from config import GitConfigParser from remote import Remote -from odb.db import GitObjectDB +from odb import GitObjectDB import os import sys -- cgit v1.2.1