diff options
29 files changed, 3676 insertions, 1863 deletions
@@ -1,7 +1,12 @@ ======= CHANGES ======= - + +0.2 Beta 2 +=========== + * Commit objects now carry the 'encoding' information of their message. It wasn't parsed previously, and defaults to UTF-8 + * Commit.create_from_tree now uses a pure-python implementation, mimicing git-commit-tree + 0.2 ===== General diff --git a/lib/git/__init__.py b/lib/git/__init__.py index aac539eb..2f17c55b 100644 --- a/lib/git/__init__.py +++ b/lib/git/__init__.py @@ -22,5 +22,8 @@ from git.remote import * from git.index import * from git.utils import LockFile, BlockingLockFile +# odb is NOT imported intentionally - if you really want it, you should get it +# yourself as its part of the core + __all__ = [ name for name, obj in locals().items() if not (name.startswith('_') or inspect.ismodule(obj)) ] diff --git a/lib/git/cmd.py b/lib/git/cmd.py index 82daf551..5cae2998 100644 --- a/lib/git/cmd.py +++ b/lib/git/cmd.py @@ -13,427 +13,515 @@ from errors import GitCommandError GIT_PYTHON_TRACE = os.environ.get("GIT_PYTHON_TRACE", False) execute_kwargs = ('istream', 'with_keep_cwd', 'with_extended_output', - 'with_exceptions', 'as_process', - 'output_stream' ) + 'with_exceptions', 'as_process', + 'output_stream' ) def dashify(string): - return string.replace('_', '-') + return string.replace('_', '-') class Git(object): - """ - The Git class manages communication with the Git binary. - - It provides a convenient interface to calling the Git binary, such as in:: - - g = Git( git_dir ) - g.init() # calls 'git init' program - rval = g.ls_files() # calls 'git ls-files' program - - ``Debugging`` - Set the GIT_PYTHON_TRACE environment variable print each invocation - of the command to stdout. - Set its value to 'full' to see details about the returned values. - """ - __slots__ = ("_working_dir", "cat_file_all", "cat_file_header") - - # CONFIGURATION - # The size in bytes read from stdout when copying git's output to another stream - max_chunk_size = 1024*64 - - class AutoInterrupt(object): - """ - Kill/Interrupt the stored process instance once this instance goes out of scope. It is - used to prevent processes piling up in case iterators stop reading. - Besides all attributes are wired through to the contained process object. - - The wait method was overridden to perform automatic status code checking - and possibly raise. - """ - __slots__= ("proc", "args") - - def __init__(self, proc, args ): - self.proc = proc - self.args = args - - def __del__(self): - # did the process finish already so we have a return code ? - if self.proc.poll() is not None: - return - - # can be that nothing really exists anymore ... - if os is None: - return - - # try to kill it - try: - os.kill(self.proc.pid, 2) # interrupt signal - except AttributeError: - # try windows - # for some reason, providing None for stdout/stderr still prints something. This is why - # we simply use the shell and redirect to nul. Its slower than CreateProcess, question - # is whether we really want to see all these messages. Its annoying no matter what. - subprocess.call(("TASKKILL /F /T /PID %s 2>nul 1>nul" % str(self.proc.pid)), shell=True) - # END exception handling - - def __getattr__(self, attr): - return getattr(self.proc, attr) - - def wait(self): - """ - Wait for the process and return its status code. - - Raise - GitCommandError if the return status is not 0 - """ - status = self.proc.wait() - if status != 0: - raise GitCommandError(self.args, status, self.proc.stderr.read()) - # END status handling - return status - - - - def __init__(self, working_dir=None): - """ - Initialize this instance with: - - ``working_dir`` - Git directory we should work in. If None, we always work in the current - directory as returned by os.getcwd(). - It is meant to be the working tree directory if available, or the - .git directory in case of bare repositories. - """ - super(Git, self).__init__() - self._working_dir = working_dir - - # cached command slots - self.cat_file_header = None - self.cat_file_all = None - - def __getattr__(self, name): - """ - A convenience method as it allows to call the command as if it was - an object. - Returns - Callable object that will execute call _call_process with your arguments. - """ - if name[:1] == '_': - raise AttributeError(name) - return lambda *args, **kwargs: self._call_process(name, *args, **kwargs) - - @property - def working_dir(self): - """ - Returns - Git directory we are working on - """ - return self._working_dir - - def execute(self, command, - istream=None, - with_keep_cwd=False, - with_extended_output=False, - with_exceptions=True, - as_process=False, - output_stream=None, - **subprocess_kwargs - ): - """ - Handles executing the command on the shell and consumes and returns - the returned information (stdout) - - ``command`` - The command argument list to execute. - It should be a string, or a sequence of program arguments. The - program to execute is the first item in the args sequence or string. - - ``istream`` - Standard input filehandle passed to subprocess.Popen. - - ``with_keep_cwd`` - Whether to use the current working directory from os.getcwd(). - The cmd otherwise uses its own working_dir that it has been initialized - with if possible. - - ``with_extended_output`` - Whether to return a (status, stdout, stderr) tuple. - - ``with_exceptions`` - Whether to raise an exception when git returns a non-zero status. - - ``as_process`` - Whether to return the created process instance directly from which - streams can be read on demand. This will render with_extended_output and - with_exceptions ineffective - the caller will have - to deal with the details himself. - It is important to note that the process will be placed into an AutoInterrupt - wrapper that will interrupt the process once it goes out of scope. If you - use the command in iterators, you should pass the whole process instance - instead of a single stream. - - ``output_stream`` - If set to a file-like object, data produced by the git command will be - output to the given stream directly. - This feature only has any effect if as_process is False. Processes will - always be created with a pipe as subprocess.Popen can only accept system - file descriptors, not python objects ( such as StringIO ). - This merely is a workaround as the data will be copied from the - output pipe to the given output stream directly. - See also: Git.max_chunk_size - - ``**subprocess_kwargs`` - Keyword arguments to be passed to subprocess.Popen. Please note that - some of the valid kwargs are already set by this method, the ones you - specify may not be the same ones. - - Returns:: - - str(output) # extended_output = False (Default) - tuple(int(status), str(stdout), str(stderr)) # extended_output = True - - if ouput_stream is True, the stdout value will be your output stream: - output_stream # extended_output = False - tuple(int(status), output_stream, str(stderr))# extended_output = True - - Raise - GitCommandError - - NOTE - If you add additional keyword arguments to the signature of this method, - you must update the execute_kwargs tuple housed in this module. - """ - if GIT_PYTHON_TRACE and not GIT_PYTHON_TRACE == 'full': - print ' '.join(command) - - # Allow the user to have the command executed in their working dir. - if with_keep_cwd or self._working_dir is None: - cwd = os.getcwd() - else: - cwd=self._working_dir - - # Start the process - proc = subprocess.Popen(command, - cwd=cwd, - stdin=istream, - stderr=subprocess.PIPE, - stdout=subprocess.PIPE, - close_fds=(os.name=='posix'),# unsupported on linux - **subprocess_kwargs - ) - if as_process: - return self.AutoInterrupt(proc, command) - - # Wait for the process to return - status = 0 - stdout_value = '' - stderr_value = '' - try: - if output_stream is None: - stdout_value, stderr_value = proc.communicate() - # strip trailing "\n" - if stdout_value.endswith("\n"): - stdout_value = stdout_value[:-1] - if stderr_value.endswith("\n"): - stderr_value = stderr_value[:-1] - status = proc.returncode - else: - max_chunk_size = self.max_chunk_size - while True: - chunk = proc.stdout.read(max_chunk_size) - output_stream.write(chunk) - if len(chunk) < max_chunk_size: - break - # END reading output stream - stdout_value = output_stream - stderr_value = proc.stderr.read() - # strip trailing "\n" - if stderr_value.endswith("\n"): - stderr_value = stderr_value[:-1] - status = proc.wait() - # END stdout handling - finally: - proc.stdout.close() - proc.stderr.close() - - if with_exceptions and status != 0: - raise GitCommandError(command, status, stderr_value) - - if GIT_PYTHON_TRACE == 'full': - if stderr_value: - print "%s -> %d: '%s' !! '%s'" % (command, status, stdout_value, stderr_value) - elif stdout_value: - print "%s -> %d: '%s'" % (command, status, stdout_value) - else: - print "%s -> %d" % (command, status) - - # Allow access to the command's status code - if with_extended_output: - return (status, stdout_value, stderr_value) - else: - return stdout_value - - def transform_kwargs(self, **kwargs): - """ - Transforms Python style kwargs into git command line options. - """ - args = [] - for k, v in kwargs.items(): - if len(k) == 1: - if v is True: - args.append("-%s" % k) - elif type(v) is not bool: - args.append("-%s%s" % (k, v)) - else: - if v is True: - args.append("--%s" % dashify(k)) - elif type(v) is not bool: - args.append("--%s=%s" % (dashify(k), v)) - return args - - @classmethod - def __unpack_args(cls, arg_list): - if not isinstance(arg_list, (list,tuple)): - return [ str(arg_list) ] - - outlist = list() - for arg in arg_list: - if isinstance(arg_list, (list, tuple)): - outlist.extend(cls.__unpack_args( arg )) - # END recursion - else: - outlist.append(str(arg)) - # END for each arg - return outlist - - def _call_process(self, method, *args, **kwargs): - """ - Run the given git command with the specified arguments and return - the result as a String - - ``method`` - is the command. Contained "_" characters will be converted to dashes, - such as in 'ls_files' to call 'ls-files'. - - ``args`` - is the list of arguments. If None is included, it will be pruned. - This allows your commands to call git more conveniently as None - is realized as non-existent - - ``kwargs`` - is a dict of keyword arguments. - This function accepts the same optional keyword arguments - as execute(). - - Examples:: - git.rev_list('master', max_count=10, header=True) - - Returns - Same as execute() - """ - - # Handle optional arguments prior to calling transform_kwargs - # otherwise these'll end up in args, which is bad. - _kwargs = {} - for kwarg in execute_kwargs: - try: - _kwargs[kwarg] = kwargs.pop(kwarg) - except KeyError: - pass - - # Prepare the argument list - opt_args = self.transform_kwargs(**kwargs) - - ext_args = self.__unpack_args([a for a in args if a is not None]) - args = opt_args + ext_args - - call = ["git", dashify(method)] - call.extend(args) - - return self.execute(call, **_kwargs) - - def _parse_object_header(self, header_line): - """ - ``header_line`` - <hex_sha> type_string size_as_int - - Returns - (hex_sha, type_string, size_as_int) - - Raises - ValueError if the header contains indication for an error due to incorrect - input sha - """ - tokens = header_line.split() - if len(tokens) != 3: - raise ValueError("SHA named %s could not be resolved, git returned: %r" % (tokens[0], header_line.strip()) ) - if len(tokens[0]) != 40: - raise ValueError("Failed to parse header: %r" % header_line) - return (tokens[0], tokens[1], int(tokens[2])) - - def __prepare_ref(self, ref): - # required for command to separate refs on stdin - refstr = str(ref) # could be ref-object - if refstr.endswith("\n"): - return refstr - return refstr + "\n" - - def __get_persistent_cmd(self, attr_name, cmd_name, *args,**kwargs): - cur_val = getattr(self, attr_name) - if cur_val is not None: - return cur_val - - options = { "istream" : subprocess.PIPE, "as_process" : True } - options.update( kwargs ) - - cmd = self._call_process( cmd_name, *args, **options ) - setattr(self, attr_name, cmd ) - return cmd - - def __get_object_header(self, cmd, ref): - cmd.stdin.write(self.__prepare_ref(ref)) - cmd.stdin.flush() - return self._parse_object_header(cmd.stdout.readline()) - - def get_object_header(self, ref): - """ - Use this method to quickly examine the type and size of the object behind - the given ref. - - NOTE - The method will only suffer from the costs of command invocation - once and reuses the command in subsequent calls. - - Return: - (hexsha, type_string, size_as_int) - """ - cmd = self.__get_persistent_cmd("cat_file_header", "cat_file", batch_check=True) - return self.__get_object_header(cmd, ref) - - def get_object_data(self, ref): - """ - As get_object_header, but returns object data as well - - Return: - (hexsha, type_string, size_as_int,data_string) - """ - cmd = self.__get_persistent_cmd("cat_file_all", "cat_file", batch=True) - hexsha, typename, size = self.__get_object_header(cmd, ref) - data = cmd.stdout.read(size) - cmd.stdout.read(1) # finishing newlines - - return (hexsha, typename, size, data) - - def clear_cache(self): - """ - Clear all kinds of internal caches to release resources. - - Currently persistent commands will be interrupted. - - Returns - self - """ - self.cat_file_all = None - self.cat_file_header = None - return self + """ + The Git class manages communication with the Git binary. + + It provides a convenient interface to calling the Git binary, such as in:: + + g = Git( git_dir ) + g.init() # calls 'git init' program + rval = g.ls_files() # calls 'git ls-files' program + + ``Debugging`` + Set the GIT_PYTHON_TRACE environment variable print each invocation + of the command to stdout. + Set its value to 'full' to see details about the returned values. + """ + __slots__ = ("_working_dir", "cat_file_all", "cat_file_header") + + # CONFIGURATION + # The size in bytes read from stdout when copying git's output to another stream + max_chunk_size = 1024*64 + + class AutoInterrupt(object): + """ + Kill/Interrupt the stored process instance once this instance goes out of scope. It is + used to prevent processes piling up in case iterators stop reading. + Besides all attributes are wired through to the contained process object. + + The wait method was overridden to perform automatic status code checking + and possibly raise. + """ + __slots__= ("proc", "args") + + def __init__(self, proc, args ): + self.proc = proc + self.args = args + + def __del__(self): + # did the process finish already so we have a return code ? + if self.proc.poll() is not None: + return + + # can be that nothing really exists anymore ... + if os is None: + return + + # try to kill it + try: + os.kill(self.proc.pid, 2) # interrupt signal + except AttributeError: + # try windows + # for some reason, providing None for stdout/stderr still prints something. This is why + # we simply use the shell and redirect to nul. Its slower than CreateProcess, question + # is whether we really want to see all these messages. Its annoying no matter what. + subprocess.call(("TASKKILL /F /T /PID %s 2>nul 1>nul" % str(self.proc.pid)), shell=True) + # END exception handling + + def __getattr__(self, attr): + return getattr(self.proc, attr) + + def wait(self): + """ + Wait for the process and return its status code. + + Raise + GitCommandError if the return status is not 0 + """ + status = self.proc.wait() + if status != 0: + raise GitCommandError(self.args, status, self.proc.stderr.read()) + # END status handling + return status + # END auto interrupt + + class CatFileContentStream(object): + """Object representing a sized read-only stream returning the contents of + an object. + It behaves like a stream, but counts the data read and simulates an empty + stream once our sized content region is empty. + If not all data is read to the end of the objects's lifetime, we read the + rest to assure the underlying stream continues to work""" + + __slots__ = ('_stream', '_nbr', '_size') + + def __init__(self, size, stream): + self._stream = stream + self._size = size + self._nbr = 0 # num bytes read + + def read(self, size=-1): + bytes_left = self._size - self._nbr + if bytes_left == 0: + return '' + if size > -1: + # assure we don't try to read past our limit + size = min(bytes_left, size) + else: + # they try to read all, make sure its not more than what remains + size = bytes_left + # END check early depletion + data = self._stream.read(size) + self._nbr += len(data) + + # check for depletion, read our final byte to make the stream usable by others + if self._size - self._nbr == 0: + self._stream.read(1) # final newline + # END finish reading + + return data + + def readline(self, size=-1): + if self._nbr == self._size: + return '' + + # clamp size to lowest allowed value + bytes_left = self._size - self._nbr + if size > -1: + size = min(bytes_left, size) + else: + size = bytes_left + # END handle size + + data = self._stream.readline(size) + self._nbr += len(data) + + # handle final byte + # we inline everything, it must be fast ! + if self._size - self._nbr == 0: + self._stream.read(1) + # END finish reading + + return data + + def readlines(self, size=-1): + if self._nbr == self._size: + return list() + + # leave all additional logic to our readline method, we just check the size + out = list() + nbr = 0 + while True: + line = self.readline() + if not line: + break + out.append(line) + if size > -1: + nbr += len(line) + if nbr > size: + break + # END handle size constraint + # END readline loop + return out + + def __iter__(self): + return self + + def next(self): + line = self.readline() + if not line: + raise StopIteration + return line + + def __del__(self): + bytes_left = self._size - self._nbr + if bytes_left: + # seek and discard + self._stream.seek(bytes_left + 1, os.SEEK_CUR) # includes terminating newline + # END handle incomplete read + + + def __init__(self, working_dir=None): + """ + Initialize this instance with: + + ``working_dir`` + Git directory we should work in. If None, we always work in the current + directory as returned by os.getcwd(). + It is meant to be the working tree directory if available, or the + .git directory in case of bare repositories. + """ + super(Git, self).__init__() + self._working_dir = working_dir + + # cached command slots + self.cat_file_header = None + self.cat_file_all = None + + def __getattr__(self, name): + """ + A convenience method as it allows to call the command as if it was + an object. + Returns + Callable object that will execute call _call_process with your arguments. + """ + if name[:1] == '_': + raise AttributeError(name) + return lambda *args, **kwargs: self._call_process(name, *args, **kwargs) + + @property + def working_dir(self): + """ + Returns + Git directory we are working on + """ + return self._working_dir + + def execute(self, command, + istream=None, + with_keep_cwd=False, + with_extended_output=False, + with_exceptions=True, + as_process=False, + output_stream=None, + **subprocess_kwargs + ): + """ + Handles executing the command on the shell and consumes and returns + the returned information (stdout) + + ``command`` + The command argument list to execute. + It should be a string, or a sequence of program arguments. The + program to execute is the first item in the args sequence or string. + + ``istream`` + Standard input filehandle passed to subprocess.Popen. + + ``with_keep_cwd`` + Whether to use the current working directory from os.getcwd(). + The cmd otherwise uses its own working_dir that it has been initialized + with if possible. + + ``with_extended_output`` + Whether to return a (status, stdout, stderr) tuple. + + ``with_exceptions`` + Whether to raise an exception when git returns a non-zero status. + + ``as_process`` + Whether to return the created process instance directly from which + streams can be read on demand. This will render with_extended_output and + with_exceptions ineffective - the caller will have + to deal with the details himself. + It is important to note that the process will be placed into an AutoInterrupt + wrapper that will interrupt the process once it goes out of scope. If you + use the command in iterators, you should pass the whole process instance + instead of a single stream. + + ``output_stream`` + If set to a file-like object, data produced by the git command will be + output to the given stream directly. + This feature only has any effect if as_process is False. Processes will + always be created with a pipe due to issues with subprocess. + This merely is a workaround as data will be copied from the + output pipe to the given output stream directly. + + ``**subprocess_kwargs`` + Keyword arguments to be passed to subprocess.Popen. Please note that + some of the valid kwargs are already set by this method, the ones you + specify may not be the same ones. + + Returns:: + + str(output) # extended_output = False (Default) + tuple(int(status), str(stdout), str(stderr)) # extended_output = True + + if ouput_stream is True, the stdout value will be your output stream: + output_stream # extended_output = False + tuple(int(status), output_stream, str(stderr))# extended_output = True + + Raise + GitCommandError + + NOTE + If you add additional keyword arguments to the signature of this method, + you must update the execute_kwargs tuple housed in this module. + """ + if GIT_PYTHON_TRACE and not GIT_PYTHON_TRACE == 'full': + print ' '.join(command) + + # Allow the user to have the command executed in their working dir. + if with_keep_cwd or self._working_dir is None: + cwd = os.getcwd() + else: + cwd=self._working_dir + + # Start the process + proc = subprocess.Popen(command, + cwd=cwd, + stdin=istream, + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + close_fds=(os.name=='posix'),# unsupported on linux + **subprocess_kwargs + ) + if as_process: + return self.AutoInterrupt(proc, command) + + # Wait for the process to return + status = 0 + stdout_value = '' + stderr_value = '' + try: + if output_stream is None: + stdout_value, stderr_value = proc.communicate() + # strip trailing "\n" + if stdout_value.endswith("\n"): + stdout_value = stdout_value[:-1] + if stderr_value.endswith("\n"): + stderr_value = stderr_value[:-1] + status = proc.returncode + else: + stream_copy(proc.stdout, output_stream, self.max_chunk_size) + stdout_value = output_stream + stderr_value = proc.stderr.read() + # strip trailing "\n" + if stderr_value.endswith("\n"): + stderr_value = stderr_value[:-1] + status = proc.wait() + # END stdout handling + finally: + proc.stdout.close() + proc.stderr.close() + + if with_exceptions and status != 0: + raise GitCommandError(command, status, stderr_value) + + if GIT_PYTHON_TRACE == 'full': + if stderr_value: + print "%s -> %d: '%s' !! '%s'" % (command, status, stdout_value, stderr_value) + elif stdout_value: + print "%s -> %d: '%s'" % (command, status, stdout_value) + else: + print "%s -> %d" % (command, status) + + # Allow access to the command's status code + if with_extended_output: + return (status, stdout_value, stderr_value) + else: + return stdout_value + + def transform_kwargs(self, **kwargs): + """ + Transforms Python style kwargs into git command line options. + """ + args = [] + for k, v in kwargs.items(): + if len(k) == 1: + if v is True: + args.append("-%s" % k) + elif type(v) is not bool: + args.append("-%s%s" % (k, v)) + else: + if v is True: + args.append("--%s" % dashify(k)) + elif type(v) is not bool: + args.append("--%s=%s" % (dashify(k), v)) + return args + + @classmethod + def __unpack_args(cls, arg_list): + if not isinstance(arg_list, (list,tuple)): + return [ str(arg_list) ] + + outlist = list() + for arg in arg_list: + if isinstance(arg_list, (list, tuple)): + outlist.extend(cls.__unpack_args( arg )) + # END recursion + else: + outlist.append(str(arg)) + # END for each arg + return outlist + + def _call_process(self, method, *args, **kwargs): + """ + Run the given git command with the specified arguments and return + the result as a String + + ``method`` + is the command. Contained "_" characters will be converted to dashes, + such as in 'ls_files' to call 'ls-files'. + + ``args`` + is the list of arguments. If None is included, it will be pruned. + This allows your commands to call git more conveniently as None + is realized as non-existent + + ``kwargs`` + is a dict of keyword arguments. + This function accepts the same optional keyword arguments + as execute(). + + Examples:: + git.rev_list('master', max_count=10, header=True) + + Returns + Same as execute() + """ + + # Handle optional arguments prior to calling transform_kwargs + # otherwise these'll end up in args, which is bad. + _kwargs = {} + for kwarg in execute_kwargs: + try: + _kwargs[kwarg] = kwargs.pop(kwarg) + except KeyError: + pass + + # Prepare the argument list + opt_args = self.transform_kwargs(**kwargs) + + ext_args = self.__unpack_args([a for a in args if a is not None]) + args = opt_args + ext_args + + call = ["git", dashify(method)] + call.extend(args) + + return self.execute(call, **_kwargs) + + def _parse_object_header(self, header_line): + """ + ``header_line`` + <hex_sha> type_string size_as_int + + Returns + (hex_sha, type_string, size_as_int) + + Raises + ValueError if the header contains indication for an error due to incorrect + input sha + """ + tokens = header_line.split() + if len(tokens) != 3: + raise ValueError("SHA named %s could not be resolved, git returned: %r" % (tokens[0], header_line.strip()) ) + if len(tokens[0]) != 40: + raise ValueError("Failed to parse header: %r" % header_line) + return (tokens[0], tokens[1], int(tokens[2])) + + def __prepare_ref(self, ref): + # required for command to separate refs on stdin + refstr = str(ref) # could be ref-object + if refstr.endswith("\n"): + return refstr + return refstr + "\n" + + def __get_persistent_cmd(self, attr_name, cmd_name, *args,**kwargs): + cur_val = getattr(self, attr_name) + if cur_val is not None: + return cur_val + + options = { "istream" : subprocess.PIPE, "as_process" : True } + options.update( kwargs ) + + cmd = self._call_process( cmd_name, *args, **options ) + setattr(self, attr_name, cmd ) + return cmd + + def __get_object_header(self, cmd, ref): + cmd.stdin.write(self.__prepare_ref(ref)) + cmd.stdin.flush() + return self._parse_object_header(cmd.stdout.readline()) + + def get_object_header(self, ref): + """ Use this method to quickly examine the type and size of the object behind + the given ref. + + :note: The method will only suffer from the costs of command invocation + once and reuses the command in subsequent calls. + + :return: (hexsha, type_string, size_as_int) """ + cmd = self.__get_persistent_cmd("cat_file_header", "cat_file", batch_check=True) + return self.__get_object_header(cmd, ref) + + def get_object_data(self, ref): + """ As get_object_header, but returns object data as well + :return: (hexsha, type_string, size_as_int,data_string) + :note: not threadsafe + """ + hexsha, typename, size, stream = self.stream_object_data(ref) + data = stream.read(size) + del(stream) + return (hexsha, typename, size, data) + + def stream_object_data(self, ref): + """As get_object_header, but returns the data as a stream + :return: (hexsha, type_string, size_as_int, stream) + :note: This method is not threadsafe, you need one independent Command instance + per thread to be safe !""" + cmd = self.__get_persistent_cmd("cat_file_all", "cat_file", batch=True) + hexsha, typename, size = self.__get_object_header(cmd, ref) + return (hexsha, typename, size, self.CatFileContentStream(size, cmd.stdout)) + + def clear_cache(self): + """ + Clear all kinds of internal caches to release resources. + + Currently persistent commands will be interrupted. + + Returns + self + """ + self.cat_file_all = None + self.cat_file_header = None + return self diff --git a/lib/git/errors.py b/lib/git/errors.py index f66fb528..d8a35e02 100644 --- a/lib/git/errors.py +++ b/lib/git/errors.py @@ -8,19 +8,25 @@ Module containing all exceptions thrown througout the git package, """ class InvalidGitRepositoryError(Exception): - """ - Thrown if the given repository appears to have an invalid format. - """ + """ Thrown if the given repository appears to have an invalid format. """ + +class ODBError(Exception): + """All errors thrown by the object database""" + +class InvalidDBRoot(ODBError): + """Thrown if an object database cannot be initialized at the given path""" + +class BadObject(ODBError): + """The object with the given SHA does not exist""" + +class BadObjectType(ODBError): + """The object had an unsupported type""" class NoSuchPathError(OSError): - """ - Thrown if a path could not be access by the system. - """ + """ Thrown if a path could not be access by the system. """ class GitCommandError(Exception): - """ - Thrown if execution of the git command fails with non-zero status code. - """ + """ Thrown if execution of the git command fails with non-zero status code. """ def __init__(self, command, status, stderr=None): self.stderr = stderr self.status = status diff --git a/lib/git/index.py b/lib/git/index.py index 8ccc3fe3..36428315 100644 --- a/lib/git/index.py +++ b/lib/git/index.py @@ -21,7 +21,7 @@ import git.diff as diff from errors import GitCommandError from git.objects import Blob, Tree, Object, Commit -from git.utils import SHA1Writer, LazyMixin, ConcurrentWriteOperation, join_path_native +from git.utils import IndexFileSHA1Writer, LazyMixin, ConcurrentWriteOperation, join_path_native class CheckoutError( Exception ): @@ -461,7 +461,7 @@ class IndexFile(LazyMixin, diff.Diffable): write_op = ConcurrentWriteOperation(file_path or self._file_path) stream = write_op._begin_writing() - stream = SHA1Writer(stream) + stream = IndexFileSHA1Writer(stream) # header stream.write("DIRC") diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py index 6a51eed3..5a3a15a7 100644 --- a/lib/git/objects/base.py +++ b/lib/git/objects/base.py @@ -4,224 +4,220 @@ # This module is part of GitPython and is released under # the BSD License: http://www.opensource.org/licenses/bsd-license.php import os -from git.utils import LazyMixin, join_path_native +from git.utils import LazyMixin, join_path_native, stream_copy import utils - + _assertion_msg_format = "Created object %r whose python type %r disagrees with the acutal git object type %r" class Object(LazyMixin): - """ - Implements an Object which may be Blobs, Trees, Commits and Tags - - This Object also serves as a constructor for instances of the correct type:: - - inst = Object.new(repo,id) - inst.sha # objects sha in hex - inst.size # objects uncompressed data size - inst.data # byte string containing the whole data of the object - """ - NULL_HEX_SHA = '0'*40 - TYPES = ("blob", "tree", "commit", "tag") - __slots__ = ("repo", "sha", "size", "data" ) - type = None # to be set by subclass - - def __init__(self, repo, id): - """ - Initialize an object by identifying it by its id. All keyword arguments - will be set on demand if None. - - ``repo`` - repository this object is located in - - ``id`` - SHA1 or ref suitable for git-rev-parse - """ - super(Object,self).__init__() - self.repo = repo - self.sha = id + """ + Implements an Object which may be Blobs, Trees, Commits and Tags + + This Object also serves as a constructor for instances of the correct type:: + + inst = Object.new(repo,id) + inst.sha # objects sha in hex + inst.size # objects uncompressed data size + inst.data # byte string containing the whole data of the object + """ + NULL_HEX_SHA = '0'*40 + TYPES = ("blob", "tree", "commit", "tag") + __slots__ = ("repo", "sha", "size", "data" ) + type = None # to be set by subclass + + def __init__(self, repo, id): + """ + Initialize an object by identifying it by its id. All keyword arguments + will be set on demand if None. + + ``repo`` + repository this object is located in + + ``id`` + SHA1 or ref suitable for git-rev-parse + """ + super(Object,self).__init__() + self.repo = repo + self.sha = id - @classmethod - def new(cls, repo, id): - """ - Return - New Object instance of a type appropriate to the object type behind - id. The id of the newly created object will be a hexsha even though - the input id may have been a Reference or Rev-Spec - - Note - This cannot be a __new__ method as it would always call __init__ - with the input id which is not necessarily a hexsha. - """ - hexsha, typename, size = repo.git.get_object_header(id) - obj_type = utils.get_object_type_by_name(typename) - inst = obj_type(repo, hexsha) - inst.size = size - return inst - - def _set_self_from_args_(self, args_dict): - """ - Initialize attributes on self from the given dict that was retrieved - from locals() in the calling method. - - Will only set an attribute on self if the corresponding value in args_dict - is not None - """ - for attr, val in args_dict.items(): - if attr != "self" and val is not None: - setattr( self, attr, val ) - # END set all non-None attributes - - def _set_cache_(self, attr): - """ - Retrieve object information - """ - if attr == "size": - hexsha, typename, self.size = self.repo.git.get_object_header(self.sha) - assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) - elif attr == "data": - hexsha, typename, self.size, self.data = self.repo.git.get_object_data(self.sha) - assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) - else: - super(Object,self)._set_cache_(attr) - - def __eq__(self, other): - """ - Returns - True if the objects have the same SHA1 - """ - return self.sha == other.sha - - def __ne__(self, other): - """ - Returns - True if the objects do not have the same SHA1 - """ - return self.sha != other.sha - - def __hash__(self): - """ - Returns - Hash of our id allowing objects to be used in dicts and sets - """ - return hash(self.sha) - - def __str__(self): - """ - Returns - string of our SHA1 as understood by all git commands - """ - return self.sha - - def __repr__(self): - """ - Returns - string with pythonic representation of our object - """ - return '<git.%s "%s">' % (self.__class__.__name__, self.sha) + @classmethod + def new(cls, repo, id): + """ + Return + New Object instance of a type appropriate to the object type behind + id. The id of the newly created object will be a hexsha even though + the input id may have been a Reference or Rev-Spec + + Note + This cannot be a __new__ method as it would always call __init__ + with the input id which is not necessarily a hexsha. + """ + hexsha, typename, size = repo.git.get_object_header(id) + obj_type = utils.get_object_type_by_name(typename) + inst = obj_type(repo, hexsha) + inst.size = size + return inst + + def _set_self_from_args_(self, args_dict): + """ + Initialize attributes on self from the given dict that was retrieved + from locals() in the calling method. + + Will only set an attribute on self if the corresponding value in args_dict + is not None + """ + for attr, val in args_dict.items(): + if attr != "self" and val is not None: + setattr( self, attr, val ) + # END set all non-None attributes + + def _set_cache_(self, attr): + """ + Retrieve object information + """ + if attr == "size": + oinfo = self.repo.odb.info(self.sha) + self.size = oinfo.size + assert oinfo.type == self.type, _assertion_msg_format % (self.sha, oinfo.type, self.type) + elif attr == "data": + ostream = self.repo.odb.stream(self.sha) + self.size = ostream.size + self.data = ostream.read() + assert ostream.type == self.type, _assertion_msg_format % (self.sha, ostream.type, self.type) + else: + super(Object,self)._set_cache_(attr) + + def __eq__(self, other): + """ + Returns + True if the objects have the same SHA1 + """ + return self.sha == other.sha + + def __ne__(self, other): + """ + Returns + True if the objects do not have the same SHA1 + """ + return self.sha != other.sha + + def __hash__(self): + """ + Returns + Hash of our id allowing objects to be used in dicts and sets + """ + return hash(self.sha) + + def __str__(self): + """ + Returns + string of our SHA1 as understood by all git commands + """ + return self.sha + + def __repr__(self): + """ + Returns + string with pythonic representation of our object + """ + return '<git.%s "%s">' % (self.__class__.__name__, self.sha) - @property - def data_stream(self): - """ - Returns - File Object compatible stream to the uncompressed raw data of the object - """ - proc = self.repo.git.cat_file(self.type, self.sha, as_process=True) - return utils.ProcessStreamAdapter(proc, "stdout") + @property + def data_stream(self): + """ :return: File Object compatible stream to the uncompressed raw data of the object + :note: returned streams must be read in order""" + return self.repo.odb.stream(self.sha) - def stream_data(self, ostream): - """ - Writes our data directly to the given output stream - - ``ostream`` - File object compatible stream object. - - Returns - self - """ - self.repo.git.cat_file(self.type, self.sha, output_stream=ostream) - return self + def stream_data(self, ostream): + """Writes our data directly to the given output stream + :param ostream: File object compatible stream object. + :return: self""" + istream = self.repo.odb.stream(self.sha) + stream_copy(istream, ostream) + return self + class IndexObject(Object): - """ - Base for all objects that can be part of the index file , namely Tree, Blob and - SubModule objects - """ - __slots__ = ("path", "mode") - - def __init__(self, repo, sha, mode=None, path=None): - """ - Initialize a newly instanced IndexObject - ``repo`` - is the Repo we are located in + """ + Base for all objects that can be part of the index file , namely Tree, Blob and + SubModule objects + """ + __slots__ = ("path", "mode") + + def __init__(self, repo, sha, mode=None, path=None): + """ + Initialize a newly instanced IndexObject + ``repo`` + is the Repo we are located in - ``sha`` : string - is the git object id as hex sha + ``sha`` : string + is the git object id as hex sha - ``mode`` : int - is the file mode as int, use the stat module to evaluate the infomration + ``mode`` : int + is the file mode as int, use the stat module to evaluate the infomration - ``path`` : str - is the path to the file in the file system, relative to the git repository root, i.e. - file.ext or folder/other.ext - - NOTE - Path may not be set of the index object has been created directly as it cannot - be retrieved without knowing the parent tree. - """ - super(IndexObject, self).__init__(repo, sha) - self._set_self_from_args_(locals()) - if isinstance(mode, basestring): - self.mode = self._mode_str_to_int(mode) - - def __hash__(self): - """ - Returns - Hash of our path as index items are uniquely identifyable by path, not - by their data ! - """ - return hash(self.path) - - def _set_cache_(self, attr): - if attr in IndexObject.__slots__: - # they cannot be retrieved lateron ( not without searching for them ) - raise AttributeError( "path and mode attributes must have been set during %s object creation" % type(self).__name__ ) - else: - super(IndexObject, self)._set_cache_(attr) - - @classmethod - def _mode_str_to_int(cls, modestr): - """ - ``modestr`` - string like 755 or 644 or 100644 - only the last 6 chars will be used - - Returns - String identifying a mode compatible to the mode methods ids of the - stat module regarding the rwx permissions for user, group and other, - special flags and file system flags, i.e. whether it is a symlink - for example. - """ - mode = 0 - for iteration,char in enumerate(reversed(modestr[-6:])): - mode += int(char) << iteration*3 - # END for each char - return mode - - @property - def name(self): - """ - Returns - Name portion of the path, effectively being the basename - """ - return os.path.basename(self.path) - - @property - def abspath(self): - """ - Returns - Absolute path to this index object in the file system ( as opposed to the - .path field which is a path relative to the git repository ). - - The returned path will be native to the system and contains '\' on windows. - """ - return join_path_native(self.repo.working_tree_dir, self.path) - + ``path`` : str + is the path to the file in the file system, relative to the git repository root, i.e. + file.ext or folder/other.ext + + NOTE + Path may not be set of the index object has been created directly as it cannot + be retrieved without knowing the parent tree. + """ + super(IndexObject, self).__init__(repo, sha) + self._set_self_from_args_(locals()) + if isinstance(mode, basestring): + self.mode = self._mode_str_to_int(mode) + + def __hash__(self): + """ + Returns + Hash of our path as index items are uniquely identifyable by path, not + by their data ! + """ + return hash(self.path) + + def _set_cache_(self, attr): + if attr in IndexObject.__slots__: + # they cannot be retrieved lateron ( not without searching for them ) + raise AttributeError( "path and mode attributes must have been set during %s object creation" % type(self).__name__ ) + else: + super(IndexObject, self)._set_cache_(attr) + + @classmethod + def _mode_str_to_int(cls, modestr): + """ + ``modestr`` + string like 755 or 644 or 100644 - only the last 6 chars will be used + + Returns + String identifying a mode compatible to the mode methods ids of the + stat module regarding the rwx permissions for user, group and other, + special flags and file system flags, i.e. whether it is a symlink + for example. + """ + mode = 0 + for iteration,char in enumerate(reversed(modestr[-6:])): + mode += int(char) << iteration*3 + # END for each char + return mode + + @property + def name(self): + """ + Returns + Name portion of the path, effectively being the basename + """ + return os.path.basename(self.path) + + @property + def abspath(self): + """ + Returns + Absolute path to this index object in the file system ( as opposed to the + .path field which is a path relative to the git repository ). + + The returned path will be native to the system and contains '\' on windows. + """ + return join_path_native(self.repo.working_tree_dir, self.path) + diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py index 826f684c..9a3c2c95 100644 --- a/lib/git/objects/commit.py +++ b/lib/git/objects/commit.py @@ -7,372 +7,434 @@ from git.utils import Iterable import git.diff as diff import git.stats as stats +from git.actor import Actor from tree import Tree +from git.odb import IStream +from cStringIO import StringIO import base import utils -import tempfile +import time import os -class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable): - """ - Wraps a git Commit object. - - This class will act lazily on some of its attributes and will query the - value on demand only if it involves calling the git binary. - """ - - # object configuration - type = "commit" - __slots__ = ("tree", - "author", "authored_date", "author_tz_offset", - "committer", "committed_date", "committer_tz_offset", - "message", "parents") - _id_attribute_ = "sha" - - def __init__(self, repo, sha, tree=None, author=None, authored_date=None, author_tz_offset=None, - committer=None, committed_date=None, committer_tz_offset=None, message=None, parents=None): - """ - Instantiate a new Commit. All keyword arguments taking None as default will - be implicitly set if id names a valid sha. - - The parameter documentation indicates the type of the argument after a colon ':'. - - ``sha`` - is the sha id of the commit or a ref - - ``parents`` : tuple( Commit, ... ) - is a tuple of commit ids or actual Commits - - ``tree`` : Tree - is the corresponding tree id or an actual Tree - - ``author`` : Actor - is the author string ( will be implicitly converted into an Actor object ) - - ``authored_date`` : int_seconds_since_epoch - is the authored DateTime - use time.gmtime() to convert it into a - different format - - ``author_tz_offset``: int_seconds_west_of_utc - is the timezone that the authored_date is in - - ``committer`` : Actor - is the committer string - - ``committed_date`` : int_seconds_since_epoch - is the committed DateTime - use time.gmtime() to convert it into a - different format - - ``committer_tz_offset``: int_seconds_west_of_utc - is the timezone that the authored_date is in - - ``message`` : string - is the commit message - - Returns - git.Commit - """ - super(Commit,self).__init__(repo, sha) - self._set_self_from_args_(locals()) - - if parents is not None: - self.parents = tuple( self.__class__(repo, p) for p in parents ) - # END for each parent to convert - - if self.sha and tree is not None: - self.tree = Tree(repo, tree, path='') - # END id to tree conversion - - @classmethod - def _get_intermediate_items(cls, commit): - return commit.parents - - def _set_cache_(self, attr): - """ - Called by LazyMixin superclass when the given uninitialized member needs - to be set. - We set all values at once. - """ - if attr in Commit.__slots__: - # prepare our data lines to match rev-list - data_lines = self.data.splitlines() - data_lines.insert(0, "commit %s" % self.sha) - temp = self._iter_from_process_or_stream(self.repo, iter(data_lines), False).next() - self.parents = temp.parents - self.tree = temp.tree - self.author = temp.author - self.authored_date = temp.authored_date - self.author_tz_offset = temp.author_tz_offset - self.committer = temp.committer - self.committed_date = temp.committed_date - self.committer_tz_offset = temp.committer_tz_offset - self.message = temp.message - else: - super(Commit, self)._set_cache_(attr) - - @property - def summary(self): - """ - Returns - First line of the commit message. - """ - return self.message.split('\n', 1)[0] - - def count(self, paths='', **kwargs): - """ - Count the number of commits reachable from this commit - - ``paths`` - is an optinal path or a list of paths restricting the return value - to commits actually containing the paths - - ``kwargs`` - Additional options to be passed to git-rev-list. They must not alter - the ouput style of the command, or parsing will yield incorrect results - Returns - int - """ - # yes, it makes a difference whether empty paths are given or not in our case - # as the empty paths version will ignore merge commits for some reason. - if paths: - return len(self.repo.git.rev_list(self.sha, '--', paths, **kwargs).splitlines()) - else: - return len(self.repo.git.rev_list(self.sha, **kwargs).splitlines()) - - - @property - def name_rev(self): - """ - Returns - String describing the commits hex sha based on the closest Reference. - Mostly useful for UI purposes - """ - return self.repo.git.name_rev(self) - - @classmethod - def iter_items(cls, repo, rev, paths='', **kwargs): - """ - Find all commits matching the given criteria. - - ``repo`` - is the Repo - - ``rev`` - revision specifier, see git-rev-parse for viable options - - ``paths`` - is an optinal path or list of paths, if set only Commits that include the path - or paths will be considered - - ``kwargs`` - optional keyword arguments to git rev-list where - ``max_count`` is the maximum number of commits to fetch - ``skip`` is the number of commits to skip - ``since`` all commits since i.e. '1970-01-01' - - Returns - iterator yielding Commit items - """ - options = {'pretty': 'raw', 'as_process' : True } - options.update(kwargs) - - args = list() - if paths: - args.extend(('--', paths)) - # END if paths - - proc = repo.git.rev_list(rev, args, **options) - return cls._iter_from_process_or_stream(repo, proc, True) - - def iter_parents(self, paths='', **kwargs): - """ - Iterate _all_ parents of this commit. - - ``paths`` - Optional path or list of paths limiting the Commits to those that - contain at least one of the paths - - ``kwargs`` - All arguments allowed by git-rev-list - - Return: - Iterator yielding Commit objects which are parents of self - """ - # skip ourselves - skip = kwargs.get("skip", 1) - if skip == 0: # skip ourselves - skip = 1 - kwargs['skip'] = skip - - return self.iter_items( self.repo, self, paths, **kwargs ) - - @property - def stats(self): - """ - Create a git stat from changes between this commit and its first parent - or from all changes done if this is the very first commit. - - Return - git.Stats - """ - if not self.parents: - text = self.repo.git.diff_tree(self.sha, '--', numstat=True, root=True) - text2 = "" - for line in text.splitlines()[1:]: - (insertions, deletions, filename) = line.split("\t") - text2 += "%s\t%s\t%s\n" % (insertions, deletions, filename) - text = text2 - else: - text = self.repo.git.diff(self.parents[0].sha, self.sha, '--', numstat=True) - return stats.Stats._list_from_string(self.repo, text) - - @classmethod - def _iter_from_process_or_stream(cls, repo, proc_or_stream, from_rev_list): - """ - Parse out commit information into a list of Commit objects - - ``repo`` - is the Repo - - ``proc`` - git-rev-list process instance (raw format) - - ``from_rev_list`` - If True, the stream was created by rev-list in which case we parse - the message differently - Returns - iterator returning Commit objects - """ - stream = proc_or_stream - if not hasattr(stream,'next'): - stream = proc_or_stream.stdout - - for line in stream: - commit_tokens = line.split() - id = commit_tokens[1] - assert commit_tokens[0] == "commit" - tree = stream.next().split()[1] - - parents = [] - next_line = None - for parent_line in stream: - if not parent_line.startswith('parent'): - next_line = parent_line - break - # END abort reading parents - parents.append(parent_line.split()[-1]) - # END for each parent line - - author, authored_date, author_tz_offset = utils.parse_actor_and_date(next_line) - committer, committed_date, committer_tz_offset = utils.parse_actor_and_date(stream.next()) - - # empty line - stream.next() - - message_lines = [] - if from_rev_list: - for msg_line in stream: - if not msg_line.startswith(' '): - # and forget about this empty marker - break - # END abort message reading - # strip leading 4 spaces - message_lines.append(msg_line[4:]) - # END while there are message lines - else: - # a stream from our data simply gives us the plain message - for msg_line in stream: - message_lines.append(msg_line) - # END message parsing - message = '\n'.join(message_lines) - - yield Commit(repo, id, parents=tuple(parents), tree=tree, - author=author, authored_date=authored_date, author_tz_offset=author_tz_offset, - committer=committer, committed_date=committed_date, committer_tz_offset=committer_tz_offset, - message=message) - # END for each line in stream - - - @classmethod - def create_from_tree(cls, repo, tree, message, parent_commits=None, head=False): - """ - Commit the given tree, creating a commit object. - - ``repo`` - is the Repo - - ``tree`` - Sha of a tree or a tree object to become the tree of the new commit - - ``message`` - Commit message. It may be an empty string if no message is provided. - It will be converted to a string in any case. - - ``parent_commits`` - Optional Commit objects to use as parents for the new commit. - If empty list, the commit will have no parents at all and become - a root commit. - If None , the current head commit will be the parent of the - new commit object - - ``head`` - If True, the HEAD will be advanced to the new commit automatically. - Else the HEAD will remain pointing on the previous commit. This could - lead to undesired results when diffing files. - - Returns - Commit object representing the new commit - - Note: - Additional information about hte committer and Author are taken from the - environment or from the git configuration, see git-commit-tree for - more information - """ - parents = parent_commits - if parent_commits is None: - try: - parent_commits = [ repo.head.commit ] - except ValueError: - # empty repositories have no head commit - parent_commits = list() - # END handle parent commits - # END if parent commits are unset - - parent_args = [ ("-p", str(commit)) for commit in parent_commits ] - - # create message stream - tmp_file_path = tempfile.mktemp() - fp = open(tmp_file_path,"wb") - fp.write(str(message)) - fp.close() - fp = open(tmp_file_path,"rb") - fp.seek(0) - - try: - # write the current index as tree - commit_sha = repo.git.commit_tree(tree, parent_args, istream=fp) - new_commit = cls(repo, commit_sha) - - if head: - try: - repo.head.commit = new_commit - except ValueError: - # head is not yet set to the ref our HEAD points to. - import git.refs - master = git.refs.Head.create(repo, repo.head.ref, commit=new_commit) - repo.head.reference = master - # END handle empty repositories - # END advance head handling - - return new_commit - finally: - fp.close() - os.remove(tmp_file_path) - - def __str__(self): - """ Convert commit to string which is SHA1 """ - return self.sha - - def __repr__(self): - return '<git.Commit "%s">' % self.sha +class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Serializable): + """ + Wraps a git Commit object. + + This class will act lazily on some of its attributes and will query the + value on demand only if it involves calling the git binary. + """ + + # ENVIRONMENT VARIABLES + # read when creating new commits + env_author_name = "GIT_AUTHOR_NAME" + env_author_email = "GIT_AUTHOR_EMAIL" + env_author_date = "GIT_AUTHOR_DATE" + env_committer_name = "GIT_COMMITTER_NAME" + env_committer_email = "GIT_COMMITTER_EMAIL" + env_committer_date = "GIT_COMMITTER_DATE" + env_email = "EMAIL" + + # CONFIGURATION KEYS + conf_email = 'email' + conf_name = 'name' + conf_encoding = 'i18n.commitencoding' + + # INVARIANTS + default_encoding = "UTF-8" + + + # object configuration + type = "commit" + __slots__ = ("tree", + "author", "authored_date", "author_tz_offset", + "committer", "committed_date", "committer_tz_offset", + "message", "parents", "encoding") + _id_attribute_ = "sha" + + def __init__(self, repo, sha, tree=None, author=None, authored_date=None, author_tz_offset=None, + committer=None, committed_date=None, committer_tz_offset=None, + message=None, parents=None, encoding=None): + """ + Instantiate a new Commit. All keyword arguments taking None as default will + be implicitly set if id names a valid sha. + + The parameter documentation indicates the type of the argument after a colon ':'. + + :param sha: is the sha id of the commit or a ref + :param parents: tuple( Commit, ... ) + is a tuple of commit ids or actual Commits + :param tree: Tree + is the corresponding tree id or an actual Tree + :param author: Actor + is the author string ( will be implicitly converted into an Actor object ) + :param authored_date: int_seconds_since_epoch + is the authored DateTime - use time.gmtime() to convert it into a + different format + :param author_tz_offset: int_seconds_west_of_utc + is the timezone that the authored_date is in + :param committer: Actor + is the committer string + :param committed_date: int_seconds_since_epoch + is the committed DateTime - use time.gmtime() to convert it into a + different format + :param committer_tz_offset: int_seconds_west_of_utc + is the timezone that the authored_date is in + :param message: string + is the commit message + :param encoding: string + encoding of the message, defaults to UTF-8 + :return: git.Commit + + :note: Timezone information is in the same format and in the same sign + as what time.altzone returns. The sign is inverted compared to git's + UTC timezone. + """ + super(Commit,self).__init__(repo, sha) + self._set_self_from_args_(locals()) + + @classmethod + def _get_intermediate_items(cls, commit): + return commit.parents + + def _set_cache_(self, attr): + """ Called by LazyMixin superclass when the given uninitialized member needs + to be set. + We set all values at once. """ + if attr in Commit.__slots__: + # read the data in a chunk, its faster - then provide a file wrapper + # Could use self.data, but lets try to get it with less calls + hexsha, typename, size, data = self.repo.git.get_object_data(self) + self._deserialize(StringIO(data)) + else: + super(Commit, self)._set_cache_(attr) + + @property + def summary(self): + """ + Returns + First line of the commit message. + """ + return self.message.split('\n', 1)[0] + + def count(self, paths='', **kwargs): + """ + Count the number of commits reachable from this commit + + ``paths`` + is an optinal path or a list of paths restricting the return value + to commits actually containing the paths + + ``kwargs`` + Additional options to be passed to git-rev-list. They must not alter + the ouput style of the command, or parsing will yield incorrect results + Returns + int + """ + # yes, it makes a difference whether empty paths are given or not in our case + # as the empty paths version will ignore merge commits for some reason. + if paths: + return len(self.repo.git.rev_list(self.sha, '--', paths, **kwargs).splitlines()) + else: + return len(self.repo.git.rev_list(self.sha, **kwargs).splitlines()) + + + @property + def name_rev(self): + """ + Returns + String describing the commits hex sha based on the closest Reference. + Mostly useful for UI purposes + """ + return self.repo.git.name_rev(self) + + @classmethod + def iter_items(cls, repo, rev, paths='', **kwargs): + """ + Find all commits matching the given criteria. + + ``repo`` + is the Repo + + ``rev`` + revision specifier, see git-rev-parse for viable options + + ``paths`` + is an optinal path or list of paths, if set only Commits that include the path + or paths will be considered + + ``kwargs`` + optional keyword arguments to git rev-list where + ``max_count`` is the maximum number of commits to fetch + ``skip`` is the number of commits to skip + ``since`` all commits since i.e. '1970-01-01' + + Returns + iterator yielding Commit items + """ + if 'pretty' in kwargs: + raise ValueError("--pretty cannot be used as parsing expects single sha's only") + # END handle pretty + args = list() + if paths: + args.extend(('--', paths)) + # END if paths + + proc = repo.git.rev_list(rev, args, as_process=True, **kwargs) + return cls._iter_from_process_or_stream(repo, proc) + + def iter_parents(self, paths='', **kwargs): + """ + Iterate _all_ parents of this commit. + + ``paths`` + Optional path or list of paths limiting the Commits to those that + contain at least one of the paths + + ``kwargs`` + All arguments allowed by git-rev-list + + Return: + Iterator yielding Commit objects which are parents of self + """ + # skip ourselves + skip = kwargs.get("skip", 1) + if skip == 0: # skip ourselves + skip = 1 + kwargs['skip'] = skip + + return self.iter_items( self.repo, self, paths, **kwargs ) + + @property + def stats(self): + """ + Create a git stat from changes between this commit and its first parent + or from all changes done if this is the very first commit. + + Return + git.Stats + """ + if not self.parents: + text = self.repo.git.diff_tree(self.sha, '--', numstat=True, root=True) + text2 = "" + for line in text.splitlines()[1:]: + (insertions, deletions, filename) = line.split("\t") + text2 += "%s\t%s\t%s\n" % (insertions, deletions, filename) + text = text2 + else: + text = self.repo.git.diff(self.parents[0].sha, self.sha, '--', numstat=True) + return stats.Stats._list_from_string(self.repo, text) + + @classmethod + def _iter_from_process_or_stream(cls, repo, proc_or_stream): + """Parse out commit information into a list of Commit objects + We expect one-line per commit, and parse the actual commit information directly + from our lighting fast object database + + :param proc: git-rev-list process instance - one sha per line + :return: iterator returning Commit objects""" + stream = proc_or_stream + if not hasattr(stream,'readline'): + stream = proc_or_stream.stdout + + readline = stream.readline + while True: + line = readline() + if not line: + break + sha = line.strip() + if len(sha) > 40: + # split additional information, as returned by bisect for instance + sha, rest = line.split(None, 1) + # END handle extra info + + assert len(sha) == 40, "Invalid line: %s" % sha + yield Commit(repo, sha) + # END for each line in stream + + + @classmethod + def create_from_tree(cls, repo, tree, message, parent_commits=None, head=False): + """Commit the given tree, creating a commit object. + + :param repo: Repo object the commit should be part of + :param tree: Sha of a tree or a tree object to become the tree of the new commit + :param message: Commit message. It may be an empty string if no message is provided. + It will be converted to a string in any case. + :param parent_commits: + Optional Commit objects to use as parents for the new commit. + If empty list, the commit will have no parents at all and become + a root commit. + If None , the current head commit will be the parent of the + new commit object + :param head: + If True, the HEAD will be advanced to the new commit automatically. + Else the HEAD will remain pointing on the previous commit. This could + lead to undesired results when diffing files. + + :return: Commit object representing the new commit + + :note: + Additional information about the committer and Author are taken from the + environment or from the git configuration, see git-commit-tree for + more information + """ + parents = parent_commits + if parent_commits is None: + try: + parent_commits = [ repo.head.commit ] + except ValueError: + # empty repositories have no head commit + parent_commits = list() + # END handle parent commits + # END if parent commits are unset + + # retrieve all additional information, create a commit object, and + # serialize it + # Generally: + # * Environment variables override configuration values + # * Sensible defaults are set according to the git documentation + + # COMMITER AND AUTHOR INFO + cr = repo.config_reader() + env = os.environ + default_email = utils.get_user_id() + default_name = default_email.split('@')[0] + + conf_name = cr.get_value('user', cls.conf_name, default_name) + conf_email = cr.get_value('user', cls.conf_email, default_email) + + author_name = env.get(cls.env_author_name, conf_name) + author_email = env.get(cls.env_author_email, default_email) + + committer_name = env.get(cls.env_committer_name, conf_name) + committer_email = env.get(cls.env_committer_email, conf_email) + + # PARSE THE DATES + unix_time = int(time.time()) + offset = time.altzone + + author_date_str = env.get(cls.env_author_date, '') + if author_date_str: + author_time, author_offset = utils.parse_date(author_date_str) + else: + author_time, author_offset = unix_time, offset + # END set author time + + committer_date_str = env.get(cls.env_committer_date, '') + if committer_date_str: + committer_time, committer_offset = utils.parse_date(committer_date_str) + else: + committer_time, committer_offset = unix_time, offset + # END set committer time + + # assume utf8 encoding + enc_section, enc_option = cls.conf_encoding.split('.') + conf_encoding = cr.get_value(enc_section, enc_option, cls.default_encoding) + + author = Actor(author_name, author_email) + committer = Actor(committer_name, committer_email) + + + # CREATE NEW COMMIT + new_commit = cls(repo, cls.NULL_HEX_SHA, tree, + author, author_time, author_offset, + committer, committer_time, committer_offset, + message, parent_commits, conf_encoding) + + stream = StringIO() + new_commit._serialize(stream) + streamlen = stream.tell() + stream.seek(0) + + istream = repo.odb.store(IStream(cls.type, streamlen, stream)) + new_commit.sha = istream.sha + + if head: + try: + repo.head.commit = new_commit + except ValueError: + # head is not yet set to the ref our HEAD points to + # Happens on first commit + import git.refs + master = git.refs.Head.create(repo, repo.head.ref, commit=new_commit) + repo.head.reference = master + # END handle empty repositories + # END advance head handling + + return new_commit + + + def __str__(self): + """ Convert commit to string which is SHA1 """ + return self.sha + + def __repr__(self): + return '<git.Commit "%s">' % self.sha + + #{ Serializable Implementation + + def _serialize(self, stream): + write = stream.write + write("tree %s\n" % self.tree) + for p in self.parents: + write("parent %s\n" % p) + + a = self.author + c = self.committer + fmt = "%s %s <%s> %s %s\n" + write(fmt % ("author", a.name, a.email, + self.authored_date, + utils.altz_to_utctz_str(self.author_tz_offset))) + + write(fmt % ("committer", c.name, c.email, + self.committed_date, + utils.altz_to_utctz_str(self.committer_tz_offset))) + + if self.encoding != self.default_encoding: + write("encoding %s\n" % self.encoding) + + write("\n") + write(self.message) + return self + + def _deserialize(self, stream): + """:param from_rev_list: if true, the stream format is coming from the rev-list command + Otherwise it is assumed to be a plain data stream from our object""" + readline = stream.readline + self.tree = Tree(self.repo, readline().split()[1], 0, '') + + self.parents = list() + next_line = None + while True: + parent_line = readline() + if not parent_line.startswith('parent'): + next_line = parent_line + break + # END abort reading parents + self.parents.append(type(self)(self.repo, parent_line.split()[-1])) + # END for each parent line + self.parents = tuple(self.parents) + + self.author, self.authored_date, self.author_tz_offset = utils.parse_actor_and_date(next_line) + self.committer, self.committed_date, self.committer_tz_offset = utils.parse_actor_and_date(readline()) + + + # now we can have the encoding line, or an empty line followed by the optional + # message. + self.encoding = self.default_encoding + # read encoding or empty line to separate message + enc = readline() + enc = enc.strip() + if enc: + self.encoding = enc[enc.find(' ')+1:] + # now comes the message separator + readline() + # END handle encoding + + # a stream from our data simply gives us the plain message + # The end of our message stream is marked with a newline that we strip + self.message = stream.read() + return self + + #} END serializable implementation diff --git a/lib/git/objects/tree.py b/lib/git/objects/tree.py index a9e60981..285d3b5b 100644 --- a/lib/git/objects/tree.py +++ b/lib/git/objects/tree.py @@ -209,7 +209,7 @@ class Tree(base.IndexObject, diff.Diffable, utils.Traversable): visit_once = False, ignore_self=1 ): """For documentation, see utils.Traversable.traverse - Trees are set to visist_once = False to gain more performance in the traversal""" + Trees are set to visit_once = False to gain more performance in the traversal""" return super(Tree, self).traverse(predicate, prune, depth, branch_first, visit_once, ignore_self) # List protocol diff --git a/lib/git/objects/utils.py b/lib/git/objects/utils.py index 4f17b652..c93f2091 100644 --- a/lib/git/objects/utils.py +++ b/lib/git/objects/utils.py @@ -9,159 +9,302 @@ Module for general utility functions import re from collections import deque as Deque from git.actor import Actor +import platform + +from string import digits +import time +import os + +__all__ = ('get_object_type_by_name', 'get_user_id', 'parse_date', 'parse_actor_and_date', + 'ProcessStreamAdapter', 'Traversable', 'altz_to_utctz_str', 'utctz_to_altz', + 'verify_utctz') def get_object_type_by_name(object_type_name): - """ - Returns - type suitable to handle the given object type name. - Use the type to create new instances. - - ``object_type_name`` - Member of TYPES - - Raises - ValueError: In case object_type_name is unknown - """ - if object_type_name == "commit": - import commit - return commit.Commit - elif object_type_name == "tag": - import tag - return tag.TagObject - elif object_type_name == "blob": - import blob - return blob.Blob - elif object_type_name == "tree": - import tree - return tree.Tree - else: - raise ValueError("Cannot handle unknown object type: %s" % object_type_name) - - + """ + Returns + type suitable to handle the given object type name. + Use the type to create new instances. + + ``object_type_name`` + Member of TYPES + + Raises + ValueError: In case object_type_name is unknown + """ + if object_type_name == "commit": + import commit + return commit.Commit + elif object_type_name == "tag": + import tag + return tag.TagObject + elif object_type_name == "blob": + import blob + return blob.Blob + elif object_type_name == "tree": + import tree + return tree.Tree + else: + raise ValueError("Cannot handle unknown object type: %s" % object_type_name) + + +def get_user_id(): + """:return: string identifying the currently active system user as name@node + :note: user can be set with the 'USER' environment variable, usually set on windows""" + ukn = 'UNKNOWN' + username = os.environ.get('USER', ukn) + if username == ukn and hasattr(os, 'getlogin'): + username = os.getlogin() + # END get username from login + return "%s@%s" % (username, platform.node()) + + +def utctz_to_altz(utctz): + """we convert utctz to the timezone in seconds, it is the format time.altzone + returns. Git stores it as UTC timezon which has the opposite sign as well, + which explains the -1 * ( that was made explicit here ) + :param utctz: git utc timezone string, i.e. +0200""" + return -1 * int(float(utctz)/100*3600) + +def altz_to_utctz_str(altz): + """As above, but inverses the operation, returning a string that can be used + in commit objects""" + utci = -1 * int((altz / 3600)*100) + utcs = str(abs(utci)) + utcs = "0"*(4-len(utcs)) + utcs + prefix = (utci < 0 and '-') or '+' + return prefix + utcs + + +def verify_utctz(offset): + """:raise ValueError: if offset is incorrect + :return: offset""" + fmt_exc = ValueError("Invalid timezone offset format: %s" % offset) + if len(offset) != 5: + raise fmt_exc + if offset[0] not in "+-": + raise fmt_exc + if offset[1] not in digits or \ + offset[2] not in digits or \ + offset[3] not in digits or \ + offset[4] not in digits: + raise fmt_exc + # END for each char + return offset + +def parse_date(string_date): + """ + Parse the given date as one of the following + * Git internal format: timestamp offset + * RFC 2822: Thu, 07 Apr 2005 22:13:13 +0200. + * ISO 8601 2005-04-07T22:13:13 + The T can be a space as well + + :return: Tuple(int(timestamp), int(offset), both in seconds since epoch + :raise ValueError: If the format could not be understood + :note: Date can also be YYYY.MM.DD, MM/DD/YYYY and DD.MM.YYYY + """ + # git time + try: + if string_date.count(' ') == 1 and string_date.rfind(':') == -1: + timestamp, offset = string_date.split() + timestamp = int(timestamp) + return timestamp, utctz_to_altz(verify_utctz(offset)) + else: + offset = "+0000" # local time by default + if string_date[-5] in '-+': + offset = verify_utctz(string_date[-5:]) + string_date = string_date[:-6] # skip space as well + # END split timezone info + + # now figure out the date and time portion - split time + date_formats = list() + splitter = -1 + if ',' in string_date: + date_formats.append("%a, %d %b %Y") + splitter = string_date.rfind(' ') + else: + # iso plus additional + date_formats.append("%Y-%m-%d") + date_formats.append("%Y.%m.%d") + date_formats.append("%m/%d/%Y") + date_formats.append("%d.%m.%Y") + + splitter = string_date.rfind('T') + if splitter == -1: + splitter = string_date.rfind(' ') + # END handle 'T' and ' ' + # END handle rfc or iso + + assert splitter > -1 + + # split date and time + time_part = string_date[splitter+1:] # skip space + date_part = string_date[:splitter] + + # parse time + tstruct = time.strptime(time_part, "%H:%M:%S") + + for fmt in date_formats: + try: + dtstruct = time.strptime(date_part, fmt) + fstruct = time.struct_time((dtstruct.tm_year, dtstruct.tm_mon, dtstruct.tm_mday, + tstruct.tm_hour, tstruct.tm_min, tstruct.tm_sec, + dtstruct.tm_wday, dtstruct.tm_yday, tstruct.tm_isdst)) + return int(time.mktime(fstruct)), utctz_to_altz(offset) + except ValueError: + continue + # END exception handling + # END for each fmt + + # still here ? fail + raise ValueError("no format matched") + # END handle format + except Exception: + raise ValueError("Unsupported date format: %s" % string_date) + # END handle exceptions + + # precompiled regex _re_actor_epoch = re.compile(r'^.+? (.*) (\d+) ([+-]\d+).*$') def parse_actor_and_date(line): - """ - Parse out the actor (author or committer) info from a line like:: - - author Tom Preston-Werner <tom@mojombo.com> 1191999972 -0700 - - Returns - [Actor, int_seconds_since_epoch, int_timezone_offset] - """ - m = _re_actor_epoch.search(line) - actor, epoch, offset = m.groups() - return (Actor._from_string(actor), int(epoch), -int(float(offset)/100*3600)) - - - + """ + Parse out the actor (author or committer) info from a line like:: + + author Tom Preston-Werner <tom@mojombo.com> 1191999972 -0700 + + Returns + [Actor, int_seconds_since_epoch, int_timezone_offset] + """ + m = _re_actor_epoch.search(line) + actor, epoch, offset = m.groups() + return (Actor._from_string(actor), int(epoch), utctz_to_altz(offset)) + + + class ProcessStreamAdapter(object): - """ - Class wireing all calls to the contained Process instance. - - Use this type to hide the underlying process to provide access only to a specified - stream. The process is usually wrapped into an AutoInterrupt class to kill - it if the instance goes out of scope. - """ - __slots__ = ("_proc", "_stream") - def __init__(self, process, stream_name): - self._proc = process - self._stream = getattr(process, stream_name) - - def __getattr__(self, attr): - return getattr(self._stream, attr) - - + """ + Class wireing all calls to the contained Process instance. + + Use this type to hide the underlying process to provide access only to a specified + stream. The process is usually wrapped into an AutoInterrupt class to kill + it if the instance goes out of scope. + """ + __slots__ = ("_proc", "_stream") + def __init__(self, process, stream_name): + self._proc = process + self._stream = getattr(process, stream_name) + + def __getattr__(self, attr): + return getattr(self._stream, attr) + + class Traversable(object): - """Simple interface to perforam depth-first or breadth-first traversals - into one direction. - Subclasses only need to implement one function. - Instances of the Subclass must be hashable""" - __slots__ = tuple() - - @classmethod - def _get_intermediate_items(cls, item): - """ - Returns: - List of items connected to the given item. - Must be implemented in subclass - """ - raise NotImplementedError("To be implemented in subclass") - - - def traverse( self, predicate = lambda i,d: True, - prune = lambda i,d: False, depth = -1, branch_first=True, - visit_once = True, ignore_self=1, as_edge = False ): - """ - ``Returns`` - iterator yieling of items found when traversing self - - ``predicate`` - f(i,d) returns False if item i at depth d should not be included in the result - - ``prune`` - f(i,d) return True if the search should stop at item i at depth d. - Item i will not be returned. - - ``depth`` - define at which level the iteration should not go deeper - if -1, there is no limit - if 0, you would effectively only get self, the root of the iteration - i.e. if 1, you would only get the first level of predessessors/successors - - ``branch_first`` - if True, items will be returned branch first, otherwise depth first - - ``visit_once`` - if True, items will only be returned once, although they might be encountered - several times. Loops are prevented that way. - - ``ignore_self`` - if True, self will be ignored and automatically pruned from - the result. Otherwise it will be the first item to be returned. - If as_edge is True, the source of the first edge is None - - ``as_edge`` - if True, return a pair of items, first being the source, second the - destinatination, i.e. tuple(src, dest) with the edge spanning from - source to destination""" - visited = set() - stack = Deque() - stack.append( ( 0 ,self, None ) ) # self is always depth level 0 - - def addToStack( stack, item, branch_first, depth ): - lst = self._get_intermediate_items( item ) - if not lst: - return - if branch_first: - stack.extendleft( ( depth , i, item ) for i in lst ) - else: - reviter = ( ( depth , lst[i], item ) for i in range( len( lst )-1,-1,-1) ) - stack.extend( reviter ) - # END addToStack local method - - while stack: - d, item, src = stack.pop() # depth of item, item, item_source - - if visit_once and item in visited: - continue - - if visit_once: - visited.add(item) - - rval = ( as_edge and (src, item) ) or item - if prune( rval, d ): - continue - - skipStartItem = ignore_self and ( item == self ) - if not skipStartItem and predicate( rval, d ): - yield rval - - # only continue to next level if this is appropriate ! - nd = d + 1 - if depth > -1 and nd > depth: - continue - - addToStack( stack, item, branch_first, nd ) - # END for each item on work stack + """Simple interface to perforam depth-first or breadth-first traversals + into one direction. + Subclasses only need to implement one function. + Instances of the Subclass must be hashable""" + __slots__ = tuple() + + @classmethod + def _get_intermediate_items(cls, item): + """ + Returns: + List of items connected to the given item. + Must be implemented in subclass + """ + raise NotImplementedError("To be implemented in subclass") + + + def traverse( self, predicate = lambda i,d: True, + prune = lambda i,d: False, depth = -1, branch_first=True, + visit_once = True, ignore_self=1, as_edge = False ): + """ + ``Returns`` + iterator yieling of items found when traversing self + + ``predicate`` + f(i,d) returns False if item i at depth d should not be included in the result + + ``prune`` + f(i,d) return True if the search should stop at item i at depth d. + Item i will not be returned. + + ``depth`` + define at which level the iteration should not go deeper + if -1, there is no limit + if 0, you would effectively only get self, the root of the iteration + i.e. if 1, you would only get the first level of predessessors/successors + + ``branch_first`` + if True, items will be returned branch first, otherwise depth first + + ``visit_once`` + if True, items will only be returned once, although they might be encountered + several times. Loops are prevented that way. + + ``ignore_self`` + if True, self will be ignored and automatically pruned from + the result. Otherwise it will be the first item to be returned. + If as_edge is True, the source of the first edge is None + + ``as_edge`` + if True, return a pair of items, first being the source, second the + destinatination, i.e. tuple(src, dest) with the edge spanning from + source to destination""" + visited = set() + stack = Deque() + stack.append( ( 0 ,self, None ) ) # self is always depth level 0 + + def addToStack( stack, item, branch_first, depth ): + lst = self._get_intermediate_items( item ) + if not lst: + return + if branch_first: + stack.extendleft( ( depth , i, item ) for i in lst ) + else: + reviter = ( ( depth , lst[i], item ) for i in range( len( lst )-1,-1,-1) ) + stack.extend( reviter ) + # END addToStack local method + + while stack: + d, item, src = stack.pop() # depth of item, item, item_source + + if visit_once and item in visited: + continue + + if visit_once: + visited.add(item) + + rval = ( as_edge and (src, item) ) or item + if prune( rval, d ): + continue + + skipStartItem = ignore_self and ( item == self ) + if not skipStartItem and predicate( rval, d ): + yield rval + + # only continue to next level if this is appropriate ! + nd = d + 1 + if depth > -1 and nd > depth: + continue + + addToStack( stack, item, branch_first, nd ) + # END for each item on work stack + + +class Serializable(object): + """Defines methods to serialize and deserialize objects from and into a data stream""" + + def _serialize(self, stream): + """Serialize the data of this object into the given data stream + :note: a serialized object would ``_deserialize`` into the same objet + :param stream: a file-like object + :return: self""" + raise NotImplementedError("To be implemented in subclass") + + def _deserialize(self, stream): + """Deserialize all information regarding this object from the stream + :param stream: a file-like object + :return: self""" + raise NotImplementedError("To be implemented in subclass") diff --git a/lib/git/odb/__init__.py b/lib/git/odb/__init__.py new file mode 100644 index 00000000..5789d7eb --- /dev/null +++ b/lib/git/odb/__init__.py @@ -0,0 +1,6 @@ +"""Initialize the object database module""" + +# default imports +from db import * +from stream import * + diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py new file mode 100644 index 00000000..a8de28ec --- /dev/null +++ b/lib/git/odb/db.py @@ -0,0 +1,337 @@ +"""Contains implementations of database retrieveing objects""" +from git.utils import IndexFileSHA1Writer +from git.errors import ( + InvalidDBRoot, + BadObject, + BadObjectType + ) + +from stream import ( + DecompressMemMapReader, + FDCompressedSha1Writer, + Sha1Writer, + OStream, + OInfo + ) + +from utils import ( + ENOENT, + to_hex_sha, + exists, + hex_to_bin, + isdir, + mkdir, + rename, + dirname, + join + ) + +from fun import ( + chunk_size, + loose_object_header_info, + write_object + ) + +import tempfile +import mmap +import os + + +__all__ = ('ObjectDBR', 'ObjectDBW', 'FileDBBase', 'LooseObjectDB', 'PackedDB', + 'CompoundDB', 'ReferenceDB', 'GitObjectDB' ) + +class ObjectDBR(object): + """Defines an interface for object database lookup. + Objects are identified either by hex-sha (40 bytes) or + by sha (20 bytes)""" + + def __contains__(self, sha): + return self.has_obj + + #{ Query Interface + def has_object(self, sha): + """ + :return: True if the object identified by the given 40 byte hexsha or 20 bytes + binary sha is contained in the database + :raise BadObject:""" + raise NotImplementedError("To be implemented in subclass") + + def info(self, sha): + """ :return: OInfo instance + :param sha: 40 bytes hexsha or 20 bytes binary sha + :raise BadObject:""" + raise NotImplementedError("To be implemented in subclass") + + def info_async(self, input_channel): + """Retrieve information of a multitude of objects asynchronously + :param input_channel: Channel yielding the sha's of the objects of interest + :return: Channel yielding OInfo|InvalidOInfo, in any order""" + raise NotImplementedError("To be implemented in subclass") + + def stream(self, sha): + """:return: OStream instance + :param sha: 40 bytes hexsha or 20 bytes binary sha + :raise BadObject:""" + raise NotImplementedError("To be implemented in subclass") + + def stream_async(self, input_channel): + """Retrieve the OStream of multiple objects + :param input_channel: see ``info`` + :param max_threads: see ``ObjectDBW.store`` + :return: Channel yielding OStream|InvalidOStream instances in any order""" + raise NotImplementedError("To be implemented in subclass") + + #} END query interface + +class ObjectDBW(object): + """Defines an interface to create objects in the database""" + + def __init__(self, *args, **kwargs): + self._ostream = None + + #{ Edit Interface + def set_ostream(self, stream): + """Adjusts the stream to which all data should be sent when storing new objects + :param stream: if not None, the stream to use, if None the default stream + will be used. + :return: previously installed stream, or None if there was no override + :raise TypeError: if the stream doesn't have the supported functionality""" + cstream = self._ostream + self._ostream = stream + return cstream + + def ostream(self): + """:return: overridden output stream this instance will write to, or None + if it will write to the default stream""" + return self._ostream + + def store(self, istream): + """Create a new object in the database + :return: the input istream object with its sha set to its corresponding value + :param istream: IStream compatible instance. If its sha is already set + to a value, the object will just be stored in the our database format, + in which case the input stream is expected to be in object format ( header + contents ). + :raise IOError: if data could not be written""" + raise NotImplementedError("To be implemented in subclass") + + def store_async(self, input_channel): + """Create multiple new objects in the database asynchronously. The method will + return right away, returning an output channel which receives the results as + they are computed. + + :return: Channel yielding your IStream which served as input, in any order. + The IStreams sha will be set to the sha it received during the process, + or its error attribute will be set to the exception informing about the error. + :param input_channel: Channel yielding IStream instance. + As the same instances will be used in the output channel, you can create a map + between the id(istream) -> istream + :note:As some ODB implementations implement this operation as atomic, they might + abort the whole operation if one item could not be processed. Hence check how + many items have actually been produced.""" + raise NotImplementedError("To be implemented in subclass") + + #} END edit interface + + +class FileDBBase(object): + """Provides basic facilities to retrieve files of interest, including + caching facilities to help mapping hexsha's to objects""" + + def __init__(self, root_path): + """Initialize this instance to look for its files at the given root path + All subsequent operations will be relative to this path + :raise InvalidDBRoot: + :note: The base will perform basic checking for accessability, but the subclass + is required to verify that the root_path contains the database structure it needs""" + super(FileDBBase, self).__init__() + if not os.path.isdir(root_path): + raise InvalidDBRoot(root_path) + self._root_path = root_path + + + #{ Interface + def root_path(self): + """:return: path at which this db operates""" + return self._root_path + + def db_path(self, rela_path): + """ + :return: the given relative path relative to our database root, allowing + to pontentially access datafiles""" + return join(self._root_path, rela_path) + #} END interface + + + +class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): + """A database which operates on loose object files""" + + # CONFIGURATION + # chunks in which data will be copied between streams + stream_chunk_size = chunk_size + + + def __init__(self, root_path): + super(LooseObjectDB, self).__init__(root_path) + self._hexsha_to_file = dict() + # Additional Flags - might be set to 0 after the first failure + # Depending on the root, this might work for some mounts, for others not, which + # is why it is per instance + self._fd_open_flags = getattr(os, 'O_NOATIME', 0) + + #{ Interface + def object_path(self, hexsha): + """ + :return: path at which the object with the given hexsha would be stored, + relative to the database root""" + return join(hexsha[:2], hexsha[2:]) + + def readable_db_object_path(self, hexsha): + """ + :return: readable object path to the object identified by hexsha + :raise BadObject: If the object file does not exist""" + try: + return self._hexsha_to_file[hexsha] + except KeyError: + pass + # END ignore cache misses + + # try filesystem + path = self.db_path(self.object_path(hexsha)) + if exists(path): + self._hexsha_to_file[hexsha] = path + return path + # END handle cache + raise BadObject(hexsha) + + #} END interface + + def _map_loose_object(self, sha): + """ + :return: memory map of that file to allow random read access + :raise BadObject: if object could not be located""" + db_path = self.db_path(self.object_path(to_hex_sha(sha))) + try: + fd = os.open(db_path, os.O_RDONLY|self._fd_open_flags) + except OSError,e: + if e.errno != ENOENT: + # try again without noatime + try: + fd = os.open(db_path, os.O_RDONLY) + except OSError: + raise BadObject(to_hex_sha(sha)) + # didn't work because of our flag, don't try it again + self._fd_open_flags = 0 + else: + raise BadObject(to_hex_sha(sha)) + # END handle error + # END exception handling + try: + return mmap.mmap(fd, 0, access=mmap.ACCESS_READ) + finally: + os.close(fd) + # END assure file is closed + + def set_ostream(self, stream): + """:raise TypeError: if the stream does not support the Sha1Writer interface""" + if stream is not None and not isinstance(stream, Sha1Writer): + raise TypeError("Output stream musst support the %s interface" % Sha1Writer.__name__) + return super(LooseObjectDB, self).set_ostream(stream) + + def info(self, sha): + m = self._map_loose_object(sha) + try: + type, size = loose_object_header_info(m) + return OInfo(sha, type, size) + finally: + m.close() + # END assure release of system resources + + def stream(self, sha): + m = self._map_loose_object(sha) + type, size, stream = DecompressMemMapReader.new(m, close_on_deletion = True) + return OStream(sha, type, size, stream) + + def has_object(self, sha): + try: + self.readable_db_object_path(to_hex_sha(sha)) + return True + except BadObject: + return False + # END check existance + + def store(self, istream): + """note: The sha we produce will be hex by nature""" + assert istream.sha is None, "Direct istream writing not yet implemented" + tmp_path = None + writer = self.ostream() + if writer is None: + # open a tmp file to write the data to + fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) + writer = FDCompressedSha1Writer(fd) + # END handle custom writer + + try: + try: + write_object(istream.type, istream.size, istream.read, writer.write, + chunk_size=self.stream_chunk_size) + except: + if tmp_path: + os.remove(tmp_path) + raise + # END assure tmpfile removal on error + finally: + if tmp_path: + writer.close() + # END assure target stream is closed + + sha = writer.sha(as_hex=True) + + if tmp_path: + obj_path = self.db_path(self.object_path(sha)) + obj_dir = dirname(obj_path) + if not isdir(obj_dir): + mkdir(obj_dir) + # END handle destination directory + rename(tmp_path, obj_path) + # END handle dry_run + + istream.sha = sha + return istream + + +class PackedDB(FileDBBase, ObjectDBR): + """A database operating on a set of object packs""" + + +class CompoundDB(ObjectDBR): + """A database which delegates calls to sub-databases""" + + +class ReferenceDB(CompoundDB): + """A database consisting of database referred to in a file""" + + +#class GitObjectDB(CompoundDB, ObjectDBW): +class GitObjectDB(LooseObjectDB): + """A database representing the default git object store, which includes loose + objects, pack files and an alternates file + + It will create objects only in the loose object database. + :note: for now, we use the git command to do all the lookup, just until he + have packs and the other implementations + """ + def __init__(self, root_path, git): + """Initialize this instance with the root and a git command""" + super(GitObjectDB, self).__init__(root_path) + self._git = git + + def info(self, sha): + t = self._git.get_object_header(sha) + return OInfo(t[0], t[1], t[2]) + + def stream(self, sha): + """For now, all lookup is done by git itself""" + t = self._git.stream_object_data(sha) + return OStream(t[0], t[1], t[2], t[3]) + diff --git a/lib/git/odb/fun.py b/lib/git/odb/fun.py new file mode 100644 index 00000000..870a6f02 --- /dev/null +++ b/lib/git/odb/fun.py @@ -0,0 +1,108 @@ +"""Contains basic c-functions which usually contain performance critical code +Keeping this code separate from the beginning makes it easier to out-source +it into c later, if required""" + +from git.errors import ( + BadObjectType + ) + +import zlib +decompressobj = zlib.decompressobj + + +# INVARIANTS +type_id_to_type_map = { + 1 : "commit", + 2 : "tree", + 3 : "blob", + 4 : "tag" + } + +# used when dealing with larger streams +chunk_size = 1000*1000 + +__all__ = ('is_loose_object', 'loose_object_header_info', 'object_header_info', + 'write_object' ) + +#{ Routines + +def is_loose_object(m): + """:return: True the file contained in memory map m appears to be a loose object. + Only the first two bytes are needed""" + b0, b1 = map(ord, m[:2]) + word = (b0 << 8) + b1 + return b0 == 0x78 and (word % 31) == 0 + +def loose_object_header_info(m): + """:return: tuple(type_string, uncompressed_size_in_bytes) the type string of the + object as well as its uncompressed size in bytes. + :param m: memory map from which to read the compressed object data""" + decompress_size = 8192 # is used in cgit as well + hdr = decompressobj().decompress(m, decompress_size) + type_name, size = hdr[:hdr.find("\0")].split(" ") + return type_name, int(size) + +def object_header_info(m): + """:return: tuple(type_string, uncompressed_size_in_bytes + :param mmap: mapped memory map. It will be + seeked to the actual start of the object contents, which can be used + to initialize a zlib decompress object. + :note: This routine can only handle new-style objects which are assumably contained + in packs + """ + assert not is_loose_object(m), "Use loose_object_header_info instead" + + c = b0 # first byte + i = 1 # next char to read + type_id = (c >> 4) & 7 # numeric type + size = c & 15 # starting size + s = 4 # starting bit-shift size + while c & 0x80: + c = ord(m[i]) + i += 1 + size += (c & 0x7f) << s + s += 7 + # END character loop + + # finally seek the map to the start of the data stream + m.seek(i) + try: + return (type_id_to_type_map[type_id], size) + except KeyError: + # invalid object type - we could try to be smart now and decode part + # of the stream to get the info, problem is that we had trouble finding + # the exact start of the content stream + raise BadObjectType(type_id) + # END handle exceptions + +def write_object(type, size, read, write, chunk_size=chunk_size): + """Write the object as identified by type, size and source_stream into the + target_stream + + :param type: type string of the object + :param size: amount of bytes to write from source_stream + :param read: read method of a stream providing the content data + :param write: write method of the output stream + :param close_target_stream: if True, the target stream will be closed when + the routine exits, even if an error is thrown + :return: The actual amount of bytes written to stream, which includes the header and a trailing newline""" + tbw = 0 # total num bytes written + dbw = 0 # num data bytes written + + # WRITE HEADER: type SP size NULL + tbw += write("%s %i\0" % (type, size)) + + # WRITE ALL DATA UP TO SIZE + while True: + cs = min(chunk_size, size-dbw) + data_len = write(read(cs)) + dbw += data_len + if data_len < cs or dbw == size: + tbw += dbw + break + # END check for stream end + # END duplicate data + return tbw + + +#} END routines diff --git a/lib/git/odb/stream.py b/lib/git/odb/stream.py new file mode 100644 index 00000000..d1181382 --- /dev/null +++ b/lib/git/odb/stream.py @@ -0,0 +1,446 @@ +import zlib +from cStringIO import StringIO +from git.utils import make_sha +import errno + +from utils import ( + to_hex_sha, + to_bin_sha, + write, + close + ) + +__all__ = ('OInfo', 'OStream', 'IStream', 'InvalidOInfo', 'InvalidOStream', + 'DecompressMemMapReader', 'FDCompressedSha1Writer') + + +# ZLIB configuration +# used when compressing objects - 1 to 9 ( slowest ) +Z_BEST_SPEED = 1 + + +#{ ODB Bases + +class OInfo(tuple): + """Carries information about an object in an ODB, provdiing information + about the sha of the object, the type_string as well as the uncompressed size + in bytes. + + It can be accessed using tuple notation and using attribute access notation:: + + assert dbi[0] == dbi.sha + assert dbi[1] == dbi.type + assert dbi[2] == dbi.size + + The type is designed to be as lighteight as possible.""" + __slots__ = tuple() + + def __new__(cls, sha, type, size): + return tuple.__new__(cls, (sha, type, size)) + + def __init__(self, *args): + tuple.__init__(self) + + #{ Interface + @property + def sha(self): + return self[0] + + @property + def type(self): + return self[1] + + @property + def size(self): + return self[2] + #} END interface + + +class OStream(OInfo): + """Base for object streams retrieved from the database, providing additional + information about the stream. + Generally, ODB streams are read-only as objects are immutable""" + __slots__ = tuple() + + def __new__(cls, sha, type, size, stream, *args, **kwargs): + """Helps with the initialization of subclasses""" + return tuple.__new__(cls, (sha, type, size, stream)) + + + def __init__(self, *args, **kwargs): + tuple.__init__(self) + #{ Interface + + def is_compressed(self): + """:return: True if reads of this stream yield zlib compressed data. Default False + :note: this does not imply anything about the actual internal storage. + Hence the data could be uncompressed, but read compressed, or vice versa""" + raise False + + #} END interface + + #{ Stream Reader Interface + + def read(self, size=-1): + return self[3].read(size) + + #} END stream reader interface + + +class IStream(list): + """Represents an input content stream to be fed into the ODB. It is mutable to allow + the ODB to record information about the operations outcome right in this instance. + + It provides interfaces for the OStream and a StreamReader to allow the instance + to blend in without prior conversion. + + The only method your content stream must support is 'read'""" + __slots__ = tuple() + + def __new__(cls, type, size, stream, sha=None, compressed=False): + return list.__new__(cls, (sha, type, size, stream, compressed, None)) + + def __init__(self, type, size, stream, sha=None, compressed=None): + list.__init__(self, (sha, type, size, stream, compressed, None)) + + #{ Interface + + def hexsha(self): + """:return: our sha, hex encoded, 40 bytes""" + return to_hex_sha(self[0]) + + def binsha(self): + """:return: our sha as binary, 20 bytes""" + return to_bin_sha(self[0]) + + def _error(self): + """:return: the error that occurred when processing the stream, or None""" + return self[5] + + def _set_error(self, exc): + """Set this input stream to the given exc, may be None to reset the error""" + self[5] = exc + + error = property(_error, _set_error) + + #} END interface + + #{ Stream Reader Interface + + def read(self, size=-1): + """Implements a simple stream reader interface, passing the read call on + to our internal stream""" + return self[3].read(size) + + #} END stream reader interface + + #{ interface + + def _set_sha(self, sha): + self[0] = sha + + def _sha(self): + return self[0] + + sha = property(_sha, _set_sha) + + + def _type(self): + return self[1] + + def _set_type(self, type): + self[1] = type + + type = property(_type, _set_type) + + def _size(self): + return self[2] + + def _set_size(self, size): + self[2] = size + + size = property(_size, _set_size) + + def _stream(self): + return self[3] + + def _set_stream(self, stream): + self[3] = stream + + stream = property(_stream, _set_stream) + + #} END odb info interface + + #{ OStream interface + + def is_compressed(self): + return self[4] + + #} END OStream interface + + +class InvalidOInfo(tuple): + """Carries information about a sha identifying an object which is invalid in + the queried database. The exception attribute provides more information about + the cause of the issue""" + __slots__ = tuple() + + def __new__(cls, sha, exc): + return tuple.__new__(cls, (sha, exc)) + + def __init__(self, sha, exc): + tuple.__init__(self, (sha, exc)) + + @property + def sha(self): + return self[0] + + @property + def error(self): + """:return: exception instance explaining the failure""" + return self[1] + + +class InvalidOStream(InvalidOInfo): + """Carries information about an invalid ODB stream""" + __slots__ = tuple() + +#} END ODB Bases + + +#{ RO Streams + +class DecompressMemMapReader(object): + """Reads data in chunks from a memory map and decompresses it. The client sees + only the uncompressed data, respective file-like read calls are handling on-demand + buffered decompression accordingly + + A constraint on the total size of bytes is activated, simulating + a logical file within a possibly larger physical memory area + + To read efficiently, you clearly don't want to read individual bytes, instead, + read a few kilobytes at least. + + :note: The chunk-size should be carefully selected as it will involve quite a bit + of string copying due to the way the zlib is implemented. Its very wasteful, + hence we try to find a good tradeoff between allocation time and number of + times we actually allocate. An own zlib implementation would be good here + to better support streamed reading - it would only need to keep the mmap + and decompress it into chunks, thats all ... """ + __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close') + + max_read_size = 512*1024 + + def __init__(self, m, close_on_deletion, size): + """Initialize with mmap for stream reading""" + self._m = m + self._zip = zlib.decompressobj() + self._buf = None # buffer of decompressed bytes + self._buflen = 0 # length of bytes in buffer + self._s = size # size of uncompressed data to read in total + self._br = 0 # num uncompressed bytes read + self._cws = 0 # start byte of compression window + self._cwe = 0 # end byte of compression window + self._close = close_on_deletion # close the memmap on deletion ? + + def __del__(self): + if self._close: + self._m.close() + # END handle resource freeing + + @classmethod + def new(self, m, close_on_deletion=False): + """Create a new DecompressMemMapReader instance for acting as a read-only stream + This method parses the object header from m and returns the parsed + type and size, as well as the created stream instance. + :param m: memory map on which to oparate + :param close_on_deletion: if True, the memory map will be closed once we are + being deleted""" + inst = DecompressMemMapReader(m, close_on_deletion, 0) + + # read header + maxb = 512 # should really be enough, cgit uses 8192 I believe + inst._s = maxb + hdr = inst.read(maxb) + hdrend = hdr.find("\0") + type, size = hdr[:hdrend].split(" ") + size = int(size) + inst._s = size + + # adjust internal state to match actual header length that we ignore + # The buffer will be depleted first on future reads + inst._br = 0 + hdrend += 1 # count terminating \0 + inst._buf = StringIO(hdr[hdrend:]) + inst._buflen = len(hdr) - hdrend + + return type, size, inst + + def read(self, size=-1): + if size < 1: + size = self._s - self._br + else: + size = min(size, self._s - self._br) + # END clamp size + + if size == 0: + return str() + # END handle depletion + + # protect from memory peaks + # If he tries to read large chunks, our memory patterns get really bad + # as we end up copying a possibly huge chunk from our memory map right into + # memory. This might not even be possible. Nonetheless, try to dampen the + # effect a bit by reading in chunks, returning a huge string in the end. + # Our performance now depends on StringIO. This way we don't need two large + # buffers in peak times, but only one large one in the end which is + # the return buffer + # NO: We don't do it - if the user thinks its best, he is right. If he + # has trouble, he will start reading in chunks. According to our tests + # its still faster if we read 10 Mb at once instead of chunking it. + + # if size > self.max_read_size: + # sio = StringIO() + # while size: + # read_size = min(self.max_read_size, size) + # data = self.read(read_size) + # sio.write(data) + # size -= len(data) + # if len(data) < read_size: + # break + # # END data loop + # sio.seek(0) + # return sio.getvalue() + # # END handle maxread + # + # deplete the buffer, then just continue using the decompress object + # which has an own buffer. We just need this to transparently parse the + # header from the zlib stream + dat = str() + if self._buf: + if self._buflen >= size: + # have enough data + dat = self._buf.read(size) + self._buflen -= size + self._br += size + return dat + else: + dat = self._buf.read() # ouch, duplicates data + size -= self._buflen + self._br += self._buflen + + self._buflen = 0 + self._buf = None + # END handle buffer len + # END handle buffer + + # decompress some data + # Abstract: zlib needs to operate on chunks of our memory map ( which may + # be large ), as it will otherwise and always fill in the 'unconsumed_tail' + # attribute which possible reads our whole map to the end, forcing + # everything to be read from disk even though just a portion was requested. + # As this would be a nogo, we workaround it by passing only chunks of data, + # moving the window into the memory map along as we decompress, which keeps + # the tail smaller than our chunk-size. This causes 'only' the chunk to be + # copied once, and another copy of a part of it when it creates the unconsumed + # tail. We have to use it to hand in the appropriate amount of bytes durin g + # the next read. + tail = self._zip.unconsumed_tail + if tail: + # move the window, make it as large as size demands. For code-clarity, + # we just take the chunk from our map again instead of reusing the unconsumed + # tail. The latter one would safe some memory copying, but we could end up + # with not getting enough data uncompressed, so we had to sort that out as well. + # Now we just assume the worst case, hence the data is uncompressed and the window + # needs to be as large as the uncompressed bytes we want to read. + self._cws = self._cwe - len(tail) + self._cwe = self._cws + size + + + indata = self._m[self._cws:self._cwe] # another copy ... :( + # get the actual window end to be sure we don't use it for computations + self._cwe = self._cws + len(indata) + else: + cws = self._cws + self._cws = self._cwe + self._cwe = cws + size + indata = self._m[self._cws:self._cwe] # ... copy it again :( + # END handle tail + + dcompdat = self._zip.decompress(indata, size) + + self._br += len(dcompdat) + if dat: + dcompdat = dat + dcompdat + + return dcompdat + +#} END RO streams + + +#{ W Streams + +class Sha1Writer(object): + """Simple stream writer which produces a sha whenever you like as it degests + everything it is supposed to write""" + + def __init__(self): + self.sha1 = make_sha("") + + #{ Stream Interface + + def write(self, data): + """:raise IOError: If not all bytes could be written + :return: lenght of incoming data""" + self.sha1.update(data) + return len(data) + + # END stream interface + + #{ Interface + + def sha(self, as_hex = False): + """:return: sha so far + :param as_hex: if True, sha will be hex-encoded, binary otherwise""" + if as_hex: + return self.sha1.hexdigest() + return self.sha1.digest() + + #} END interface + +class FDCompressedSha1Writer(Sha1Writer): + """Digests data written to it, making the sha available, then compress the + data and write it to the file descriptor + :note: operates on raw file descriptors + :note: for this to work, you have to use the close-method of this instance""" + __slots__ = ("fd", "sha1", "zip") + + # default exception + exc = IOError("Failed to write all bytes to filedescriptor") + + def __init__(self, fd): + super(FDCompressedSha1Writer, self).__init__() + self.fd = fd + self.zip = zlib.compressobj(Z_BEST_SPEED) + + #{ Stream Interface + + def write(self, data): + """:raise IOError: If not all bytes could be written + :return: lenght of incoming data""" + self.sha1.update(data) + cdata = self.zip.compress(data) + bytes_written = write(self.fd, cdata) + if bytes_written != len(cdata): + raise self.exc + return len(data) + + def close(self): + remainder = self.zip.flush() + if write(self.fd, remainder) != len(remainder): + raise self.exc + return close(self.fd) + + #} END stream interface + +#} END W streams diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py new file mode 100644 index 00000000..6863e97b --- /dev/null +++ b/lib/git/odb/utils.py @@ -0,0 +1,38 @@ +import binascii +import os +import errno + +#{ Routines + +hex_to_bin = binascii.a2b_hex +bin_to_hex = binascii.b2a_hex + +def to_hex_sha(sha): + """:return: hexified version of sha""" + if len(sha) == 40: + return sha + return bin_to_hex(sha) + +def to_bin_sha(sha): + if len(sha) == 20: + return sha + return hex_to_bin(sha) + +# errors +ENOENT = errno.ENOENT + +# os shortcuts +exists = os.path.exists +mkdir = os.mkdir +isdir = os.path.isdir +rename = os.rename +dirname = os.path.dirname +join = os.path.join +read = os.read +write = os.write +close = os.close + + +#} END Routines + + diff --git a/lib/git/repo.py b/lib/git/repo.py index f4caa3fb..78e5f526 100644 --- a/lib/git/repo.py +++ b/lib/git/repo.py @@ -4,12 +4,6 @@ # This module is part of GitPython and is released under # the BSD License: http://www.opensource.org/licenses/bsd-license.php -import os -import sys -import re -import gzip -import StringIO - from errors import InvalidGitRepositoryError, NoSuchPathError from cmd import Git from actor import Actor @@ -19,6 +13,15 @@ from objects import * from config import GitConfigParser from remote import Remote +from odb import GitObjectDB + +import os +import sys +import re +import gzip +import StringIO + + def touch(filename): fp = open(filename, "a") fp.close() @@ -53,7 +56,7 @@ class Repo(object): 'git_dir' is the .git repository directoy, which is always set. """ DAEMON_EXPORT_FILE = 'git-daemon-export-ok' - __slots__ = ( "working_dir", "_working_tree_dir", "git_dir", "_bare", "git" ) + __slots__ = ( "working_dir", "_working_tree_dir", "git_dir", "_bare", "git", "odb" ) # precompiled regex re_whitespace = re.compile(r'\s+') @@ -65,27 +68,22 @@ class Repo(object): # represents the configuration level of a configuration file config_level = ("system", "global", "repository") - def __init__(self, path=None): - """ - Create a new Repo instance + def __init__(self, path=None, odbt = GitObjectDB): + """ Create a new Repo instance - ``path`` - is the path to either the root git directory or the bare git repo - - Examples:: + :param path: is the path to either the root git directory or the bare git repo:: repo = Repo("/Users/mtrier/Development/git-python") repo = Repo("/Users/mtrier/Development/git-python.git") repo = Repo("~/Development/git-python.git") repo = Repo("$REPOSITORIES/Development/git-python.git") - - Raises - InvalidGitRepositoryError or NoSuchPathError - - Returns - ``git.Repo`` - """ - + + :param odbt: Object DataBase type - a type which is constructed by providing + the directory containing the database objects, i.e. .git/objects. It will + be used to access all object data + :raise InvalidGitRepositoryError: + :raise NoSuchPathError: + :return: git.Repo """ epath = os.path.abspath(os.path.expandvars(os.path.expanduser(path or os.getcwd()))) if not os.path.exists(epath): @@ -130,6 +128,12 @@ class Repo(object): self.working_dir = self._working_tree_dir or self.git_dir self.git = Git(self.working_dir) + + # special handling, in special times + args = [os.path.join(self.git_dir, 'objects')] + if issubclass(odbt, GitObjectDB): + args.append(self.git) + self.odb = odbt(*args) def __eq__(self, rhs): if isinstance(rhs, Repo): diff --git a/lib/git/utils.py b/lib/git/utils.py index c21528b1..60a7de48 100644 --- a/lib/git/utils.py +++ b/lib/git/utils.py @@ -27,6 +27,21 @@ def make_sha(source=''): sha1 = sha.sha(source) return sha1 +def stream_copy(source, destination, chunk_size=512*1024): + """Copy all data from the source stream into the destination stream in chunks + of size chunk_size + :return: amount of bytes written""" + br = 0 + while True: + chunk = source.read(chunk_size) + destination.write(chunk) + br += len(chunk) + if len(chunk) < chunk_size: + break + # END reading output stream + return br + + def join_path(a, *p): """Join path tokens together similar to os.path.join, but always use '/' instead of possibly '\' on windows.""" @@ -61,12 +76,14 @@ def join_path_native(a, *p): return to_native_path(join_path(a, *p)) -class SHA1Writer(object): +class IndexFileSHA1Writer(object): """ Wrapper around a file-like object that remembers the SHA1 of the data written to it. It will write a sha when the stream is closed or if the asked for explicitly usign write_sha. + Only useful to the indexfile + Note: Based on the dulwich project """ @@ -78,7 +95,7 @@ class SHA1Writer(object): def write(self, data): self.sha1.update(data) - self.f.write(data) + return self.f.write(data) def write_sha(self): sha = self.sha1.digest() diff --git a/test/fixtures/rev_list b/test/fixtures/rev_list index 95a1ebff..1a576118 100644 --- a/test/fixtures/rev_list +++ b/test/fixtures/rev_list @@ -1,24 +1,3 @@ -commit 4c8124ffcf4039d292442eeccabdeca5af5c5017 -tree 672eca9b7f9e09c22dcb128c283e8c3c8d7697a4 -parent 634396b2f541a9f2d58b00be1a07f0c358b999b3 -author Tom Preston-Werner <tom@mojombo.com> 1191999972 -0700 -committer Tom Preston-Werner <tom@mojombo.com> 1191999972 -0700 - - implement Grit#heads - -commit 634396b2f541a9f2d58b00be1a07f0c358b999b3 -tree b35b4bf642d667fdd613eebcfe4e17efd420fb8a -author Tom Preston-Werner <tom@mojombo.com> 1191997100 -0700 -committer Tom Preston-Werner <tom@mojombo.com> 1191997100 -0700 - - initial grit setup - -commit ab25fd8483882c3bda8a458ad2965d2248654335 -tree c20b5ec543bde1e43a931449b196052c06ed8acc -parent 6e64c55896aabb9a7d8e9f8f296f426d21a78c2c -parent 7f874954efb9ba35210445be456c74e037ba6af2 -author Tom Preston-Werner <tom@mojombo.com> 1182645538 -0700 -committer Tom Preston-Werner <tom@mojombo.com> 1182645538 -0700 - - Merge branch 'site' - Some other stuff +4c8124ffcf4039d292442eeccabdeca5af5c5017 +634396b2f541a9f2d58b00be1a07f0c358b999b3 +ab25fd8483882c3bda8a458ad2965d2248654335 diff --git a/test/git/performance/lib.py b/test/git/performance/lib.py new file mode 100644 index 00000000..7d2a9f4a --- /dev/null +++ b/test/git/performance/lib.py @@ -0,0 +1,65 @@ +"""Contains library functions""" +import os +from test.testlib import * +import shutil +import tempfile + +from git import ( + Repo + ) + +#{ Invvariants +k_env_git_repo = "GIT_PYTHON_TEST_GIT_REPO_BASE" +#} END invariants + + +#{ Utilities +def resolve_or_fail(env_var): + """:return: resolved environment variable or raise EnvironmentError""" + try: + return os.environ[env_var] + except KeyError: + raise EnvironmentError("Please set the %r envrionment variable and retry" % env_var) + # END exception handling + +#} END utilities + + +#{ Base Classes + +class TestBigRepoR(TestBase): + """TestCase providing access to readonly 'big' repositories using the following + member variables: + + * gitrepo + + * Read-Only git repository - actually the repo of git itself""" + + #{ Invariants + head_sha_2k = '235d521da60e4699e5bd59ac658b5b48bd76ddca' + head_sha_50 = '32347c375250fd470973a5d76185cac718955fd5' + #} END invariants + + @classmethod + def setUpAll(cls): + super(TestBigRepoR, cls).setUpAll() + cls.gitrorepo = Repo(resolve_or_fail(k_env_git_repo)) + + +class TestBigRepoRW(TestBigRepoR): + """As above, but provides a big repository that we can write to. + + Provides ``self.gitrwrepo``""" + + @classmethod + def setUpAll(cls): + super(TestBigRepoRW, cls).setUpAll() + dirname = tempfile.mktemp() + os.mkdir(dirname) + cls.gitrwrepo = cls.gitrorepo.clone(dirname, shared=True, bare=True) + + @classmethod + def tearDownAll(cls): + shutil.rmtree(cls.gitrwrepo.working_dir) + +#} END base classes diff --git a/test/git/performance/test_commit.py b/test/git/performance/test_commit.py new file mode 100644 index 00000000..0571d0d9 --- /dev/null +++ b/test/git/performance/test_commit.py @@ -0,0 +1,98 @@ +# test_performance.py +# Copyright (C) 2008, 2009 Michael Trier (mtrier@gmail.com) and contributors +# +# This module is part of GitPython and is released under +# the BSD License: http://www.opensource.org/licenses/bsd-license.php + +from lib import * +from git import * +from test.git.test_commit import assert_commit_serialization +from cStringIO import StringIO +from time import time +import sys + +class TestPerformance(TestBigRepoRW): + + # ref with about 100 commits in its history + ref_100 = '0.1.6' + + def _query_commit_info(self, c): + c.author + c.authored_date + c.author_tz_offset + c.committer + c.committed_date + c.committer_tz_offset + c.message + c.parents + + def test_iteration(self): + no = 0 + nc = 0 + + # find the first commit containing the given path - always do a full + # iteration ( restricted to the path in question ), but in fact it should + # return quite a lot of commits, we just take one and hence abort the operation + + st = time() + for c in self.rorepo.iter_commits(self.ref_100): + nc += 1 + self._query_commit_info(c) + for obj in c.tree.traverse(): + obj.size + no += 1 + # END for each object + # END for each commit + elapsed_time = time() - st + print >> sys.stderr, "Traversed %i Trees and a total of %i unchached objects in %s [s] ( %f objs/s )" % (nc, no, elapsed_time, no/elapsed_time) + + def test_commit_traversal(self): + # bound to cat-file parsing performance + nc = 0 + st = time() + for c in self.gitrorepo.commit(self.head_sha_2k).traverse(branch_first=False): + nc += 1 + self._query_commit_info(c) + # END for each traversed commit + elapsed_time = time() - st + print >> sys.stderr, "Traversed %i Commits in %s [s] ( %f commits/s )" % (nc, elapsed_time, nc/elapsed_time) + + def test_commit_iteration(self): + # bound to stream parsing performance + nc = 0 + st = time() + for c in Commit.iter_items(self.gitrorepo, self.head_sha_2k): + nc += 1 + self._query_commit_info(c) + # END for each traversed commit + elapsed_time = time() - st + print >> sys.stderr, "Iterated %i Commits in %s [s] ( %f commits/s )" % (nc, elapsed_time, nc/elapsed_time) + + def test_commit_serialization(self): + assert_commit_serialization(self.gitrwrepo, self.head_sha_2k, True) + + rwrepo = self.gitrwrepo + make_object = rwrepo.odb.store + # direct serialization - deserialization can be tested afterwards + # serialization is probably limited on IO + hc = rwrepo.commit(self.head_sha_2k) + + commits = list() + nc = 5000 + st = time() + for i in xrange(nc): + cm = Commit( rwrepo, Commit.NULL_HEX_SHA, hc.tree, + hc.author, hc.authored_date, hc.author_tz_offset, + hc.committer, hc.committed_date, hc.committer_tz_offset, + str(i), parents=hc.parents, encoding=hc.encoding) + + stream = StringIO() + cm._serialize(stream) + slen = stream.tell() + stream.seek(0) + + cm.sha = make_object(IStream(Commit.type, slen, stream)).sha + # END commit creation + elapsed = time() - st + + print >> sys.stderr, "Serialized %i commits to loose objects in %f s ( %f commits / s )" % (nc, elapsed, nc / elapsed) diff --git a/test/git/performance/test_odb.py b/test/git/performance/test_odb.py new file mode 100644 index 00000000..7b1ee838 --- /dev/null +++ b/test/git/performance/test_odb.py @@ -0,0 +1,61 @@ +"""Performance tests for object store""" + +from time import time +import sys +import stat + +from lib import ( + TestBigRepoR + ) + + +class TestObjDBPerformance(TestBigRepoR): + + def test_random_access(self): + + # GET COMMITS + # TODO: use the actual db for this + st = time() + root_commit = self.gitrorepo.commit(self.head_sha_2k) + commits = list(root_commit.traverse()) + nc = len(commits) + elapsed = time() - st + + print >> sys.stderr, "Retrieved %i commits from ObjectStore in %g s ( %f commits / s )" % (nc, elapsed, nc / elapsed) + + + # GET TREES + # walk all trees of all commits + st = time() + blobs_per_commit = list() + nt = 0 + for commit in commits: + tree = commit.tree + blobs = list() + for item in tree.traverse(): + nt += 1 + if item.type == 'blob': + blobs.append(item) + # direct access for speed + # END while trees are there for walking + blobs_per_commit.append(blobs) + # END for each commit + elapsed = time() - st + + print >> sys.stderr, "Retrieved %i objects from %i commits in %g s ( %f objects / s )" % (nt, len(commits), elapsed, nt / elapsed) + + # GET BLOBS + st = time() + nb = 0 + too_many = 15000 + for blob_list in blobs_per_commit: + for blob in blob_list: + blob.data + # END for each blobsha + nb += len(blob_list) + if nb > too_many: + break + # END for each bloblist + elapsed = time() - st + + print >> sys.stderr, "Retrieved %i blob and their data in %g s ( %f blobs / s )" % (nb, elapsed, nb / elapsed) diff --git a/test/git/performance/test_streams.py b/test/git/performance/test_streams.py new file mode 100644 index 00000000..01ec9fc4 --- /dev/null +++ b/test/git/performance/test_streams.py @@ -0,0 +1,144 @@ +"""Performance data streaming performance""" + +from test.testlib import * +from git.odb import * + +from array import array +from cStringIO import StringIO +from time import time +import os +import sys +import stat +import random +import subprocess + + +from lib import ( + TestBigRepoR + ) + + + +def make_memory_file(size_in_bytes, randomize=False): + """:return: tuple(size_of_stream, stream) + :param randomize: try to produce a very random stream""" + actual_size = size_in_bytes / 4 + producer = xrange(actual_size) + if randomize: + producer = list(producer) + random.shuffle(producer) + # END randomize + a = array('i', producer) + return actual_size*4, StringIO(a.tostring()) + + +class TestObjDBPerformance(TestBigRepoR): + + large_data_size_bytes = 1000*1000*10 # some MiB should do it + moderate_data_size_bytes = 1000*1000*1 # just 1 MiB + + @with_bare_rw_repo + def test_large_data_streaming(self, rwrepo): + ldb = LooseObjectDB(os.path.join(rwrepo.git_dir, 'objects')) + + for randomize in range(2): + desc = (randomize and 'random ') or '' + print >> sys.stderr, "Creating %s data ..." % desc + st = time() + size, stream = make_memory_file(self.large_data_size_bytes, randomize) + elapsed = time() - st + print >> sys.stderr, "Done (in %f s)" % elapsed + + # writing - due to the compression it will seem faster than it is + st = time() + sha = ldb.store(IStream('blob', size, stream)).sha + elapsed_add = time() - st + assert ldb.has_object(sha) + db_file = ldb.readable_db_object_path(sha) + fsize_kib = os.path.getsize(db_file) / 1000 + + + size_kib = size / 1000 + print >> sys.stderr, "Added %i KiB (filesize = %i KiB) of %s data to loose odb in %f s ( %f Write KiB / s)" % (size_kib, fsize_kib, desc, elapsed_add, size_kib / elapsed_add) + + # reading all at once + st = time() + ostream = ldb.stream(sha) + shadata = ostream.read() + elapsed_readall = time() - st + + stream.seek(0) + assert shadata == stream.getvalue() + print >> sys.stderr, "Read %i KiB of %s data at once from loose odb in %f s ( %f Read KiB / s)" % (size_kib, desc, elapsed_readall, size_kib / elapsed_readall) + + + # reading in chunks of 1 MiB + cs = 512*1000 + chunks = list() + st = time() + ostream = ldb.stream(sha) + while True: + data = ostream.read(cs) + chunks.append(data) + if len(data) < cs: + break + # END read in chunks + elapsed_readchunks = time() - st + + stream.seek(0) + assert ''.join(chunks) == stream.getvalue() + + cs_kib = cs / 1000 + print >> sys.stderr, "Read %i KiB of %s data in %i KiB chunks from loose odb in %f s ( %f Read KiB / s)" % (size_kib, desc, cs_kib, elapsed_readchunks, size_kib / elapsed_readchunks) + + # del db file so git has something to do + os.remove(db_file) + + # VS. CGIT + ########## + # CGIT ! Can using the cgit programs be faster ? + proc = rwrepo.git.hash_object('-w', '--stdin', as_process=True, istream=subprocess.PIPE) + + # write file - pump everything in at once to be a fast as possible + data = stream.getvalue() # cache it + st = time() + proc.stdin.write(data) + proc.stdin.close() + gitsha = proc.stdout.read().strip() + proc.wait() + gelapsed_add = time() - st + del(data) + assert gitsha == sha # we do it the same way, right ? + + # as its the same sha, we reuse our path + fsize_kib = os.path.getsize(db_file) / 1000 + print >> sys.stderr, "Added %i KiB (filesize = %i KiB) of %s data to using git-hash-object in %f s ( %f Write KiB / s)" % (size_kib, fsize_kib, desc, gelapsed_add, size_kib / gelapsed_add) + + # compare ... + print >> sys.stderr, "Git-Python is %f %% faster than git when adding big %s files" % (100.0 - (elapsed_add / gelapsed_add) * 100, desc) + + + # read all + st = time() + s, t, size, data = rwrepo.git.get_object_data(gitsha) + gelapsed_readall = time() - st + print >> sys.stderr, "Read %i KiB of %s data at once using git-cat-file in %f s ( %f Read KiB / s)" % (size_kib, desc, gelapsed_readall, size_kib / gelapsed_readall) + + # compare + print >> sys.stderr, "Git-Python is %f %% faster than git when reading big %sfiles" % (100.0 - (elapsed_readall / gelapsed_readall) * 100, desc) + + + # read chunks + st = time() + s, t, size, stream = rwrepo.git.stream_object_data(gitsha) + while True: + data = stream.read(cs) + if len(data) < cs: + break + # END read stream + gelapsed_readchunks = time() - st + print >> sys.stderr, "Read %i KiB of %s data in %i KiB chunks from git-cat-file in %f s ( %f Read KiB / s)" % (size_kib, desc, cs_kib, gelapsed_readchunks, size_kib / gelapsed_readchunks) + + # compare + print >> sys.stderr, "Git-Python is %f %% faster than git when reading big %s files in chunks" % (100.0 - (elapsed_readchunks / gelapsed_readchunks) * 100, desc) + # END for each randomization factor diff --git a/test/git/performance/test_utils.py b/test/git/performance/test_utils.py new file mode 100644 index 00000000..76adffec --- /dev/null +++ b/test/git/performance/test_utils.py @@ -0,0 +1,59 @@ +"""Performance of utilities""" +from time import time +import sys +import stat + +from lib import ( + TestBigRepoR + ) + + +class TestUtilPerformance(TestBigRepoR): + + def test_access(self): + # compare dict vs. slot access + class Slotty(object): + __slots__ = "attr" + def __init__(self): + self.attr = 1 + + class Dicty(object): + def __init__(self): + self.attr = 1 + + class BigSlotty(object): + __slots__ = ('attr', ) + tuple('abcdefghijk') + def __init__(self): + for attr in self.__slots__: + setattr(self, attr, 1) + + class BigDicty(object): + def __init__(self): + for attr in BigSlotty.__slots__: + setattr(self, attr, 1) + + ni = 1000000 + for cls in (Slotty, Dicty, BigSlotty, BigDicty): + cli = cls() + st = time() + for i in xrange(ni): + cli.attr + # END for each access + elapsed = time() - st + print >> sys.stderr, "Accessed %s.attr %i times in %s s ( %f acc / s)" % (cls.__name__, ni, elapsed, ni / elapsed) + # END for each class type + + # check num of sequence-acceses + for cls in (list, tuple): + x = 10 + st = time() + s = cls(range(x)) + for i in xrange(ni): + s[0] + s[1] + s[2] + # END for + elapsed = time() - st + na = ni * 3 + print >> sys.stderr, "Accessed %s[x] %i times in %s s ( %f acc / s)" % (cls.__name__, na, elapsed, na / elapsed) + # END for each sequence diff --git a/test/git/test_commit.py b/test/git/test_commit.py index 48937c93..e65e2e59 100644 --- a/test/git/test_commit.py +++ b/test/git/test_commit.py @@ -6,172 +6,229 @@ from test.testlib import * from git import * +from git.odb import IStream + +from cStringIO import StringIO +import time +import sys -class TestCommit(TestBase): - def test_bake(self): +def assert_commit_serialization(rwrepo, commit_id, print_performance_info=False): + """traverse all commits in the history of commit identified by commit_id and check + if the serialization works. + :param print_performance_info: if True, we will show how fast we are""" + ns = 0 # num serializations + nds = 0 # num deserializations + + st = time.time() + for cm in rwrepo.commit(commit_id).traverse(): + nds += 1 + + # assert that we deserialize commits correctly, hence we get the same + # sha on serialization + stream = StringIO() + cm._serialize(stream) + ns += 1 + streamlen = stream.tell() + stream.seek(0) + + istream = rwrepo.odb.store(IStream(Commit.type, streamlen, stream)) + assert istream.sha == cm.sha + + nc = Commit(rwrepo, Commit.NULL_HEX_SHA, cm.tree.sha, + cm.author, cm.authored_date, cm.author_tz_offset, + cm.committer, cm.committed_date, cm.committer_tz_offset, + cm.message, cm.parents, cm.encoding) + + assert nc.parents == cm.parents + stream = StringIO() + nc._serialize(stream) + ns += 1 + streamlen = stream.tell() + stream.seek(0) + + # reuse istream + istream.size = streamlen + istream.stream = stream + istream.sha = None + nc.sha = rwrepo.odb.store(istream).sha + + # if it worked, we have exactly the same contents ! + assert nc.sha == cm.sha + # END check commits + elapsed = time.time() - st + + if print_performance_info: + print >> sys.stderr, "Serialized %i and deserialized %i commits in %f s ( (%f, %f) commits / s" % (ns, nds, elapsed, ns/elapsed, nds/elapsed) + # END handle performance info + - commit = Commit(self.rorepo, **{'sha': '2454ae89983a4496a445ce347d7a41c0bb0ea7ae'}) - commit.author # bake +class TestCommit(TestBase): - assert_equal("Sebastian Thiel", commit.author.name) - assert_equal("byronimo@gmail.com", commit.author.email) - assert commit.author == commit.committer - assert isinstance(commit.authored_date, int) and isinstance(commit.committed_date, int) - assert isinstance(commit.author_tz_offset, int) and isinstance(commit.committer_tz_offset, int) - assert commit.message == "Added missing information to docstrings of commit and stats module" + def test_bake(self): + commit = Commit(self.rorepo, '2454ae89983a4496a445ce347d7a41c0bb0ea7ae') + commit.author # bake - def test_stats(self): - commit = Commit(self.rorepo, '33ebe7acec14b25c5f84f35a664803fcab2f7781') - stats = commit.stats - - def check_entries(d): - assert isinstance(d, dict) - for key in ("insertions", "deletions", "lines"): - assert key in d - # END assertion helper - assert stats.files - assert stats.total - - check_entries(stats.total) - assert "files" in stats.total - - for filepath, d in stats.files.items(): - check_entries(d) - # END for each stated file - - # assure data is parsed properly - michael = Actor._from_string("Michael Trier <mtrier@gmail.com>") - assert commit.author == michael - assert commit.committer == michael - assert commit.authored_date == 1210193388 - assert commit.committed_date == 1210193388 - assert commit.author_tz_offset == 14400, commit.author_tz_offset - assert commit.committer_tz_offset == 14400, commit.committer_tz_offset - assert commit.message == "initial project" - - def test_traversal(self): - start = self.rorepo.commit("a4d06724202afccd2b5c54f81bcf2bf26dea7fff") - first = self.rorepo.commit("33ebe7acec14b25c5f84f35a664803fcab2f7781") - p0 = start.parents[0] - p1 = start.parents[1] - p00 = p0.parents[0] - p10 = p1.parents[0] - - # basic branch first, depth first - dfirst = start.traverse(branch_first=False) - bfirst = start.traverse(branch_first=True) - assert dfirst.next() == p0 - assert dfirst.next() == p00 - - assert bfirst.next() == p0 - assert bfirst.next() == p1 - assert bfirst.next() == p00 - assert bfirst.next() == p10 - - # at some point, both iterations should stop - assert list(bfirst)[-1] == first - stoptraverse = self.rorepo.commit("254d04aa3180eb8b8daf7b7ff25f010cd69b4e7d").traverse(as_edge=True) - l = list(stoptraverse) - assert len(l[0]) == 2 - - # ignore self - assert start.traverse(ignore_self=False).next() == start - - # depth - assert len(list(start.traverse(ignore_self=False, depth=0))) == 1 - - # prune - assert start.traverse(branch_first=1, prune=lambda i,d: i==p0).next() == p1 - - # predicate - assert start.traverse(branch_first=1, predicate=lambda i,d: i==p1).next() == p1 - - # traversal should stop when the beginning is reached - self.failUnlessRaises(StopIteration, first.traverse().next) - - # parents of the first commit should be empty ( as the only parent has a null - # sha ) - assert len(first.parents) == 0 - - def test_iteration(self): - # we can iterate commits - all_commits = Commit.list_items(self.rorepo, self.rorepo.head) - assert all_commits - assert all_commits == list(self.rorepo.iter_commits()) - - # this includes merge commits - mcomit = Commit(self.rorepo, 'd884adc80c80300b4cc05321494713904ef1df2d') - assert mcomit in all_commits - - # we can limit the result to paths - ltd_commits = list(self.rorepo.iter_commits(paths='CHANGES')) - assert ltd_commits and len(ltd_commits) < len(all_commits) - - # show commits of multiple paths, resulting in a union of commits - less_ltd_commits = list(Commit.iter_items(self.rorepo, 'master', paths=('CHANGES', 'AUTHORS'))) - assert len(ltd_commits) < len(less_ltd_commits) - - - @patch_object(Git, '_call_process') - def test_rev_list_bisect_all(self, git): - """ - 'git rev-list --bisect-all' returns additional information - in the commit header. This test ensures that we properly parse it. - """ + assert_equal("Sebastian Thiel", commit.author.name) + assert_equal("byronimo@gmail.com", commit.author.email) + assert commit.author == commit.committer + assert isinstance(commit.authored_date, int) and isinstance(commit.committed_date, int) + assert isinstance(commit.author_tz_offset, int) and isinstance(commit.committer_tz_offset, int) + assert commit.message == "Added missing information to docstrings of commit and stats module\n" - git.return_value = fixture('rev_list_bisect_all') - revs = self.rorepo.git.rev_list('HEAD', - pretty='raw', - first_parent=True, - bisect_all=True) - assert_true(git.called) + def test_stats(self): + commit = Commit(self.rorepo, '33ebe7acec14b25c5f84f35a664803fcab2f7781') + stats = commit.stats + + def check_entries(d): + assert isinstance(d, dict) + for key in ("insertions", "deletions", "lines"): + assert key in d + # END assertion helper + assert stats.files + assert stats.total + + check_entries(stats.total) + assert "files" in stats.total + + for filepath, d in stats.files.items(): + check_entries(d) + # END for each stated file + + # assure data is parsed properly + michael = Actor._from_string("Michael Trier <mtrier@gmail.com>") + assert commit.author == michael + assert commit.committer == michael + assert commit.authored_date == 1210193388 + assert commit.committed_date == 1210193388 + assert commit.author_tz_offset == 14400, commit.author_tz_offset + assert commit.committer_tz_offset == 14400, commit.committer_tz_offset + assert commit.message == "initial project\n" + + def test_traversal(self): + start = self.rorepo.commit("a4d06724202afccd2b5c54f81bcf2bf26dea7fff") + first = self.rorepo.commit("33ebe7acec14b25c5f84f35a664803fcab2f7781") + p0 = start.parents[0] + p1 = start.parents[1] + p00 = p0.parents[0] + p10 = p1.parents[0] + + # basic branch first, depth first + dfirst = start.traverse(branch_first=False) + bfirst = start.traverse(branch_first=True) + assert dfirst.next() == p0 + assert dfirst.next() == p00 + + assert bfirst.next() == p0 + assert bfirst.next() == p1 + assert bfirst.next() == p00 + assert bfirst.next() == p10 + + # at some point, both iterations should stop + assert list(bfirst)[-1] == first + stoptraverse = self.rorepo.commit("254d04aa3180eb8b8daf7b7ff25f010cd69b4e7d").traverse(as_edge=True) + l = list(stoptraverse) + assert len(l[0]) == 2 + + # ignore self + assert start.traverse(ignore_self=False).next() == start + + # depth + assert len(list(start.traverse(ignore_self=False, depth=0))) == 1 + + # prune + assert start.traverse(branch_first=1, prune=lambda i,d: i==p0).next() == p1 + + # predicate + assert start.traverse(branch_first=1, predicate=lambda i,d: i==p1).next() == p1 + + # traversal should stop when the beginning is reached + self.failUnlessRaises(StopIteration, first.traverse().next) + + # parents of the first commit should be empty ( as the only parent has a null + # sha ) + assert len(first.parents) == 0 + + def test_iteration(self): + # we can iterate commits + all_commits = Commit.list_items(self.rorepo, self.rorepo.head) + assert all_commits + assert all_commits == list(self.rorepo.iter_commits()) + + # this includes merge commits + mcomit = Commit(self.rorepo, 'd884adc80c80300b4cc05321494713904ef1df2d') + assert mcomit in all_commits + + # we can limit the result to paths + ltd_commits = list(self.rorepo.iter_commits(paths='CHANGES')) + assert ltd_commits and len(ltd_commits) < len(all_commits) + + # show commits of multiple paths, resulting in a union of commits + less_ltd_commits = list(Commit.iter_items(self.rorepo, 'master', paths=('CHANGES', 'AUTHORS'))) + assert len(ltd_commits) < len(less_ltd_commits) + + def test_iter_items(self): + # pretty not allowed + self.failUnlessRaises(ValueError, Commit.iter_items, self.rorepo, 'master', pretty="raw") + + def test_rev_list_bisect_all(self): + """ + 'git rev-list --bisect-all' returns additional information + in the commit header. This test ensures that we properly parse it. + """ + revs = self.rorepo.git.rev_list('933d23bf95a5bd1624fbcdf328d904e1fa173474', + first_parent=True, + bisect_all=True) - commits = Commit._iter_from_process_or_stream(self.rorepo, ListProcessAdapter(revs), True) - expected_ids = ( - 'cf37099ea8d1d8c7fbf9b6d12d7ec0249d3acb8b', - '33ebe7acec14b25c5f84f35a664803fcab2f7781', - 'a6604a00a652e754cb8b6b0b9f194f839fc38d7c', - '8df638c22c75ddc9a43ecdde90c0c9939f5009e7', - 'c231551328faa864848bde6ff8127f59c9566e90', - ) - for sha1, commit in zip(expected_ids, commits): - assert_equal(sha1, commit.sha) + commits = Commit._iter_from_process_or_stream(self.rorepo, StringProcessAdapter(revs)) + expected_ids = ( + '7156cece3c49544abb6bf7a0c218eb36646fad6d', + '1f66cfbbce58b4b552b041707a12d437cc5f400a', + '33ebe7acec14b25c5f84f35a664803fcab2f7781', + '933d23bf95a5bd1624fbcdf328d904e1fa173474' + ) + for sha1, commit in zip(expected_ids, commits): + assert_equal(sha1, commit.sha) - def test_count(self): - assert self.rorepo.tag('refs/tags/0.1.5').commit.count( ) == 143 - - def test_list(self): - assert isinstance(Commit.list_items(self.rorepo, '0.1.5', max_count=5)['5117c9c8a4d3af19a9958677e45cda9269de1541'], Commit) + def test_count(self): + assert self.rorepo.tag('refs/tags/0.1.5').commit.count( ) == 143 + + def test_list(self): + assert isinstance(Commit.list_items(self.rorepo, '0.1.5', max_count=5)['5117c9c8a4d3af19a9958677e45cda9269de1541'], Commit) - def test_str(self): - commit = Commit(self.rorepo, 'abc') - assert_equal ("abc", str(commit)) + def test_str(self): + commit = Commit(self.rorepo, 'abc') + assert_equal ("abc", str(commit)) - def test_repr(self): - commit = Commit(self.rorepo, 'abc') - assert_equal('<git.Commit "abc">', repr(commit)) + def test_repr(self): + commit = Commit(self.rorepo, 'abc') + assert_equal('<git.Commit "abc">', repr(commit)) - def test_equality(self): - commit1 = Commit(self.rorepo, 'abc') - commit2 = Commit(self.rorepo, 'abc') - commit3 = Commit(self.rorepo, 'zyx') - assert_equal(commit1, commit2) - assert_not_equal(commit2, commit3) - - def test_iter_parents(self): - # should return all but ourselves, even if skip is defined - c = self.rorepo.commit('0.1.5') - for skip in (0, 1): - piter = c.iter_parents(skip=skip) - first_parent = piter.next() - assert first_parent != c - assert first_parent == c.parents[0] - # END for each - - def test_base(self): - name_rev = self.rorepo.head.commit.name_rev - assert isinstance(name_rev, basestring) - + def test_equality(self): + commit1 = Commit(self.rorepo, 'abc') + commit2 = Commit(self.rorepo, 'abc') + commit3 = Commit(self.rorepo, 'zyx') + assert_equal(commit1, commit2) + assert_not_equal(commit2, commit3) + + def test_iter_parents(self): + # should return all but ourselves, even if skip is defined + c = self.rorepo.commit('0.1.5') + for skip in (0, 1): + piter = c.iter_parents(skip=skip) + first_parent = piter.next() + assert first_parent != c + assert first_parent == c.parents[0] + # END for each + + def test_base(self): + name_rev = self.rorepo.head.commit.name_rev + assert isinstance(name_rev, basestring) + + @with_bare_rw_repo + def test_serialization(self, rwrepo): + # create all commits of our repo + assert_commit_serialization(rwrepo, '0.1.6') + diff --git a/test/git/test_diff.py b/test/git/test_diff.py index 2f6a19bd..a113b992 100644 --- a/test/git/test_diff.py +++ b/test/git/test_diff.py @@ -20,7 +20,7 @@ class TestDiff(TestBase): return diffs def test_list_from_string_new_mode(self): - output = ListProcessAdapter(fixture('diff_new_mode')) + output = StringProcessAdapter(fixture('diff_new_mode')) diffs = Diff._index_from_patch_format(self.rorepo, output.stdout) self._assert_diff_format(diffs) @@ -28,7 +28,7 @@ class TestDiff(TestBase): assert_equal(10, len(diffs[0].diff.splitlines())) def test_diff_with_rename(self): - output = ListProcessAdapter(fixture('diff_rename')) + output = StringProcessAdapter(fixture('diff_rename')) diffs = Diff._index_from_patch_format(self.rorepo, output.stdout) self._assert_diff_format(diffs) @@ -47,7 +47,7 @@ class TestDiff(TestBase): "diff_tree_numstat_root" ) for fixture_name in fixtures: - diff_proc = ListProcessAdapter(fixture(fixture_name)) + diff_proc = StringProcessAdapter(fixture(fixture_name)) diffs = Diff._index_from_patch_format(self.rorepo, diff_proc.stdout) # END for each fixture diff --git a/test/git/test_odb.py b/test/git/test_odb.py new file mode 100644 index 00000000..c3a03714 --- /dev/null +++ b/test/git/test_odb.py @@ -0,0 +1,66 @@ +"""Test for object db""" + +from test.testlib import * +from git.odb import * +from git.odb.stream import Sha1Writer +from git import Blob +from git.errors import BadObject + +from cStringIO import StringIO +import os + + +class TestDB(TestBase): + """Test the different db class implementations""" + + # data + two_lines = "1234\nhello world" + + all_data = (two_lines, ) + + def _assert_object_writing(self, db): + """General tests to verify object writing, compatible to ObjectDBW + :note: requires write access to the database""" + # start in 'dry-run' mode, using a simple sha1 writer + ostreams = (Sha1Writer, None) + for ostreamcls in ostreams: + for data in self.all_data: + dry_run = ostreamcls is not None + ostream = None + if ostreamcls is not None: + ostream = ostreamcls() + # END create ostream + + prev_ostream = db.set_ostream(ostream) + assert type(prev_ostream) in ostreams or prev_ostream in ostreams + + istream = IStream(Blob.type, len(data), StringIO(data)) + my_istream = db.store(istream) + sha = istream.sha + assert my_istream is istream + assert db.has_object(sha) != dry_run + assert len(sha) == 40 # for now we require 40 byte shas as default + + # verify data - the slow way, we want to run code + if not dry_run: + info = db.info(sha) + assert Blob.type == info.type + assert info.size == len(data) + + ostream = db.stream(sha) + assert ostream.read() == data + assert ostream.type == Blob.type + assert ostream.size == len(data) + else: + self.failUnlessRaises(BadObject, db.info, sha) + self.failUnlessRaises(BadObject, db.stream, sha) + # END for each data set + # END for each dry_run mode + + @with_bare_rw_repo + def test_writing(self, rwrepo): + ldb = LooseObjectDB(os.path.join(rwrepo.git_dir, 'objects')) + + # write data + self._assert_object_writing(ldb) + diff --git a/test/git/test_performance.py b/test/git/test_performance.py deleted file mode 100644 index 72acfcac..00000000 --- a/test/git/test_performance.py +++ /dev/null @@ -1,52 +0,0 @@ -# test_performance.py -# Copyright (C) 2008, 2009 Michael Trier (mtrier@gmail.com) and contributors -# -# This module is part of GitPython and is released under -# the BSD License: http://www.opensource.org/licenses/bsd-license.php - -from test.testlib import * -from git import * -from time import time - -class TestPerformance(TestBase): - - def _query_commit_info(self, c): - c.author - c.authored_date - c.author_tz_offset - c.committer - c.committed_date - c.committer_tz_offset - c.message - c.parents - - def test_iteration(self): - num_objs = 0 - num_commits = 0 - - # find the first commit containing the given path - always do a full - # iteration ( restricted to the path in question ), but in fact it should - # return quite a lot of commits, we just take one and hence abort the operation - - st = time() - for c in self.rorepo.iter_commits('0.1.6'): - num_commits += 1 - self._query_commit_info(c) - for obj in c.tree.traverse(): - obj.size - num_objs += 1 - # END for each object - # END for each commit - elapsed_time = time() - st - print "Traversed %i Trees and a total of %i unchached objects in %s [s] ( %f objs/s )" % (num_commits, num_objs, elapsed_time, num_objs/elapsed_time) - - def test_commit_traversal(self): - num_commits = 0 - - st = time() - for c in self.rorepo.commit('0.1.6').traverse(branch_first=False): - num_commits += 1 - self._query_commit_info(c) - # END for each traversed commit - elapsed_time = time() - st - print "Traversed %i Commits in %s [s] ( %f commits/s )" % (num_commits, elapsed_time, num_commits/elapsed_time) diff --git a/test/git/test_repo.py b/test/git/test_repo.py index ce79402a..d2c7c742 100644 --- a/test/git/test_repo.py +++ b/test/git/test_repo.py @@ -10,325 +10,396 @@ from git import * from git.utils import join_path_native import tempfile import shutil +from cStringIO import StringIO class TestRepo(TestBase): - - @raises(InvalidGitRepositoryError) - def test_new_should_raise_on_invalid_repo_location(self): - Repo(tempfile.gettempdir()) - - @raises(NoSuchPathError) - def test_new_should_raise_on_non_existant_path(self): - Repo("repos/foobar") - - def test_repo_creation_from_different_paths(self): - r_from_gitdir = Repo(self.rorepo.git_dir) - assert r_from_gitdir.git_dir == self.rorepo.git_dir - assert r_from_gitdir.git_dir.endswith('.git') - assert not self.rorepo.git.working_dir.endswith('.git') - assert r_from_gitdir.git.working_dir == self.rorepo.git.working_dir - - def test_description(self): - txt = "Test repository" - self.rorepo.description = txt - assert_equal(self.rorepo.description, txt) - - def test_heads_should_return_array_of_head_objects(self): - for head in self.rorepo.heads: - assert_equal(Head, head.__class__) - - def test_heads_should_populate_head_data(self): - for head in self.rorepo.heads: - assert head.name - assert isinstance(head.commit,Commit) - # END for each head - - assert isinstance(self.rorepo.heads.master, Head) - assert isinstance(self.rorepo.heads['master'], Head) - - def test_tree_from_revision(self): - tree = self.rorepo.tree('0.1.6') - assert tree.type == "tree" - assert self.rorepo.tree(tree) == tree - - # try from invalid revision that does not exist - self.failUnlessRaises(ValueError, self.rorepo.tree, 'hello world') - - @patch_object(Git, '_call_process') - def test_commits(self, git): - git.return_value = ListProcessAdapter(fixture('rev_list')) - - commits = list( self.rorepo.iter_commits('master', max_count=10) ) - - c = commits[0] - assert_equal('4c8124ffcf4039d292442eeccabdeca5af5c5017', c.sha) - assert_equal(["634396b2f541a9f2d58b00be1a07f0c358b999b3"], [p.sha for p in c.parents]) - assert_equal("672eca9b7f9e09c22dcb128c283e8c3c8d7697a4", c.tree.sha) - assert_equal("Tom Preston-Werner", c.author.name) - assert_equal("tom@mojombo.com", c.author.email) - assert_equal(1191999972, c.authored_date) - assert_equal("Tom Preston-Werner", c.committer.name) - assert_equal("tom@mojombo.com", c.committer.email) - assert_equal(1191999972, c.committed_date) - assert_equal("implement Grit#heads", c.message) - - c = commits[1] - assert_equal(tuple(), c.parents) - - c = commits[2] - assert_equal(["6e64c55896aabb9a7d8e9f8f296f426d21a78c2c", "7f874954efb9ba35210445be456c74e037ba6af2"], map(lambda p: p.sha, c.parents)) - assert_equal("Merge branch 'site'", c.summary) - - assert_true(git.called) - - def test_trees(self): - mc = 30 - num_trees = 0 - for tree in self.rorepo.iter_trees('0.1.5', max_count=mc): - num_trees += 1 - assert isinstance(tree, Tree) - # END for each tree - assert num_trees == mc - - - def _test_empty_repo(self, repo): - # test all kinds of things with an empty, freshly initialized repo. - # It should throw good errors - - # entries should be empty - assert len(repo.index.entries) == 0 - - # head is accessible - assert repo.head - assert repo.head.ref - assert not repo.head.is_valid() - - # we can change the head to some other ref - head_ref = Head.from_path(repo, Head.to_full_path('some_head')) - assert not head_ref.is_valid() - repo.head.ref = head_ref - - # is_dirty can handle all kwargs - for args in ((1, 0, 0), (0, 1, 0), (0, 0, 1)): - assert not repo.is_dirty(*args) - # END for each arg - - # we can add a file to the index ( if we are not bare ) - if not repo.bare: - pass - # END test repos with working tree - - - def test_init(self): - prev_cwd = os.getcwd() - os.chdir(tempfile.gettempdir()) - git_dir_rela = "repos/foo/bar.git" - del_dir_abs = os.path.abspath("repos") - git_dir_abs = os.path.abspath(git_dir_rela) - try: - # with specific path - for path in (git_dir_rela, git_dir_abs): - r = Repo.init(path=path, bare=True) - assert isinstance(r, Repo) - assert r.bare == True - assert os.path.isdir(r.git_dir) - - self._test_empty_repo(r) - shutil.rmtree(git_dir_abs) - # END for each path - - os.makedirs(git_dir_rela) - os.chdir(git_dir_rela) - r = Repo.init(bare=False) - r.bare == False - - self._test_empty_repo(r) - finally: - try: - shutil.rmtree(del_dir_abs) - except OSError: - pass - os.chdir(prev_cwd) - # END restore previous state - - def test_bare_property(self): - self.rorepo.bare - - @patch_object(Repo, '__init__') - @patch_object(Git, '_call_process') - def test_init_with_options(self, git, repo): - git.return_value = True - repo.return_value = None - - r = Repo.init("repos/foo/bar.git", **{'bare' : True,'template': "/baz/sweet"}) - assert isinstance(r, Repo) - - assert_true(git.called) - assert_true(repo.called) - - @patch_object(Repo, '__init__') - @patch_object(Git, '_call_process') - def test_clone(self, git, repo): - git.return_value = None - repo.return_value = None - - self.rorepo.clone("repos/foo/bar.git") - - assert_true(git.called) - path = os.path.join(absolute_project_path(), '.git') - assert_equal(git.call_args, (('clone', path, 'repos/foo/bar.git'), {})) - assert_true(repo.called) - - @patch_object(Repo, '__init__') - @patch_object(Git, '_call_process') - def test_clone_with_options(self, git, repo): - git.return_value = None - repo.return_value = None - - self.rorepo.clone("repos/foo/bar.git", **{'template': '/awesome'}) - - assert_true(git.called) - path = os.path.join(absolute_project_path(), '.git') - assert_equal(git.call_args, (('clone', path, 'repos/foo/bar.git'), - { 'template': '/awesome'})) - assert_true(repo.called) - - - def test_daemon_export(self): - orig_val = self.rorepo.daemon_export - self.rorepo.daemon_export = not orig_val - assert self.rorepo.daemon_export == ( not orig_val ) - self.rorepo.daemon_export = orig_val - assert self.rorepo.daemon_export == orig_val + + @raises(InvalidGitRepositoryError) + def test_new_should_raise_on_invalid_repo_location(self): + Repo(tempfile.gettempdir()) + + @raises(NoSuchPathError) + def test_new_should_raise_on_non_existant_path(self): + Repo("repos/foobar") + + def test_repo_creation_from_different_paths(self): + r_from_gitdir = Repo(self.rorepo.git_dir) + assert r_from_gitdir.git_dir == self.rorepo.git_dir + assert r_from_gitdir.git_dir.endswith('.git') + assert not self.rorepo.git.working_dir.endswith('.git') + assert r_from_gitdir.git.working_dir == self.rorepo.git.working_dir + + def test_description(self): + txt = "Test repository" + self.rorepo.description = txt + assert_equal(self.rorepo.description, txt) + + def test_heads_should_return_array_of_head_objects(self): + for head in self.rorepo.heads: + assert_equal(Head, head.__class__) + + def test_heads_should_populate_head_data(self): + for head in self.rorepo.heads: + assert head.name + assert isinstance(head.commit,Commit) + # END for each head + + assert isinstance(self.rorepo.heads.master, Head) + assert isinstance(self.rorepo.heads['master'], Head) + + def test_tree_from_revision(self): + tree = self.rorepo.tree('0.1.6') + assert len(tree.sha) == 40 + assert tree.type == "tree" + assert self.rorepo.tree(tree) == tree + + # try from invalid revision that does not exist + self.failUnlessRaises(ValueError, self.rorepo.tree, 'hello world') + + def test_commits(self): + mc = 10 + commits = list(self.rorepo.iter_commits('0.1.6', max_count=mc)) + assert len(commits) == mc + + c = commits[0] + assert_equal('9a4b1d4d11eee3c5362a4152216376e634bd14cf', c.sha) + assert_equal(["c76852d0bff115720af3f27acdb084c59361e5f6"], [p.sha for p in c.parents]) + assert_equal("ce41fc29549042f1aa09cc03174896cf23f112e3", c.tree.sha) + assert_equal("Michael Trier", c.author.name) + assert_equal("mtrier@gmail.com", c.author.email) + assert_equal(1232829715, c.authored_date) + assert_equal(5*3600, c.author_tz_offset) + assert_equal("Michael Trier", c.committer.name) + assert_equal("mtrier@gmail.com", c.committer.email) + assert_equal(1232829715, c.committed_date) + assert_equal(5*3600, c.committer_tz_offset) + assert_equal("Bumped version 0.1.6\n", c.message) + + c = commits[1] + assert isinstance(c.parents, tuple) + + def test_trees(self): + mc = 30 + num_trees = 0 + for tree in self.rorepo.iter_trees('0.1.5', max_count=mc): + num_trees += 1 + assert isinstance(tree, Tree) + # END for each tree + assert num_trees == mc + + + def _test_empty_repo(self, repo): + # test all kinds of things with an empty, freshly initialized repo. + # It should throw good errors + + # entries should be empty + assert len(repo.index.entries) == 0 + + # head is accessible + assert repo.head + assert repo.head.ref + assert not repo.head.is_valid() + + # we can change the head to some other ref + head_ref = Head.from_path(repo, Head.to_full_path('some_head')) + assert not head_ref.is_valid() + repo.head.ref = head_ref + + # is_dirty can handle all kwargs + for args in ((1, 0, 0), (0, 1, 0), (0, 0, 1)): + assert not repo.is_dirty(*args) + # END for each arg + + # we can add a file to the index ( if we are not bare ) + if not repo.bare: + pass + # END test repos with working tree + + + def test_init(self): + prev_cwd = os.getcwd() + os.chdir(tempfile.gettempdir()) + git_dir_rela = "repos/foo/bar.git" + del_dir_abs = os.path.abspath("repos") + git_dir_abs = os.path.abspath(git_dir_rela) + try: + # with specific path + for path in (git_dir_rela, git_dir_abs): + r = Repo.init(path=path, bare=True) + assert isinstance(r, Repo) + assert r.bare == True + assert os.path.isdir(r.git_dir) + + self._test_empty_repo(r) + shutil.rmtree(git_dir_abs) + # END for each path + + os.makedirs(git_dir_rela) + os.chdir(git_dir_rela) + r = Repo.init(bare=False) + r.bare == False + + self._test_empty_repo(r) + finally: + try: + shutil.rmtree(del_dir_abs) + except OSError: + pass + os.chdir(prev_cwd) + # END restore previous state + + def test_bare_property(self): + self.rorepo.bare + + @patch_object(Repo, '__init__') + @patch_object(Git, '_call_process') + def test_init_with_options(self, git, repo): + git.return_value = True + repo.return_value = None + + r = Repo.init("repos/foo/bar.git", **{'bare' : True,'template': "/baz/sweet"}) + assert isinstance(r, Repo) + + assert_true(git.called) + assert_true(repo.called) + + @patch_object(Repo, '__init__') + @patch_object(Git, '_call_process') + def test_clone(self, git, repo): + git.return_value = None + repo.return_value = None + + self.rorepo.clone("repos/foo/bar.git") + + assert_true(git.called) + path = os.path.join(absolute_project_path(), '.git') + assert_equal(git.call_args, (('clone', path, 'repos/foo/bar.git'), {})) + assert_true(repo.called) + + @patch_object(Repo, '__init__') + @patch_object(Git, '_call_process') + def test_clone_with_options(self, git, repo): + git.return_value = None + repo.return_value = None + + self.rorepo.clone("repos/foo/bar.git", **{'template': '/awesome'}) + + assert_true(git.called) + path = os.path.join(absolute_project_path(), '.git') + assert_equal(git.call_args, (('clone', path, 'repos/foo/bar.git'), + { 'template': '/awesome'})) + assert_true(repo.called) + + + def test_daemon_export(self): + orig_val = self.rorepo.daemon_export + self.rorepo.daemon_export = not orig_val + assert self.rorepo.daemon_export == ( not orig_val ) + self.rorepo.daemon_export = orig_val + assert self.rorepo.daemon_export == orig_val - def test_alternates(self): - cur_alternates = self.rorepo.alternates - # empty alternates - self.rorepo.alternates = [] - assert self.rorepo.alternates == [] - alts = [ "other/location", "this/location" ] - self.rorepo.alternates = alts - assert alts == self.rorepo.alternates - self.rorepo.alternates = cur_alternates - - def test_repr(self): - path = os.path.join(os.path.abspath(GIT_REPO), '.git') - assert_equal('<git.Repo "%s">' % path, repr(self.rorepo)) - - def test_is_dirty_with_bare_repository(self): - self.rorepo._bare = True - assert_false(self.rorepo.is_dirty()) - - def test_is_dirty(self): - self.rorepo._bare = False - for index in (0,1): - for working_tree in (0,1): - for untracked_files in (0,1): - assert self.rorepo.is_dirty(index, working_tree, untracked_files) in (True, False) - # END untracked files - # END working tree - # END index - self.rorepo._bare = True - assert self.rorepo.is_dirty() == False - - def test_head(self): - assert self.rorepo.head.reference.object == self.rorepo.active_branch.object - - def test_index(self): - index = self.rorepo.index - assert isinstance(index, IndexFile) - - def test_tag(self): - assert self.rorepo.tag('refs/tags/0.1.5').commit - - def test_archive(self): - tmpfile = os.tmpfile() - self.rorepo.archive(tmpfile, '0.1.5') - assert tmpfile.tell() - - @patch_object(Git, '_call_process') - def test_should_display_blame_information(self, git): - git.return_value = fixture('blame') - b = self.rorepo.blame( 'master', 'lib/git.py') - assert_equal(13, len(b)) - assert_equal( 2, len(b[0]) ) - # assert_equal(25, reduce(lambda acc, x: acc + len(x[-1]), b)) - assert_equal(hash(b[0][0]), hash(b[9][0])) - c = b[0][0] - assert_true(git.called) - assert_equal(git.call_args, (('blame', 'master', '--', 'lib/git.py'), {'p': True})) - - assert_equal('634396b2f541a9f2d58b00be1a07f0c358b999b3', c.sha) - assert_equal('Tom Preston-Werner', c.author.name) - assert_equal('tom@mojombo.com', c.author.email) - assert_equal(1191997100, c.authored_date) - assert_equal('Tom Preston-Werner', c.committer.name) - assert_equal('tom@mojombo.com', c.committer.email) - assert_equal(1191997100, c.committed_date) - assert_equal('initial grit setup', c.message) - - # test the 'lines per commit' entries - tlist = b[0][1] - assert_true( tlist ) - assert_true( isinstance( tlist[0], basestring ) ) - assert_true( len( tlist ) < sum( len(t) for t in tlist ) ) # test for single-char bug - - def test_untracked_files(self): - base = self.rorepo.working_tree_dir - files = ( join_path_native(base, "__test_myfile"), - join_path_native(base, "__test_other_file") ) - num_recently_untracked = 0 - try: - for fpath in files: - fd = open(fpath,"wb") - fd.close() - # END for each filename - untracked_files = self.rorepo.untracked_files - num_recently_untracked = len(untracked_files) - - # assure we have all names - they are relative to the git-dir - num_test_untracked = 0 - for utfile in untracked_files: - num_test_untracked += join_path_native(base, utfile) in files - assert len(files) == num_test_untracked - finally: - for fpath in files: - if os.path.isfile(fpath): - os.remove(fpath) - # END handle files - - assert len(self.rorepo.untracked_files) == (num_recently_untracked - len(files)) - - def test_config_reader(self): - reader = self.rorepo.config_reader() # all config files - assert reader.read_only - reader = self.rorepo.config_reader("repository") # single config file - assert reader.read_only - - def test_config_writer(self): - for config_level in self.rorepo.config_level: - try: - writer = self.rorepo.config_writer(config_level) - assert not writer.read_only - except IOError: - # its okay not to get a writer for some configuration files if we - # have no permissions - pass - # END for each config level - - def test_creation_deletion(self): - # just a very quick test to assure it generally works. There are - # specialized cases in the test_refs module - head = self.rorepo.create_head("new_head", "HEAD~1") - self.rorepo.delete_head(head) - - tag = self.rorepo.create_tag("new_tag", "HEAD~2") - self.rorepo.delete_tag(tag) - - remote = self.rorepo.create_remote("new_remote", "git@server:repo.git") - self.rorepo.delete_remote(remote) - - def test_comparison_and_hash(self): - # this is only a preliminary test, more testing done in test_index - assert self.rorepo == self.rorepo and not (self.rorepo != self.rorepo) - assert len(set((self.rorepo, self.rorepo))) == 1 + def test_alternates(self): + cur_alternates = self.rorepo.alternates + # empty alternates + self.rorepo.alternates = [] + assert self.rorepo.alternates == [] + alts = [ "other/location", "this/location" ] + self.rorepo.alternates = alts + assert alts == self.rorepo.alternates + self.rorepo.alternates = cur_alternates + + def test_repr(self): + path = os.path.join(os.path.abspath(GIT_REPO), '.git') + assert_equal('<git.Repo "%s">' % path, repr(self.rorepo)) + + def test_is_dirty_with_bare_repository(self): + self.rorepo._bare = True + assert_false(self.rorepo.is_dirty()) + + def test_is_dirty(self): + self.rorepo._bare = False + for index in (0,1): + for working_tree in (0,1): + for untracked_files in (0,1): + assert self.rorepo.is_dirty(index, working_tree, untracked_files) in (True, False) + # END untracked files + # END working tree + # END index + self.rorepo._bare = True + assert self.rorepo.is_dirty() == False + + def test_head(self): + assert self.rorepo.head.reference.object == self.rorepo.active_branch.object + + def test_index(self): + index = self.rorepo.index + assert isinstance(index, IndexFile) + + def test_tag(self): + assert self.rorepo.tag('refs/tags/0.1.5').commit + + def test_archive(self): + tmpfile = os.tmpfile() + self.rorepo.archive(tmpfile, '0.1.5') + assert tmpfile.tell() + + @patch_object(Git, '_call_process') + def test_should_display_blame_information(self, git): + git.return_value = fixture('blame') + b = self.rorepo.blame( 'master', 'lib/git.py') + assert_equal(13, len(b)) + assert_equal( 2, len(b[0]) ) + # assert_equal(25, reduce(lambda acc, x: acc + len(x[-1]), b)) + assert_equal(hash(b[0][0]), hash(b[9][0])) + c = b[0][0] + assert_true(git.called) + assert_equal(git.call_args, (('blame', 'master', '--', 'lib/git.py'), {'p': True})) + + assert_equal('634396b2f541a9f2d58b00be1a07f0c358b999b3', c.sha) + assert_equal('Tom Preston-Werner', c.author.name) + assert_equal('tom@mojombo.com', c.author.email) + assert_equal(1191997100, c.authored_date) + assert_equal('Tom Preston-Werner', c.committer.name) + assert_equal('tom@mojombo.com', c.committer.email) + assert_equal(1191997100, c.committed_date) + assert_equal('initial grit setup', c.message) + + # test the 'lines per commit' entries + tlist = b[0][1] + assert_true( tlist ) + assert_true( isinstance( tlist[0], basestring ) ) + assert_true( len( tlist ) < sum( len(t) for t in tlist ) ) # test for single-char bug + + def test_untracked_files(self): + base = self.rorepo.working_tree_dir + files = ( join_path_native(base, "__test_myfile"), + join_path_native(base, "__test_other_file") ) + num_recently_untracked = 0 + try: + for fpath in files: + fd = open(fpath,"wb") + fd.close() + # END for each filename + untracked_files = self.rorepo.untracked_files + num_recently_untracked = len(untracked_files) + + # assure we have all names - they are relative to the git-dir + num_test_untracked = 0 + for utfile in untracked_files: + num_test_untracked += join_path_native(base, utfile) in files + assert len(files) == num_test_untracked + finally: + for fpath in files: + if os.path.isfile(fpath): + os.remove(fpath) + # END handle files + + assert len(self.rorepo.untracked_files) == (num_recently_untracked - len(files)) + + def test_config_reader(self): + reader = self.rorepo.config_reader() # all config files + assert reader.read_only + reader = self.rorepo.config_reader("repository") # single config file + assert reader.read_only + + def test_config_writer(self): + for config_level in self.rorepo.config_level: + try: + writer = self.rorepo.config_writer(config_level) + assert not writer.read_only + except IOError: + # its okay not to get a writer for some configuration files if we + # have no permissions + pass + # END for each config level + + def test_creation_deletion(self): + # just a very quick test to assure it generally works. There are + # specialized cases in the test_refs module + head = self.rorepo.create_head("new_head", "HEAD~1") + self.rorepo.delete_head(head) + + tag = self.rorepo.create_tag("new_tag", "HEAD~2") + self.rorepo.delete_tag(tag) + + remote = self.rorepo.create_remote("new_remote", "git@server:repo.git") + self.rorepo.delete_remote(remote) + + def test_comparison_and_hash(self): + # this is only a preliminary test, more testing done in test_index + assert self.rorepo == self.rorepo and not (self.rorepo != self.rorepo) + assert len(set((self.rorepo, self.rorepo))) == 1 + + def test_git_cmd(self): + # test CatFileContentStream, just to be very sure we have no fencepost errors + # last \n is the terminating newline that it expects + l1 = "0123456789\n" + l2 = "abcdefghijklmnopqrstxy\n" + l3 = "z\n" + d = "%s%s%s\n" % (l1, l2, l3) + + l1p = l1[:5] + + # full size + # size is without terminating newline + def mkfull(): + return Git.CatFileContentStream(len(d)-1, StringIO(d)) + + ts = 5 + def mktiny(): + return Git.CatFileContentStream(ts, StringIO(d)) + + # readlines no limit + s = mkfull() + lines = s.readlines() + assert len(lines) == 3 and lines[-1].endswith('\n') + assert s._stream.tell() == len(d) # must have scrubbed to the end + + # realines line limit + s = mkfull() + lines = s.readlines(5) + assert len(lines) == 1 + + # readlines on tiny sections + s = mktiny() + lines = s.readlines() + assert len(lines) == 1 and lines[0] == l1p + assert s._stream.tell() == ts+1 + + # readline no limit + s = mkfull() + assert s.readline() == l1 + assert s.readline() == l2 + assert s.readline() == l3 + assert s.readline() == '' + assert s._stream.tell() == len(d) + + # readline limit + s = mkfull() + assert s.readline(5) == l1p + assert s.readline() == l1[5:] + + # readline on tiny section + s = mktiny() + assert s.readline() == l1p + assert s.readline() == '' + assert s._stream.tell() == ts+1 + + # read no limit + s = mkfull() + assert s.read() == d[:-1] + assert s.read() == '' + assert s._stream.tell() == len(d) + + # read limit + s = mkfull() + assert s.read(5) == l1p + assert s.read(6) == l1[5:] + assert s._stream.tell() == 5 + 6 # its not yet done + + # read tiny + s = mktiny() + assert s.read(2) == l1[:2] + assert s._stream.tell() == 2 + assert s.read() == l1[2:ts] + assert s._stream.tell() == ts+1 diff --git a/test/git/test_utils.py b/test/git/test_utils.py index f843c12e..83ef7e4b 100644 --- a/test/git/test_utils.py +++ b/test/git/test_utils.py @@ -9,112 +9,144 @@ import tempfile from test.testlib import * from git.utils import * +from git.objects.utils import * from git import * from git.cmd import dashify import time class TestUtils(TestCase): - def setup(self): - self.testdict = { - "string": "42", - "int": 42, - "array": [ 42 ], - } + def setup(self): + self.testdict = { + "string": "42", + "int": 42, + "array": [ 42 ], + } - def test_it_should_dashify(self): - assert_equal('this-is-my-argument', dashify('this_is_my_argument')) - assert_equal('foo', dashify('foo')) - - - def test_lock_file(self): - my_file = tempfile.mktemp() - lock_file = LockFile(my_file) - assert not lock_file._has_lock() - # release lock we don't have - fine - lock_file._release_lock() - - # get lock - lock_file._obtain_lock_or_raise() - assert lock_file._has_lock() - - # concurrent access - other_lock_file = LockFile(my_file) - assert not other_lock_file._has_lock() - self.failUnlessRaises(IOError, other_lock_file._obtain_lock_or_raise) - - lock_file._release_lock() - assert not lock_file._has_lock() - - other_lock_file._obtain_lock_or_raise() - self.failUnlessRaises(IOError, lock_file._obtain_lock_or_raise) - - # auto-release on destruction - del(other_lock_file) - lock_file._obtain_lock_or_raise() - lock_file._release_lock() - - def test_blocking_lock_file(self): - my_file = tempfile.mktemp() - lock_file = BlockingLockFile(my_file) - lock_file._obtain_lock() - - # next one waits for the lock - start = time.time() - wait_time = 0.1 - wait_lock = BlockingLockFile(my_file, 0.05, wait_time) - self.failUnlessRaises(IOError, wait_lock._obtain_lock) - elapsed = time.time() - start - assert elapsed <= wait_time + 0.02 # some extra time it may cost - - def _cmp_contents(self, file_path, data): - # raise if data from file at file_path - # does not match data string - fp = open(file_path, "rb") - try: - assert fp.read() == data - finally: - fp.close() - - def test_safe_operation(self): - my_file = tempfile.mktemp() - orig_data = "hello" - new_data = "world" - my_file_fp = open(my_file, "wb") - my_file_fp.write(orig_data) - my_file_fp.close() - - try: - cwrite = ConcurrentWriteOperation(my_file) - - # didn't start writing, doesnt matter - cwrite._end_writing(False) - cwrite._end_writing(True) - assert not cwrite._is_writing() - - # write data and fail - stream = cwrite._begin_writing() - assert cwrite._is_writing() - stream.write(new_data) - cwrite._end_writing(successful=False) - self._cmp_contents(my_file, orig_data) - assert not os.path.exists(stream.name) - - # write data - concurrently - ocwrite = ConcurrentWriteOperation(my_file) - stream = cwrite._begin_writing() - self.failUnlessRaises(IOError, ocwrite._begin_writing) - - stream.write("world") - cwrite._end_writing(successful=True) - self._cmp_contents(my_file, new_data) - assert not os.path.exists(stream.name) - - # could test automatic _end_writing on destruction - finally: - os.remove(my_file) - # END final cleanup - - - - + def test_it_should_dashify(self): + assert_equal('this-is-my-argument', dashify('this_is_my_argument')) + assert_equal('foo', dashify('foo')) + + + def test_lock_file(self): + my_file = tempfile.mktemp() + lock_file = LockFile(my_file) + assert not lock_file._has_lock() + # release lock we don't have - fine + lock_file._release_lock() + + # get lock + lock_file._obtain_lock_or_raise() + assert lock_file._has_lock() + + # concurrent access + other_lock_file = LockFile(my_file) + assert not other_lock_file._has_lock() + self.failUnlessRaises(IOError, other_lock_file._obtain_lock_or_raise) + + lock_file._release_lock() + assert not lock_file._has_lock() + + other_lock_file._obtain_lock_or_raise() + self.failUnlessRaises(IOError, lock_file._obtain_lock_or_raise) + + # auto-release on destruction + del(other_lock_file) + lock_file._obtain_lock_or_raise() + lock_file._release_lock() + + def test_blocking_lock_file(self): + my_file = tempfile.mktemp() + lock_file = BlockingLockFile(my_file) + lock_file._obtain_lock() + + # next one waits for the lock + start = time.time() + wait_time = 0.1 + wait_lock = BlockingLockFile(my_file, 0.05, wait_time) + self.failUnlessRaises(IOError, wait_lock._obtain_lock) + elapsed = time.time() - start + assert elapsed <= wait_time + 0.02 # some extra time it may cost + + def _cmp_contents(self, file_path, data): + # raise if data from file at file_path + # does not match data string + fp = open(file_path, "rb") + try: + assert fp.read() == data + finally: + fp.close() + + def test_safe_operation(self): + my_file = tempfile.mktemp() + orig_data = "hello" + new_data = "world" + my_file_fp = open(my_file, "wb") + my_file_fp.write(orig_data) + my_file_fp.close() + + try: + cwrite = ConcurrentWriteOperation(my_file) + + # didn't start writing, doesnt matter + cwrite._end_writing(False) + cwrite._end_writing(True) + assert not cwrite._is_writing() + + # write data and fail + stream = cwrite._begin_writing() + assert cwrite._is_writing() + stream.write(new_data) + cwrite._end_writing(successful=False) + self._cmp_contents(my_file, orig_data) + assert not os.path.exists(stream.name) + + # write data - concurrently + ocwrite = ConcurrentWriteOperation(my_file) + stream = cwrite._begin_writing() + self.failUnlessRaises(IOError, ocwrite._begin_writing) + + stream.write("world") + cwrite._end_writing(successful=True) + self._cmp_contents(my_file, new_data) + assert not os.path.exists(stream.name) + + # could test automatic _end_writing on destruction + finally: + os.remove(my_file) + # END final cleanup + + def test_user_id(self): + assert '@' in get_user_id() + + def test_parse_date(self): + # test all supported formats + def assert_rval(rval, veri_time, offset=0): + assert len(rval) == 2 + assert isinstance(rval[0], int) and isinstance(rval[1], int) + assert rval[0] == veri_time + assert rval[1] == offset + + # now that we are here, test our conversion functions as well + utctz = altz_to_utctz_str(offset) + assert isinstance(utctz, basestring) + assert utctz_to_altz(verify_utctz(utctz)) == offset + # END assert rval utility + + rfc = ("Thu, 07 Apr 2005 22:13:11 +0000", 0) + iso = ("2005-04-07T22:13:11 -0200", 7200) + iso2 = ("2005-04-07 22:13:11 +0400", -14400) + iso3 = ("2005.04.07 22:13:11 -0000", 0) + alt = ("04/07/2005 22:13:11", 0) + alt2 = ("07.04.2005 22:13:11", 0) + veri_time = 1112904791 # the time this represents + for date, offset in (rfc, iso, iso2, iso3, alt, alt2): + assert_rval(parse_date(date), veri_time, offset) + # END for each date type + + # and failure + self.failUnlessRaises(ValueError, parse_date, 'invalid format') + self.failUnlessRaises(ValueError, parse_date, '123456789 -02000') + self.failUnlessRaises(ValueError, parse_date, ' 123456789 -0200') + + diff --git a/test/testlib/helper.py b/test/testlib/helper.py index 9c38ffd5..c9b4c2ac 100644 --- a/test/testlib/helper.py +++ b/test/testlib/helper.py @@ -9,6 +9,7 @@ from git import Repo, Remote, GitCommandError from unittest import TestCase import tempfile import shutil +import cStringIO GIT_REPO = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) @@ -23,40 +24,13 @@ def absolute_project_path(): return os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) -class ListProcessAdapter(object): - """Allows to use lists as Process object as returned by SubProcess.Popen. +class StringProcessAdapter(object): + """Allows to use strings as Process object as returned by SubProcess.Popen. Its tailored to work with the test system only""" - class Stream(object): - """Simple stream emulater meant to work only with tests""" - def __init__(self, data): - self.data = data - self.cur_iter = None - - def __iter__(self): - dat = self.data - if isinstance(dat, basestring): - dat = dat.splitlines() - if self.cur_iter is None: - self.cur_iter = iter(dat) - return self.cur_iter - - def read(self): - dat = self.data - if isinstance(dat, (tuple,list)): - dat = "\n".join(dat) - return dat - - def next(self): - if self.cur_iter is None: - self.cur_iter = iter(self) - return self.cur_iter.next() - - # END stream - - def __init__(self, input_list_or_string): - self.stdout = self.Stream(input_list_or_string) - self.stderr = self.Stream('') + def __init__(self, input_string): + self.stdout = cStringIO.StringIO(input_string) + self.stderr = cStringIO.StringIO() def wait(self): return 0 |