From df0892351a394d768489b5647d47b73c24d3ef5f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 00:48:16 +0200 Subject: commit: initial version of commit_from_tree which could create commit objects if it could serialize itself --- lib/git/objects/utils.py | 411 ++++++++++++++++++++++++++++++----------------- 1 file changed, 263 insertions(+), 148 deletions(-) (limited to 'lib/git/objects/utils.py') diff --git a/lib/git/objects/utils.py b/lib/git/objects/utils.py index 4f17b652..7060e293 100644 --- a/lib/git/objects/utils.py +++ b/lib/git/objects/utils.py @@ -9,159 +9,274 @@ Module for general utility functions import re from collections import deque as Deque from git.actor import Actor +import platform + +from string import digits +import time +import os + +__all__ = ('get_object_type_by_name', 'get_user_id', 'parse_date', 'parse_actor_and_date', + 'ProcessStreamAdapter', 'Traversable') def get_object_type_by_name(object_type_name): - """ - Returns - type suitable to handle the given object type name. - Use the type to create new instances. - - ``object_type_name`` - Member of TYPES - - Raises - ValueError: In case object_type_name is unknown - """ - if object_type_name == "commit": - import commit - return commit.Commit - elif object_type_name == "tag": - import tag - return tag.TagObject - elif object_type_name == "blob": - import blob - return blob.Blob - elif object_type_name == "tree": - import tree - return tree.Tree - else: - raise ValueError("Cannot handle unknown object type: %s" % object_type_name) - - + """ + Returns + type suitable to handle the given object type name. + Use the type to create new instances. + + ``object_type_name`` + Member of TYPES + + Raises + ValueError: In case object_type_name is unknown + """ + if object_type_name == "commit": + import commit + return commit.Commit + elif object_type_name == "tag": + import tag + return tag.TagObject + elif object_type_name == "blob": + import blob + return blob.Blob + elif object_type_name == "tree": + import tree + return tree.Tree + else: + raise ValueError("Cannot handle unknown object type: %s" % object_type_name) + + +def get_user_id(): + """:return: string identifying the currently active system user as name@node + :note: user can be set with the 'USER' environment variable, usually set on windows""" + ukn = 'UNKNOWN' + username = os.environ.get('USER', ukn) + if username == ukn and hasattr(os, 'getlogin'): + username = os.getlogin() + # END get username from login + return "%s@%s" % (username, platform.node()) + + +def _utc_tz_to_altz(utctz): + """we convert utctz to the timezone in seconds, it is the format time.altzone + returns. Git stores it as UTC timezon which has the opposite sign as well, + which explains the -1 * ( that was made explicit here ) + :param utctz: git utc timezone string, i.e. +0200""" + return -1 * int(float(utctz)/100*3600) + +def _verify_utctz(offset): + """:raise ValueError: if offset is incorrect + :return: offset""" + fmt_exc = ValueError("Invalid timezone offset format: %s" % offset) + if len(offset) != 5: + raise fmt_exc + if offset[0] not in "+-": + raise fmt_exc + if offset[1] not in digits or \ + offset[2] not in digits or \ + offset[3] not in digits or \ + offset[4] not in digits: + raise fmt_exc + # END for each char + return offset + +def parse_date(string_date): + """ + Parse the given date as one of the following + * Git internal format: timestamp offset + * RFC 2822: Thu, 07 Apr 2005 22:13:13 +0200. + * ISO 8601 2005-04-07T22:13:13 + The T can be a space as well + + :return: Tuple(int(timestamp), int(offset), both in seconds since epoch + :raise ValueError: If the format could not be understood + :note: Date can also be YYYY.MM.DD, MM/DD/YYYY and DD.MM.YYYY + """ + # git time + try: + if string_date.count(' ') == 1 and string_date.rfind(':') == -1: + timestamp, offset = string_date.split() + timestamp = int(timestamp) + return timestamp, _utc_tz_to_altz(_verify_utctz(offset)) + else: + offset = "+0000" # local time by default + if string_date[-5] in '-+': + offset = _verify_utctz(string_date[-5:]) + string_date = string_date[:-6] # skip space as well + # END split timezone info + + # now figure out the date and time portion - split time + date_formats = list() + splitter = -1 + if ',' in string_date: + date_formats.append("%a, %d %b %Y") + splitter = string_date.rfind(' ') + else: + # iso plus additional + date_formats.append("%Y-%m-%d") + date_formats.append("%Y.%m.%d") + date_formats.append("%m/%d/%Y") + date_formats.append("%d.%m.%Y") + + splitter = string_date.rfind('T') + if splitter == -1: + splitter = string_date.rfind(' ') + # END handle 'T' and ' ' + # END handle rfc or iso + + assert splitter > -1 + + # split date and time + time_part = string_date[splitter+1:] # skip space + date_part = string_date[:splitter] + + # parse time + tstruct = time.strptime(time_part, "%H:%M:%S") + + for fmt in date_formats: + try: + dtstruct = time.strptime(date_part, fmt) + fstruct = time.struct_time((dtstruct.tm_year, dtstruct.tm_mon, dtstruct.tm_mday, + tstruct.tm_hour, tstruct.tm_min, tstruct.tm_sec, + dtstruct.tm_wday, dtstruct.tm_yday, tstruct.tm_isdst)) + return int(time.mktime(fstruct)), _utc_tz_to_altz(offset) + except ValueError: + continue + # END exception handling + # END for each fmt + + # still here ? fail + raise ValueError("no format matched") + # END handle format + except Exception: + raise ValueError("Unsupported date format: %s" % string_date) + # END handle exceptions + + # precompiled regex _re_actor_epoch = re.compile(r'^.+? (.*) (\d+) ([+-]\d+).*$') def parse_actor_and_date(line): - """ - Parse out the actor (author or committer) info from a line like:: - - author Tom Preston-Werner 1191999972 -0700 - - Returns - [Actor, int_seconds_since_epoch, int_timezone_offset] - """ - m = _re_actor_epoch.search(line) - actor, epoch, offset = m.groups() - return (Actor._from_string(actor), int(epoch), -int(float(offset)/100*3600)) - - - + """ + Parse out the actor (author or committer) info from a line like:: + + author Tom Preston-Werner 1191999972 -0700 + + Returns + [Actor, int_seconds_since_epoch, int_timezone_offset] + """ + m = _re_actor_epoch.search(line) + actor, epoch, offset = m.groups() + return (Actor._from_string(actor), int(epoch), _utc_tz_to_altz(offset)) + + + class ProcessStreamAdapter(object): - """ - Class wireing all calls to the contained Process instance. - - Use this type to hide the underlying process to provide access only to a specified - stream. The process is usually wrapped into an AutoInterrupt class to kill - it if the instance goes out of scope. - """ - __slots__ = ("_proc", "_stream") - def __init__(self, process, stream_name): - self._proc = process - self._stream = getattr(process, stream_name) - - def __getattr__(self, attr): - return getattr(self._stream, attr) - - + """ + Class wireing all calls to the contained Process instance. + + Use this type to hide the underlying process to provide access only to a specified + stream. The process is usually wrapped into an AutoInterrupt class to kill + it if the instance goes out of scope. + """ + __slots__ = ("_proc", "_stream") + def __init__(self, process, stream_name): + self._proc = process + self._stream = getattr(process, stream_name) + + def __getattr__(self, attr): + return getattr(self._stream, attr) + + class Traversable(object): - """Simple interface to perforam depth-first or breadth-first traversals - into one direction. - Subclasses only need to implement one function. - Instances of the Subclass must be hashable""" - __slots__ = tuple() - - @classmethod - def _get_intermediate_items(cls, item): - """ - Returns: - List of items connected to the given item. - Must be implemented in subclass - """ - raise NotImplementedError("To be implemented in subclass") - - - def traverse( self, predicate = lambda i,d: True, - prune = lambda i,d: False, depth = -1, branch_first=True, - visit_once = True, ignore_self=1, as_edge = False ): - """ - ``Returns`` - iterator yieling of items found when traversing self - - ``predicate`` - f(i,d) returns False if item i at depth d should not be included in the result - - ``prune`` - f(i,d) return True if the search should stop at item i at depth d. - Item i will not be returned. - - ``depth`` - define at which level the iteration should not go deeper - if -1, there is no limit - if 0, you would effectively only get self, the root of the iteration - i.e. if 1, you would only get the first level of predessessors/successors - - ``branch_first`` - if True, items will be returned branch first, otherwise depth first - - ``visit_once`` - if True, items will only be returned once, although they might be encountered - several times. Loops are prevented that way. - - ``ignore_self`` - if True, self will be ignored and automatically pruned from - the result. Otherwise it will be the first item to be returned. - If as_edge is True, the source of the first edge is None - - ``as_edge`` - if True, return a pair of items, first being the source, second the - destinatination, i.e. tuple(src, dest) with the edge spanning from - source to destination""" - visited = set() - stack = Deque() - stack.append( ( 0 ,self, None ) ) # self is always depth level 0 - - def addToStack( stack, item, branch_first, depth ): - lst = self._get_intermediate_items( item ) - if not lst: - return - if branch_first: - stack.extendleft( ( depth , i, item ) for i in lst ) - else: - reviter = ( ( depth , lst[i], item ) for i in range( len( lst )-1,-1,-1) ) - stack.extend( reviter ) - # END addToStack local method - - while stack: - d, item, src = stack.pop() # depth of item, item, item_source - - if visit_once and item in visited: - continue - - if visit_once: - visited.add(item) - - rval = ( as_edge and (src, item) ) or item - if prune( rval, d ): - continue - - skipStartItem = ignore_self and ( item == self ) - if not skipStartItem and predicate( rval, d ): - yield rval - - # only continue to next level if this is appropriate ! - nd = d + 1 - if depth > -1 and nd > depth: - continue - - addToStack( stack, item, branch_first, nd ) - # END for each item on work stack + """Simple interface to perforam depth-first or breadth-first traversals + into one direction. + Subclasses only need to implement one function. + Instances of the Subclass must be hashable""" + __slots__ = tuple() + + @classmethod + def _get_intermediate_items(cls, item): + """ + Returns: + List of items connected to the given item. + Must be implemented in subclass + """ + raise NotImplementedError("To be implemented in subclass") + + + def traverse( self, predicate = lambda i,d: True, + prune = lambda i,d: False, depth = -1, branch_first=True, + visit_once = True, ignore_self=1, as_edge = False ): + """ + ``Returns`` + iterator yieling of items found when traversing self + + ``predicate`` + f(i,d) returns False if item i at depth d should not be included in the result + + ``prune`` + f(i,d) return True if the search should stop at item i at depth d. + Item i will not be returned. + + ``depth`` + define at which level the iteration should not go deeper + if -1, there is no limit + if 0, you would effectively only get self, the root of the iteration + i.e. if 1, you would only get the first level of predessessors/successors + + ``branch_first`` + if True, items will be returned branch first, otherwise depth first + + ``visit_once`` + if True, items will only be returned once, although they might be encountered + several times. Loops are prevented that way. + + ``ignore_self`` + if True, self will be ignored and automatically pruned from + the result. Otherwise it will be the first item to be returned. + If as_edge is True, the source of the first edge is None + + ``as_edge`` + if True, return a pair of items, first being the source, second the + destinatination, i.e. tuple(src, dest) with the edge spanning from + source to destination""" + visited = set() + stack = Deque() + stack.append( ( 0 ,self, None ) ) # self is always depth level 0 + + def addToStack( stack, item, branch_first, depth ): + lst = self._get_intermediate_items( item ) + if not lst: + return + if branch_first: + stack.extendleft( ( depth , i, item ) for i in lst ) + else: + reviter = ( ( depth , lst[i], item ) for i in range( len( lst )-1,-1,-1) ) + stack.extend( reviter ) + # END addToStack local method + + while stack: + d, item, src = stack.pop() # depth of item, item, item_source + + if visit_once and item in visited: + continue + + if visit_once: + visited.add(item) + + rval = ( as_edge and (src, item) ) or item + if prune( rval, d ): + continue + + skipStartItem = ignore_self and ( item == self ) + if not skipStartItem and predicate( rval, d ): + yield rval + + # only continue to next level if this is appropriate ! + nd = d + 1 + if depth > -1 and nd > depth: + continue + + addToStack( stack, item, branch_first, nd ) + # END for each item on work stack -- cgit v1.2.1 From 8c1a87d11df666d308d14e4ae7ee0e9d614296b6 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 12:30:33 +0200 Subject: commit: refactored existing code to decode commits from streams - performance is slightly better git.cmd: added method to provide access to the content stream directly. This is more efficient if large objects are handled, if it is actually used test.helpers: removed unnecessary code --- lib/git/objects/utils.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'lib/git/objects/utils.py') diff --git a/lib/git/objects/utils.py b/lib/git/objects/utils.py index 7060e293..6d378a72 100644 --- a/lib/git/objects/utils.py +++ b/lib/git/objects/utils.py @@ -280,3 +280,20 @@ class Traversable(object): addToStack( stack, item, branch_first, nd ) # END for each item on work stack + + +class Serializable(object): + """Defines methods to serialize and deserialize objects from and into a data stream""" + + def _serialize(self, stream): + """Serialize the data of this object into the given data stream + :note: a serialized object would ``_deserialize`` into the same objet + :param stream: a file-like object + :return: self""" + raise NotImplementedError("To be implemented in subclass") + + def _deserialize(self, stream): + """Deserialize all information regarding this object from the stream + :param stream: a file-like object + :return: self""" + raise NotImplementedError("To be implemented in subclass") -- cgit v1.2.1 From 1e2b46138ba58033738a24dadccc265748fce2ca Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 3 Jun 2010 23:20:34 +0200 Subject: commit.create_from_tree now uses pure python implementation, fixed message parsing which truncated newlines although it was ilegitimate. Its up to the reader to truncate therse, nowhere in the git code I could find anyone adding newlines to commits where it is written Added performance tests for serialization, it does about 5k commits per second if writing to tmpfs --- lib/git/objects/utils.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'lib/git/objects/utils.py') diff --git a/lib/git/objects/utils.py b/lib/git/objects/utils.py index 6d378a72..c93f2091 100644 --- a/lib/git/objects/utils.py +++ b/lib/git/objects/utils.py @@ -16,7 +16,8 @@ import time import os __all__ = ('get_object_type_by_name', 'get_user_id', 'parse_date', 'parse_actor_and_date', - 'ProcessStreamAdapter', 'Traversable') + 'ProcessStreamAdapter', 'Traversable', 'altz_to_utctz_str', 'utctz_to_altz', + 'verify_utctz') def get_object_type_by_name(object_type_name): """ @@ -57,14 +58,24 @@ def get_user_id(): return "%s@%s" % (username, platform.node()) -def _utc_tz_to_altz(utctz): +def utctz_to_altz(utctz): """we convert utctz to the timezone in seconds, it is the format time.altzone returns. Git stores it as UTC timezon which has the opposite sign as well, which explains the -1 * ( that was made explicit here ) :param utctz: git utc timezone string, i.e. +0200""" return -1 * int(float(utctz)/100*3600) + +def altz_to_utctz_str(altz): + """As above, but inverses the operation, returning a string that can be used + in commit objects""" + utci = -1 * int((altz / 3600)*100) + utcs = str(abs(utci)) + utcs = "0"*(4-len(utcs)) + utcs + prefix = (utci < 0 and '-') or '+' + return prefix + utcs + -def _verify_utctz(offset): +def verify_utctz(offset): """:raise ValueError: if offset is incorrect :return: offset""" fmt_exc = ValueError("Invalid timezone offset format: %s" % offset) @@ -97,11 +108,11 @@ def parse_date(string_date): if string_date.count(' ') == 1 and string_date.rfind(':') == -1: timestamp, offset = string_date.split() timestamp = int(timestamp) - return timestamp, _utc_tz_to_altz(_verify_utctz(offset)) + return timestamp, utctz_to_altz(verify_utctz(offset)) else: offset = "+0000" # local time by default if string_date[-5] in '-+': - offset = _verify_utctz(string_date[-5:]) + offset = verify_utctz(string_date[-5:]) string_date = string_date[:-6] # skip space as well # END split timezone info @@ -139,7 +150,7 @@ def parse_date(string_date): fstruct = time.struct_time((dtstruct.tm_year, dtstruct.tm_mon, dtstruct.tm_mday, tstruct.tm_hour, tstruct.tm_min, tstruct.tm_sec, dtstruct.tm_wday, dtstruct.tm_yday, tstruct.tm_isdst)) - return int(time.mktime(fstruct)), _utc_tz_to_altz(offset) + return int(time.mktime(fstruct)), utctz_to_altz(offset) except ValueError: continue # END exception handling @@ -167,7 +178,7 @@ def parse_actor_and_date(line): """ m = _re_actor_epoch.search(line) actor, epoch, offset = m.groups() - return (Actor._from_string(actor), int(epoch), _utc_tz_to_altz(offset)) + return (Actor._from_string(actor), int(epoch), utctz_to_altz(offset)) -- cgit v1.2.1