diff options
author | Sebastian Thiel <byronimo@gmail.com> | 2009-10-09 12:14:02 +0200 |
---|---|---|
committer | Sebastian Thiel <byronimo@gmail.com> | 2009-10-09 15:13:05 +0200 |
commit | 52ab307935bd2bbda52f853f9fc6b49f01897727 (patch) | |
tree | 8950b9658b4f0fba902e80077ba5ee88f50c4541 /lib/git | |
parent | 07c20b4231b12fee42d15f1c44c948ce474f5851 (diff) | |
download | gitpython-52ab307935bd2bbda52f853f9fc6b49f01897727.tar.gz |
diff regex are now precompiled on class level, renamed a|b_blob to a|b_blob_id as it better reflects the actual value
actor regex now precompiled on class level
blob regex now precompiled on class level; made blame method more readable and faster although it can still be improved by making assumptions about the blame format and by reading the git command stream directly ( which is a general issue right now )
Diffstat (limited to 'lib/git')
-rw-r--r-- | lib/git/actor.py | 8 | ||||
-rw-r--r-- | lib/git/blob.py | 111 | ||||
-rw-r--r-- | lib/git/diff.py | 45 |
3 files changed, 98 insertions, 66 deletions
diff --git a/lib/git/actor.py b/lib/git/actor.py index bc1a4479..28f50e73 100644 --- a/lib/git/actor.py +++ b/lib/git/actor.py @@ -10,6 +10,10 @@ class Actor(object): """Actors hold information about a person acting on the repository. They can be committers and authors or anything with a name and an email as mentioned in the git log entries.""" + # precompiled regex + name_only_regex = re.compile( r'<.+>' ) + name_email_regex = re.compile( r'(.*) <(.+?)>' ) + def __init__(self, name, email): self.name = name self.email = email @@ -34,8 +38,8 @@ class Actor(object): Returns Actor """ - if re.search(r'<.+>', string): - m = re.search(r'(.*) <(.+?)>', string) + if cls.name_only_regex.search(string): + m = cls.name_email_regex.search(string) name, email = m.groups() return Actor(name, email) else: diff --git a/lib/git/blob.py b/lib/git/blob.py index dac0888f..1e8aa12b 100644 --- a/lib/git/blob.py +++ b/lib/git/blob.py @@ -15,6 +15,12 @@ class Blob(object): """A Blob encapsulates a git blob object""" DEFAULT_MIME_TYPE = "text/plain" + # precompiled regex + re_whitespace = re.compile(r'\s+') + re_hexsha_only = re.compile('^[0-9A-Fa-f]{40}$') + re_author_committer_start = re.compile(r'^(author|committer)') + re_tab_full_line = re.compile(r'^\t(.*)$') + def __init__(self, repo, id, mode=None, path=None): """ Create an unbaked Blob containing just the specified attributes @@ -112,49 +118,68 @@ class Blob(object): info = None for line in data.splitlines(): - parts = re.split(r'\s+', line, 1) - if re.search(r'^[0-9A-Fa-f]{40}$', parts[0]): - if re.search(r'^([0-9A-Fa-f]{40}) (\d+) (\d+) (\d+)$', line): - m = re.search(r'^([0-9A-Fa-f]{40}) (\d+) (\d+) (\d+)$', line) - id, origin_line, final_line, group_lines = m.groups() - info = {'id': id} - blames.append([None, []]) - elif re.search(r'^([0-9A-Fa-f]{40}) (\d+) (\d+)$', line): - m = re.search(r'^([0-9A-Fa-f]{40}) (\d+) (\d+)$', line) - id, origin_line, final_line = m.groups() - info = {'id': id} - elif re.search(r'^(author|committer)', parts[0]): - if re.search(r'^(.+)-mail$', parts[0]): - m = re.search(r'^(.+)-mail$', parts[0]) - info["%s_email" % m.groups()[0]] = parts[-1] - elif re.search(r'^(.+)-time$', parts[0]): - m = re.search(r'^(.+)-time$', parts[0]) - info["%s_date" % m.groups()[0]] = time.gmtime(int(parts[-1])) - elif re.search(r'^(author|committer)$', parts[0]): - m = re.search(r'^(author|committer)$', parts[0]) - info[m.groups()[0]] = parts[-1] - elif re.search(r'^filename', parts[0]): - info['filename'] = parts[-1] - elif re.search(r'^summary', parts[0]): - info['summary'] = parts[-1] - elif parts[0] == '': - if info: - c = commits.has_key(info['id']) and commits[info['id']] - if not c: - c = Commit(repo, id=info['id'], - author=Actor.from_string(info['author'] + ' ' + info['author_email']), - authored_date=info['author_date'], - committer=Actor.from_string(info['committer'] + ' ' + info['committer_email']), - committed_date=info['committer_date'], - message=info['summary']) - commits[info['id']] = c - - m = re.search(r'^\t(.*)$', line) - text, = m.groups() - blames[-1][0] = c - blames[-1][1].append( text ) - info = None - + parts = cls.re_whitespace.split(line, 1) + firstpart = parts[0] + if cls.re_hexsha_only.search(firstpart): + # handles + # 634396b2f541a9f2d58b00be1a07f0c358b999b3 1 1 7 - indicates blame-data start + # 634396b2f541a9f2d58b00be1a07f0c358b999b3 2 2 + digits = parts[-1].split(" ") + if len(digits) == 3: + info = {'id': firstpart} + blames.append([None, []]) + # END blame data initialization + else: + m = cls.re_author_committer_start.search(firstpart) + if m: + # handles: + # author Tom Preston-Werner + # author-mail <tom@mojombo.com> + # author-time 1192271832 + # author-tz -0700 + # committer Tom Preston-Werner + # committer-mail <tom@mojombo.com> + # committer-time 1192271832 + # committer-tz -0700 - IGNORED BY US + role = m.group(0) + if firstpart.endswith('-mail'): + info["%s_email" % role] = parts[-1] + elif firstpart.endswith('-time'): + info["%s_date" % role] = time.gmtime(int(parts[-1])) + elif role == firstpart: + info[role] = parts[-1] + # END distinguish mail,time,name + else: + # handle + # filename lib/grit.rb + # summary add Blob + # <and rest> + if firstpart.startswith('filename'): + info['filename'] = parts[-1] + elif firstpart.startswith('summary'): + info['summary'] = parts[-1] + elif firstpart == '': + if info: + sha = info['id'] + c = commits.get(sha) + if c is None: + c = Commit( repo, id=sha, + author=Actor.from_string(info['author'] + ' ' + info['author_email']), + authored_date=info['author_date'], + committer=Actor.from_string(info['committer'] + ' ' + info['committer_email']), + committed_date=info['committer_date'], + message=info['summary']) + commits[sha] = c + # END if commit objects needs initial creation + m = cls.re_tab_full_line.search(line) + text, = m.groups() + blames[-1][0] = c + blames[-1][1].append( text ) + info = None + # END if we collected commit info + # END distinguish filename,summary,rest + # END distinguish author|committer vs filename,summary,rest + # END distinguish hexsha vs other information return blames def __repr__(self): diff --git a/lib/git/diff.py b/lib/git/diff.py index db12f1e4..75450d70 100644 --- a/lib/git/diff.py +++ b/lib/git/diff.py @@ -29,20 +29,36 @@ class Diff(object): b_mode is None b_blob is NOne """ + + # precompiled regex + re_header = re.compile(r""" + #^diff[ ]--git + [ ]a/(?P<a_path>\S+)[ ]b/(?P<b_path>\S+)\n + (?:^similarity[ ]index[ ](?P<similarity_index>\d+)%\n + ^rename[ ]from[ ](?P<rename_from>\S+)\n + ^rename[ ]to[ ](?P<rename_to>\S+)(?:\n|$))? + (?:^old[ ]mode[ ](?P<old_mode>\d+)\n + ^new[ ]mode[ ](?P<new_mode>\d+)(?:\n|$))? + (?:^new[ ]file[ ]mode[ ](?P<new_file_mode>.+)(?:\n|$))? + (?:^deleted[ ]file[ ]mode[ ](?P<deleted_file_mode>.+)(?:\n|$))? + (?:^index[ ](?P<a_blob_id>[0-9A-Fa-f]+) + \.\.(?P<b_blob_id>[0-9A-Fa-f]+)[ ]?(?P<b_mode>.+)?(?:\n|$))? + """, re.VERBOSE | re.MULTILINE) + re_is_null_hexsha = re.compile( r'^0{40}$' ) - def __init__(self, repo, a_path, b_path, a_blob, b_blob, a_mode, + def __init__(self, repo, a_path, b_path, a_blob_id, b_blob_id, a_mode, b_mode, new_file, deleted_file, rename_from, rename_to, diff): self.repo = repo - if not a_blob or re.search(r'^0{40}$', a_blob): + if not a_blob_id or self.re_is_null_hexsha.search(a_blob_id): self.a_blob = None else: - self.a_blob = blob.Blob(repo, id=a_blob, mode=a_mode, path=a_path) - if not b_blob or re.search(r'^0{40}$', b_blob): + self.a_blob = blob.Blob(repo, id=a_blob_id, mode=a_mode, path=a_path) + if not b_blob_id or self.re_is_null_hexsha.search(b_blob_id): self.b_blob = None else: - self.b_blob = blob.Blob(repo, id=b_blob, mode=b_mode, path=b_path) + self.b_blob = blob.Blob(repo, id=b_blob_id, mode=b_mode, path=b_path) self.a_mode = a_mode self.b_mode = b_mode @@ -68,29 +84,16 @@ class Diff(object): """ diffs = [] - diff_header = re.compile(r""" - #^diff[ ]--git - [ ]a/(?P<a_path>\S+)[ ]b/(?P<b_path>\S+)\n - (?:^similarity[ ]index[ ](?P<similarity_index>\d+)%\n - ^rename[ ]from[ ](?P<rename_from>\S+)\n - ^rename[ ]to[ ](?P<rename_to>\S+)(?:\n|$))? - (?:^old[ ]mode[ ](?P<old_mode>\d+)\n - ^new[ ]mode[ ](?P<new_mode>\d+)(?:\n|$))? - (?:^new[ ]file[ ]mode[ ](?P<new_file_mode>.+)(?:\n|$))? - (?:^deleted[ ]file[ ]mode[ ](?P<deleted_file_mode>.+)(?:\n|$))? - (?:^index[ ](?P<a_blob>[0-9A-Fa-f]+) - \.\.(?P<b_blob>[0-9A-Fa-f]+)[ ]?(?P<b_mode>.+)?(?:\n|$))? - """, re.VERBOSE | re.MULTILINE).match - + diff_header = cls.re_header.match for diff in ('\n' + text).split('\ndiff --git')[1:]: header = diff_header(diff) a_path, b_path, similarity_index, rename_from, rename_to, \ old_mode, new_mode, new_file_mode, deleted_file_mode, \ - a_blob, b_blob, b_mode = header.groups() + a_blob_id, b_blob_id, b_mode = header.groups() new_file, deleted_file = bool(new_file_mode), bool(deleted_file_mode) - diffs.append(Diff(repo, a_path, b_path, a_blob, b_blob, + diffs.append(Diff(repo, a_path, b_path, a_blob_id, b_blob_id, old_mode or deleted_file_mode, new_mode or new_file_mode or b_mode, new_file, deleted_file, rename_from, rename_to, diff[header.end():])) |