summaryrefslogtreecommitdiff
path: root/lib/git/diff.py
diff options
context:
space:
mode:
authorPaul Sowden <paul@idontsmoke.co.uk>2008-11-20 00:09:43 -0800
committerMichael Trier <mtrier@gmail.com>2008-12-15 14:12:23 -0500
commit753e908dcea03cf9962cf45d3965cf93b0d30d94 (patch)
tree409595f1b5368461b5afc7ee0db642c6a83fd538 /lib/git/diff.py
parent9e14356d12226cb140b0e070bd079468b4ab599b (diff)
downloadgitpython-753e908dcea03cf9962cf45d3965cf93b0d30d94.tar.gz
implement a far far faster diff parser
The old diff parser in list_from_string took a large amount of time to parse long diffs, on one of my repositories it took over 3 minutes to parse the initial commit. The new parser uses a single regexp to match the header of a diff, and iterates over the each individual diff by splitting the entire string by the diff seperator, attempting to match the header for each individual diff. With the new parser parsing the same repository is almost instant, woohoo! (cherry picked from commit 5b6b27f153bdc30380bea12a528ef483571dd57a)
Diffstat (limited to 'lib/git/diff.py')
-rw-r--r--lib/git/diff.py61
1 files changed, 19 insertions, 42 deletions
diff --git a/lib/git/diff.py b/lib/git/diff.py
index 51315fe3..28ebda01 100644
--- a/lib/git/diff.py
+++ b/lib/git/diff.py
@@ -34,52 +34,29 @@ class Diff(object):
@classmethod
def list_from_string(cls, repo, text):
- lines = text.splitlines()
- a_mode = None
- b_mode = None
diffs = []
- while lines:
- m = re.search(r'^diff --git a/(\S+) b/(\S+)$', lines.pop(0))
- if m:
- a_path, b_path = m.groups()
- if re.search(r'^old mode', lines[0]):
- m = re.search(r'^old mode (\d+)', lines.pop(0))
- if m:
- a_mode, = m.groups()
- m = re.search(r'^new mode (\d+)', lines.pop(0))
- if m:
- b_mode, = m.groups()
- if re.search(r'^diff --git', lines[0]):
- diffs.append(Diff(repo, a_path, b_path, None, None, a_mode, b_mode, False, False, None))
- continue
- new_file = False
- deleted_file = False
+ diff_header = re.compile(r"""
+ #^diff[ ]--git
+ [ ]a/(?P<a_path>\S+)[ ]b/(?P<b_path>\S+)\n
+ (?:^old[ ]mode[ ](?P<old_mode>\d+)(?:\n|$))?
+ (?:^new[ ]mode[ ](?P<new_mode>\d+)(?:\n|$))?
+ (?:^new[ ]file[ ]mode[ ](?P<new_file_mode>.+)(?:\n|$))?
+ (?:^deleted[ ]file[ ]mode[ ](?P<deleted_file_mode>.+)(?:\n|$))?
+ (?:^index[ ](?P<a_commit>[0-9A-Fa-f]+)
+ \.\.(?P<b_commit>[0-9A-Fa-f]+)[ ]?(?P<b_mode>.+)?(?:\n|$))?
+ """, re.VERBOSE | re.MULTILINE).match
- if re.search(r'^new file', lines[0]):
- m = re.search(r'^new file mode (.+)', lines.pop(0))
- if m:
- b_mode, = m.groups()
- a_mode = None
- new_file = True
- elif re.search(r'^deleted file', lines[0]):
- m = re.search(r'^deleted file mode (.+)$', lines.pop(0))
- if m:
- a_mode, = m.groups()
- b_mode = None
- deleted_file = True
+ for diff in ('\n' + text).split('\ndiff --git')[1:]:
+ header = diff_header(diff)
- m = re.search(r'^index ([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+) ?(.+)?$', lines.pop(0))
- if m:
- a_commit, b_commit, b_mode = m.groups()
- if b_mode:
- b_mode = b_mode.strip()
+ a_path, b_path, old_mode, new_mode, new_file_mode, deleted_file_mode, \
+ a_commit, b_commit, b_mode = header.groups()
+ new_file, deleted_file = bool(new_file_mode), bool(deleted_file_mode)
- diff_lines = []
- while lines and not re.search(r'^diff', lines[0]):
- diff_lines.append(lines.pop(0))
-
- diff = "\n".join(diff_lines)
- diffs.append(Diff(repo, a_path, b_path, a_commit, b_commit, a_mode, b_mode, new_file, deleted_file, diff))
+ diffs.append(Diff(repo, a_path, b_path, a_commit, b_commit,
+ old_mode or deleted_file_mode, new_mode or new_file_mode or b_mode,
+ new_file, deleted_file, diff[header.end():]))
return diffs
+