summaryrefslogtreecommitdiff
path: root/Lib/difflib.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/difflib.py')
-rw-r--r--Lib/difflib.py183
1 files changed, 109 insertions, 74 deletions
diff --git a/Lib/difflib.py b/Lib/difflib.py
index b43dc970e2..3bbcb76b7e 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -151,7 +151,7 @@ class SequenceMatcher:
Return an upper bound on ratio() very quickly.
"""
- def __init__(self, isjunk=None, a='', b=''):
+ def __init__(self, isjunk=None, a='', b='', autojunk=True):
"""Construct a SequenceMatcher.
Optional arg isjunk is None (the default), or a one-argument
@@ -169,6 +169,10 @@ class SequenceMatcher:
Optional arg b is the second of two sequences to be compared. By
default, an empty string. The elements of b must be hashable. See
also .set_seqs() and .set_seq2().
+
+ Optional arg autojunk should be set to False to disable the
+ "automatic junk heuristic" that treats popular elements as junk
+ (see module documentation for more information).
"""
# Members:
@@ -207,11 +211,13 @@ class SequenceMatcher:
# DOES NOT WORK for x in a!
# isbpopular
# for x in b, isbpopular(x) is true iff b is reasonably long
- # (at least 200 elements) and x accounts for more than 1% of
- # its elements. DOES NOT WORK for x in a!
+ # (at least 200 elements) and x accounts for more than 1 + 1% of
+ # its elements (when autojunk is enabled).
+ # DOES NOT WORK for x in a!
self.isjunk = isjunk
self.a = self.b = None
+ self.autojunk = autojunk
self.set_seqs(a, b)
def set_seqs(self, a, b):
@@ -288,7 +294,7 @@ class SequenceMatcher:
# from starting any matching block at a junk element ...
# also creates the fast isbjunk function ...
# b2j also does not contain entries for "popular" elements, meaning
- # elements that account for more than 1% of the total elements, and
+ # elements that account for more than 1 + 1% of the total elements, and
# when the sequence is reasonably large (>= 200 elements); this can
# be viewed as an adaptive notion of semi-junk, and yields an enormous
# speedup when, e.g., comparing program files with hundreds of
@@ -309,44 +315,37 @@ class SequenceMatcher:
# out the junk later is much cheaper than building b2j "right"
# from the start.
b = self.b
- n = len(b)
self.b2j = b2j = {}
- populardict = {}
- for i, elt in enumerate(b):
- if elt in b2j:
- indices = b2j[elt]
- if n >= 200 and len(indices) * 100 > n:
- populardict[elt] = 1
- del indices[:]
- else:
- indices.append(i)
- else:
- b2j[elt] = [i]
- # Purge leftover indices for popular elements.
- for elt in populardict:
- del b2j[elt]
+ for i, elt in enumerate(b):
+ indices = b2j.setdefault(elt, [])
+ indices.append(i)
- # Now b2j.keys() contains elements uniquely, and especially when
- # the sequence is a string, that's usually a good deal smaller
- # than len(string). The difference is the number of isjunk calls
- # saved.
+ # Purge junk elements
+ junk = set()
isjunk = self.isjunk
- junkdict = {}
if isjunk:
- for d in populardict, b2j:
- for elt in d.keys():
- if isjunk(elt):
- junkdict[elt] = 1
- del d[elt]
-
- # Now for x in b, isjunk(x) == x in junkdict, but the
- # latter is much faster. Note too that while there may be a
- # lot of junk in the sequence, the number of *unique* junk
- # elements is probably small. So the memory burden of keeping
- # this dict alive is likely trivial compared to the size of b2j.
- self.isbjunk = junkdict.__contains__
- self.isbpopular = populardict.__contains__
+ for elt in list(b2j.keys()): # using list() since b2j is modified
+ if isjunk(elt):
+ junk.add(elt)
+ del b2j[elt]
+
+ # Purge popular elements that are not junk
+ popular = set()
+ n = len(b)
+ if self.autojunk and n >= 200:
+ ntest = n // 100 + 1
+ for elt, idxs in list(b2j.items()):
+ if len(idxs) > ntest:
+ popular.add(elt)
+ del b2j[elt]
+
+ # Now for x in b, isjunk(x) == x in junk, but the latter is much faster.
+ # Sicne the number of *unique* junk elements is probably small, the
+ # memory burden of keeping this set alive is likely trivial compared to
+ # the size of b2j.
+ self.isbjunk = junk.__contains__
+ self.isbpopular = popular.__contains__
def find_longest_match(self, alo, ahi, blo, bhi):
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
@@ -1141,6 +1140,21 @@ def IS_CHARACTER_JUNK(ch, ws=" \t"):
return ch in ws
+########################################################################
+### Unified Diff
+########################################################################
+
+def _format_range_unified(start, stop):
+ 'Convert range to the "ed" format'
+ # Per the diff spec at http://www.unix.org/single_unix_specification/
+ beginning = start + 1 # lines start numbering with one
+ length = stop - start
+ if length == 1:
+ return '{}'.format(beginning)
+ if not length:
+ beginning -= 1 # empty ranges begin at line just before the range
+ return '{},{}'.format(beginning, length)
+
def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
tofiledate='', n=3, lineterm='\n'):
r"""
@@ -1161,18 +1175,18 @@ def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
The unidiff format normally has a header for filenames and modification
times. Any or all of these may be specified using strings for
- 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. The modification
- times are normally expressed in the format returned by time.ctime().
+ 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
+ The modification times are normally expressed in the ISO 8601 format.
Example:
>>> for line in unified_diff('one two three four'.split(),
... 'zero one tree four'.split(), 'Original', 'Current',
- ... 'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:20:52 2003',
+ ... '2005-01-26 23:30:50', '2010-04-02 10:20:52',
... lineterm=''):
- ... print line
- --- Original Sat Jan 26 23:30:50 1991
- +++ Current Fri Jun 06 10:20:52 2003
+ ... print line # doctest: +NORMALIZE_WHITESPACE
+ --- Original 2005-01-26 23:30:50
+ +++ Current 2010-04-02 10:20:52
@@ -1,4 +1,4 @@
+zero
one
@@ -1185,23 +1199,45 @@ def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
started = False
for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
if not started:
- yield '--- %s %s%s' % (fromfile, fromfiledate, lineterm)
- yield '+++ %s %s%s' % (tofile, tofiledate, lineterm)
started = True
- i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4]
- yield "@@ -%d,%d +%d,%d @@%s" % (i1+1, i2-i1, j1+1, j2-j1, lineterm)
+ fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
+ todate = '\t{}'.format(tofiledate) if tofiledate else ''
+ yield '--- {}{}{}'.format(fromfile, fromdate, lineterm)
+ yield '+++ {}{}{}'.format(tofile, todate, lineterm)
+
+ first, last = group[0], group[-1]
+ file1_range = _format_range_unified(first[1], last[2])
+ file2_range = _format_range_unified(first[3], last[4])
+ yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm)
+
for tag, i1, i2, j1, j2 in group:
if tag == 'equal':
for line in a[i1:i2]:
yield ' ' + line
continue
- if tag == 'replace' or tag == 'delete':
+ if tag in ('replace', 'delete'):
for line in a[i1:i2]:
yield '-' + line
- if tag == 'replace' or tag == 'insert':
+ if tag in ('replace', 'insert'):
for line in b[j1:j2]:
yield '+' + line
+
+########################################################################
+### Context Diff
+########################################################################
+
+def _format_range_context(start, stop):
+ 'Convert range to the "ed" format'
+ # Per the diff spec at http://www.unix.org/single_unix_specification/
+ beginning = start + 1 # lines start numbering with one
+ length = stop - start
+ if not length:
+ beginning -= 1 # empty ranges begin at line just before the range
+ if length <= 1:
+ return '{}'.format(beginning)
+ return '{},{}'.format(beginning, beginning + length - 1)
+
# See http://www.unix.org/single_unix_specification/
def context_diff(a, b, fromfile='', tofile='',
fromfiledate='', tofiledate='', n=3, lineterm='\n'):
@@ -1224,16 +1260,15 @@ def context_diff(a, b, fromfile='', tofile='',
The context diff format normally has a header for filenames and
modification times. Any or all of these may be specified using
strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
- The modification times are normally expressed in the format returned
- by time.ctime(). If not specified, the strings default to blanks.
+ The modification times are normally expressed in the ISO 8601 format.
+ If not specified, the strings default to blanks.
Example:
>>> print ''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(1),
- ... 'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current',
- ... 'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:22:46 2003')),
- *** Original Sat Jan 26 23:30:50 1991
- --- Current Fri Jun 06 10:22:46 2003
+ ... 'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current')),
+ *** Original
+ --- Current
***************
*** 1,4 ****
one
@@ -1247,36 +1282,36 @@ def context_diff(a, b, fromfile='', tofile='',
four
"""
+ prefix = dict(insert='+ ', delete='- ', replace='! ', equal=' ')
started = False
- prefixmap = {'insert':'+ ', 'delete':'- ', 'replace':'! ', 'equal':' '}
for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
if not started:
- yield '*** %s %s%s' % (fromfile, fromfiledate, lineterm)
- yield '--- %s %s%s' % (tofile, tofiledate, lineterm)
started = True
+ fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
+ todate = '\t{}'.format(tofiledate) if tofiledate else ''
+ yield '*** {}{}{}'.format(fromfile, fromdate, lineterm)
+ yield '--- {}{}{}'.format(tofile, todate, lineterm)
- yield '***************%s' % (lineterm,)
- if group[-1][2] - group[0][1] >= 2:
- yield '*** %d,%d ****%s' % (group[0][1]+1, group[-1][2], lineterm)
- else:
- yield '*** %d ****%s' % (group[-1][2], lineterm)
- visiblechanges = [e for e in group if e[0] in ('replace', 'delete')]
- if visiblechanges:
+ first, last = group[0], group[-1]
+ yield '***************' + lineterm
+
+ file1_range = _format_range_context(first[1], last[2])
+ yield '*** {} ****{}'.format(file1_range, lineterm)
+
+ if any(tag in ('replace', 'delete') for tag, _, _, _, _ in group):
for tag, i1, i2, _, _ in group:
if tag != 'insert':
for line in a[i1:i2]:
- yield prefixmap[tag] + line
+ yield prefix[tag] + line
- if group[-1][4] - group[0][3] >= 2:
- yield '--- %d,%d ----%s' % (group[0][3]+1, group[-1][4], lineterm)
- else:
- yield '--- %d ----%s' % (group[-1][4], lineterm)
- visiblechanges = [e for e in group if e[0] in ('replace', 'insert')]
- if visiblechanges:
+ file2_range = _format_range_context(first[3], last[4])
+ yield '--- {} ----{}'.format(file2_range, lineterm)
+
+ if any(tag in ('replace', 'insert') for tag, _, _, _, _ in group):
for tag, _, _, j1, j2 in group:
if tag != 'delete':
for line in b[j1:j2]:
- yield prefixmap[tag] + line
+ yield prefix[tag] + line
def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
r"""
@@ -1712,7 +1747,7 @@ class HtmlDiff(object):
line = line.replace(' ','\0')
# expand tabs into spaces
line = line.expandtabs(self._tabsize)
- # relace spaces from expanded tabs back into tab characters
+ # replace spaces from expanded tabs back into tab characters
# (we'll replace them with markup after we do differencing)
line = line.replace(' ','\t')
return line.replace('\0',' ').rstrip('\n')