summaryrefslogtreecommitdiff
path: root/Lib/heapq.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/heapq.py')
-rw-r--r--Lib/heapq.py351
1 files changed, 241 insertions, 110 deletions
diff --git a/Lib/heapq.py b/Lib/heapq.py
index d615239b94..0b3e89a3a9 100644
--- a/Lib/heapq.py
+++ b/Lib/heapq.py
@@ -54,7 +54,7 @@ representation for a tournament. The numbers below are `k', not a[k]:
In the tree above, each cell `k' is topping `2*k+1' and `2*k+2'. In
-an usual binary tournament we see in sports, each cell is the winner
+a usual binary tournament we see in sports, each cell is the winner
over the two cells it tops, and we can trace the winner down the tree
to see all opponents s/he had. However, in many computer applications
of such tournaments, we do not need to trace the history of a winner.
@@ -127,8 +127,6 @@ From all times, sorting has always been a Great Art! :-)
__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'merge',
'nlargest', 'nsmallest', 'heappushpop']
-from itertools import islice, count, tee, chain
-
def heappush(heap, item):
"""Push item onto heap, maintaining the heap invariant."""
heap.append(item)
@@ -141,9 +139,8 @@ def heappop(heap):
returnitem = heap[0]
heap[0] = lastelt
_siftup(heap, 0)
- else:
- returnitem = lastelt
- return returnitem
+ return returnitem
+ return lastelt
def heapreplace(heap, item):
"""Pop and return the current smallest value, and add the new item.
@@ -179,12 +176,22 @@ def heapify(x):
for i in reversed(range(n//2)):
_siftup(x, i)
-def _heappushpop_max(heap, item):
- """Maxheap version of a heappush followed by a heappop."""
- if heap and item < heap[0]:
- item, heap[0] = heap[0], item
+def _heappop_max(heap):
+ """Maxheap version of a heappop."""
+ lastelt = heap.pop() # raises appropriate IndexError if heap is empty
+ if heap:
+ returnitem = heap[0]
+ heap[0] = lastelt
_siftup_max(heap, 0)
- return item
+ return returnitem
+ return lastelt
+
+def _heapreplace_max(heap, item):
+ """Maxheap version of a heappop followed by a heappush."""
+ returnitem = heap[0] # raises appropriate IndexError if heap is empty
+ heap[0] = item
+ _siftup_max(heap, 0)
+ return returnitem
def _heapify_max(x):
"""Transform list into a maxheap, in-place, in O(len(x)) time."""
@@ -192,42 +199,6 @@ def _heapify_max(x):
for i in reversed(range(n//2)):
_siftup_max(x, i)
-def nlargest(n, iterable):
- """Find the n largest elements in a dataset.
-
- Equivalent to: sorted(iterable, reverse=True)[:n]
- """
- if n < 0:
- return []
- it = iter(iterable)
- result = list(islice(it, n))
- if not result:
- return result
- heapify(result)
- _heappushpop = heappushpop
- for elem in it:
- _heappushpop(result, elem)
- result.sort(reverse=True)
- return result
-
-def nsmallest(n, iterable):
- """Find the n smallest elements in a dataset.
-
- Equivalent to: sorted(iterable)[:n]
- """
- if n < 0:
- return []
- it = iter(iterable)
- result = list(islice(it, n))
- if not result:
- return result
- _heapify_max(result)
- _heappushpop = _heappushpop_max
- for elem in it:
- _heappushpop(result, elem)
- result.sort()
- return result
-
# 'heap' is a heap at all indices >= startpos, except possibly for pos. pos
# is the index of a leaf with a possibly out-of-order value. Restore the
# heap invariant.
@@ -340,13 +311,7 @@ def _siftup_max(heap, pos):
heap[pos] = newitem
_siftdown_max(heap, startpos, pos)
-# If available, use C implementation
-try:
- from _heapq import *
-except ImportError:
- pass
-
-def merge(*iterables):
+def merge(*iterables, key=None, reverse=False):
'''Merge multiple sorted inputs into a single sorted output.
Similar to sorted(itertools.chain(*iterables)) but returns a generator,
@@ -356,51 +321,158 @@ def merge(*iterables):
>>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))
[0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]
+ If *key* is not None, applies a key function to each element to determine
+ its sort order.
+
+ >>> list(merge(['dog', 'horse'], ['cat', 'fish', 'kangaroo'], key=len))
+ ['dog', 'cat', 'fish', 'horse', 'kangaroo']
+
'''
- _heappop, _heapreplace, _StopIteration = heappop, heapreplace, StopIteration
- _len = len
h = []
h_append = h.append
- for itnum, it in enumerate(map(iter, iterables)):
+
+ if reverse:
+ _heapify = _heapify_max
+ _heappop = _heappop_max
+ _heapreplace = _heapreplace_max
+ direction = -1
+ else:
+ _heapify = heapify
+ _heappop = heappop
+ _heapreplace = heapreplace
+ direction = 1
+
+ if key is None:
+ for order, it in enumerate(map(iter, iterables)):
+ try:
+ next = it.__next__
+ h_append([next(), order * direction, next])
+ except StopIteration:
+ pass
+ _heapify(h)
+ while len(h) > 1:
+ try:
+ while True:
+ value, order, next = s = h[0]
+ yield value
+ s[0] = next() # raises StopIteration when exhausted
+ _heapreplace(h, s) # restore heap condition
+ except StopIteration:
+ _heappop(h) # remove empty iterator
+ if h:
+ # fast case when only a single iterator remains
+ value, order, next = h[0]
+ yield value
+ yield from next.__self__
+ return
+
+ for order, it in enumerate(map(iter, iterables)):
try:
next = it.__next__
- h_append([next(), itnum, next])
- except _StopIteration:
+ value = next()
+ h_append([key(value), order * direction, value, next])
+ except StopIteration:
pass
- heapify(h)
-
- while _len(h) > 1:
+ _heapify(h)
+ while len(h) > 1:
try:
while True:
- v, itnum, next = s = h[0]
- yield v
- s[0] = next() # raises StopIteration when exhausted
- _heapreplace(h, s) # restore heap condition
- except _StopIteration:
- _heappop(h) # remove empty iterator
+ key_value, order, value, next = s = h[0]
+ yield value
+ value = next()
+ s[0] = key(value)
+ s[2] = value
+ _heapreplace(h, s)
+ except StopIteration:
+ _heappop(h)
if h:
- # fast case when only a single iterator remains
- v, itnum, next = h[0]
- yield v
+ key_value, order, value, next = h[0]
+ yield value
yield from next.__self__
-# Extend the implementations of nsmallest and nlargest to use a key= argument
-_nsmallest = nsmallest
+
+# Algorithm notes for nlargest() and nsmallest()
+# ==============================================
+#
+# Make a single pass over the data while keeping the k most extreme values
+# in a heap. Memory consumption is limited to keeping k values in a list.
+#
+# Measured performance for random inputs:
+#
+# number of comparisons
+# n inputs k-extreme values (average of 5 trials) % more than min()
+# ------------- ---------------- --------------------- -----------------
+# 1,000 100 3,317 231.7%
+# 10,000 100 14,046 40.5%
+# 100,000 100 105,749 5.7%
+# 1,000,000 100 1,007,751 0.8%
+# 10,000,000 100 10,009,401 0.1%
+#
+# Theoretical number of comparisons for k smallest of n random inputs:
+#
+# Step Comparisons Action
+# ---- -------------------------- ---------------------------
+# 1 1.66 * k heapify the first k-inputs
+# 2 n - k compare remaining elements to top of heap
+# 3 k * (1 + lg2(k)) * ln(n/k) replace the topmost value on the heap
+# 4 k * lg2(k) - (k/2) final sort of the k most extreme values
+#
+# Combining and simplifying for a rough estimate gives:
+#
+# comparisons = n + k * (log(k, 2) * log(n/k) + log(k, 2) + log(n/k))
+#
+# Computing the number of comparisons for step 3:
+# -----------------------------------------------
+# * For the i-th new value from the iterable, the probability of being in the
+# k most extreme values is k/i. For example, the probability of the 101st
+# value seen being in the 100 most extreme values is 100/101.
+# * If the value is a new extreme value, the cost of inserting it into the
+# heap is 1 + log(k, 2).
+# * The probability times the cost gives:
+# (k/i) * (1 + log(k, 2))
+# * Summing across the remaining n-k elements gives:
+# sum((k/i) * (1 + log(k, 2)) for i in range(k+1, n+1))
+# * This reduces to:
+# (H(n) - H(k)) * k * (1 + log(k, 2))
+# * Where H(n) is the n-th harmonic number estimated by:
+# gamma = 0.5772156649
+# H(n) = log(n, e) + gamma + 1 / (2 * n)
+# http://en.wikipedia.org/wiki/Harmonic_series_(mathematics)#Rate_of_divergence
+# * Substituting the H(n) formula:
+# comparisons = k * (1 + log(k, 2)) * (log(n/k, e) + (1/n - 1/k) / 2)
+#
+# Worst-case for step 3:
+# ----------------------
+# In the worst case, the input data is reversed sorted so that every new element
+# must be inserted in the heap:
+#
+# comparisons = 1.66 * k + log(k, 2) * (n - k)
+#
+# Alternative Algorithms
+# ----------------------
+# Other algorithms were not used because they:
+# 1) Took much more auxiliary memory,
+# 2) Made multiple passes over the data.
+# 3) Made more comparisons in common cases (small k, large n, semi-random input).
+# See the more detailed comparison of approach at:
+# http://code.activestate.com/recipes/577573-compare-algorithms-for-heapqsmallest
+
def nsmallest(n, iterable, key=None):
"""Find the n smallest elements in a dataset.
Equivalent to: sorted(iterable, key=key)[:n]
"""
- # Short-cut for n==1 is to use min() when len(iterable)>0
+
+ # Short-cut for n==1 is to use min()
if n == 1:
it = iter(iterable)
- head = list(islice(it, 1))
- if not head:
- return []
+ sentinel = object()
if key is None:
- return [min(chain(head, it))]
- return [min(chain(head, it), key=key)]
+ result = min(it, default=sentinel)
+ else:
+ result = min(it, default=sentinel, key=key)
+ return [] if result is sentinel else [result]
# When n>=size, it's faster to use sorted()
try:
@@ -413,32 +485,57 @@ def nsmallest(n, iterable, key=None):
# When key is none, use simpler decoration
if key is None:
- it = zip(iterable, count()) # decorate
- result = _nsmallest(n, it)
- return [r[0] for r in result] # undecorate
+ it = iter(iterable)
+ # put the range(n) first so that zip() doesn't
+ # consume one too many elements from the iterator
+ result = [(elem, i) for i, elem in zip(range(n), it)]
+ if not result:
+ return result
+ _heapify_max(result)
+ top = result[0][0]
+ order = n
+ _heapreplace = _heapreplace_max
+ for elem in it:
+ if elem < top:
+ _heapreplace(result, (elem, order))
+ top = result[0][0]
+ order += 1
+ result.sort()
+ return [r[0] for r in result]
# General case, slowest method
- in1, in2 = tee(iterable)
- it = zip(map(key, in1), count(), in2) # decorate
- result = _nsmallest(n, it)
- return [r[2] for r in result] # undecorate
+ it = iter(iterable)
+ result = [(key(elem), i, elem) for i, elem in zip(range(n), it)]
+ if not result:
+ return result
+ _heapify_max(result)
+ top = result[0][0]
+ order = n
+ _heapreplace = _heapreplace_max
+ for elem in it:
+ k = key(elem)
+ if k < top:
+ _heapreplace(result, (k, order, elem))
+ top = result[0][0]
+ order += 1
+ result.sort()
+ return [r[2] for r in result]
-_nlargest = nlargest
def nlargest(n, iterable, key=None):
"""Find the n largest elements in a dataset.
Equivalent to: sorted(iterable, key=key, reverse=True)[:n]
"""
- # Short-cut for n==1 is to use max() when len(iterable)>0
+ # Short-cut for n==1 is to use max()
if n == 1:
it = iter(iterable)
- head = list(islice(it, 1))
- if not head:
- return []
+ sentinel = object()
if key is None:
- return [max(chain(head, it))]
- return [max(chain(head, it), key=key)]
+ result = max(it, default=sentinel)
+ else:
+ result = max(it, default=sentinel, key=key)
+ return [] if result is sentinel else [result]
# When n>=size, it's faster to use sorted()
try:
@@ -451,26 +548,60 @@ def nlargest(n, iterable, key=None):
# When key is none, use simpler decoration
if key is None:
- it = zip(iterable, count(0,-1)) # decorate
- result = _nlargest(n, it)
- return [r[0] for r in result] # undecorate
+ it = iter(iterable)
+ result = [(elem, i) for i, elem in zip(range(0, -n, -1), it)]
+ if not result:
+ return result
+ heapify(result)
+ top = result[0][0]
+ order = -n
+ _heapreplace = heapreplace
+ for elem in it:
+ if top < elem:
+ _heapreplace(result, (elem, order))
+ top = result[0][0]
+ order -= 1
+ result.sort(reverse=True)
+ return [r[0] for r in result]
# General case, slowest method
- in1, in2 = tee(iterable)
- it = zip(map(key, in1), count(0,-1), in2) # decorate
- result = _nlargest(n, it)
- return [r[2] for r in result] # undecorate
+ it = iter(iterable)
+ result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)]
+ if not result:
+ return result
+ heapify(result)
+ top = result[0][0]
+ order = -n
+ _heapreplace = heapreplace
+ for elem in it:
+ k = key(elem)
+ if top < k:
+ _heapreplace(result, (k, order, elem))
+ top = result[0][0]
+ order -= 1
+ result.sort(reverse=True)
+ return [r[2] for r in result]
+
+# If available, use C implementation
+try:
+ from _heapq import *
+except ImportError:
+ pass
+try:
+ from _heapq import _heapreplace_max
+except ImportError:
+ pass
+try:
+ from _heapq import _heapify_max
+except ImportError:
+ pass
+try:
+ from _heapq import _heappop_max
+except ImportError:
+ pass
+
if __name__ == "__main__":
- # Simple sanity test
- heap = []
- data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 0]
- for item in data:
- heappush(heap, item)
- sort = []
- while heap:
- sort.append(heappop(heap))
- print(sort)
import doctest
- doctest.testmod()
+ print(doctest.testmod())