summaryrefslogtreecommitdiff
path: root/Lib/heapq.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/heapq.py')
-rw-r--r--Lib/heapq.py61
1 files changed, 59 insertions, 2 deletions
diff --git a/Lib/heapq.py b/Lib/heapq.py
index d615239b94..88c7019abc 100644
--- a/Lib/heapq.py
+++ b/Lib/heapq.py
@@ -192,12 +192,69 @@ def _heapify_max(x):
for i in reversed(range(n//2)):
_siftup_max(x, i)
+
+# Algorithm notes for nlargest() and nsmallest()
+# ==============================================
+#
+# Makes just one pass over the data while keeping the n most extreme values
+# in a heap. Memory consumption is limited to keeping n values in a list.
+#
+# Number of comparisons for n random inputs, keeping the k smallest values:
+# -----------------------------------------------------------
+# Step Comparisons Action
+# 1 1.66*k heapify the first k-inputs
+# 2 n - k compare new input elements to top of heap
+# 3 k*lg2(k)*(ln(n)-ln(k)) add new extreme values to the heap
+# 4 k*lg2(k) final sort of the k most extreme values
+#
+# number of comparisons
+# n-random inputs k-extreme values average of 5 trials % more than min()
+# --------------- ---------------- ------------------- -----------------
+# 10,000 100 14,046 40.5%
+# 100,000 100 105,749 5.7%
+# 1,000,000 100 1,007,751 0.8%
+#
+# Computing the number of comparisons for step 3:
+# -----------------------------------------------
+# * For the i-th new value from the iterable, the probability of being in the
+# k most extreme values is k/i. For example, the probability of the 101st
+# value seen being in the 100 most extreme values is 100/101.
+# * If the value is a new extreme value, the cost of inserting it into the
+# heap is log(k, 2).
+# * The probabilty times the cost gives:
+# (k/i) * log(k, 2)
+# * Summing across the remaining n-k elements gives:
+# sum((k/i) * log(k, 2) for xrange(k+1, n+1))
+# * This reduces to:
+# (H(n) - H(k)) * k * log(k, 2)
+# * Where H(n) is the n-th harmonic number estimated by:
+# H(n) = log(n, e) + gamma + 1.0 / (2.0 * n)
+# gamma = 0.5772156649
+# http://en.wikipedia.org/wiki/Harmonic_series_(mathematics)#Rate_of_divergence
+# * Substituting the H(n) formula and ignoring the (1/2*n) fraction gives:
+# comparisons = k * log(k, 2) * (log(n,e) - log(k, e))
+#
+# Worst-case for step 3:
+# ----------------------
+# In the worst case, the input data is reversed sorted so that every new element
+# must be inserted in the heap:
+# comparisons = log(k, 2) * (n - k)
+#
+# Alternative Algorithms
+# ----------------------
+# Other algorithms were not used because they:
+# 1) Took much more auxiliary memory,
+# 2) Made multiple passes over the data.
+# 3) Made more comparisons in common cases (small k, large n, semi-random input).
+# See detailed comparisons at:
+# http://code.activestate.com/recipes/577573-compare-algorithms-for-heapqsmallest
+
def nlargest(n, iterable):
"""Find the n largest elements in a dataset.
Equivalent to: sorted(iterable, reverse=True)[:n]
"""
- if n < 0:
+ if n <= 0:
return []
it = iter(iterable)
result = list(islice(it, n))
@@ -215,7 +272,7 @@ def nsmallest(n, iterable):
Equivalent to: sorted(iterable)[:n]
"""
- if n < 0:
+ if n <= 0:
return []
it = iter(iterable)
result = list(islice(it, n))