Merge pull request #335 from scrapinghub/fix-mp-consumer-distribution

Wrong partitions distribution logic for MP Consumer
author: Dana Powers <dana.powers@gmail.com> 2015-03-12 16:53:09 -0700
committer: Dana Powers <dana.powers@gmail.com> 2015-03-12 16:53:09 -0700
commit: a5b1c8d0fc627de228d00db25ad27fa078c9da32 (patch)
tree: 1a8d99ce13ca5f9a1df9b58b921a475b82fed837
parent: 92aa7e94288cbfc4aed0dfbd52021d21694bced4 (diff)
parent: 01ea3bf968c76a5f7a1999cfca36766d9bbff5e7 (diff)
download: kafka-python-a5b1c8d0fc627de228d00db25ad27fa078c9da32.tar.gz
1 files changed, 12 insertions, 10 deletions
diff --git a/kafka/consumer/multiprocess.py b/kafka/consumer/multiprocess.py
index 4dc04dc..bec3100 100644
--- a/kafka/consumer/multiprocess.py
+++ b/kafka/consumer/multiprocess.py
@@ -123,26 +123,28 @@ class MultiProcessConsumer(Consumer):
         self.pause = Event()        # Requests the consumers to pause fetch
         self.size = Value('i', 0)   # Indicator of number of messages to fetch
 
-        partitions = self.offsets.keys()
+        # dict.keys() returns a view in py3 + it's not a thread-safe operation
+        # http://blog.labix.org/2008/06/27/watch-out-for-listdictkeys-in-python-3
+        # It's safer to copy dict as it only runs during the init.
+        partitions = list(self.offsets.copy().keys())
 
-        # If unspecified, start one consumer per partition
+        # By default, start one consumer process for all partitions
         # The logic below ensures that
         # * we do not cross the num_procs limit
         # * we have an even distribution of partitions among processes
-        if not partitions_per_proc:
-            partitions_per_proc = round(len(partitions) * 1.0 / num_procs)
-            if partitions_per_proc < num_procs * 0.5:
-                partitions_per_proc += 1
+
+        if partitions_per_proc:
+            num_procs = len(partitions) / partitions_per_proc
+            if num_procs * partitions_per_proc < len(partitions):
+                num_procs += 1
 
         # The final set of chunks
-        chunker = lambda *x: [] + list(x)
-        chunks = map(chunker, *[iter(partitions)] * int(partitions_per_proc))
+        chunks = [partitions[proc::num_procs] for proc in range(num_procs)]
 
         self.procs = []
         for chunk in chunks:
-            chunk = filter(lambda x: x is not None, chunk)
             args = (client.copy(),
-                    group, topic, list(chunk),
+                    group, topic, chunk,
                     self.queue, self.start, self.exit,
                     self.pause, self.size)
author	Dana Powers <dana.powers@gmail.com>	2015-03-12 16:53:09 -0700
committer	Dana Powers <dana.powers@gmail.com>	2015-03-12 16:53:09 -0700
commit	a5b1c8d0fc627de228d00db25ad27fa078c9da32 (patch)
tree	1a8d99ce13ca5f9a1df9b58b921a475b82fed837
parent	92aa7e94288cbfc4aed0dfbd52021d21694bced4 (diff)
parent	01ea3bf968c76a5f7a1999cfca36766d9bbff5e7 (diff)
download	kafka-python-a5b1c8d0fc627de228d00db25ad27fa078c9da32.tar.gz