summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSimon MacMullen <simon@rabbitmq.com>2014-10-10 16:21:09 +0100
committerSimon MacMullen <simon@rabbitmq.com>2014-10-10 16:21:09 +0100
commit2cb7a0389c023a632e46fb30d3f0dcd044c47e01 (patch)
tree0bb95417eecfa44e47becf8eb291cbac212ed509
parentf2cb9a133a45116ac4c12e52f99bfca36a5a5361 (diff)
parentb12028d78500722af5e7ec7aff162a36ebafce01 (diff)
downloadrabbitmq-server-git-2cb7a0389c023a632e46fb30d3f0dcd044c47e01.tar.gz
Merge in default
-rw-r--r--src/rabbit_mirror_queue_sync.erl24
1 files changed, 18 insertions, 6 deletions
diff --git a/src/rabbit_mirror_queue_sync.erl b/src/rabbit_mirror_queue_sync.erl
index e3fae4c09c..d1ef5f30b9 100644
--- a/src/rabbit_mirror_queue_sync.erl
+++ b/src/rabbit_mirror_queue_sync.erl
@@ -156,18 +156,30 @@ syncer(Ref, Log, MPid, SPids) ->
%% We wait for a reply from the slaves so that we know they are in
%% a receive block and will thus receive messages we send to them
%% *without* those messages ending up in their gen_server2 pqueue.
- case [SPid || SPid <- SPids,
- receive
- {sync_ready, Ref, SPid} -> true;
- {sync_deny, Ref, SPid} -> false;
- {'DOWN', _, process, SPid, _} -> false
- end] of
+ case await_slaves(Ref, SPids) of
[] -> Log("all slaves already synced", []);
SPids1 -> MPid ! {ready, self()},
Log("mirrors ~p to sync", [[node(SPid) || SPid <- SPids1]]),
syncer_loop(Ref, MPid, SPids1)
end.
+await_slaves(Ref, SPids) ->
+ Nodes = rabbit_mnesia:cluster_nodes(running),
+ [SPid || SPid <- SPids,
+ lists:member(node(SPid), Nodes) andalso %% [0]
+ receive
+ {sync_ready, Ref, SPid} -> true;
+ {sync_deny, Ref, SPid} -> false;
+ {'DOWN', _, process, SPid, _} -> false
+ end].
+%% [0] This check is in case there's been a partition which has then
+%% healed in between the master retrieving the slave pids from Mnesia
+%% and sending 'sync_start' over GM. If so there might be slaves on the
+%% other side of the partition which we can monitor (since they have
+%% rejoined the distributed system with us) but which did not get the
+%% 'sync_start' and so will not reply. We need to act as though they are
+%% down.
+
syncer_loop(Ref, MPid, SPids) ->
MPid ! {next, Ref},
receive