summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJean-Sébastien Pédron <jean-sebastien@rabbitmq.com>2015-06-26 17:35:47 +0200
committerJean-Sébastien Pédron <jean-sebastien@rabbitmq.com>2015-06-26 17:42:34 +0200
commitd7b18da002ceab4c1afd31aa6fa3f5828e0f8084 (patch)
treebbc32320cdfc5183305067db89c5926411b83ce0 /src
parentede194b2d5600b3214c7dfbd9310c560a1ce8add (diff)
downloadrabbitmq-server-git-d7b18da002ceab4c1afd31aa6fa3f5828e0f8084.tar.gz
rabbit_amqqueue:on_node_up/1: Check the queue's running slaves list
... before removing the node from the recoverable slaves list. This prevents a race with rabbit_mirror_queue_slave:record_synchronised/1, where a recoverable node could be removed from the recoverable slaves list. Fixes #200.
Diffstat (limited to 'src')
-rw-r--r--src/rabbit_amqqueue.erl36
1 files changed, 30 insertions, 6 deletions
diff --git a/src/rabbit_amqqueue.erl b/src/rabbit_amqqueue.erl
index 9ce800023f..f3151b0578 100644
--- a/src/rabbit_amqqueue.erl
+++ b/src/rabbit_amqqueue.erl
@@ -782,15 +782,39 @@ on_node_up(Node) ->
fun () ->
Qs = mnesia:match_object(rabbit_queue,
#amqqueue{_ = '_'}, write),
- [case lists:member(Node, RSs) of
- true -> RSs1 = RSs -- [Node],
- store_queue(
- Q#amqqueue{recoverable_slaves = RSs1});
- false -> ok
- end || #amqqueue{recoverable_slaves = RSs} = Q <- Qs],
+ [maybe_clear_recoverable_node(Node, Q) || Q <- Qs],
ok
end).
+maybe_clear_recoverable_node(Node,
+ #amqqueue{sync_slave_pids = SPids, recoverable_slaves = RSs} = Q) ->
+ case lists:member(Node, RSs) of
+ true ->
+ %% There is a race with
+ %% rabbit_mirror_queue_slave:record_synchronised/1 called
+ %% by the incoming slave node and this function, called
+ %% by the master node. If this function is executed after
+ %% record_synchronised/1, the node is erroneously removed
+ %% from the recoverable slaves list.
+ %%
+ %% We check if the slave node's queue PID is alive. If it is
+ %% the case, then this function is executed after. In this
+ %% situation, we don't touch the queue record, it is already
+ %% correct.
+ DoClearNode = case [SP || SP <- SPids, node(SP) =:= Node] of
+ [SPid] -> not rabbit_misc:is_process_alive(SPid);
+ _ -> true
+ end,
+ if
+ DoClearNode -> RSs1 = RSs -- [Node],
+ store_queue(
+ Q#amqqueue{recoverable_slaves = RSs1});
+ true -> ok
+ end;
+ false ->
+ ok
+ end.
+
on_node_down(Node) ->
rabbit_misc:execute_mnesia_tx_with_tail(
fun () -> QsDels =