summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorSimon MacMullen <simon@rabbitmq.com>2014-11-21 13:35:22 +0000
committerSimon MacMullen <simon@rabbitmq.com>2014-11-21 13:35:22 +0000
commitedf18aeae192c99d9c54e2b3a381795bec799f3e (patch)
treea3d93e843a8854c78f67dbb5e74dafed5233b3ab /src
parentf351d41768a2548fe35fee381c8ea0f2231ed377 (diff)
downloadrabbitmq-server-git-edf18aeae192c99d9c54e2b3a381795bec799f3e.tar.gz
Be a bit more careful before declaring a partial partition.
Diffstat (limited to 'src')
-rw-r--r--src/rabbit_node_monitor.erl21
1 files changed, 20 insertions, 1 deletions
diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl
index e606938759..5f4530537c 100644
--- a/src/rabbit_node_monitor.erl
+++ b/src/rabbit_node_monitor.erl
@@ -317,10 +317,29 @@ handle_cast({check_partial_partition, Node, Rep, NodeGUID, MyGUID, RepGUID},
node_guids = GUIDs}) ->
case lists:member(Node, rabbit_mnesia:cluster_nodes(running)) andalso
orddict:find(Node, GUIDs) =:= {ok, NodeGUID} of
- true -> cast(Rep, {partial_partition, Node, node(), RepGUID});
+ true -> spawn_link( %%[1]
+ fun () ->
+ case rpc:call(Node, rabbit, is_running, []) of
+ {badrpc, _} ->
+ ok;
+ _ ->
+ cast(Rep, {partial_partition,
+ Node, node(), RepGUID})
+ end
+ end);
false -> ok
end,
{noreply, State};
+%% [1] We checked that we haven't heard the node go down - but we
+%% really should make sure we can actually communicate with
+%% it. Otherwise there's a race where we falsely detect a partial
+%% partition.
+%%
+%% Now of course the rpc:call/4 may take a long time to return if
+%% connectivity with the node is actually interrupted - but that's OK,
+%% we only really want to do something in a timely manner if
+%% connectivity is OK. However, of course as always we must not block
+%% the node monitor, so we do the check in a separate process.
handle_cast({check_partial_partition, _Node, _Reporter,
_NodeGUID, _GUID, _ReporterGUID}, State) ->