diff options
| author | Simon MacMullen <simon@rabbitmq.com> | 2014-11-21 13:35:22 +0000 |
|---|---|---|
| committer | Simon MacMullen <simon@rabbitmq.com> | 2014-11-21 13:35:22 +0000 |
| commit | edf18aeae192c99d9c54e2b3a381795bec799f3e (patch) | |
| tree | a3d93e843a8854c78f67dbb5e74dafed5233b3ab | |
| parent | f351d41768a2548fe35fee381c8ea0f2231ed377 (diff) | |
| download | rabbitmq-server-git-edf18aeae192c99d9c54e2b3a381795bec799f3e.tar.gz | |
Be a bit more careful before declaring a partial partition.
| -rw-r--r-- | src/rabbit_node_monitor.erl | 21 |
1 files changed, 20 insertions, 1 deletions
diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl index e606938759..5f4530537c 100644 --- a/src/rabbit_node_monitor.erl +++ b/src/rabbit_node_monitor.erl @@ -317,10 +317,29 @@ handle_cast({check_partial_partition, Node, Rep, NodeGUID, MyGUID, RepGUID}, node_guids = GUIDs}) -> case lists:member(Node, rabbit_mnesia:cluster_nodes(running)) andalso orddict:find(Node, GUIDs) =:= {ok, NodeGUID} of - true -> cast(Rep, {partial_partition, Node, node(), RepGUID}); + true -> spawn_link( %%[1] + fun () -> + case rpc:call(Node, rabbit, is_running, []) of + {badrpc, _} -> + ok; + _ -> + cast(Rep, {partial_partition, + Node, node(), RepGUID}) + end + end); false -> ok end, {noreply, State}; +%% [1] We checked that we haven't heard the node go down - but we +%% really should make sure we can actually communicate with +%% it. Otherwise there's a race where we falsely detect a partial +%% partition. +%% +%% Now of course the rpc:call/4 may take a long time to return if +%% connectivity with the node is actually interrupted - but that's OK, +%% we only really want to do something in a timely manner if +%% connectivity is OK. However, of course as always we must not block +%% the node monitor, so we do the check in a separate process. handle_cast({check_partial_partition, _Node, _Reporter, _NodeGUID, _GUID, _ReporterGUID}, State) -> |
