summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/rabbit.erl2
-rw-r--r--src/rabbit_node_monitor.erl40
-rw-r--r--test/partitions_SUITE.erl38
3 files changed, 59 insertions, 21 deletions
diff --git a/src/rabbit.erl b/src/rabbit.erl
index 26a69ddeea..138d03f051 100644
--- a/src/rabbit.erl
+++ b/src/rabbit.erl
@@ -624,7 +624,7 @@ decrypt_list([Value|Tail], Algo, Acc) ->
stop_apps(Apps) ->
rabbit_log:info(
- lists:flatten(["Stopping RabbitMQ applications and their dependencies in the following order: ~n",
+ lists:flatten(["Stopping RabbitMQ applications and their dependencies in the following order:~n",
[" ~p~n" || _ <- Apps]]),
lists:reverse(Apps)),
ok = app_utils:stop_applications(
diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl
index 0eadf0ff59..810df2d1fc 100644
--- a/src/rabbit_node_monitor.erl
+++ b/src/rabbit_node_monitor.erl
@@ -344,8 +344,8 @@ init([]) ->
Nodes = possibly_partitioned_nodes(),
startup_log(Nodes),
Monitors = lists:foldl(fun(Node, Monitors0) ->
- pmon:monitor({rabbit, Node}, Monitors0)
- end, pmon:new(), Nodes),
+ pmon:monitor({rabbit, Node}, Monitors0)
+ end, pmon:new(), Nodes),
{ok, ensure_keepalive_timer(#state{monitors = Monitors,
subscribers = pmon:new(),
partitions = [],
@@ -420,12 +420,12 @@ handle_cast({check_partial_partition, Node, Rep, NodeGUID, MyGUID, RepGUID},
fun () ->
case rpc:call(Node, rabbit, is_running, []) of
{badrpc, _} -> ok;
- _ ->
- rabbit_log:warning("Received a 'DOWN' message"
- " from ~p but still can"
- " communicate with it ~n",
- [Node]),
- cast(Rep, {partial_partition,
+ _ ->
+ rabbit_log:warning("Received a 'DOWN' message"
+ " from ~p but still can"
+ " communicate with it ~n",
+ [Node]),
+ cast(Rep, {partial_partition,
Node, node(), RepGUID})
end
end);
@@ -499,18 +499,18 @@ handle_cast({node_up, Node, NodeType},
rabbit_log:info("rabbit on node ~p up~n", [Node]),
{AllNodes, DiscNodes, RunningNodes} = read_cluster_status(),
write_cluster_status({add_node(Node, AllNodes),
- case NodeType of
- disc -> add_node(Node, DiscNodes);
- ram -> DiscNodes
- end,
- add_node(Node, RunningNodes)}),
+ case NodeType of
+ disc -> add_node(Node, DiscNodes);
+ ram -> DiscNodes
+ end,
+ add_node(Node, RunningNodes)}),
ok = handle_live_rabbit(Node),
Monitors1 = case pmon:is_monitored({rabbit, Node}, Monitors) of
- true ->
- Monitors;
- false ->
- pmon:monitor({rabbit, Node}, Monitors)
- end,
+ true ->
+ Monitors;
+ false ->
+ pmon:monitor({rabbit, Node}, Monitors)
+ end,
{noreply, maybe_autoheal(State#state{monitors = Monitors1})};
handle_cast({joined_cluster, Node, NodeType}, State) ->
@@ -584,7 +584,7 @@ handle_info({mnesia_system_event,
State1 = case pmon:is_monitored({rabbit, Node}, Monitors) of
true -> State;
false -> State#state{
- monitors = pmon:monitor({rabbit, Node}, Monitors)}
+ monitors = pmon:monitor({rabbit, Node}, Monitors)}
end,
ok = handle_live_rabbit(Node),
Partitions1 = lists:usort([Node | Partitions]),
@@ -893,4 +893,4 @@ startup_log([]) ->
rabbit_log:info("Starting rabbit_node_monitor~n", []);
startup_log(Nodes) ->
rabbit_log:info("Starting rabbit_node_monitor, might be partitioned from ~p~n",
- [Nodes]).
+ [Nodes]).
diff --git a/test/partitions_SUITE.erl b/test/partitions_SUITE.erl
index 8c8a772987..b09d05b550 100644
--- a/test/partitions_SUITE.erl
+++ b/test/partitions_SUITE.erl
@@ -335,16 +335,26 @@ autoheal_unexpected_finish(Config) ->
partial_false_positive(Config) ->
[A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
+ suspend_node_monitor(Config, C),
block([{A, B}]),
timer:sleep(1000),
block([{A, C}]),
timer:sleep(?DELAY),
+ resume_node_monitor(Config, C),
+ timer:sleep(?DELAY),
unblock([{A, B}, {A, C}]),
timer:sleep(?DELAY),
%% When B times out A's connection, it will check with C. C will
%% not have timed out A yet, but already it can't talk to it. We
%% need to not consider this a partial partition; B and C should
%% still talk to each other.
+ %%
+ %% Because there is a chance that C can still talk to A when B
+ %% requests to check for a partial partition, we suspend C's
+ %% rabbit_node_monitor at the beginning and resume it after the
+ %% link between A and C is blocked. This way, when B asks C about
+ %% A, we make sure that the A<->C link is blocked before C's
+ %% rabbit_node_monitor processes B's request.
[B, C] = partitions(A),
[A] = partitions(B),
[A] = partitions(C),
@@ -369,7 +379,19 @@ partial_to_full(Config) ->
partial_pause_minority(Config) ->
[A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
set_mode(Config, pause_minority),
+ %% We suspend rabbit_node_monitor on C while we block the link
+ %% between A and B. This should make sure C's rabbit_node_monitor
+ %% processes both partial partition checks from A and B at about
+ %% the same time, and thus increase the chance both A and B decides
+ %% there is a partial partition.
+ %%
+ %% Without this, one node may see the partial partition and stop,
+ %% before the other node sees it. In this case, the other node
+ %% doesn't stop and this testcase fails.
+ suspend_node_monitor(Config, C),
block([{A, B}]),
+ timer:sleep(?DELAY),
+ resume_node_monitor(Config, C),
[await_running(N, false) || N <- [A, B]],
await_running(C, true),
unblock([{A, B}]),
@@ -394,6 +416,22 @@ set_mode(Config, Mode) ->
set_mode(Config, Nodes, Mode) ->
rabbit_ct_broker_helpers:set_partition_handling_mode(Config, Nodes, Mode).
+suspend_node_monitor(Config, Node) ->
+ rabbit_ct_broker_helpers:rpc(
+ Config, Node, ?MODULE, suspend_or_resume_node_monitor, [suspend]).
+
+resume_node_monitor(Config, Node) ->
+ rabbit_ct_broker_helpers:rpc(
+ Config, Node, ?MODULE, suspend_or_resume_node_monitor, [resume]).
+
+suspend_or_resume_node_monitor(SuspendOrResume) ->
+ Action = case SuspendOrResume of
+ suspend -> "Suspending";
+ resume -> "Resuming"
+ end,
+ rabbit_log:info("(~s) ~s node monitor~n", [?MODULE, Action]),
+ ok = sys:SuspendOrResume(rabbit_node_monitor).
+
block_unblock(Pairs) ->
block(Pairs),
timer:sleep(?DELAY),