diff options
| -rw-r--r-- | src/rabbit.erl | 2 | ||||
| -rw-r--r-- | src/rabbit_node_monitor.erl | 40 | ||||
| -rw-r--r-- | test/partitions_SUITE.erl | 38 |
3 files changed, 59 insertions, 21 deletions
diff --git a/src/rabbit.erl b/src/rabbit.erl index 26a69ddeea..138d03f051 100644 --- a/src/rabbit.erl +++ b/src/rabbit.erl @@ -624,7 +624,7 @@ decrypt_list([Value|Tail], Algo, Acc) -> stop_apps(Apps) -> rabbit_log:info( - lists:flatten(["Stopping RabbitMQ applications and their dependencies in the following order: ~n", + lists:flatten(["Stopping RabbitMQ applications and their dependencies in the following order:~n", [" ~p~n" || _ <- Apps]]), lists:reverse(Apps)), ok = app_utils:stop_applications( diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl index 0eadf0ff59..810df2d1fc 100644 --- a/src/rabbit_node_monitor.erl +++ b/src/rabbit_node_monitor.erl @@ -344,8 +344,8 @@ init([]) -> Nodes = possibly_partitioned_nodes(), startup_log(Nodes), Monitors = lists:foldl(fun(Node, Monitors0) -> - pmon:monitor({rabbit, Node}, Monitors0) - end, pmon:new(), Nodes), + pmon:monitor({rabbit, Node}, Monitors0) + end, pmon:new(), Nodes), {ok, ensure_keepalive_timer(#state{monitors = Monitors, subscribers = pmon:new(), partitions = [], @@ -420,12 +420,12 @@ handle_cast({check_partial_partition, Node, Rep, NodeGUID, MyGUID, RepGUID}, fun () -> case rpc:call(Node, rabbit, is_running, []) of {badrpc, _} -> ok; - _ -> - rabbit_log:warning("Received a 'DOWN' message" - " from ~p but still can" - " communicate with it ~n", - [Node]), - cast(Rep, {partial_partition, + _ -> + rabbit_log:warning("Received a 'DOWN' message" + " from ~p but still can" + " communicate with it ~n", + [Node]), + cast(Rep, {partial_partition, Node, node(), RepGUID}) end end); @@ -499,18 +499,18 @@ handle_cast({node_up, Node, NodeType}, rabbit_log:info("rabbit on node ~p up~n", [Node]), {AllNodes, DiscNodes, RunningNodes} = read_cluster_status(), write_cluster_status({add_node(Node, AllNodes), - case NodeType of - disc -> add_node(Node, DiscNodes); - ram -> DiscNodes - end, - add_node(Node, RunningNodes)}), + case NodeType of + disc -> add_node(Node, DiscNodes); + ram -> DiscNodes + end, + add_node(Node, RunningNodes)}), ok = handle_live_rabbit(Node), Monitors1 = case pmon:is_monitored({rabbit, Node}, Monitors) of - true -> - Monitors; - false -> - pmon:monitor({rabbit, Node}, Monitors) - end, + true -> + Monitors; + false -> + pmon:monitor({rabbit, Node}, Monitors) + end, {noreply, maybe_autoheal(State#state{monitors = Monitors1})}; handle_cast({joined_cluster, Node, NodeType}, State) -> @@ -584,7 +584,7 @@ handle_info({mnesia_system_event, State1 = case pmon:is_monitored({rabbit, Node}, Monitors) of true -> State; false -> State#state{ - monitors = pmon:monitor({rabbit, Node}, Monitors)} + monitors = pmon:monitor({rabbit, Node}, Monitors)} end, ok = handle_live_rabbit(Node), Partitions1 = lists:usort([Node | Partitions]), @@ -893,4 +893,4 @@ startup_log([]) -> rabbit_log:info("Starting rabbit_node_monitor~n", []); startup_log(Nodes) -> rabbit_log:info("Starting rabbit_node_monitor, might be partitioned from ~p~n", - [Nodes]). + [Nodes]). diff --git a/test/partitions_SUITE.erl b/test/partitions_SUITE.erl index 8c8a772987..b09d05b550 100644 --- a/test/partitions_SUITE.erl +++ b/test/partitions_SUITE.erl @@ -335,16 +335,26 @@ autoheal_unexpected_finish(Config) -> partial_false_positive(Config) -> [A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename), + suspend_node_monitor(Config, C), block([{A, B}]), timer:sleep(1000), block([{A, C}]), timer:sleep(?DELAY), + resume_node_monitor(Config, C), + timer:sleep(?DELAY), unblock([{A, B}, {A, C}]), timer:sleep(?DELAY), %% When B times out A's connection, it will check with C. C will %% not have timed out A yet, but already it can't talk to it. We %% need to not consider this a partial partition; B and C should %% still talk to each other. + %% + %% Because there is a chance that C can still talk to A when B + %% requests to check for a partial partition, we suspend C's + %% rabbit_node_monitor at the beginning and resume it after the + %% link between A and C is blocked. This way, when B asks C about + %% A, we make sure that the A<->C link is blocked before C's + %% rabbit_node_monitor processes B's request. [B, C] = partitions(A), [A] = partitions(B), [A] = partitions(C), @@ -369,7 +379,19 @@ partial_to_full(Config) -> partial_pause_minority(Config) -> [A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename), set_mode(Config, pause_minority), + %% We suspend rabbit_node_monitor on C while we block the link + %% between A and B. This should make sure C's rabbit_node_monitor + %% processes both partial partition checks from A and B at about + %% the same time, and thus increase the chance both A and B decides + %% there is a partial partition. + %% + %% Without this, one node may see the partial partition and stop, + %% before the other node sees it. In this case, the other node + %% doesn't stop and this testcase fails. + suspend_node_monitor(Config, C), block([{A, B}]), + timer:sleep(?DELAY), + resume_node_monitor(Config, C), [await_running(N, false) || N <- [A, B]], await_running(C, true), unblock([{A, B}]), @@ -394,6 +416,22 @@ set_mode(Config, Mode) -> set_mode(Config, Nodes, Mode) -> rabbit_ct_broker_helpers:set_partition_handling_mode(Config, Nodes, Mode). +suspend_node_monitor(Config, Node) -> + rabbit_ct_broker_helpers:rpc( + Config, Node, ?MODULE, suspend_or_resume_node_monitor, [suspend]). + +resume_node_monitor(Config, Node) -> + rabbit_ct_broker_helpers:rpc( + Config, Node, ?MODULE, suspend_or_resume_node_monitor, [resume]). + +suspend_or_resume_node_monitor(SuspendOrResume) -> + Action = case SuspendOrResume of + suspend -> "Suspending"; + resume -> "Resuming" + end, + rabbit_log:info("(~s) ~s node monitor~n", [?MODULE, Action]), + ok = sys:SuspendOrResume(rabbit_node_monitor). + block_unblock(Pairs) -> block(Pairs), timer:sleep(?DELAY), |
