summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Klishin <michael@clojurewerkz.org>2017-09-04 17:44:39 -0700
committerMichael Klishin <michael@clojurewerkz.org>2017-09-04 17:44:39 -0700
commitcfc5bde7d40c40a96a094ed0f849ef4f98ff6406 (patch)
tree258e351b64b6f0f095c45222d59f5938063c65db
parent282bf55214fa5ed7c445d9215416c6d37fb7986a (diff)
parentbee9a4238e2731a6bf371481056f02a940c1beb4 (diff)
downloadrabbitmq-server-git-cfc5bde7d40c40a96a094ed0f849ef4f98ff6406.tar.gz
Merge branch 'stable'
-rw-r--r--src/rabbit_autoheal.erl26
-rw-r--r--src/rabbit_node_monitor.erl22
2 files changed, 35 insertions, 13 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl
index 58f60bb527..5324858c28 100644
--- a/src/rabbit_autoheal.erl
+++ b/src/rabbit_autoheal.erl
@@ -17,7 +17,7 @@
-module(rabbit_autoheal).
-export([init/0, enabled/0, maybe_start/1, rabbit_down/2, node_down/2,
- handle_msg/3]).
+ handle_msg/3, process_down/2]).
%% The named process we are running in.
-define(SERVER, rabbit_node_monitor).
@@ -196,6 +196,16 @@ node_down(Node, _State) ->
rabbit_log:info("Autoheal: aborting - ~p went down~n", [Node]),
not_healing.
+%% If the process that has to restart the node crashes for an unexpected reason,
+%% we go back to a not healing state so the node is able to recover.
+process_down({'EXIT', Pid, Reason}, {restarting, Pid}) when Reason =/= normal ->
+ rabbit_log:info("Autoheal: aborting - the process responsible for restarting the "
+ "node terminated with reason: ~p~n", [Reason]),
+ not_healing;
+
+process_down(_, State) ->
+ State.
+
%% By receiving this message we become the leader
%% TODO should we try to debounce this?
handle_msg({request_start, Node},
@@ -252,17 +262,19 @@ handle_msg({become_winner, _},
handle_msg({winner_is, Winner}, State = not_healing,
_Partitions) ->
%% This node is a loser, nothing else.
- restart_loser(State, Winner),
- restarting;
+ Pid = restart_loser(State, Winner),
+ {restarting, Pid};
handle_msg({winner_is, Winner}, State = {leader_waiting, Winner, _},
_Partitions) ->
%% This node is the leader and a loser at the same time.
- restart_loser(State, Winner),
- restarting;
+ Pid = restart_loser(State, Winner),
+ {restarting, Pid};
-handle_msg(_, restarting, _Partitions) ->
+handle_msg(Request, {restarting, Pid} = St, _Partitions) ->
%% ignore, we can contribute no further
- restarting;
+ rabbit_log:info("Autoheal: Received the request ~p while waiting for ~p "
+ "to restart the node. Ignoring it ~n", [Request, Pid]),
+ St;
handle_msg(report_autoheal_status, not_healing, _Partitions) ->
%% The leader is asking about the autoheal status to us (the
diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl
index 810df2d1fc..8bf79f7130 100644
--- a/src/rabbit_node_monitor.erl
+++ b/src/rabbit_node_monitor.erl
@@ -623,6 +623,10 @@ handle_info(ping_up_nodes, State) ->
[cast(N, keepalive) || N <- alive_nodes() -- [node()]],
{noreply, ensure_keepalive_timer(State#state{keepalive_timer = undefined})};
+handle_info({'EXIT', _, _} = Info, State = #state{autoheal = AState0}) ->
+ AState = rabbit_autoheal:process_down(Info, AState0),
+ {noreply, State#state{autoheal = AState}};
+
handle_info(_Info, State) ->
{noreply, State}.
@@ -686,12 +690,18 @@ await_cluster_recovery(Condition) ->
ok.
run_outside_applications(Fun, WaitForExistingProcess) ->
- spawn(fun () ->
- %% If our group leader is inside an application we are about
- %% to stop, application:stop/1 does not return.
- group_leader(whereis(init), self()),
- register_outside_app_process(Fun, WaitForExistingProcess)
- end).
+ spawn_link(fun () ->
+ %% Ignore exit messages from the monitor - the link is needed
+ %% to ensure the monitor detects abnormal exits from this process
+ %% and can reset the 'restarting' status on the autoheal, avoiding
+ %% a deadlock. The monitor is restarted when rabbit does, so messages
+ %% in the other direction should be ignored.
+ process_flag(trap_exit, true),
+ %% If our group leader is inside an application we are about
+ %% to stop, application:stop/1 does not return.
+ group_leader(whereis(init), self()),
+ register_outside_app_process(Fun, WaitForExistingProcess)
+ end).
register_outside_app_process(Fun, WaitForExistingProcess) ->
%% Ensure only one such process at a time, the exit(badarg) is