diff options
| -rw-r--r-- | src/rabbit_autoheal.erl | 26 | ||||
| -rw-r--r-- | src/rabbit_node_monitor.erl | 22 |
2 files changed, 35 insertions, 13 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl index 58f60bb527..5324858c28 100644 --- a/src/rabbit_autoheal.erl +++ b/src/rabbit_autoheal.erl @@ -17,7 +17,7 @@ -module(rabbit_autoheal). -export([init/0, enabled/0, maybe_start/1, rabbit_down/2, node_down/2, - handle_msg/3]). + handle_msg/3, process_down/2]). %% The named process we are running in. -define(SERVER, rabbit_node_monitor). @@ -196,6 +196,16 @@ node_down(Node, _State) -> rabbit_log:info("Autoheal: aborting - ~p went down~n", [Node]), not_healing. +%% If the process that has to restart the node crashes for an unexpected reason, +%% we go back to a not healing state so the node is able to recover. +process_down({'EXIT', Pid, Reason}, {restarting, Pid}) when Reason =/= normal -> + rabbit_log:info("Autoheal: aborting - the process responsible for restarting the " + "node terminated with reason: ~p~n", [Reason]), + not_healing; + +process_down(_, State) -> + State. + %% By receiving this message we become the leader %% TODO should we try to debounce this? handle_msg({request_start, Node}, @@ -252,17 +262,19 @@ handle_msg({become_winner, _}, handle_msg({winner_is, Winner}, State = not_healing, _Partitions) -> %% This node is a loser, nothing else. - restart_loser(State, Winner), - restarting; + Pid = restart_loser(State, Winner), + {restarting, Pid}; handle_msg({winner_is, Winner}, State = {leader_waiting, Winner, _}, _Partitions) -> %% This node is the leader and a loser at the same time. - restart_loser(State, Winner), - restarting; + Pid = restart_loser(State, Winner), + {restarting, Pid}; -handle_msg(_, restarting, _Partitions) -> +handle_msg(Request, {restarting, Pid} = St, _Partitions) -> %% ignore, we can contribute no further - restarting; + rabbit_log:info("Autoheal: Received the request ~p while waiting for ~p " + "to restart the node. Ignoring it ~n", [Request, Pid]), + St; handle_msg(report_autoheal_status, not_healing, _Partitions) -> %% The leader is asking about the autoheal status to us (the diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl index 810df2d1fc..8bf79f7130 100644 --- a/src/rabbit_node_monitor.erl +++ b/src/rabbit_node_monitor.erl @@ -623,6 +623,10 @@ handle_info(ping_up_nodes, State) -> [cast(N, keepalive) || N <- alive_nodes() -- [node()]], {noreply, ensure_keepalive_timer(State#state{keepalive_timer = undefined})}; +handle_info({'EXIT', _, _} = Info, State = #state{autoheal = AState0}) -> + AState = rabbit_autoheal:process_down(Info, AState0), + {noreply, State#state{autoheal = AState}}; + handle_info(_Info, State) -> {noreply, State}. @@ -686,12 +690,18 @@ await_cluster_recovery(Condition) -> ok. run_outside_applications(Fun, WaitForExistingProcess) -> - spawn(fun () -> - %% If our group leader is inside an application we are about - %% to stop, application:stop/1 does not return. - group_leader(whereis(init), self()), - register_outside_app_process(Fun, WaitForExistingProcess) - end). + spawn_link(fun () -> + %% Ignore exit messages from the monitor - the link is needed + %% to ensure the monitor detects abnormal exits from this process + %% and can reset the 'restarting' status on the autoheal, avoiding + %% a deadlock. The monitor is restarted when rabbit does, so messages + %% in the other direction should be ignored. + process_flag(trap_exit, true), + %% If our group leader is inside an application we are about + %% to stop, application:stop/1 does not return. + group_leader(whereis(init), self()), + register_outside_app_process(Fun, WaitForExistingProcess) + end). register_outside_app_process(Fun, WaitForExistingProcess) -> %% Ensure only one such process at a time, the exit(badarg) is |
