diff options
| author | Diana Corbacho <diana@rabbitmq.com> | 2017-08-31 11:24:11 +0100 |
|---|---|---|
| committer | Diana Corbacho <diana@rabbitmq.com> | 2017-08-31 12:05:55 +0100 |
| commit | a04658d53ecb375e43dd2e18c7b74fae9b3039fe (patch) | |
| tree | 885d7c072da4649076cbaef813a04f90b2840a8e | |
| parent | f2afe31fcca09a24c17debac1c07b4bba6de13c4 (diff) | |
| download | rabbitmq-server-git-a04658d53ecb375e43dd2e18c7b74fae9b3039fe.tar.gz | |
Link process responsible of restart during autoheal, and abort if needed
If the process crashes unexpectedly, the autoheal process gets into a
deadlock in the 'restarting' state, ignoring any new request from the winner.
If the crash happens before the process is registered, no logs are generated.
Thus, we need to link that process and abort the autoheal if the process finishes
with a reason different from normal.
rabbitmq-server#1346
[#150707017]
| -rw-r--r-- | src/rabbit_autoheal.erl | 26 | ||||
| -rw-r--r-- | src/rabbit_node_monitor.erl | 4 |
2 files changed, 23 insertions, 7 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl index 58f60bb527..5324858c28 100644 --- a/src/rabbit_autoheal.erl +++ b/src/rabbit_autoheal.erl @@ -17,7 +17,7 @@ -module(rabbit_autoheal). -export([init/0, enabled/0, maybe_start/1, rabbit_down/2, node_down/2, - handle_msg/3]). + handle_msg/3, process_down/2]). %% The named process we are running in. -define(SERVER, rabbit_node_monitor). @@ -196,6 +196,16 @@ node_down(Node, _State) -> rabbit_log:info("Autoheal: aborting - ~p went down~n", [Node]), not_healing. +%% If the process that has to restart the node crashes for an unexpected reason, +%% we go back to a not healing state so the node is able to recover. +process_down({'EXIT', Pid, Reason}, {restarting, Pid}) when Reason =/= normal -> + rabbit_log:info("Autoheal: aborting - the process responsible for restarting the " + "node terminated with reason: ~p~n", [Reason]), + not_healing; + +process_down(_, State) -> + State. + %% By receiving this message we become the leader %% TODO should we try to debounce this? handle_msg({request_start, Node}, @@ -252,17 +262,19 @@ handle_msg({become_winner, _}, handle_msg({winner_is, Winner}, State = not_healing, _Partitions) -> %% This node is a loser, nothing else. - restart_loser(State, Winner), - restarting; + Pid = restart_loser(State, Winner), + {restarting, Pid}; handle_msg({winner_is, Winner}, State = {leader_waiting, Winner, _}, _Partitions) -> %% This node is the leader and a loser at the same time. - restart_loser(State, Winner), - restarting; + Pid = restart_loser(State, Winner), + {restarting, Pid}; -handle_msg(_, restarting, _Partitions) -> +handle_msg(Request, {restarting, Pid} = St, _Partitions) -> %% ignore, we can contribute no further - restarting; + rabbit_log:info("Autoheal: Received the request ~p while waiting for ~p " + "to restart the node. Ignoring it ~n", [Request, Pid]), + St; handle_msg(report_autoheal_status, not_healing, _Partitions) -> %% The leader is asking about the autoheal status to us (the diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl index 810df2d1fc..a8e21f1054 100644 --- a/src/rabbit_node_monitor.erl +++ b/src/rabbit_node_monitor.erl @@ -623,6 +623,10 @@ handle_info(ping_up_nodes, State) -> [cast(N, keepalive) || N <- alive_nodes() -- [node()]], {noreply, ensure_keepalive_timer(State#state{keepalive_timer = undefined})}; +handle_info({'EXIT', _, _} = Info, State = #state{autoheal = AState0}) -> + AState = rabbit_autoheal:process_down(Info, AState0), + {noreply, State#state{autoheal = AState}}; + handle_info(_Info, State) -> {noreply, State}. |
