Link process responsible of restart during autoheal, and abort if needed

If the process crashes unexpectedly, the autoheal process gets into a deadlock in the 'restarting' state, ignoring any new request from the winner. If the crash happens before the process is registered, no logs are generated. Thus, we need to link that process and abort the autoheal if the process finishes with a reason different from normal. rabbitmq-server#1346 [#150707017]
author: Diana Corbacho <diana@rabbitmq.com> 2017-08-31 11:24:11 +0100
committer: Diana Corbacho <diana@rabbitmq.com> 2017-08-31 12:05:55 +0100
commit: a04658d53ecb375e43dd2e18c7b74fae9b3039fe (patch)
tree: 885d7c072da4649076cbaef813a04f90b2840a8e
parent: f2afe31fcca09a24c17debac1c07b4bba6de13c4 (diff)
download: rabbitmq-server-git-a04658d53ecb375e43dd2e18c7b74fae9b3039fe.tar.gz
2 files changed, 23 insertions, 7 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl
index 58f60bb527..5324858c28 100644
--- a/src/rabbit_autoheal.erl
+++ b/src/rabbit_autoheal.erl
@@ -17,7 +17,7 @@
 -module(rabbit_autoheal).
 
 -export([init/0, enabled/0, maybe_start/1, rabbit_down/2, node_down/2,
-         handle_msg/3]).
+         handle_msg/3, process_down/2]).
 
 %% The named process we are running in.
 -define(SERVER, rabbit_node_monitor).
@@ -196,6 +196,16 @@ node_down(Node, _State) ->
     rabbit_log:info("Autoheal: aborting - ~p went down~n", [Node]),
     not_healing.
 
+%% If the process that has to restart the node crashes for an unexpected reason,
+%% we go back to a not healing state so the node is able to recover.
+process_down({'EXIT', Pid, Reason}, {restarting, Pid}) when Reason =/= normal ->
+    rabbit_log:info("Autoheal: aborting - the process responsible for restarting the "
+                    "node terminated with reason: ~p~n", [Reason]),
+    not_healing;
+
+process_down(_, State) ->
+    State.
+
 %% By receiving this message we become the leader
 %% TODO should we try to debounce this?
 handle_msg({request_start, Node},
@@ -252,17 +262,19 @@ handle_msg({become_winner, _},
 handle_msg({winner_is, Winner}, State = not_healing,
            _Partitions) ->
     %% This node is a loser, nothing else.
-    restart_loser(State, Winner),
-    restarting;
+    Pid = restart_loser(State, Winner),
+    {restarting, Pid};
 handle_msg({winner_is, Winner}, State = {leader_waiting, Winner, _},
            _Partitions) ->
     %% This node is the leader and a loser at the same time.
-    restart_loser(State, Winner),
-    restarting;
+    Pid = restart_loser(State, Winner),
+    {restarting, Pid};
 
-handle_msg(_, restarting, _Partitions) ->
+handle_msg(Request, {restarting, Pid} = St, _Partitions) ->
     %% ignore, we can contribute no further
-    restarting;
+    rabbit_log:info("Autoheal: Received the request ~p while waiting for ~p "
+                    "to restart the node. Ignoring it ~n", [Request, Pid]),
+    St;
 
 handle_msg(report_autoheal_status, not_healing, _Partitions) ->
     %% The leader is asking about the autoheal status to us (the
diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl
index 810df2d1fc..a8e21f1054 100644
--- a/src/rabbit_node_monitor.erl
+++ b/src/rabbit_node_monitor.erl
@@ -623,6 +623,10 @@ handle_info(ping_up_nodes, State) ->
     [cast(N, keepalive) || N <- alive_nodes() -- [node()]],
     {noreply, ensure_keepalive_timer(State#state{keepalive_timer = undefined})};
 
+handle_info({'EXIT', _, _} = Info, State = #state{autoheal = AState0}) ->
+    AState = rabbit_autoheal:process_down(Info, AState0),
+    {noreply, State#state{autoheal = AState}};
+
 handle_info(_Info, State) ->
     {noreply, State}.
author	Diana Corbacho <diana@rabbitmq.com>	2017-08-31 11:24:11 +0100
committer	Diana Corbacho <diana@rabbitmq.com>	2017-08-31 12:05:55 +0100
commit	a04658d53ecb375e43dd2e18c7b74fae9b3039fe (patch)
tree	885d7c072da4649076cbaef813a04f90b2840a8e
parent	f2afe31fcca09a24c17debac1c07b4bba6de13c4 (diff)
download	rabbitmq-server-git-a04658d53ecb375e43dd2e18c7b74fae9b3039fe.tar.gz