2 files changed, 35 insertions, 13 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl
index 58f60bb527..5324858c28 100644
--- a/src/rabbit_autoheal.erl
+++ b/src/rabbit_autoheal.erl
@@ -17,7 +17,7 @@
 -module(rabbit_autoheal).
 
 -export([init/0, enabled/0, maybe_start/1, rabbit_down/2, node_down/2,
-         handle_msg/3]).
+         handle_msg/3, process_down/2]).
 
 %% The named process we are running in.
 -define(SERVER, rabbit_node_monitor).
@@ -196,6 +196,16 @@ node_down(Node, _State) ->
     rabbit_log:info("Autoheal: aborting - ~p went down~n", [Node]),
     not_healing.
 
+%% If the process that has to restart the node crashes for an unexpected reason,
+%% we go back to a not healing state so the node is able to recover.
+process_down({'EXIT', Pid, Reason}, {restarting, Pid}) when Reason =/= normal ->
+    rabbit_log:info("Autoheal: aborting - the process responsible for restarting the "
+                    "node terminated with reason: ~p~n", [Reason]),
+    not_healing;
+
+process_down(_, State) ->
+    State.
+
 %% By receiving this message we become the leader
 %% TODO should we try to debounce this?
 handle_msg({request_start, Node},
@@ -252,17 +262,19 @@ handle_msg({become_winner, _},
 handle_msg({winner_is, Winner}, State = not_healing,
            _Partitions) ->
     %% This node is a loser, nothing else.
-    restart_loser(State, Winner),
-    restarting;
+    Pid = restart_loser(State, Winner),
+    {restarting, Pid};
 handle_msg({winner_is, Winner}, State = {leader_waiting, Winner, _},
            _Partitions) ->
     %% This node is the leader and a loser at the same time.
-    restart_loser(State, Winner),
-    restarting;
+    Pid = restart_loser(State, Winner),
+    {restarting, Pid};
 
-handle_msg(_, restarting, _Partitions) ->
+handle_msg(Request, {restarting, Pid} = St, _Partitions) ->
     %% ignore, we can contribute no further
-    restarting;
+    rabbit_log:info("Autoheal: Received the request ~p while waiting for ~p "
+                    "to restart the node. Ignoring it ~n", [Request, Pid]),
+    St;
 
 handle_msg(report_autoheal_status, not_healing, _Partitions) ->
     %% The leader is asking about the autoheal status to us (the
diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl
index 810df2d1fc..8bf79f7130 100644
--- a/src/rabbit_node_monitor.erl
+++ b/src/rabbit_node_monitor.erl
@@ -623,6 +623,10 @@ handle_info(ping_up_nodes, State) ->
     [cast(N, keepalive) || N <- alive_nodes() -- [node()]],
     {noreply, ensure_keepalive_timer(State#state{keepalive_timer = undefined})};
 
+handle_info({'EXIT', _, _} = Info, State = #state{autoheal = AState0}) ->
+    AState = rabbit_autoheal:process_down(Info, AState0),
+    {noreply, State#state{autoheal = AState}};
+
 handle_info(_Info, State) ->
     {noreply, State}.
 
@@ -686,12 +690,18 @@ await_cluster_recovery(Condition) ->
     ok.
 
 run_outside_applications(Fun, WaitForExistingProcess) ->
-    spawn(fun () ->
-                  %% If our group leader is inside an application we are about
-                  %% to stop, application:stop/1 does not return.
-                  group_leader(whereis(init), self()),
-                  register_outside_app_process(Fun, WaitForExistingProcess)
-          end).
+    spawn_link(fun () ->
+                       %% Ignore exit messages from the monitor - the link is needed
+                       %% to ensure the monitor detects abnormal exits from this process
+                       %% and can reset the 'restarting' status on the autoheal, avoiding
+                       %% a deadlock. The monitor is restarted when rabbit does, so messages
+                       %% in the other direction should be ignored.
+                       process_flag(trap_exit, true),
+                       %% If our group leader is inside an application we are about
+                       %% to stop, application:stop/1 does not return.
+                       group_leader(whereis(init), self()),
+                       register_outside_app_process(Fun, WaitForExistingProcess)
+               end).
 
 register_outside_app_process(Fun, WaitForExistingProcess) ->
     %% Ensure only one such process at a time, the exit(badarg) is