Eliminate the node_stopped message, since it is possible that a badly-timed stop_app could lead to us missing it. Instead just go based on whether the rabbit stops - if it stops for any reason other than autoheal, we just send it a message it will ignore and continue.

author: Simon MacMullen <simon@rabbitmq.com> 2014-02-26 10:18:24 +0000
committer: Simon MacMullen <simon@rabbitmq.com> 2014-02-26 10:18:24 +0000
commit: 354a4927cf2ba39b01804c7b0430b7571b2088b5 (patch)
tree: e10756bdbd6e8f236458da9fe2ef9267fde413c7 /src
parent: 05b62c27143e980529da39c552c90aae0f810539 (diff)
download: rabbitmq-server-git-354a4927cf2ba39b01804c7b0430b7571b2088b5.tar.gz
2 files changed, 22 insertions, 22 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl
index 43d35fb5e4..7dc5e55387 100644
--- a/src/rabbit_autoheal.erl
+++ b/src/rabbit_autoheal.erl
@@ -16,7 +16,7 @@
 
 -module(rabbit_autoheal).
 
--export([init/0, maybe_start/1, node_down/2, handle_msg/3]).
+-export([init/0, maybe_start/1, rabbit_down/2, node_down/2, handle_msg/3]).
 
 %% The named process we are running in.
 -define(SERVER, rabbit_node_monitor).
@@ -75,6 +75,21 @@ maybe_start(State) ->
 enabled() ->
     {ok, autoheal} =:= application:get_env(rabbit, cluster_partition_handling).
 
+
+%% This is the winner receiving its last notification that a node has
+%% stopped - all nodes can now start again
+rabbit_down(Node, {winner_waiting, [Node], Notify}) ->
+    rabbit_log:info("Autoheal: final node has stopped, starting...~n",[]),
+    [{rabbit_outside_app_process, N} ! autoheal_safe_to_start || N <- Notify],
+    not_healing;
+
+rabbit_down(Node, {winner_waiting, WaitFor, Notify}) ->
+    {winner_waiting, WaitFor -- [Node], Notify};
+
+rabbit_down(_Node, State) ->
+    %% ignore, we already cancelled the autoheal process
+    State.
+
 node_down(_Node, not_healing) ->
     not_healing;
 node_down(Node, _State) ->
@@ -127,7 +142,6 @@ handle_msg({winner_is, Winner},
       fun () ->
               MRef = erlang:monitor(process, {?SERVER, Winner}),
               rabbit:stop(),
-              send(Winner, {node_stopped, node()}),
               receive
                   {'DOWN', MRef, process, {?SERVER, Winner}, _Reason} -> ok;
                   autoheal_safe_to_start                              -> ok
@@ -137,25 +151,9 @@ handle_msg({winner_is, Winner},
       end),
     restarting;
 
-%% This is the winner receiving its last notification that a node has
-%% stopped - all nodes can now start again
-handle_msg({node_stopped, Node},
-           {winner_waiting, [Node], Notify}, _Partitions) ->
-    rabbit_log:info("Autoheal: final node has stopped, starting...~n",[]),
-    [{rabbit_outside_app_process, N} ! autoheal_safe_to_start || N <- Notify],
-    not_healing;
-
-handle_msg({node_stopped, Node},
-           {winner_waiting, WaitFor, Notify}, _Partitions) ->
-    {winner_waiting, WaitFor -- [Node], Notify};
-
 handle_msg(_, restarting, _Partitions) ->
     %% ignore, we can contribute no further
-    restarting;
-
-handle_msg({node_stopped, _Node}, State, _Partitions) ->
-    %% ignore, we already cancelled the autoheal process
-    State.
+    restarting.
 
 %%----------------------------------------------------------------------------
 
diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl
index c47e9b24d2..46dbd7b7a1 100644
--- a/src/rabbit_node_monitor.erl
+++ b/src/rabbit_node_monitor.erl
@@ -387,7 +387,8 @@ wait_for_cluster_recovery(Nodes) ->
                  wait_for_cluster_recovery(Nodes)
     end.
 
-handle_dead_rabbit(Node, State = #state{partitions = Partitions}) ->
+handle_dead_rabbit(Node, State = #state{partitions = Partitions,
+                                        autoheal   = Autoheal}) ->
     %% TODO: This may turn out to be a performance hog when there are
     %% lots of nodes.  We really only need to execute some of these
     %% statements on *one* node, rather than all of them.
@@ -404,8 +405,9 @@ handle_dead_rabbit(Node, State = #state{partitions = Partitions}) ->
                       [] -> [];
                       _  -> Partitions
                   end,
-    ensure_ping_timer(State#state{partitions = Partitions1}).
-
+    ensure_ping_timer(
+      State#state{partitions = Partitions1,
+                  autoheal   = rabbit_autoheal:rabbit_down(Node, Autoheal)}).
 
 ensure_ping_timer(State) ->
     rabbit_misc:ensure_timer(
author	Simon MacMullen <simon@rabbitmq.com>	2014-02-26 10:18:24 +0000
committer	Simon MacMullen <simon@rabbitmq.com>	2014-02-26 10:18:24 +0000
commit	354a4927cf2ba39b01804c7b0430b7571b2088b5 (patch)
tree	e10756bdbd6e8f236458da9fe2ef9267fde413c7 /src
parent	05b62c27143e980529da39c552c90aae0f810539 (diff)
download	rabbitmq-server-git-354a4927cf2ba39b01804c7b0430b7571b2088b5.tar.gz