summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean-Sebastien Pedron <jean-sebastien@rabbitmq.com>2014-12-09 19:14:28 +0100
committerJean-Sebastien Pedron <jean-sebastien@rabbitmq.com>2014-12-09 19:14:28 +0100
commit07d93ab1916933c4663f10f91fa705e02e5c7cf0 (patch)
tree3bde494cd06a95b675af77ad49c9aaf75de940d9
parent28f82588c0ea05fa9e69b370ce03a0a316ce9f24 (diff)
downloadrabbitmq-server-git-07d93ab1916933c4663f10f91fa705e02e5c7cf0.tar.gz
Autoheal: The loosing leader must wait for the winner_is message
As any other loosing nodes, the leader must wait for the winner_is message, instead of restarting immediately. The previous behaviour caused transient failures in the autoheal process if the leader was in the middle of the restart at the time the winner checks that all loosing nodes are up and running.
-rw-r--r--src/rabbit_autoheal.erl9
1 files changed, 7 insertions, 2 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl
index 90458741b7..7089911c32 100644
--- a/src/rabbit_autoheal.erl
+++ b/src/rabbit_autoheal.erl
@@ -54,6 +54,10 @@
%% - we are the winner and are waiting for all losing nodes to stop
%% before telling them they can restart
%%
+%% about_to_heal
+%% - we are the leader, and have already assigned the winner and losers.
+%% We are part of the losers and we wait for the winner_is announcement.
+%%
%% {leader_waiting, OutstandingStops}
%% - we are the leader, and have already assigned the winner and losers.
%% We are neither but need to ignore further requests to autoheal.
@@ -135,7 +139,7 @@ handle_msg({request_start, Node},
true -> Continue({become_winner, Losers});
false -> send(Winner, {become_winner, Losers}), %% [0]
case lists:member(node(), Losers) of
- true -> Continue({winner_is, Winner});
+ true -> about_to_heal;
false -> {leader_waiting, Losers}
end
end
@@ -163,7 +167,8 @@ handle_msg({become_winner, Losers},
end;
handle_msg({winner_is, Winner},
- not_healing, _Partitions) ->
+ State, _Partitions)
+ when State =:= not_healing orelse State =:= about_to_heal ->
rabbit_log:warning(
"Autoheal: we were selected to restart; winner is ~p~n", [Winner]),
rabbit_node_monitor:run_outside_applications(