summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSimon MacMullen <simon@rabbitmq.com>2014-10-14 15:16:34 +0100
committerSimon MacMullen <simon@rabbitmq.com>2014-10-14 15:16:34 +0100
commit621628e07b87181af8ebd609c945e5d551b198d1 (patch)
tree0ab928780eae02065aa5fe72607ee031b257b345
parent99912404206130f34b8aa370150b624ee1320219 (diff)
downloadrabbitmq-server-git-621628e07b87181af8ebd609c945e5d551b198d1.tar.gz
Switch to having the winner inform the losers that they need to stop, rather than having the leader do it. This fixes the race where the leader tells them to stop before the partition has healed from the winner's POV. So it should be simpler and more correct.
-rw-r--r--src/rabbit_autoheal.erl33
1 files changed, 8 insertions, 25 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl
index 5a6cd48f6e..259e6ec25b 100644
--- a/src/rabbit_autoheal.erl
+++ b/src/rabbit_autoheal.erl
@@ -127,7 +127,6 @@ handle_msg({request_start, Node},
" * Winner: ~p~n"
" * Losers: ~p~n",
[AllPartitions, Winner, Losers]),
- [send(L, {winner_is, Winner}) || L <- Losers],
Continue = fun(Msg) ->
handle_msg(Msg, not_healing, Partitions)
end,
@@ -153,7 +152,14 @@ handle_msg({become_winner, Losers},
not_healing, _Partitions) ->
rabbit_log:info("Autoheal: I am the winner, waiting for ~p to stop~n",
[Losers]),
- filter_already_down_losers(Losers, Losers);
+ %% The leader said everything was ready - do we agree? If not then
+ %% give up.
+ Down = Losers -- rabbit_node_monitor:alive_rabbit_nodes(Losers),
+ case Down of
+ [] -> [send(L, {winner_is, node()}) || L <- Losers],
+ {winner_waiting, Losers, Losers};
+ _ -> abort(Down, Losers)
+ end;
handle_msg({winner_is, Winner},
not_healing, _Partitions) ->
@@ -224,26 +230,3 @@ all_partitions([{Node, CantSee} | Rest], Partitions) ->
_ -> [A, B | Others]
end,
all_partitions(Rest, Partitions1).
-
-%% We could have received and ignored DOWN messages from some losers
-%% before becoming the winner - check for already down nodes.
-filter_already_down_losers(WantStopped, Notify) ->
- Down = WantStopped -- rabbit_node_monitor:alive_nodes(WantStopped),
- case Down of
- [] ->
- Running = rabbit_node_monitor:alive_rabbit_nodes(WantStopped),
- AlreadyStopped = WantStopped -- Running,
- case AlreadyStopped of
- [] -> ok;
- _ -> rabbit_log:info(
- "Autoheal: ~p already down~n", [AlreadyStopped])
- end,
- case Running of
- [] -> rabbit_log:info(
- "Autoheal: final node has stopped, starting...~n",[]),
- winner_finish(Notify);
- _ -> {winner_waiting, Running, Notify}
- end;
- _ ->
- abort(Down, Notify)
- end.