diff options
| author | Simon MacMullen <simon@rabbitmq.com> | 2014-02-26 10:18:24 +0000 |
|---|---|---|
| committer | Simon MacMullen <simon@rabbitmq.com> | 2014-02-26 10:18:24 +0000 |
| commit | 354a4927cf2ba39b01804c7b0430b7571b2088b5 (patch) | |
| tree | e10756bdbd6e8f236458da9fe2ef9267fde413c7 /src | |
| parent | 05b62c27143e980529da39c552c90aae0f810539 (diff) | |
| download | rabbitmq-server-git-354a4927cf2ba39b01804c7b0430b7571b2088b5.tar.gz | |
Eliminate the node_stopped message, since it is possible that a badly-timed stop_app could lead to us missing it. Instead just go based on whether the rabbit stops - if it stops for any reason other than autoheal, we just send it a message it will ignore and continue.
Diffstat (limited to 'src')
| -rw-r--r-- | src/rabbit_autoheal.erl | 36 | ||||
| -rw-r--r-- | src/rabbit_node_monitor.erl | 8 |
2 files changed, 22 insertions, 22 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl index 43d35fb5e4..7dc5e55387 100644 --- a/src/rabbit_autoheal.erl +++ b/src/rabbit_autoheal.erl @@ -16,7 +16,7 @@ -module(rabbit_autoheal). --export([init/0, maybe_start/1, node_down/2, handle_msg/3]). +-export([init/0, maybe_start/1, rabbit_down/2, node_down/2, handle_msg/3]). %% The named process we are running in. -define(SERVER, rabbit_node_monitor). @@ -75,6 +75,21 @@ maybe_start(State) -> enabled() -> {ok, autoheal} =:= application:get_env(rabbit, cluster_partition_handling). + +%% This is the winner receiving its last notification that a node has +%% stopped - all nodes can now start again +rabbit_down(Node, {winner_waiting, [Node], Notify}) -> + rabbit_log:info("Autoheal: final node has stopped, starting...~n",[]), + [{rabbit_outside_app_process, N} ! autoheal_safe_to_start || N <- Notify], + not_healing; + +rabbit_down(Node, {winner_waiting, WaitFor, Notify}) -> + {winner_waiting, WaitFor -- [Node], Notify}; + +rabbit_down(_Node, State) -> + %% ignore, we already cancelled the autoheal process + State. + node_down(_Node, not_healing) -> not_healing; node_down(Node, _State) -> @@ -127,7 +142,6 @@ handle_msg({winner_is, Winner}, fun () -> MRef = erlang:monitor(process, {?SERVER, Winner}), rabbit:stop(), - send(Winner, {node_stopped, node()}), receive {'DOWN', MRef, process, {?SERVER, Winner}, _Reason} -> ok; autoheal_safe_to_start -> ok @@ -137,25 +151,9 @@ handle_msg({winner_is, Winner}, end), restarting; -%% This is the winner receiving its last notification that a node has -%% stopped - all nodes can now start again -handle_msg({node_stopped, Node}, - {winner_waiting, [Node], Notify}, _Partitions) -> - rabbit_log:info("Autoheal: final node has stopped, starting...~n",[]), - [{rabbit_outside_app_process, N} ! autoheal_safe_to_start || N <- Notify], - not_healing; - -handle_msg({node_stopped, Node}, - {winner_waiting, WaitFor, Notify}, _Partitions) -> - {winner_waiting, WaitFor -- [Node], Notify}; - handle_msg(_, restarting, _Partitions) -> %% ignore, we can contribute no further - restarting; - -handle_msg({node_stopped, _Node}, State, _Partitions) -> - %% ignore, we already cancelled the autoheal process - State. + restarting. %%---------------------------------------------------------------------------- diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl index c47e9b24d2..46dbd7b7a1 100644 --- a/src/rabbit_node_monitor.erl +++ b/src/rabbit_node_monitor.erl @@ -387,7 +387,8 @@ wait_for_cluster_recovery(Nodes) -> wait_for_cluster_recovery(Nodes) end. -handle_dead_rabbit(Node, State = #state{partitions = Partitions}) -> +handle_dead_rabbit(Node, State = #state{partitions = Partitions, + autoheal = Autoheal}) -> %% TODO: This may turn out to be a performance hog when there are %% lots of nodes. We really only need to execute some of these %% statements on *one* node, rather than all of them. @@ -404,8 +405,9 @@ handle_dead_rabbit(Node, State = #state{partitions = Partitions}) -> [] -> []; _ -> Partitions end, - ensure_ping_timer(State#state{partitions = Partitions1}). - + ensure_ping_timer( + State#state{partitions = Partitions1, + autoheal = rabbit_autoheal:rabbit_down(Node, Autoheal)}). ensure_ping_timer(State) -> rabbit_misc:ensure_timer( |
