diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/rabbit_autoheal.erl | 9 | ||||
| -rw-r--r-- | src/rabbit_channel.erl | 4 | ||||
| -rw-r--r-- | src/rabbit_node_monitor.erl | 93 |
3 files changed, 77 insertions, 29 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl index 09e9aa6ae9..af1795f953 100644 --- a/src/rabbit_autoheal.erl +++ b/src/rabbit_autoheal.erl @@ -16,7 +16,8 @@ -module(rabbit_autoheal). --export([init/0, maybe_start/1, rabbit_down/2, node_down/2, handle_msg/3]). +-export([init/0, enabled/0, maybe_start/1, rabbit_down/2, node_down/2, + handle_msg/3]). %% The named process we are running in. -define(SERVER, rabbit_node_monitor). @@ -90,7 +91,11 @@ maybe_start(State) -> State. enabled() -> - {ok, autoheal} =:= application:get_env(rabbit, cluster_partition_handling). + case application:get_env(rabbit, cluster_partition_handling) of + {ok, autoheal} -> true; + {ok, {pause_if_all_down, _, autoheal}} -> true; + _ -> false + end. %% This is the winner receiving its last notification that a node has diff --git a/src/rabbit_channel.erl b/src/rabbit_channel.erl index 83b3c53d0c..63a5eb7e79 100644 --- a/src/rabbit_channel.erl +++ b/src/rabbit_channel.erl @@ -1736,7 +1736,7 @@ send_nacks(_, State) -> send_confirms(State = #ch{tx = none, confirmed = []}) -> State; send_confirms(State = #ch{tx = none, confirmed = C}) -> - case rabbit_node_monitor:pause_minority_guard() of + case rabbit_node_monitor:pause_partition_guard() of ok -> MsgSeqNos = lists:foldl( fun ({MsgSeqNo, XName}, MSNs) -> @@ -1748,7 +1748,7 @@ send_confirms(State = #ch{tx = none, confirmed = C}) -> pausing -> State end; send_confirms(State) -> - case rabbit_node_monitor:pause_minority_guard() of + case rabbit_node_monitor:pause_partition_guard() of ok -> maybe_complete_tx(State); pausing -> State end. diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl index 82a7a89be9..12ac08120a 100644 --- a/src/rabbit_node_monitor.erl +++ b/src/rabbit_node_monitor.erl @@ -25,7 +25,7 @@ update_cluster_status/0, reset_cluster_status/0]). -export([notify_node_up/0, notify_joined_cluster/0, notify_left_cluster/1]). -export([partitions/0, partitions/1, status/1, subscribe/1]). --export([pause_minority_guard/0]). +-export([pause_partition_guard/0]). %% gen_server callbacks -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, @@ -64,7 +64,7 @@ -spec(partitions/1 :: ([node()]) -> [{node(), [node()]}]). -spec(status/1 :: ([node()]) -> {[{node(), [node()]}], [node()]}). -spec(subscribe/1 :: (pid()) -> 'ok'). --spec(pause_minority_guard/0 :: () -> 'ok' | 'pausing'). +-spec(pause_partition_guard/0 :: () -> 'ok' | 'pausing'). -spec(all_rabbit_nodes_up/0 :: () -> boolean()). -spec(run_outside_applications/1 :: (fun (() -> any())) -> pid()). @@ -194,46 +194,66 @@ subscribe(Pid) -> gen_server:cast(?SERVER, {subscribe, Pid}). %%---------------------------------------------------------------------------- -%% pause_minority safety +%% pause_minority/pause_if_all_down safety %%---------------------------------------------------------------------------- %% If we are in a minority and pause_minority mode then a) we are %% going to shut down imminently and b) we should not confirm anything %% until then, since anything we confirm is likely to be lost. %% -%% We could confirm something by having an HA queue see the minority +%% The same principles apply to a node which isn't part of the preferred +%% partition when we are in pause_if_all_down mode. +%% +%% We could confirm something by having an HA queue see the pausing %% state (and fail over into it) before the node monitor stops us, or %% by using unmirrored queues and just having them vanish (and %% confiming messages as thrown away). %% %% So we have channels call in here before issuing confirms, to do a -%% lightweight check that we have not entered a minority state. +%% lightweight check that we have not entered a pausing state. -pause_minority_guard() -> - case get(pause_minority_guard) of - not_minority_mode -> +pause_partition_guard() -> + case get(pause_partition_guard) of + not_pause_mode -> ok; undefined -> {ok, M} = application:get_env(rabbit, cluster_partition_handling), case M of - pause_minority -> pause_minority_guard([]); - _ -> put(pause_minority_guard, not_minority_mode), - ok + pause_minority -> + pause_minority_guard([]); + {pause_if_all_down, PreferredNodes, _} -> + pause_if_all_down_guard(PreferredNodes, []); + _ -> + put(pause_partition_guard, not_pause_mode), + ok end; {minority_mode, Nodes} -> - pause_minority_guard(Nodes) + pause_minority_guard(Nodes); + {pause_if_all_down_mode, PreferredNodes, Nodes} -> + pause_if_all_down_guard(PreferredNodes, Nodes) end. pause_minority_guard(LastNodes) -> case nodes() of LastNodes -> ok; - _ -> put(pause_minority_guard, {minority_mode, nodes()}), + _ -> put(pause_partition_guard, {minority_mode, nodes()}), case majority() of false -> pausing; true -> ok end end. +pause_if_all_down_guard(PreferredNodes, LastNodes) -> + case nodes() of + LastNodes -> ok; + _ -> put(pause_partition_guard, + {pause_if_all_down_mode, PreferredNodes, nodes()}), + case in_preferred_partition(PreferredNodes) of + false -> pausing; + true -> ok + end + end. + %%---------------------------------------------------------------------------- %% gen_server callbacks %%---------------------------------------------------------------------------- @@ -289,8 +309,9 @@ handle_cast(notify_node_up, State = #state{guid = GUID}) -> %% 'check_partial_partition' to all the nodes it still thinks are %% alive. If any of those (intermediate) nodes still see the "down" %% node as up, they inform it that this has happened. The original -%% node (in 'ignore' or 'autoheal' mode) will then disconnect from the -%% intermediate node to "upgrade" to a full partition. +%% node (in 'ignore', 'pause_if_all_down' or 'autoheal' mode) will then +%% disconnect from the intermediate node to "upgrade" to a full +%% partition. %% %% In pause_minority mode it will instead immediately pause until all %% nodes come back. This is because the contract for pause_minority is @@ -525,10 +546,11 @@ handle_dead_node(Node, State = #state{autoheal = Autoheal}) -> %% that we can respond in the same way to "rabbitmqctl stop_app" %% and "rabbitmqctl stop" as much as possible. %% - %% However, for pause_minority mode we can't do this, since we - %% depend on looking at whether other nodes are up to decide - %% whether to come back up ourselves - if we decide that based on - %% the rabbit application we would go down and never come back. + %% However, for pause_minority and pause_if_all_down modes we can't do + %% this, since we depend on looking at whether other nodes are up + %% to decide whether to come back up ourselves - if we decide that + %% based on the rabbit application we would go down and never come + %% back. case application:get_env(rabbit, cluster_partition_handling) of {ok, pause_minority} -> case majority() of @@ -536,6 +558,17 @@ handle_dead_node(Node, State = #state{autoheal = Autoheal}) -> false -> await_cluster_recovery(fun majority/0) end, State; + {ok, {pause_if_all_down, PreferredNodes, HowToRecover}} -> + case in_preferred_partition(PreferredNodes) of + true -> ok; + false -> await_cluster_recovery( + fun in_preferred_partition/0) + end, + case HowToRecover of + autoheal -> State#state{autoheal = + rabbit_autoheal:node_down(Node, Autoheal)}; + _ -> State + end; {ok, ignore} -> State; {ok, autoheal} -> @@ -547,8 +580,8 @@ handle_dead_node(Node, State = #state{autoheal = Autoheal}) -> end. await_cluster_recovery(Condition) -> - rabbit_log:warning("Cluster minority status detected - awaiting recovery~n", - []), + rabbit_log:warning("Cluster minority/secondary status detected - " + "awaiting recovery~n", []), run_outside_applications(fun () -> rabbit:stop(), wait_for_cluster_recovery(Condition) @@ -681,15 +714,25 @@ disconnect(Node) -> %% here. "rabbit" in a function's name implies we test if the rabbit %% application is up, not just the node. -%% As we use these functions to decide what to do in pause_minority -%% state, they *must* be fast, even in the case where TCP connections -%% are timing out. So that means we should be careful about whether we -%% connect to nodes which are currently disconnected. +%% As we use these functions to decide what to do in pause_minority or +%% pause_if_all_down states, they *must* be fast, even in the case where +%% TCP connections are timing out. So that means we should be careful +%% about whether we connect to nodes which are currently disconnected. majority() -> Nodes = rabbit_mnesia:cluster_nodes(all), length(alive_nodes(Nodes)) / length(Nodes) > 0.5. +in_preferred_partition() -> + {ok, {pause_if_all_down, PreferredNodes, _}} = + application:get_env(rabbit, cluster_partition_handling), + in_preferred_partition(PreferredNodes). + +in_preferred_partition(PreferredNodes) -> + Nodes = rabbit_mnesia:cluster_nodes(all), + RealPreferredNodes = [N || N <- PreferredNodes, lists:member(N, Nodes)], + RealPreferredNodes =:= [] orelse alive_nodes(RealPreferredNodes) =/= []. + all_nodes_up() -> Nodes = rabbit_mnesia:cluster_nodes(all), length(alive_nodes(Nodes)) =:= length(Nodes). |
