Add a new "keep_preferred" cluster partition handling method

The syntax is: {cluster_partition_management, {keep_preferred, node@domain}} The specified node name is used to determine which partition should run or be suspended. Nodes which can still reach the specified node continue to run. Nodes which can't are suspended. Compared to pause_minority, this allows the admin to determine which nodes to prioritize in case of partitions with an equal number of nodes.
author: Jean-Sebastien Pedron <jean-sebastien@rabbitmq.com> 2014-12-02 15:43:32 +0100
committer: Jean-Sebastien Pedron <jean-sebastien@rabbitmq.com> 2014-12-02 15:43:32 +0100
commit: 5595a265cebebdcfcfc60bbea61c5418a59996a6 (patch)
tree: c12ecc9c72663fea7207dc784a53ba921220794f
parent: 19450507c7eed92dc8c4a8c264faadf797cf040c (diff)
download: rabbitmq-server-git-5595a265cebebdcfcfc60bbea61c5418a59996a6.tar.gz
2 files changed, 74 insertions, 27 deletions
diff --git a/src/rabbit_channel.erl b/src/rabbit_channel.erl
index 13cc925cd0..bc3ee2e0eb 100644
--- a/src/rabbit_channel.erl
+++ b/src/rabbit_channel.erl
@@ -1736,7 +1736,7 @@ send_nacks(_, State) ->
 send_confirms(State = #ch{tx = none, confirmed = []}) ->
     State;
 send_confirms(State = #ch{tx = none, confirmed = C}) ->
-    case rabbit_node_monitor:pause_minority_guard() of
+    case rabbit_node_monitor:pause_partition_guard() of
         ok      -> MsgSeqNos =
                        lists:foldl(
                          fun ({MsgSeqNo, XName}, MSNs) ->
@@ -1748,7 +1748,7 @@ send_confirms(State = #ch{tx = none, confirmed = C}) ->
         pausing -> State
     end;
 send_confirms(State) ->
-    case rabbit_node_monitor:pause_minority_guard() of
+    case rabbit_node_monitor:pause_partition_guard() of
         ok      -> maybe_complete_tx(State);
         pausing -> State
     end.
diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl
index 82a7a89be9..4af896f947 100644
--- a/src/rabbit_node_monitor.erl
+++ b/src/rabbit_node_monitor.erl
@@ -25,7 +25,7 @@
          update_cluster_status/0, reset_cluster_status/0]).
 -export([notify_node_up/0, notify_joined_cluster/0, notify_left_cluster/1]).
 -export([partitions/0, partitions/1, status/1, subscribe/1]).
--export([pause_minority_guard/0]).
+-export([pause_partition_guard/0]).
 
 %% gen_server callbacks
 -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2,
@@ -64,7 +64,7 @@
 -spec(partitions/1 :: ([node()]) -> [{node(), [node()]}]).
 -spec(status/1 :: ([node()]) -> {[{node(), [node()]}], [node()]}).
 -spec(subscribe/1 :: (pid()) -> 'ok').
--spec(pause_minority_guard/0 :: () -> 'ok' | 'pausing').
+-spec(pause_partition_guard/0 :: () -> 'ok' | 'pausing').
 
 -spec(all_rabbit_nodes_up/0 :: () -> boolean()).
 -spec(run_outside_applications/1 :: (fun (() -> any())) -> pid()).
@@ -194,46 +194,66 @@ subscribe(Pid) ->
     gen_server:cast(?SERVER, {subscribe, Pid}).
 
 %%----------------------------------------------------------------------------
-%% pause_minority safety
+%% pause_minority/keep_preferred safety
 %%----------------------------------------------------------------------------
 
 %% If we are in a minority and pause_minority mode then a) we are
 %% going to shut down imminently and b) we should not confirm anything
 %% until then, since anything we confirm is likely to be lost.
 %%
-%% We could confirm something by having an HA queue see the minority
+%% The same principles apply to a node which isn't part of the preferred
+%% partition when we are in keep_preferred mode.
+%%
+%% We could confirm something by having an HA queue see the pausing
 %% state (and fail over into it) before the node monitor stops us, or
 %% by using unmirrored queues and just having them vanish (and
 %% confiming messages as thrown away).
 %%
 %% So we have channels call in here before issuing confirms, to do a
-%% lightweight check that we have not entered a minority state.
+%% lightweight check that we have not entered a pausing state.
 
-pause_minority_guard() ->
-    case get(pause_minority_guard) of
-        not_minority_mode ->
+pause_partition_guard() ->
+    case get(pause_partition_guard) of
+        not_pause_mode ->
             ok;
         undefined ->
             {ok, M} = application:get_env(rabbit, cluster_partition_handling),
             case M of
-                pause_minority -> pause_minority_guard([]);
-                _              -> put(pause_minority_guard, not_minority_mode),
-                                  ok
+                pause_minority ->
+                    pause_minority_guard([]);
+                {keep_preferred, PreferredNode} when is_atom(PreferredNode) ->
+                    keep_preferred_guard(PreferredNode, []);
+                _ ->
+                    put(pause_partition_guard, not_pause_mode),
+                    ok
             end;
         {minority_mode, Nodes} ->
-            pause_minority_guard(Nodes)
+            pause_minority_guard(Nodes);
+        {keep_preferred_mode, PreferredNode, Nodes} ->
+            keep_preferred_guard(PreferredNode, Nodes)
     end.
 
 pause_minority_guard(LastNodes) ->
     case nodes() of
         LastNodes -> ok;
-        _         -> put(pause_minority_guard, {minority_mode, nodes()}),
+        _         -> put(pause_partition_guard, {minority_mode, nodes()}),
                      case majority() of
                          false -> pausing;
                          true  -> ok
                      end
     end.
 
+keep_preferred_guard(PreferredNode, LastNodes) ->
+    case nodes() of
+        LastNodes -> ok;
+        _         -> put(pause_partition_guard,
+                         {keep_preferred_mode, PreferredNode, nodes()}),
+                     case in_preferred_partition(PreferredNode) of
+                         false -> pausing;
+                         true  -> ok
+                     end
+    end.
+
 %%----------------------------------------------------------------------------
 %% gen_server callbacks
 %%----------------------------------------------------------------------------
@@ -289,8 +309,9 @@ handle_cast(notify_node_up, State = #state{guid = GUID}) ->
 %% 'check_partial_partition' to all the nodes it still thinks are
 %% alive. If any of those (intermediate) nodes still see the "down"
 %% node as up, they inform it that this has happened. The original
-%% node (in 'ignore' or 'autoheal' mode) will then disconnect from the
-%% intermediate node to "upgrade" to a full partition.
+%% node (in 'ignore', 'keep_preferred' or 'autoheal' mode) will then
+%% disconnect from the intermediate node to "upgrade" to a full
+%% partition.
 %%
 %% In pause_minority mode it will instead immediately pause until all
 %% nodes come back. This is because the contract for pause_minority is
@@ -525,10 +546,11 @@ handle_dead_node(Node, State = #state{autoheal = Autoheal}) ->
     %% that we can respond in the same way to "rabbitmqctl stop_app"
     %% and "rabbitmqctl stop" as much as possible.
     %%
-    %% However, for pause_minority mode we can't do this, since we
-    %% depend on looking at whether other nodes are up to decide
-    %% whether to come back up ourselves - if we decide that based on
-    %% the rabbit application we would go down and never come back.
+    %% However, for pause_minority and keep_preferred modes we can't do
+    %% this, since we depend on looking at whether other nodes are up
+    %% to decide whether to come back up ourselves - if we decide that
+    %% based on the rabbit application we would go down and never come
+    %% back.
     case application:get_env(rabbit, cluster_partition_handling) of
         {ok, pause_minority} ->
             case majority() of
@@ -536,6 +558,21 @@ handle_dead_node(Node, State = #state{autoheal = Autoheal}) ->
                 false -> await_cluster_recovery(fun majority/0)
             end,
             State;
+        {ok, {keep_preferred, PreferredNode}} when is_atom(PreferredNode) ->
+            AllNodes = rabbit_mnesia:cluster_nodes(all),
+            case lists:member(PreferredNode, AllNodes) of
+                true ->
+                    case in_preferred_partition(PreferredNode) of
+                        true  -> ok;
+                        false -> await_cluster_recovery(
+                                    fun in_preferred_partition/0)
+                    end;
+                false ->
+                    rabbit_log:warning("cluster_partition_handling: preferred "
+                                       "node ~s not part of the cluster, "
+                                       "assuming 'ignore'~n", [PreferredNode])
+            end,
+            State;
         {ok, ignore} ->
             State;
         {ok, autoheal} ->
@@ -547,8 +584,8 @@ handle_dead_node(Node, State = #state{autoheal = Autoheal}) ->
     end.
 
 await_cluster_recovery(Condition) ->
-    rabbit_log:warning("Cluster minority status detected - awaiting recovery~n",
-                       []),
+    rabbit_log:warning("Cluster minority/secondary status detected - "
+                       "awaiting recovery~n", []),
     run_outside_applications(fun () ->
                                      rabbit:stop(),
                                      wait_for_cluster_recovery(Condition)
@@ -681,15 +718,25 @@ disconnect(Node) ->
 %% here. "rabbit" in a function's name implies we test if the rabbit
 %% application is up, not just the node.
 
-%% As we use these functions to decide what to do in pause_minority
-%% state, they *must* be fast, even in the case where TCP connections
-%% are timing out. So that means we should be careful about whether we
-%% connect to nodes which are currently disconnected.
+%% As we use these functions to decide what to do in pause_minority or
+%% keep_preferred states, they *must* be fast, even in the case where
+%% TCP connections are timing out. So that means we should be careful
+%% about whether we connect to nodes which are currently disconnected.
 
 majority() ->
     Nodes = rabbit_mnesia:cluster_nodes(all),
     length(alive_nodes(Nodes)) / length(Nodes) > 0.5.
 
+in_preferred_partition() ->
+    {ok, {keep_preferred, PreferredNode}} =
+        application:get_env(rabbit, cluster_partition_handling),
+    in_preferred_partition(PreferredNode).
+
+in_preferred_partition(PreferredNode) ->
+    Nodes = rabbit_mnesia:cluster_nodes(all),
+    lists:member(PreferredNode, Nodes) andalso
+    alive_nodes([PreferredNode]) =/= [].
+
 all_nodes_up() ->
     Nodes = rabbit_mnesia:cluster_nodes(all),
     length(alive_nodes(Nodes)) =:= length(Nodes).
author	Jean-Sebastien Pedron <jean-sebastien@rabbitmq.com>	2014-12-02 15:43:32 +0100
committer	Jean-Sebastien Pedron <jean-sebastien@rabbitmq.com>	2014-12-02 15:43:32 +0100
commit	5595a265cebebdcfcfc60bbea61c5418a59996a6 (patch)
tree	c12ecc9c72663fea7207dc784a53ba921220794f
parent	19450507c7eed92dc8c4a8c264faadf797cf040c (diff)
download	rabbitmq-server-git-5595a265cebebdcfcfc60bbea61c5418a59996a6.tar.gz