summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrancesco Mazzoli <francesco@rabbitmq.com>2012-07-05 18:02:00 +0100
committerFrancesco Mazzoli <francesco@rabbitmq.com>2012-07-05 18:02:00 +0100
commit48c7fa88e07b145464c69105242a2d03b94be788 (patch)
treeacc20ae81aa421c13cf2b49abb0075dd0887fc36
parent991d7e74919b7c4656e35d967f11953e3035b588 (diff)
downloadrabbitmq-server-git-48c7fa88e07b145464c69105242a2d03b94be788.tar.gz
fix `remove_node/1' - do not remove nodes from offline nodes, unless we're "safe"
We wan to remove nodes from offline nodes only when all the cluster nodes are offline, and when the node we are removing from was either the last to go down or the second after the node we are removing. However we cannot know that for sure. For this reason rabbit will refuse to remove from offline nodes unless the flag `--offline' is passed. There is more information in comments around `rabbit_mnesia:remove_node/2.
-rw-r--r--src/rabbit_control_main.erl10
-rw-r--r--src/rabbit_mnesia.erl108
2 files changed, 81 insertions, 37 deletions
diff --git a/src/rabbit_control_main.erl b/src/rabbit_control_main.erl
index d927206bdb..10bb0f1bd7 100644
--- a/src/rabbit_control_main.erl
+++ b/src/rabbit_control_main.erl
@@ -26,11 +26,13 @@
-define(NODE_OPT, "-n").
-define(VHOST_OPT, "-p").
-define(RAM_OPT, "--ram").
+-define(OFFLINE_OPT, "--offline").
-define(QUIET_DEF, {?QUIET_OPT, flag}).
-define(NODE_DEF(Node), {?NODE_OPT, {option, Node}}).
-define(VHOST_DEF, {?VHOST_OPT, {option, "/"}}).
-define(RAM_DEF, {?RAM_OPT, flag}).
+-define(OFFLINE_DEF, {?OFFLINE_OPT, flag}).
-define(GLOBAL_DEFS(Node), [?QUIET_DEF, ?NODE_DEF(Node)]).
@@ -46,7 +48,7 @@
{join_cluster, [?RAM_DEF]},
change_node_type,
recluster,
- remove_node,
+ {remove_node, [?OFFLINE_OPT]},
cluster_status,
add_user,
@@ -256,10 +258,12 @@ action(recluster, Node, [ClusterNodeS], _Opts, Inform) ->
Inform("Re-clustering ~p with ~p", [Node, ClusterNode]),
rpc_call(Node, rabbit_mnesia, recluster, [ClusterNode]);
-action(remove_node, Node, [ClusterNodeS], _Opts, Inform) ->
+action(remove_node, Node, [ClusterNodeS], Opts, Inform) ->
ClusterNode = list_to_atom(ClusterNodeS),
+ RemoveWhenOffline = proplists:get_bool(?OFFLINE_OPT, Opts),
Inform("Removing node ~p from cluster", [ClusterNode]),
- rpc_call(Node, rabbit_mnesia, remove_node, [ClusterNode]);
+ rpc_call(Node, rabbit_mnesia, remove_node,
+ [ClusterNode, RemoveWhenOffline]);
action(wait, Node, [PidFile], _Opts, Inform) ->
Inform("Waiting for ~p", [Node]),
diff --git a/src/rabbit_mnesia.erl b/src/rabbit_mnesia.erl
index d285123168..400c3d7d0f 100644
--- a/src/rabbit_mnesia.erl
+++ b/src/rabbit_mnesia.erl
@@ -24,7 +24,7 @@
force_reset/0,
recluster/1,
change_node_type/1,
- remove_node/1,
+ remove_node/2,
status/0,
is_db_empty/0,
@@ -77,7 +77,7 @@
-spec(force_reset/0 :: () -> 'ok').
-spec(recluster/1 :: (node()) -> 'ok').
-spec(change_node_type/1 :: (node_type()) -> 'ok').
--spec(remove_node/1 :: (node()) -> 'ok').
+-spec(remove_node/2 :: (node(), boolean()) -> 'ok').
%% Various queries to get the status of the db
-spec(status/0 :: () -> [{'nodes', [{node_type(), [node()]}]} |
@@ -294,44 +294,82 @@ recluster(DiscoveryNode) ->
ok.
-%% We proceed like this: try to remove the node locally. If mnesia is offline
-%% then we try to remove it remotely on some other node. If there are no other
-%% nodes running, then *if the current node is a disc node* we force-load mnesia
-%% and remove the node.
-remove_node(Node) ->
- case ordsets:is_element(Node, all_clustered_nodes()) of
+%% We proceed like this: try to remove the node locally. If the node if offline,
+%% we remove the node if:
+%% * This node is a disc node
+%% * All other nodes are offline
+%% * This node was, at the best of our knowledge (see comment below) the last
+%% or second to last after the node we're removing to go down
+remove_node(Node, RemoveWhenOffline) ->
+ case is_clustered() of
true -> ok;
false -> throw({error, {not_a_cluster_node,
"The node selected is not in the cluster."}})
end,
+ case {mnesia:system_info(is_running), RemoveWhenOffline} of
+ {yes, true} -> throw({error, {online_node_offline_flag,
+ "You set the --offline flag, which is "
+ "used to remove nodes remotely from "
+ "offline nodes, but this node is "
+ "online. "}});
+ _ -> ok
+ end,
case remove_node_if_mnesia_running(Node) of
ok ->
ok;
{error, mnesia_not_running} ->
- case remove_node_remotely(Node) of
- ok ->
- ok;
- {error, no_running_cluster_nodes} ->
- case is_disc_node() of
- false ->
- throw({error,
- {removing_node_from_ram_node,
- "There are no nodes running and this is a "
- "RAM node"}});
- true ->
+ case {ordsets:del_element(Node,
+ running_nodes(all_clustered_nodes())),
+ is_disc_node(), RemoveWhenOffline}
+ of
+ {[], true, true} ->
+ %% Note that while we check if the nodes was the last to go
+ %% down, apart from the node we're removing from, this is
+ %% still unsafe.
+ %% Consider the situation in which A and B are clustered. A
+ %% goes down, and records B as the running node. Then B gets
+ %% clustered with C, C goes down and B goes down. In this
+ %% case, C is the second-to-last, but we don't know that and
+ %% we'll remove B from A anyway, even if that will lead to
+ %% bad things.
+ case ordsets:subtract(running_clustered_nodes(),
+ ordsets:from_list([node(), Node]))
+ of
+ [] ->
start_mnesia(),
try
[mnesia:force_load_table(T) ||
T <- rabbit_mnesia:table_names()],
- remove_node(Node),
+ remove_node(Node, false),
ensure_mnesia_running()
after
stop_mnesia()
- end
- end
+ end;
+ _ ->
+ throw({error,
+ {not_last_node_to_go_down,
+ "The node you're trying to remove was not "
+ "the last to go down (excluding the node "
+ "you are removing). Please use the the "
+ "last node to go down to remove nodes when "
+ "the cluster is offline."}})
+ end;
+ {_, _, false} ->
+ throw({error,
+ {offline_node_no_offline_flag,
+ "You are trying to remove a node from an offline "
+ "node. That's dangerous, but can be done with the "
+ "--offline flag. Please consult the manual for "
+ "rabbitmqctl for more informations."}});
+ {_, _, _} ->
+ throw({error,
+ {removing_node_from_offline_node,
+ "To remove a node remotely from an offline node, "
+ "the node you're removing from must be a disc node "
+ "and all the other nodes must be offline."}})
end;
- {error, Reason} ->
- throw({error, Reason})
+ Err = {error, _} ->
+ throw(Err)
end.
%%----------------------------------------------------------------------------
@@ -1007,18 +1045,17 @@ remove_node_if_mnesia_running(Node) ->
end.
leave_cluster() ->
- remove_node_remotely(node()).
-
-remove_node_remotely(Removee) ->
- case running_clustered_nodes() -- [Removee] of
- [] ->
- {error, no_running_cluster_nodes};
- RunningNodes ->
+ case {is_clustered(),
+ running_nodes(ordsets:del_element(node(), all_clustered_nodes()))}
+ of
+ {false, []} ->
+ ok;
+ {_, AllNodes} ->
case lists:any(
fun (Node) ->
case rpc:call(Node, rabbit_mnesia,
remove_node_if_mnesia_running,
- [Removee])
+ [node()])
of
ok ->
true;
@@ -1030,10 +1067,13 @@ remove_node_remotely(Removee) ->
false
end
end,
- RunningNodes)
+ AllNodes)
of
true -> ok;
- false -> {error, no_running_cluster_nodes}
+ false -> throw({error,
+ {no_running_cluster_nodes,
+ "You cannot leave a cluster if no online "
+ "nodes are present"}})
end
end.