diff options
| author | Francesco Mazzoli <francesco@rabbitmq.com> | 2012-05-14 16:57:28 +0100 |
|---|---|---|
| committer | Francesco Mazzoli <francesco@rabbitmq.com> | 2012-05-14 16:57:28 +0100 |
| commit | da995ecd1dcf6ba8c56b5a2da6ecb185f1a6c456 (patch) | |
| tree | 13a4b4be52f132112e0fdd462c22f1a14e330973 /src | |
| parent | 730212794c1629a2425aebb83a31992411e190cb (diff) | |
| download | rabbitmq-server-git-da995ecd1dcf6ba8c56b5a2da6ecb185f1a6c456.tar.gz | |
change function to find out about cluster nodes
See comment above `check_mnesia_running/1' in `rabbit_mnesia'. In short,
the function to find out about all/running nodes in the cluster do not
work if mnesia is down and the node is a ram node, while we assumed
that they would always work. Moreover, the functions to get the cluster
disc nodes require mnesia to be up.
Diffstat (limited to 'src')
| -rw-r--r-- | src/rabbit.erl | 3 | ||||
| -rw-r--r-- | src/rabbit_control.erl | 8 | ||||
| -rw-r--r-- | src/rabbit_mnesia.erl | 184 | ||||
| -rw-r--r-- | src/rabbit_upgrade.erl | 10 |
4 files changed, 150 insertions, 55 deletions
diff --git a/src/rabbit.erl b/src/rabbit.erl index ea9731b68b..bff7af97d2 100644 --- a/src/rabbit.erl +++ b/src/rabbit.erl @@ -513,8 +513,7 @@ boot_step_error({error, {timeout_waiting_for_tables, _}}, _Stacktrace) -> case rabbit_mnesia:read_previously_running_nodes() of [] -> {"Timeout contacting cluster nodes. Since RabbitMQ was" " shut down forcefully~nit cannot determine which nodes" - " are timing out. Details on all nodes will~nfollow.~n", - rabbit_mnesia:all_clustered_nodes() -- [node()]}; + " are timing out.~n"}; Ns -> {rabbit_misc:format( "Timeout contacting cluster nodes: ~p.~n", [Ns]), Ns} diff --git a/src/rabbit_control.erl b/src/rabbit_control.erl index 357073c10a..dc51b76441 100644 --- a/src/rabbit_control.erl +++ b/src/rabbit_control.erl @@ -379,8 +379,14 @@ action(list_parameters, Node, Args = [], _Opts, Inform) -> action(report, Node, _Args, _Opts, Inform) -> io:format("Reporting server status on ~p~n~n", [erlang:universaltime()]), + Nodes = + case unsafe_rpc(Node, rabbit_mnesia, running_clustered_nodes_safe, []) + of + {ok, Res} -> Res; + {error, Reason} -> throw({error, Reason}) + end, [begin ok = action(Action, N, [], [], Inform), io:nl() end || - N <- unsafe_rpc(Node, rabbit_mnesia, running_clustered_nodes, []), + N <- Nodes, Action <- [status, cluster_status, environment]], VHosts = unsafe_rpc(Node, rabbit_vhost, list, []), [print_report(Node, Q) || Q <- ?GLOBAL_QUERIES], diff --git a/src/rabbit_mnesia.erl b/src/rabbit_mnesia.erl index 78529d3be6..324a6df4bd 100644 --- a/src/rabbit_mnesia.erl +++ b/src/rabbit_mnesia.erl @@ -19,7 +19,8 @@ -export([ensure_mnesia_dir/0, dir/0, status/0, init/0, is_db_empty/0, join_cluster/1, reset/0, force_reset/0, init_db/3, is_clustered/0, - running_clustered_nodes/0, all_clustered_nodes/0, + running_clustered_nodes/0, running_clustered_nodes_safe/0, + all_clustered_nodes/0, all_clustered_nodes_safe/0, empty_ram_only_tables/0, copy_db/1, wait_for_tables/1, create_cluster_nodes_config/1, read_cluster_nodes_config/0, record_running_nodes/0, read_previously_running_nodes/0, @@ -28,6 +29,9 @@ -export([table_names/0]). +%% Used internally in rpc calls, see `discover_nodes/1' +-export([all_clustered_and_disc_nodes/0]). + %% create_tables/0 exported for helping embed RabbitMQ in or alongside %% other mnesia-using Erlang applications, such as ejabberd -export([create_tables/0]). @@ -49,12 +53,15 @@ -spec(init/0 :: () -> 'ok'). -spec(init_db/3 :: (node_config(), boolean(), boolean()) -> 'ok'). -spec(is_db_empty/0 :: () -> boolean()). --spec(join_cluster/1 :: ([node()]) -> 'ok'). +-spec(join_cluster/1 :: ({[node()], boolean()}) -> 'ok'). -spec(reset/0 :: () -> 'ok'). -spec(force_reset/0 :: () -> 'ok'). -spec(is_clustered/0 :: () -> boolean()). -spec(running_clustered_nodes/0 :: () -> [node()]). -spec(all_clustered_nodes/0 :: () -> [node()]). +-spec(running_clustered_nodes_safe/0 :: () -> {'ok', [node()]} | + {'error', any()}). +-spec(all_clustered_nodes_safe/0 :: () -> {'ok', [node()]} | {'error', any()}). -spec(empty_ram_only_tables/0 :: () -> 'ok'). -spec(create_tables/0 :: () -> 'ok'). -spec(copy_db/1 :: (file:filename()) -> rabbit_types:ok_or_error(any())). @@ -76,6 +83,10 @@ %%---------------------------------------------------------------------------- status() -> + RamNode = {error, + {stopped_ram_node, + "This is ram node which is not running, and thus " + "information about the cluster can't be retrieved."}}, [{nodes, case mnesia:system_info(is_running) of yes -> [{Key, Nodes} || {Key, CopyType} <- [{disc_only, disc_only_copies}, @@ -85,13 +96,14 @@ status() -> Nodes = nodes_of_type(CopyType), Nodes =/= [] end]; - no -> case all_clustered_nodes() of - [] -> []; - Nodes -> [{unknown, Nodes}] + no -> case all_clustered_nodes_safe() of + {ok, Nodes} -> [{unknown, Nodes}]; + {error, _Reason} -> exit(RamNode) end; Reason when Reason =:= starting; Reason =:= stopping -> exit({rabbit_busy, try_again_later}) end}, + %% If we reached this point running_clustered_nodes() is safe {running_nodes, running_clustered_nodes()}]. init() -> @@ -116,10 +128,7 @@ is_db_empty() -> %% node. If Force is false, only connections to online nodes are %% allowed. join_cluster({DiscoveryNodes, DiscNode}) -> - ensure_mnesia_not_running(), - ensure_mnesia_dir(), - - case is_only_disc_node(node(), false) andalso not DiscNode of + case is_disc_and_clustered() andalso is_only_disc_node(node(), false) of true -> throw({error, {standalone_ram_node, "You can't cluster a node if it's the only " @@ -127,11 +136,14 @@ join_cluster({DiscoveryNodes, DiscNode}) -> _ -> ok end, + ensure_mnesia_not_running(), + ensure_mnesia_dir(), + ProperDiscoveryNodes = DiscoveryNodes -- [node()], - ClusterNodes = case discover_cluster(ProperDiscoveryNodes) of - {ok, ClusterNodes0} -> ClusterNodes0; - {error, Reason} -> throw({error, Reason}) - end, + {ClusterNodes, DiscNodes} = case discover_cluster(ProperDiscoveryNodes) of + {ok, Res} -> Res; + {error, Reason} -> throw({error, Reason}) + end, case lists:member(node(), ClusterNodes) of true -> throw({error, {already_clustered, @@ -148,7 +160,7 @@ join_cluster({DiscoveryNodes, DiscNode}) -> rabbit_misc:local_info_msg("Clustering with ~p~s~n", [ClusterNodes]), - Config = {ClusterNodes, DiscNode}, + Config = {DiscNodes, DiscNode}, %% Join the cluster start_mnesia(), @@ -167,20 +179,85 @@ join_cluster({DiscoveryNodes, DiscNode}) -> reset() -> reset(false). force_reset() -> reset(true). +%% This function will fail if mnesia is not running and the node is a ram node. is_clustered() -> RunningNodes = running_clustered_nodes(), [node()] /= RunningNodes andalso [] /= RunningNodes. +%% This function exists since we often want to check if the node is clustered +%% only if the node is a disc node as well, and so we can call `is_clustered/0' +%% safely. +is_disc_and_clustered() -> + is_disc_node() andalso is_clustered(). + +%% The situations with functions that retrieve the nodes in the cluster is +%% messy. +%% +%% * If we want to get all the nodes or the running nodes, we can do that +%% while mnesia is offline *if* the node is a disc node. If the node is ram, +%% the result will always be [node()]. +%% `all_clustered_nodes/0' and `running_clustered_nodes/0' will fail if +%% these conditions are not met. `all_clustered_nodes_safe/0' and +%% `running_clustered_nodes_safe/0' won't, but can return an error. +%% +%% * If we want to get the cluster disc nodes (running or not), we need to +%% start mnesia in any case. All the functions related to disc nodes are +%% "safe", in the sense that they should never crash and return either the +%% nodes or an error (much like the _safe function for all the nodes). + +check_mnesia_running(Fun) -> + case mnesia:system_info(is_running) of + yes -> {ok, Fun()}; + no -> {error, {mnesia_not_running, node()}} + end. + +check_disc_or_mnesia_running(Fun) -> + case is_disc_node() of + true -> + {ok, Fun()}; + false -> + case check_mnesia_running(Fun) of + {ok, Res} -> {ok, Res}; + {error, _Reason} -> {error, + {mnesia_not_running, + "Mnesia is not running and this node is " + "a ram node", node()}} + end + end. + +all_clustered_nodes_safe() -> + check_disc_or_mnesia_running(fun () -> mnesia:system_info(db_nodes) end). + all_clustered_nodes() -> - mnesia:system_info(db_nodes). + {ok, Nodes} = all_clustered_nodes_safe(), + Nodes. + +all_clustered_disc_nodes() -> + check_mnesia_running(fun () -> nodes_of_type(disc_copies) end). + +running_clustered_nodes_safe() -> + check_disc_or_mnesia_running( + fun () -> mnesia:system_info(running_db_nodes) end). running_clustered_nodes() -> - mnesia:system_info(running_db_nodes). + {ok, Nodes} = running_clustered_nodes_safe(), + Nodes. running_clustered_disc_nodes() -> - RunningSet = sets:from_list(running_clustered_nodes()), - DiscSet = sets:from_list(nodes_of_type(disc_copies)), - sets:to_list(sets:intersection(RunningSet, DiscSet)). + check_mnesia_running( + fun () -> + Running = running_clustered_nodes(), + {ok, Disc} = all_clustered_disc_nodes(), + sets:to_list(sets:intersection(sets:from_list(Running), + sets:from_list(Disc))) + end). + +all_clustered_and_disc_nodes() -> + check_mnesia_running( + fun () -> + {ok, DiscNodes} = all_clustered_disc_nodes(), + {all_clustered_nodes(), DiscNodes} + end). empty_ram_only_tables() -> Node = node(), @@ -194,11 +271,13 @@ empty_ram_only_tables() -> ok. discover_cluster([]) -> - {error, cannot_discover_cluster}; + {error, {cannot_discover_cluster, + "The cluster nodes provided are either offline or not running."}}; discover_cluster([Node | Nodes]) -> - case rpc:call(Node, rabbit_mnesia, all_clustered_nodes, []) of + case rpc:call(Node, rabbit_mnesia, all_clustered_and_disc_nodes, []) of {badrpc, _Reason} -> discover_cluster(Nodes); - Res -> {ok, Res} + {error, _Reason} -> discover_cluster(Nodes); + {ok, Res} -> {ok, Res} end. %%-------------------------------------------------------------------- @@ -449,7 +528,7 @@ create_cluster_nodes_config(Config) -> read_cluster_nodes_config() -> Convert = fun (Config = {_, _}) -> Config; (ClusterNodes) -> - rabbit_log:warning("reading legacy node config"), + log_both("reading legacy node config"), {ClusterNodes, should_be_disc_node_legacy(ClusterNodes)} end, FileName = cluster_nodes_config_filename(), @@ -728,11 +807,12 @@ wait_for_tables(TableNames) -> end. reset(Force) -> - rabbit_misc:local_info_msg("Resetting Rabbit~s~n", [if Force -> " forcefully"; - true -> "" - end]), + rabbit_misc:local_info_msg("Resetting Rabbit~s~n", + [if Force -> " forcefully"; + true -> "" + end]), ensure_mnesia_not_running(), - case not Force andalso is_clustered() andalso + case not Force andalso is_disc_and_clustered() andalso is_only_disc_node(node(), false) of true -> throw({error, {standalone_ram_node, @@ -742,32 +822,39 @@ reset(Force) -> false -> ok end, Node = node(), - case Force of - true -> - %% mnesia is down, so all_clustered_nodes() will return [node()], so - %% it's useless to try to disconnect from cluster. - []; - false -> - ensure_mnesia_dir(), - start_mnesia(), - Nodes = all_clustered_nodes() -- [Node], - RunningNodes = - try - %% Force=true here so that reset still works when - %% clustered with a node which is down - ok = init_db(read_cluster_nodes_config(), true), - running_clustered_nodes() -- [Node] - after - stop_mnesia() - end, - leave_cluster(Nodes, RunningNodes), + MaybeNodes = + case Force of + true -> + all_clustered_nodes_safe(); + false -> + ensure_mnesia_dir(), + start_mnesia(), + Nodes0 = all_clustered_nodes(), + RunningNodes = + try + %% Force=true here so that reset still works when + %% clustered with a node which is down + ok = init_db(read_cluster_nodes_config(), true), + running_clustered_nodes() -- [Node] + after + stop_mnesia() + end, + leave_cluster(Nodes0, RunningNodes), rabbit_misc:ensure_ok(mnesia:delete_schema([Node]), cannot_delete_schema), + {ok, Nodes0} + end, + case MaybeNodes of + {ok, Nodes} -> %% We need to make sure that we don't end up in a distributed Erlang %% system with nodes while not being in an Mnesia cluster with %% them. We don't handle that well. [erlang:disconnect_node(N) || N <- Nodes], - ok = delete_cluster_nodes_config() + ok = delete_cluster_nodes_config(); + {error, _Reason} -> + log_both("Since this ram node is being force reseted, " + "the node hasn't been disconnected from the " + "cluster correctly, bad things might happen.") end, %% remove persisted messages and any other garbage we find ok = rabbit_file:recursive_delete(filelib:wildcard(dir() ++ "/*")), @@ -814,7 +901,8 @@ on_node_down(Node) -> end. is_only_disc_node(Node, _MnesiaRunning = true) -> - [Node] =:= running_clustered_disc_nodes(); + {ok, Nodes} = running_clustered_disc_nodes(), + [Node] =:= Nodes; is_only_disc_node(Node, false) -> start_mnesia(), Res = is_only_disc_node(Node, true), diff --git a/src/rabbit_upgrade.erl b/src/rabbit_upgrade.erl index 273129904f..0b3413f115 100644 --- a/src/rabbit_upgrade.erl +++ b/src/rabbit_upgrade.erl @@ -121,10 +121,12 @@ remove_backup() -> info("upgrades: Mnesia backup removed~n", []). maybe_upgrade_mnesia() -> - %% rabbit_mnesia:all_clustered_nodes/0 will return [] at this point - %% if we are a RAM node since Mnesia has not started yet. - {ClusterNodes, _DiscNode} = rabbit_mnesia:read_cluster_nodes_config(), - AllNodes = lists:usort(rabbit_mnesia:all_clustered_nodes() ++ ClusterNodes), + {ClusterNodes1, _DiscNode} = rabbit_mnesia:read_cluster_nodes_config(), + ClusterNodes2 = case rabbit_mnesia:all_clustered_nodes() of + {ok, Res} -> Res; + {error, _Reason} -> [] + end, + AllNodes = lists:usort(ClusterNodes1 ++ ClusterNodes2), case rabbit_version:upgrades_required(mnesia) of {error, starting_from_scratch} -> ok; |
