summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGerhard Lazu <gerhard@users.noreply.github.com>2016-12-01 18:01:37 +0000
committerGitHub <noreply@github.com>2016-12-01 18:01:37 +0000
commit438bb3b260f9b45a1115e0bb07547ecd079d52a4 (patch)
tree47c3493ba55322a9cdb9eea879ae95a9b78258f0
parentad04a73e8ba3157e8b737f1521b1c4946c1d30e6 (diff)
parentc76fcdbe365c304eeef59eae4b9327dea8f4646f (diff)
downloadrabbitmq-server-git-438bb3b260f9b45a1115e0bb07547ecd079d52a4.tar.gz
Merge pull request #1038 from rabbitmq/rabbitmq-server-1033
Back ports #1022 (#1023) to stable
-rw-r--r--src/rabbit.app.src3
-rw-r--r--src/rabbit.erl4
-rw-r--r--src/rabbit_mnesia.erl30
-rw-r--r--src/rabbit_mnesia_rename.erl2
-rw-r--r--src/rabbit_table.erl65
-rw-r--r--test/clustering_management_SUITE.erl5
6 files changed, 71 insertions, 38 deletions
diff --git a/src/rabbit.app.src b/src/rabbit.app.src
index 9c30876c1d..59c3d4a6b2 100644
--- a/src/rabbit.app.src
+++ b/src/rabbit.app.src
@@ -45,7 +45,8 @@
{server_properties, []},
{collect_statistics, none},
{collect_statistics_interval, 5000},
- {mnesia_table_loading_timeout, 30000},
+ {mnesia_table_loading_retry_timeout, 30000},
+ {mnesia_table_loading_retry_limit, 10},
{auth_mechanisms, ['PLAIN', 'AMQPLAIN']},
{auth_backends, [rabbit_auth_backend_internal]},
{delegate_count, 16},
diff --git a/src/rabbit.erl b/src/rabbit.erl
index ddc41bee0c..e5a6f09c56 100644
--- a/src/rabbit.erl
+++ b/src/rabbit.erl
@@ -1116,9 +1116,9 @@ ensure_working_fhc() ->
end,
TestPid = spawn_link(TestFun),
%% Because we are waiting for the test fun, abuse the
- %% 'mnesia_table_loading_timeout' parameter to find a sane timeout
+ %% 'mnesia_table_loading_retry_timeout' parameter to find a sane timeout
%% value.
- Timeout = rabbit_table:wait_timeout(),
+ Timeout = rabbit_table:retry_timeout(),
receive
fhc_ok -> ok;
{'EXIT', TestPid, Exception} -> throw({ensure_working_fhc, Exception})
diff --git a/src/rabbit_mnesia.erl b/src/rabbit_mnesia.erl
index 596eb62b03..6bb7e093f6 100644
--- a/src/rabbit_mnesia.erl
+++ b/src/rabbit_mnesia.erl
@@ -104,7 +104,7 @@ init() ->
false ->
NodeType = node_type(),
init_db_and_upgrade(cluster_nodes(all), NodeType,
- NodeType =:= ram)
+ NodeType =:= ram, _Retry = true)
end,
%% We intuitively expect the global name server to be synced when
%% Mnesia is up. In fact that's not guaranteed to be the case -
@@ -138,7 +138,7 @@ init_from_config() ->
e(invalid_cluster_nodes_conf)
end,
case TryNodes of
- [] -> init_db_and_upgrade([node()], disc, false);
+ [] -> init_db_and_upgrade([node()], disc, false, _Retry = true);
_ -> auto_cluster(TryNodes, NodeType)
end.
@@ -147,13 +147,13 @@ auto_cluster(TryNodes, NodeType) ->
{ok, Node} ->
rabbit_log:info("Node '~p' selected for auto-clustering~n", [Node]),
{ok, {_, DiscNodes, _}} = discover_cluster0(Node),
- init_db_and_upgrade(DiscNodes, NodeType, true),
+ init_db_and_upgrade(DiscNodes, NodeType, true, _Retry = true),
rabbit_node_monitor:notify_joined_cluster();
none ->
rabbit_log:warning(
"Could not find any node for auto-clustering from: ~p~n"
"Starting blank node...~n", [TryNodes]),
- init_db_and_upgrade([node()], disc, false)
+ init_db_and_upgrade([node()], disc, false, _Retry = true)
end.
%% Make the node join a cluster. The node will be reset automatically
@@ -193,7 +193,7 @@ join_cluster(DiscoveryNode, NodeType) ->
rabbit_log:info("Clustering with ~p as ~p node~n",
[ClusterNodes, NodeType]),
ok = init_db_with_mnesia(ClusterNodes, NodeType,
- true, true),
+ true, true, _Retry = true),
rabbit_node_monitor:notify_joined_cluster(),
ok;
{error, Reason} ->
@@ -232,7 +232,7 @@ reset_gracefully() ->
%% need to check for consistency because we are resetting.
%% Force=true here so that reset still works when clustered with a
%% node which is down.
- init_db_with_mnesia(AllNodes, node_type(), false, false),
+ init_db_with_mnesia(AllNodes, node_type(), false, false, _Retry = false),
case is_only_clustered_disc_node() of
true -> e(resetting_only_disc_node);
false -> ok
@@ -281,7 +281,7 @@ update_cluster_nodes(DiscoveryNode) ->
rabbit_node_monitor:write_cluster_status(Status),
rabbit_log:info("Updating cluster nodes from ~p~n",
[DiscoveryNode]),
- init_db_with_mnesia(AllNodes, node_type(), true, true);
+ init_db_with_mnesia(AllNodes, node_type(), true, true, _Retry = false);
false ->
e(inconsistent_cluster)
end,
@@ -325,7 +325,7 @@ remove_node_offline_node(Node) ->
%% is by force loading the table, and making sure that
%% they are loaded.
rabbit_table:force_load(),
- rabbit_table:wait_for_replicated(),
+ rabbit_table:wait_for_replicated(_Retry = false),
forget_cluster_node(Node, false),
force_load_next_boot()
after
@@ -470,7 +470,7 @@ init_db(ClusterNodes, NodeType, CheckOtherNodes) ->
{[_ | _], _, _} ->
%% Subsequent node in cluster, catch up
maybe_force_load(),
- ok = rabbit_table:wait_for_replicated(),
+ ok = rabbit_table:wait_for_replicated(_Retry = true),
ok = rabbit_table:create_local_copy(NodeType)
end,
ensure_schema_integrity(),
@@ -480,7 +480,7 @@ init_db(ClusterNodes, NodeType, CheckOtherNodes) ->
init_db_unchecked(ClusterNodes, NodeType) ->
init_db(ClusterNodes, NodeType, false).
-init_db_and_upgrade(ClusterNodes, NodeType, CheckOtherNodes) ->
+init_db_and_upgrade(ClusterNodes, NodeType, CheckOtherNodes, Retry) ->
ok = init_db(ClusterNodes, NodeType, CheckOtherNodes),
ok = case rabbit_upgrade:maybe_upgrade_local() of
ok -> ok;
@@ -495,14 +495,14 @@ init_db_and_upgrade(ClusterNodes, NodeType, CheckOtherNodes) ->
disc -> ok
end,
%% ...and all nodes will need to wait for tables
- rabbit_table:wait_for_replicated(),
+ rabbit_table:wait_for_replicated(Retry),
ok.
init_db_with_mnesia(ClusterNodes, NodeType,
- CheckOtherNodes, CheckConsistency) ->
+ CheckOtherNodes, CheckConsistency, Retry) ->
start_mnesia(CheckConsistency),
try
- init_db_and_upgrade(ClusterNodes, NodeType, CheckOtherNodes)
+ init_db_and_upgrade(ClusterNodes, NodeType, CheckOtherNodes, Retry)
after
stop_mnesia()
end.
@@ -539,7 +539,7 @@ ensure_mnesia_not_running() ->
end.
ensure_schema_integrity() ->
- case rabbit_table:check_schema_integrity() of
+ case rabbit_table:check_schema_integrity(_Retry = true) of
ok ->
ok;
{error, Reason} ->
@@ -670,7 +670,7 @@ discover_cluster0(Node) ->
rpc:call(Node, rabbit_mnesia, cluster_status_from_mnesia, []).
schema_ok_or_move() ->
- case rabbit_table:check_schema_integrity() of
+ case rabbit_table:check_schema_integrity(_Retry = false) of
ok ->
ok;
{error, Reason} ->
diff --git a/src/rabbit_mnesia_rename.erl b/src/rabbit_mnesia_rename.erl
index 0945e31522..6600b2fb2e 100644
--- a/src/rabbit_mnesia_rename.erl
+++ b/src/rabbit_mnesia_rename.erl
@@ -187,7 +187,7 @@ delete_rename_files() -> ok = rabbit_file:recursive_delete([dir()]).
start_mnesia() -> rabbit_misc:ensure_ok(mnesia:start(), cannot_start_mnesia),
rabbit_table:force_load(),
- rabbit_table:wait_for_replicated().
+ rabbit_table:wait_for_replicated(_Retry = false).
stop_mnesia() -> stopped = mnesia:stop().
convert_backup(NodeMap, FromBackup, ToBackup) ->
diff --git a/src/rabbit_table.erl b/src/rabbit_table.erl
index 3909096964..43a744d315 100644
--- a/src/rabbit_table.erl
+++ b/src/rabbit_table.erl
@@ -16,24 +16,25 @@
-module(rabbit_table).
--export([create/0, create_local_copy/1, wait_for_replicated/0, wait/1,
+-export([create/0, create_local_copy/1, wait_for_replicated/1, wait/1,
force_load/0, is_present/0, is_empty/0, needs_default_data/0,
- check_schema_integrity/0, clear_ram_only_tables/0, wait_timeout/0]).
+ check_schema_integrity/1, clear_ram_only_tables/0, retry_timeout/0]).
-include("rabbit.hrl").
%%----------------------------------------------------------------------------
+-type retry() :: boolean().
-spec create() -> 'ok'.
-spec create_local_copy('disc' | 'ram') -> 'ok'.
--spec wait_for_replicated() -> 'ok'.
+-spec wait_for_replicated(retry()) -> 'ok'.
-spec wait([atom()]) -> 'ok'.
--spec wait_timeout() -> non_neg_integer() | infinity.
+-spec retry_timeout() -> {non_neg_integer() | infinity, non_neg_integer()}.
-spec force_load() -> 'ok'.
-spec is_present() -> boolean().
-spec is_empty() -> boolean().
-spec needs_default_data() -> boolean().
--spec check_schema_integrity() -> rabbit_types:ok_or_error(any()).
+-spec check_schema_integrity(retry()) -> rabbit_types:ok_or_error(any()).
-spec clear_ram_only_tables() -> 'ok'.
%%----------------------------------------------------------------------------
@@ -63,25 +64,53 @@ create_local_copy(ram) ->
create_local_copies(ram),
create_local_copy(schema, ram_copies).
-wait_for_replicated() ->
+wait_for_replicated(Retry) ->
wait([Tab || {Tab, TabDef} <- definitions(),
- not lists:member({local_content, true}, TabDef)]).
+ not lists:member({local_content, true}, TabDef)], Retry).
wait(TableNames) ->
+ wait(TableNames, _Retry = false).
+
+wait(TableNames, Retry) ->
+ {Timeout, Retries} = retry_timeout(Retry),
+ wait(TableNames, Timeout, Retries).
+
+wait(TableNames, Timeout, Retries) ->
%% We might be in ctl here for offline ops, in which case we can't
%% get_env() for the rabbit app.
- Timeout = wait_timeout(),
- case mnesia:wait_for_tables(TableNames, Timeout) of
- ok ->
+ rabbit_log:info("Waiting for Mnesia tables for ~p ms, ~p retries left~n",
+ [Timeout, Retries - 1]),
+ Result = case mnesia:wait_for_tables(TableNames, Timeout) of
+ ok ->
+ ok;
+ {timeout, BadTabs} ->
+ {error, {timeout_waiting_for_tables, BadTabs}};
+ {error, Reason} ->
+ {error, {failed_waiting_for_tables, Reason}}
+ end,
+ case {Retries, Result} of
+ {_, ok} ->
ok;
- {timeout, BadTabs} ->
- throw({error, {timeout_waiting_for_tables, BadTabs}});
- {error, Reason} ->
- throw({error, {failed_waiting_for_tables, Reason}})
+ {1, {error, _} = Error} ->
+ throw(Error);
+ {_, {error, Error}} ->
+ rabbit_log:warning("Error while waiting for Mnesia tables: ~p~n", [Error]),
+ wait(TableNames, Timeout, Retries - 1);
+ _ ->
+ wait(TableNames, Timeout, Retries - 1)
end.
-wait_timeout() ->
- case application:get_env(rabbit, mnesia_table_loading_timeout) of
+retry_timeout(_Retry = false) ->
+ {retry_timeout(), 1};
+retry_timeout(_Retry = true) ->
+ Retries = case application:get_env(rabbit, mnesia_table_loading_retry_limit) of
+ {ok, T} -> T;
+ undefined -> 10
+ end,
+ {retry_timeout(), Retries}.
+
+retry_timeout() ->
+ case application:get_env(rabbit, mnesia_table_loading_retry_timeout) of
{ok, T} -> T;
undefined -> 30000
end.
@@ -98,7 +127,7 @@ is_empty(Names) ->
lists:all(fun (Tab) -> mnesia:dirty_first(Tab) == '$end_of_table' end,
Names).
-check_schema_integrity() ->
+check_schema_integrity(Retry) ->
Tables = mnesia:system_info(tables),
case check(fun (Tab, TabDef) ->
case lists:member(Tab, Tables) of
@@ -106,7 +135,7 @@ check_schema_integrity() ->
true -> check_attributes(Tab, TabDef)
end
end) of
- ok -> ok = wait(names()),
+ ok -> wait(names(), Retry),
check(fun check_content/2);
Other -> Other
end.
diff --git a/test/clustering_management_SUITE.erl b/test/clustering_management_SUITE.erl
index 00ddfa48a2..923b523f00 100644
--- a/test/clustering_management_SUITE.erl
+++ b/test/clustering_management_SUITE.erl
@@ -72,7 +72,10 @@ suite() ->
init_per_suite(Config) ->
rabbit_ct_helpers:log_environment(),
- rabbit_ct_helpers:run_setup_steps(Config).
+ Config1 = rabbit_ct_helpers:merge_app_env(
+ Config,
+ {rabbit, [{mnesia_table_loading_retry_limit, 1}]}),
+ rabbit_ct_helpers:run_setup_steps(Config1).
end_per_suite(Config) ->
rabbit_ct_helpers:run_teardown_steps(Config).